In [1]:
import numpy as np
import pandas as pd

import shutil

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torch.nn.modules import Module
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor, Resize
import torchvision.transforms as transforms
import torchvision.models as models

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import tqdm.notebook as t

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
BATCH_SIZE = 16
EPOCHS = 20
DEVICE = 'cuda'

In [3]:
main_path = '../input/wonders-of-the-world-image-classification/Wonders of World/Wonders of World'
dir_list = os.listdir(main_path)

for num, path in enumerate(dir_list):
    print(f'{path} : {len(os.listdir(os.path.join(main_path, path)))}')

great_wall_of_china : 392
chichen_itza : 340
statue_of_liberty : 238
christ_the_reedemer : 323
roman_colosseum : 394
burj_khalifa : 390
pyramids_of_giza : 372
taj_mahal : 158
venezuela_angel_falls : 251
eiffel_tower : 391
machu_pichu : 393
stonehenge : 204


In [4]:
def get_train_val_image_list(main_path, train_size, val_size):
    train_set = dict()
    val_set = dict()
    
    dir_list = os.listdir(main_path)
    
    for path in t.tqdm_notebook(dir_list):
        file_list = os.listdir(os.path.join(main_path, path))

        total_files = len(file_list)
        train_list = file_list[:train_size]
        val_list = file_list[train_size:train_size+val_size]
        
        train_set[path] = [os.path.join(main_path, path,n) for n in t.tqdm_notebook(train_list)]
        val_set[path] = [os.path.join(main_path, path,n) for n in t.tqdm_notebook(val_list)]
        
        
    return train_set, val_set
    
def get_class_list(train_set):
    idx_to_classes = dict()
    classes_to_idx = dict()
    
    for count, key in enumerate(train_set.keys()):
        idx_to_classes[count] = key
        classes_to_idx[key] = count
        
    return idx_to_classes, classes_to_idx
def get_average_height_width(train_set):
    total_files = 0
    total_height = 0
    total_width = 0
    
    for key, img_list in t.tqdm_notebook(train_set.items()):
        for img in img_list:
            total_width += img.shape[0]
            total_height += img.shape[1]
            total_files += 1
            
    return int(total_width / total_files), int(total_height / total_files)

In [5]:
train_list, val_list = get_train_val_image_list(main_path, train_size=150, val_size=8)
idx_to_classes, classes_to_idx = get_class_list(train_list)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
def copy_files(dirname, dest, img_list):
    if os.path.isdir(os.path.join(dest, dirname)) == False:
        os.mkdir(os.path.join(dest, dirname))
    
    for key, img_path_list in img_list.items():
        if os.path.isdir(os.path.join(dest,dirname, key)) == False:
            os.mkdir(os.path.join(dest,dirname, key))
        for path in img_path_list:
            shutil.copy(path,os.path.join(dest,dirname, key))
        

In [7]:
copy_files('Train', './', train_list)
copy_files('Valid', './', val_list)

In [8]:
# tr_avg_width, tr_avg_height = get_average_height_width(train_list)
# val_avg_width, val_avg_height = get_average_height_width(val_list)
# print(f'Train - Width(Avg) : {tr_avg_width}, Height(Avg) : {tr_avg_height}')
# print(f'Val - Width(Avg) : {val_avg_width}, Height(Avg) : {val_avg_height}')

In [9]:
class WoWDataset(Dataset):
    def __init__(self, data, num_classes ):
        self.data = data
        self.num_classes = num_classes
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        img = self.data[idx][0]
        label = self.data[idx][1]
        
        o_label = [0] * self.num_classes
        o_label[label] = 1
        r = transforms.Resize(size=(298,211))
        o_label = torch.tensor(o_label, dtype=torch.float)
        return r(img), o_label

In [10]:
train_data = ImageFolder('./Train', transform = ToTensor())
val_data = ImageFolder('./Valid', transform = ToTensor())

In [11]:
train_dataset = WoWDataset(train_data, len(list(idx_to_classes.keys())))
val_dataset = WoWDataset(val_data, len(list(idx_to_classes.keys())))

In [12]:
train_dl = DataLoader(train_dataset, BATCH_SIZE, pin_memory=True, shuffle=True)
val_dl =DataLoader(val_dataset, BATCH_SIZE, pin_memory=True)

In [13]:
class WoWModel(Module):
    def __init__(self, num_channels):
        super(WoWModel, self).__init__()
        
        self.loss_fn = nn.BCELoss(reduction='mean')

        self.transform1 = torch.nn.Sequential(
            transforms.RandomRotation(75),
            transforms.RandomGrayscale(),
            transforms.RandomHorizontalFlip()
        )
        
        self.convnext_large = models.convnext_large(pretrained=True)
        self.flatten1 = nn.Flatten()
        self.linear1 = nn.Linear(1000,1000)
        self.linear2 = nn.Linear(1000,1000)
        self.linear3 = nn.Linear(1000,12)
        self.sigmoid1 = nn.Sigmoid()
        
    def forward(self, x, y):

        output = x
        output = self.transform1(output)
        output = self.convnext_large(output)
        output = self.linear1(output)
        output = self.linear2(output)
        output = self.linear3(output)
        output = self.sigmoid1(output)

        loss = self.loss_fn(output, y)
        
        return loss, output
    
    def predict(self, x):
        
        output = x
        output = self.convnext_large(output) 
        output = self.linear1(output)
        output = self.linear2(output)
        output = self.linear3(output)
        output = self.sigmoid1(output)

        return np.argmax(output.detach().numpy(), axis=1)
    
    def train_using_train_dl(self, optimizer, train_dl, device):
        
        self.train()
        
        losses = []
        acc = []
        pre = []
        re = []
        f1 = []
        
        total = len(train_dl)

        for idx , data in enumerate(train_dl):
            image, label = data
            image = image.to(device)
            label = label.to(device)
            
            optimizer.zero_grad()
            
            loss, output = self(image, label)
            losses.append(loss.item())
            loss.backward()
            
            optimizer.step()
            
            acc.append(accuracy_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1)))
            pre.append(precision_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1), average='macro', zero_division=1))
            re.append(recall_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1), average='macro', zero_division=1))
            f1.append(f1_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1), average='macro', zero_division=1))
            
            print(f'{idx} / {total} - Loss : {sum(losses) / len(losses):0.4f}, Accuracy : {sum(acc) / len(acc):0.4f}, F1 : {sum(f1) / len(f1):0.4f} P : {sum(pre) / len(pre):0.4f}, R : {sum(re)/ len(re):0.4f}', end='\r')
            
        return sum(losses) / len(losses) , sum(acc) / len(acc), sum(f1) / len(f1), sum(pre) / len(pre), sum(re) / len(re)
    
    def evaluate_using_val_dl(self, val_dl, device):
        
        self.eval()
        
        losses = []
        acc = []
        pre = []
        re = []
        f1 = []
        total = len(val_dl)
        
        with torch.no_grad():
            
            for idx, data in enumerate(val_dl):
                image, label = data
                image = image.to(device)
                label = label.to(device)

                loss, output = self(image, label)
                losses.append(loss.item())

                acc.append(accuracy_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1)))
                pre.append(precision_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1), average='macro', zero_division=1))
                re.append(recall_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1), average='macro', zero_division=1))
                f1.append(f1_score(np.argmax(label.cpu(), axis=1), np.argmax(output.cpu().detach().numpy(), axis=1), average='macro', zero_division=1))

                print(f'{idx} / {total} - Val_Loss : {sum(losses) / len(losses):0.4f}, Val_Accuracy : {sum(acc) / len(acc):0.4f}, Val_F1 : {sum(f1) / len(f1):0.4f}, Val_P : {sum(pre) / len(pre):0.4f}, Val_R : {sum(re)/ len(re):0.4f}', end='\r')
            print('', end='\r')
        return sum(losses) / len(losses) , sum(acc) / len(acc), sum(f1) / len(f1), sum(pre) / len(pre), sum(re) / len(re)

In [14]:
model = WoWModel(num_channels=3)
model.to(DEVICE)

Downloading: "https://download.pytorch.org/models/convnext_large-ea097f82.pth" to /root/.cache/torch/hub/checkpoints/convnext_large-ea097f82.pth


  0%|          | 0.00/755M [00:00<?, ?B/s]

WoWModel(
  (loss_fn): BCELoss()
  (transform1): Sequential(
    (0): RandomRotation(degrees=[-75.0, 75.0], interpolation=nearest, expand=False, fill=0)
    (1): RandomGrayscale(p=0.1)
    (2): RandomHorizontalFlip(p=0.5)
  )
  (convnext_large): ConvNeXt(
    (features): Sequential(
      (0): ConvNormActivation(
        (0): Conv2d(3, 192, kernel_size=(4, 4), stride=(4, 4))
        (1): LayerNorm2d((192,), eps=1e-06, elementwise_affine=True)
      )
      (1): Sequential(
        (0): CNBlock(
          (block): Sequential(
            (0): Conv2d(192, 192, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=192)
            (1): Permute()
            (2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
            (3): Linear(in_features=192, out_features=768, bias=True)
            (4): GELU()
            (5): Linear(in_features=768, out_features=192, bias=True)
            (6): Permute()
          )
          (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, eps=0.1)

In [16]:
def train_model(epochs, model, optimizer, train_dl, val_dl, device):
    
    history = {
                'train_loss' : [],
                'val_loss' : [],
                'train_accuracy' : [],
                'val_accuracy' : [],
                'F1' : [],
                'val_F1' : [],
                'P' :[],
                'val_P':[],
                'Recall' : [],
                'val_Recall' : []
              }
    
    for epoch in range(epochs):
        print(f'EPOCH : {epoch + 1} / {epochs}')
        
        train_loss, train_acc, train_f1, train_pre, train_re = model.train_using_train_dl(optimizer, train_dl, device)
        val_loss, val_acc, val_f1, val_pre, val_re = model.evaluate_using_val_dl(val_dl, device)
        
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_accuracy'].append(train_acc)
        history['val_accuracy'].append(val_acc)
        history['F1'].append(train_f1)
        history['val_F1'].append(val_f1)
        history['P'].append(train_pre)
        history['val_P'].append(val_pre)
        history['Recall'].append(train_re)
        history['val_Recall'].append(val_re)
        
        print(f'Loss : {train_loss:0.4f}, Accuracy : {train_acc:0.4f}, F1 : {train_f1:0.4f}, P :{train_pre:0.4f}, R : {train_re:0.4f}')
        print(f'Val_Loss : {val_loss:0.4f}, Val_Accuracy : {val_acc:0.4f}, Val_F1 : {val_f1:0.4f}, Val_P :{val_pre:0.4f}, Val_R : {val_re:0.4f}')

    return history
        
        

In [17]:
train_model(EPOCHS, model, optimizer, train_dl, val_dl, DEVICE)

EPOCH : 1 / 20
Loss : 0.6109, Accuracy : 0.1366, F1 : 0.0835, P :0.5528, R : 0.2356
Val_Loss : 0.4681, Val_Accuracy : 0.1146, Val_F1 : 0.0619, Val_P :0.3840, Val_R : 0.6441
EPOCH : 2 / 20
Loss : 0.3772, Accuracy : 0.1289, F1 : 0.0662, P :0.6584, R : 0.2183
Val_Loss : 0.3088, Val_Accuracy : 0.1562, Val_F1 : 0.1079, Val_P :0.4472, Val_R : 0.6111
EPOCH : 3 / 20
Loss : 0.2999, Accuracy : 0.1394, F1 : 0.0727, P :0.5986, R : 0.2470
Val_Loss : 0.2868, Val_Accuracy : 0.1562, Val_F1 : 0.0912, Val_P :0.4426, Val_R : 0.6236
EPOCH : 4 / 20
Loss : 0.2880, Accuracy : 0.1488, F1 : 0.0895, P :0.5173, R : 0.2752
Val_Loss : 0.2849, Val_Accuracy : 0.1458, Val_F1 : 0.0709, Val_P :0.2994, Val_R : 0.7094
EPOCH : 5 / 20
Loss : 0.2854, Accuracy : 0.1831, F1 : 0.1200, P :0.4969, R : 0.2956
Val_Loss : 0.2822, Val_Accuracy : 0.2188, Val_F1 : 0.1030, Val_P :0.3365, Val_R : 0.7383
EPOCH : 6 / 20
Loss : 0.2844, Accuracy : 0.1908, F1 : 0.1325, P :0.4495, R : 0.3206
Val_Loss : 0.2824, Val_Accuracy : 0.3125, Val_F1 : 

{'train_loss': [0.6109277186668025,
  0.3772444094704316,
  0.2998670305298493,
  0.2880085820117883,
  0.2854010278144769,
  0.2843675117577072,
  0.28364595190613673,
  0.28285270190871925,
  0.2821515877689936,
  0.28150986411930184,
  0.2803085878887008,
  0.27917493207264793,
  0.2782210900192767,
  0.27701098940013785,
  0.2759127416441926,
  0.2751634996549218,
  0.27380993645803064,
  0.2723854732724418,
  0.2711119290474242,
  0.2696743599608936],
 'val_loss': [0.4681355108817418,
  0.3088085154692332,
  0.286820391813914,
  0.284920593102773,
  0.2822144528230031,
  0.28241464992364246,
  0.28110351661841076,
  0.2815433194239934,
  0.27924155195554096,
  0.2782149612903595,
  0.27649391690889996,
  0.27605869869391125,
  0.27553897599379223,
  0.2733260641495387,
  0.270658274491628,
  0.27076280613740283,
  0.2696342021226883,
  0.2692180375258128,
  0.2653445452451706,
  0.2657722781101863],
 'train_accuracy': [0.13661504424778761,
  0.1288716814159292,
  0.139380530973451