In [1]:
import os
import random
import time
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tempfile import TemporaryDirectory

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, random_split

import torchvision
from torchvision import datasets, models, transforms

import wandb

cudnn.benchmark = True
plt.ion()

!wandb login b199ff6d9c0a9f7fc2901490f41b4dacc0ef21d6


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [2]:
# Customed Dataset class
class CustomDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.classes = self._find_classes()
        self.image_paths, self.labels = self._load_data()

    def _find_classes(self):
        classes = sorted([d for d in os.listdir(self.data_dir) if os.path.isdir(os.path.join(self.data_dir, d))])
        return classes

    def _load_data(self):
        image_paths = []
        labels = []
        for label in self.classes:
            class_dir = os.path.join(self.data_dir, label)
            for img_name in os.listdir(class_dir):
                img_path = os.path.join(class_dir, img_name)
                image_paths.append(img_path)
                labels.append(int(label))
        return image_paths, labels

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        label = int(label)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

In [3]:
# Parameters

# Change batchsize to fit hardware
batch_size = 64

# Training parameters
num_classes = 2139  
learning_rate = 0.001
num_epochs = 100

# Scheduler
step_size = 7
gamma = 0.1


In [4]:
# Location of data
train_dir = '/kaggle/input/wb-recognition-dataset/wb_recognition_dataset/train'
val_dir = '/kaggle/input/wb-recognition-dataset/wb_recognition_dataset/val'

In [5]:
# Data transformations
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),  
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [6]:
# Datasets from each folder
image_datasets = {
    'train': CustomDataset(train_dir, data_transforms['train']),
    'val': CustomDataset(val_dir, data_transforms['val']),
}

# Dataloader iterators
dataloaders = {
    'train': DataLoader(image_datasets['train'], batch_size=batch_size, shuffle=True, num_workers=4, pin_memory = True),
    'val': DataLoader(image_datasets['val'], batch_size=batch_size, shuffle=False, num_workers=4, pin_memory = True),
}

# Size of datasets
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

In [7]:
# Number of images and labels
print('Number of images in train: ', dataset_sizes['train'])
print('Number of labels in train: ',len(image_datasets['train'].classes))
print('Number of images in val: ', dataset_sizes['val'])
print('Number of labels in val: ', len(image_datasets['val'].classes))

Number of images in train:  56813
Number of labels in train:  2130
Number of images in val:  1392
Number of labels in val:  595


In [8]:
# pip install efficientnet-pytorch

In [9]:
# from efficientnet_pytorch import EfficientNet
# Load pretrained model
# model = EfficientNet.from_pretrained('efficientnet-b0')
model = torchvision.models.resnet18(weights='DEFAULT')

# Replace fully connected layer
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 105MB/s] 


In [11]:
# Training loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum = 0.9)

# Decay the learning rate by 10% every 7 epochs
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size= step_size, gamma=gamma)

In [12]:
# Move to gpu 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

cuda:0


In [13]:
# Check model
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [14]:
def train_model(model, criterion, optimizer, dataloaders, model_last_pth, model_best_pth, num_epochs):
    """Train a PyTorch Model

    Params
    --------
        model (PyTorch model): cnn to train
        criterion (PyTorch loss): objective to minimize
        optimizer (PyTorch optimizier): optimizer to compute gradients of model parameters
        dataloaders (PyTorch dataloader): dataloaders to iterate through
        model_last_pth, model_best_pth (str ending in '.pt'): file path to save the model state dict
        num_epochs (int): maximum number of training epochs

    Returns
    --------
        model (PyTorch model): trained cnn with best weights
    """
    
    # Min validation loss
    valid_loss_min = np.Inf
    
    # Main loop
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} / {num_epochs}')
        print('-' * 10)
        
        # Go through training and validation phase each epoch
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            
            # Keep track of loss and corrects each epoch
            running_loss = 0.0
            running_corrects = 0

            # Training loop
            for inputs, labels in dataloaders[phase]:
                inputs, labels = inputs.to(device), labels.to(device)
                
                # Clear gradients
                optimizer.zero_grad()
                
                # Predicted outputs and loss of gradients
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backpropagation and update parameters
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Update loss and number of correct predictions
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            
            # Step the scheduler if in training phase
            if phase == 'train':
                scheduler.step()
            
            # Calculate loss and accuracy of each epoch
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            
            if phase == 'val':
                if epoch_loss < valid_loss_min:
                    valid_loss_min = epoch_loss
                    torch.save(model.state_dict(), model_best_pth)
            
            print(f'{phase}\t Loss: {epoch_loss:.4f}\t Accuracy: {epoch_acc:.4f}')
            
            wandb.log({f'{phase}_loss': epoch_loss, f'{phase}_acc': epoch_acc})
        
        # Save model every epoch
        torch.save(model.state_dict(), model_last_pth)

    return model

In [18]:
# Path to save model (last and best)
model_last_pth = '/kaggle/working/resnet18-imagenet-01-last.pt'
model_best_pth = '/kaggle/working/resnet18-imagenet-01-best.pt'

# Saved model 
saved_model_path = '/kaggle/input/resnet18-imagenet-01/pytorch/resnet18-imagenet-01/1/resnet18-ImageNet-0.pt'

In [20]:
wandb.init(project='ImageProcessing-project', sync_tensorboard=True)

# Load saved model
try:
    model.load_state_dict(torch.load(saved_model_path))
    print('Loaded saved model successfully')
except FileNotFoundError:
    print('File not found')
except Exception as e:
    print(f'An error occurred: {e}')

print()

model.to(device)
    
model = train_model(model, criterion, optimizer, dataloaders, model_last_pth, model_best_pth, num_epochs)

wandb.finish()

2024-05-21 12:32:58.663982: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 12:32:58.664093: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 12:32:58.800825: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mntl09092004[0m ([33mtunglam994[0m). Use [1m`wandb login --relogin`[0m to force relogin


Loaded saved model successfully

Epoch 1 / 100
----------
train	 Loss: 0.8591	 Accuracy: 0.8170
val	 Loss: 0.6853	 Accuracy: 0.8448
Epoch 2 / 100
----------
train	 Loss: 0.3425	 Accuracy: 0.9341
val	 Loss: 0.5662	 Accuracy: 0.8728
Epoch 3 / 100
----------
train	 Loss: 0.2100	 Accuracy: 0.9659
val	 Loss: 0.5119	 Accuracy: 0.8865
Epoch 4 / 100
----------
train	 Loss: 0.1385	 Accuracy: 0.9825
val	 Loss: 0.4781	 Accuracy: 0.8894
Epoch 5 / 100
----------
train	 Loss: 0.0954	 Accuracy: 0.9914
val	 Loss: 0.4564	 Accuracy: 0.8922
Epoch 6 / 100
----------
train	 Loss: 0.0703	 Accuracy: 0.9955
val	 Loss: 0.4423	 Accuracy: 0.9009
Epoch 7 / 100
----------
train	 Loss: 0.0563	 Accuracy: 0.9970
val	 Loss: 0.4415	 Accuracy: 0.9001
Epoch 8 / 100
----------
train	 Loss: 0.0453	 Accuracy: 0.9979
val	 Loss: 0.4314	 Accuracy: 0.8987
Epoch 9 / 100
----------
train	 Loss: 0.0437	 Accuracy: 0.9979
val	 Loss: 0.4285	 Accuracy: 0.8994
Epoch 10 / 100
----------
train	 Loss: 0.0425	 Accuracy: 0.9980
val	 Loss: 0

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_acc,▁▇██████████████████████████████████████
train_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▆█▇█▇▇██████████▇█████▇██████████▇▇███▇
val_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train_acc,0.99803
train_loss,0.03934
val_acc,0.90014
val_loss,0.42239


In [None]:
#Load saved model
model.load_state_dict(torch.load('/kaggle/working/resnet18-ImageNet-0.pt'))