CS5242 Final Project : Model Training Notebook
===
> Transfer learning and fine-tuning pre-trained models on ImageNet dataset

*Murat Shagirov*

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from os import path

import matplotlib.pyplot as plt
# for plotting figures (report)
import matplotlib
plt.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (15,5) # use larger for presentation
matplotlib.rcParams['font.size']= 9 # use 14 for presentation

In [3]:
from nn import train_model # model training function

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from datautils import LoadTrainingData
from torch.utils.data import DataLoader
from torchvision import models, utils, transforms as T

from datautils import BatchUnnorm, Unnorm

# check for CUDA device and set default dtype
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
dtype = torch.float32
print(f'device: {device}\ndtype: {dtype}')

# Transforms
unnorm = Unnorm() # unnormalize a single RGB image
unnormb = BatchUnnorm() # unnormalize batch of images

toPIL = T.ToPILImage()

img_size = 512

transform = T.Compose([T.ToPILImage(),
                       T.RandomRotation((-3,3)),
                       T.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
                       T.RandomHorizontalFlip(),
                       T.RandomVerticalFlip(),
                       T.ToTensor(),
                       T.ConvertImageDtype(dtype), 
                       T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
val_transform = T.Compose([T.ToPILImage(),
                           T.Resize(img_size),
                           T.ToTensor(),
                           T.ConvertImageDtype(dtype),
                           T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

# Paths to training dataset and labels (before Train/Val split)
train_csv = path.join('./datasets','train_label.csv')
train_data_path = path.join('./datasets','train_image','train_image')

np.random.seed(42) #seed np RNG for consistency
# split the original training data into 85% / 15% train/val datasets
datasets = LoadTrainingData(train_csv, train_data_path, transform=transform,
                            split=True, train_percent=80, val_transform=val_transform)

print(f"Training dataset: {len(datasets['train'])} samples.",
      f"\nValidation dataset: {len(datasets['val'])} samples.")


device: cuda
dtype: torch.float32
Training dataset: 931 samples. 
Validation dataset: 233 samples.


## Finetuning

- fine tuning resnet18 seems faster, and validation set acc-y is generally quickly reaches >85-90% after 5epochs
- using resnet18's conv layer as feature extractor (freezing them) results in very slow training (but no overfitting), both training and val-n set accuracies increase slowly (>80% after 5 epochs, 92% (100 epochs))

- resnet18: 94.85% (25 epochs)
- resnet34: (25 epochs)
- resnet50: 95.28% (25 epochs)
- densenet121: so far 25*3 epochs-->97.4249%
- resnext50_32x4d + fc: 25*1 epochs-->96.5665% (afterwards ValLoss converges to ~96%)

In [10]:
num_epochs = 25

bsize_train = 4 # batch sizes
bsize_val = 4

lr = 0.001 # learning rate

# Download ImageNet pre-trained model from torchhub
model_ft = models.resnext50_32x4d(pretrained=True,progress=False)

# # for transfer learning freeze (disable grads for early layers)
# for param in model_ft.parameters():
#     param.requires_grad = False

num_ftrs = model_ft.fc.in_features
# num_ftrs = model_ft.classifier.in_features

# size of each output sample: nn.Linear(num_ftrs, len(class_names)).
# e.g.: model_ft.fc = nn.Linear(num_ftrs, 3)
# model_ft.classifier = nn.Linear(num_ftrs, 3)
model_ft.fc = nn.Linear(num_ftrs, 3)


model_ft = model_ft.to(device)

# Finetune all parameters
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9)
# optimizer_ft = torch.optim.Adam(model_ft.fc.parameters(), lr=lr)

# LR Schedules
# exp_lr_scheduler = None
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5*len(datasets['train']), gamma=0.1)
# exp_lr_scheduler = lr_scheduler.CyclicLR(optimizer_ft, 10**-9, 10**-4, step_size_up=5, step_size_down=10)

# Prepare dataloaders
data_loaders = {'train' : DataLoader(datasets['train'], batch_size=bsize_train, shuffle=True, num_workers=0),
                'val'   : DataLoader(datasets['val'],  batch_size=bsize_val, shuffle=False, num_workers=0)}

In [11]:
best_model, curve_data  = train_model(model_ft, optimizer_ft, data_loaders, num_epochs=num_epochs,
                         loss_func=criterion, scheduler=exp_lr_scheduler, device=device, return_best=True)

plt.figure(figsize=[20,8])
t = np.arange(curve_data['total_epochs'])
plt.subplot(121)
plt.plot(t,curve_data['trainLosses'],label='Train')
plt.plot(t,curve_data['valLosses'],label='Val')
plt.title('Loss'); plt.legend()

plt.subplot(122)
plt.plot(t,curve_data['trainAccs'],label='Train')
plt.plot(t,curve_data['valAccs'],label='Val')
plt.title('Accuracy'); plt.legend()

plt.show()

Epoch 0/24 --- 

KeyboardInterrupt: 

In [None]:
# # --> model.load_state_dict(torch.load(PATH))
# # --> model.eval()
# best_model.load_state_dict(torch.load(f'./resnet18_e{20+num_epochs}_ft.pkl')
# best_model.eval()

In [8]:
train_method = f'ft_{img_size}px'

save_dir = '../../dataDIR/cs5242/'
weights_path = path.join(save_dir, f'{model_name}_{train_method}.pkl')
torch.save(best_model.state_dict(), weights_path)
torch.save(curve_data,f'./{model_name}_{train_method}_curves.pkl')

In [29]:
# from datautils import BatchUnnorm, Unnorm

# unnorm = Unnorm() # unnormalize a single RGB image
# unnormb = BatchUnnorm() # unnormalize batch of images

# for k, sample in enumerate(data_loaders['train']):
#     print(f'Sample {k}: x {sample["image"].shape}; Labels {sample["label"].tolist()}')
#     grid = utils.make_grid(unnormb(sample['image']), padding=4, pad_value=1.)
#     plt.imshow(grid.numpy().transpose((1, 2, 0)))
#     plt.title(f'Labels:{sample["label"].tolist()}')
#     plt.axis(False)
#     plt.show()
#     break

## Transfer Learning

In [13]:
num_epochs = 25

bsize_train = 4 # batch sizes
bsize_val = 4

lr = 0.001 # learning rate

# Download ImageNet pre-trained model from torchhub
model_ft = models.resnet18(pretrained=True,progress=False)

# for transfer learning freeze (disable grads for early layers)
for param in model_ft.parameters():
    param.requires_grad = False

model_ft.eval();

num_ftrs = model_ft.fc.in_features

# size of each output sample: nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Sequential(nn.Linear(num_ftrs, 3))

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.fc.parameters(), lr=lr, momentum=0.9)
# optimizer_ft = torch.optim.Adam(model_ft.fc.parameters(), lr=lr)

# Decay LR by a factor of 0.1 every 7 epochs
# exp_lr_scheduler = None
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5*len(datasets['train']), gamma=0.1)
# exp_lr_scheduler = lr_scheduler.CyclicLR(optimizer_ft, 10**-9, 10**-4,
#                                          step_size_up=5, step_size_down=10)

# Prepare dataloaders
data_loaders = {'train' : DataLoader(datasets['train'], batch_size=bsize_train, shuffle=True, num_workers=0),
                'val'   : DataLoader(datasets['val'],  batch_size=bsize_val, shuffle=False, num_workers=0)}

SGD moment0.9 lr=0.01: >90% (25 epochs)<br>
SGD //-// lr= 0.01 step(20 epochs decay): 92% (100 epochs), <br>

In [15]:
best_model, curve_data  = train_model(model_ft, optimizer_ft, data_loaders, num_epochs=num_epochs,
                         loss_func=criterion, scheduler=exp_lr_scheduler, device=device, return_best=True)

plt.figure(figsize=[20,8])
t = np.arange(curve_data['total_epochs'])
plt.subplot(121)
plt.plot(t,curve_data['trainLosses'],label='Train')
plt.plot(t,curve_data['valLosses'],label='Val')
plt.title('Loss'); plt.legend()

plt.subplot(122)
plt.plot(t,curve_data['trainAccs'],label='Train')
plt.plot(t,curve_data['valAccs'],label='Val')
plt.title('Accuracy'); plt.legend()

plt.show()

Epoch 0/24 --- train Loss: 0.3247 Acc: 0.8797 || val Loss: 0.2816 Acc: 0.8841 || 46s
Epoch 1/24 --- train Loss: 0.3497 Acc: 0.8614 || val Loss: 0.3211 Acc: 0.8841 || 91s
Epoch 2/24 --- train Loss: 0.3693 Acc: 0.8507 || val Loss: 0.2610 Acc: 0.8927 || 134s
Epoch 3/24 --- train Loss: 0.3083 Acc: 0.8711 || val Loss: 0.2874 Acc: 0.8841 || 180s
Epoch 4/24 --- train Loss: 0.3639 Acc: 0.8582 || val Loss: 0.2648 Acc: 0.8927 || 224s
Epoch 5/24 --- train Loss: 0.3447 Acc: 0.8647 || val Loss: 0.2645 Acc: 0.8755 || 270s
Epoch 6/24 --- train Loss: 0.3264 Acc: 0.8561 || val Loss: 0.2949 Acc: 0.8884 || 313s
Epoch 7/24 --- train Loss: 0.3209 Acc: 0.8722 || val Loss: 0.3208 Acc: 0.8712 || 358s
Epoch 8/24 --- train Loss: 0.3710 Acc: 0.8475 || val Loss: 0.3011 Acc: 0.8798 || 402s
Epoch 9/24 --- train Loss: 0.3425 Acc: 0.8679 || val Loss: 0.3040 Acc: 0.8927 || 446s
Epoch 10/24 --- train Loss: 0.3435 Acc: 0.8561 || val Loss: 0.3068 Acc: 0.8712 || 490s
Epoch 11/24 --- train Loss: 0.3230 Acc: 0.8636 || val L

KeyboardInterrupt: 

In [None]:
# num_epochs = 50

# # re-enable grads for fine-tuning
# for param in model_ft.parameters():
#     param.requires_grad = True
# # Observe that all parameters are being optimized
# optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.00001, momentum=0.9)
# # exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.1)
# exp_lr_scheduler = lr_scheduler.CyclicLR(optimizer_ft, 10**-9, 10**-4,
#                                          step_size_up=5, step_size_down=10)

# best_model, curve_data  = train_model(model_ft, optimizer_ft, data_loaders, num_epochs=num_epochs,
#                          loss_func=criterion, scheduler=exp_lr_scheduler, device=device, return_best=True)

# plt.figure(figsize=[20,8])
# t = np.arange(curve_data['total_epochs'])
# plt.subplot(121)
# plt.plot(t,curve_data['trainLosses'],label='Train')
# plt.plot(t,curve_data['valLosses'],label='Val')
# plt.title('Loss'); plt.legend()

# plt.subplot(122)
# plt.plot(t,curve_data['trainAccs'],label='Train')
# plt.plot(t,curve_data['valAccs'],label='Val')
# plt.title('Accuracy'); plt.legend()

# plt.show()

In [None]:
train_method = 'tr'
save_dir = '../../dataDIR/cs5242/'
weights_path = path.join(save_dir, f'resnet152_{train_method}_V1.pkl')
torch.save(best_model.state_dict(), weights_path)
torch.save(curve_data,f'./resnet152_{train_method}_curvesV1.pkl')