In [None]:
import numpy as np
import torch

import matplotlib.pyplot as plt

In [None]:
import torchvision
import torchvision.transforms as transforms

In [None]:
import torch.nn as nn
import torch.optim as optim

from torchvision import models

import copy

In [3]:
# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Dataset

In [4]:
# TRANSFORMS

transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224), 
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

# crop - portion of the image instead of shrinking them unequally along differet dim
# crop from a random location.
# here 32x32 to 224x224 - blow up.
# normalize for images also like other data
# mean, SD in 3 channels.
# 0.5,0.5 -> most values b/w 0,1
# ideally we should find the mean, SD(of train data) and then use those values
# it will be available in the dataset website


transform_test = transforms.Compose([
    transforms.RandomResizedCrop(224), 
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

# here we are using the same set of transforms for both test and train
# but in general we might want different ones.
# eg: data augmentation - (eg. flip horizontally) - in train
# but not in test.

# should use the same mean, SD for normalizing test data also.
# same as that of train data.


trainset = torchvision.datasets.CIFAR10(root='./data', train=True, 
                                        download=False, 
                                        transform=transform_train)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, 
                                        download=False, 
                                        transform=transform_test)

### Utils

In [1]:
# function to find accuracy - given dataloader, model
def evaluation(dataloader, model):
    model.eval()
    # model be already in device
    total, correct = 0, 0
    for data in dataloader:
        inputs, labels = data
        # move data to device (in batches)
        inputs, labels = inputs.to(device), labels.to(device)
        with torch.no_grad():
            outputs = model(inputs)
        _, pred = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (pred == labels).sum().item()
    return 100 * correct / total

# device is global

# Model (ResNet - Pretrained weights transfered)

In [31]:
resnet = models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [32]:
print(resnet)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

#### "BasicBlock" - is the 2 layer, identity skip building block.

basic difference is in the forward fucntion , f(x) + x is returned.  

see source.   

Better implement as a model-class-block - as its repeating.

In [33]:
# freeze parameters
for param in resnet.parameters():
    param.requires_grad = False
    
# replace last layer
in_features = resnet.fc.in_features
resnet.fc = nn.Linear(in_features, num_classes)

# check
for param in resnet.parameters():
    if param.requires_grad:
        print(param.shape)

## Train

In [None]:
# dataloader
batch_size = 16
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

# build model
resnet = resnet.to(device)

# loss function
loss_fn = nn.CrossEntropyLoss()

# build optim
opt = optim.SGD(resnet.parameters(), lr=0.01)

#########################################################################################

loss_per_epoch_arr = []
num_epochs = 1

min_loss = 1000

# number of batches in dataset
num_steps = np.ceil(50000/batch_size)

for epoch in range(num_epochs):

    for i, data in enumerate(trainloader, 0):
        
        # data
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        opt.zero_grad() # reset gradients
        
        # forward pass
        outputs = vgg(inputs)
        
        # loss
        loss = loss_fn(outputs, labels)
        
        # compute gradients
        loss.backward()
        
        # update parameters
        opt.step()
        
        # save one with min loss.
        if min_loss > loss.item():
            min_loss = loss.item()
            # using copy.deepcopy
            # state_dict - access to all parameters.
            best_model = copy.deepcopy(vgg.state_dict())
            print('Min loss %0.2f' % min_loss)
        
        # DELETING CURRENT BATCH TO SAVE MEMORY - AS ITS NOT NEEDED AGAIN
        del inputs, labels, outputs
        torch.cuda.empty_cache()
        
        # print loss every 100 steps
        if i % 100 == 0:
            print('Step: %d/%d, Loss: %0.2f' % (i, num_steps, loss.item()) )
        
    loss_per_epoch_arr.append(loss.item())
    
    # Evaluation after each epoch      
    # no grad is there in evaluation fucntion. 
    # can add here also to be safe.
    print('Epoch: %d/%d, Test acc: %0.2f, Train acc: %0.2f' % (
        epoch, num_epochs, 
        evaluation(testloader, vgg), evaluation(trainloader, resnet)))
    
    
plt.plot(loss_per_epoch_arr)
plt.show()

In [None]:
# load checkpoint
resnet.load_state_dict(best_model)

# evaluate
print(evaluation(trainloader, resnet), evaluation(testloader, resnet))

better start retraining from the min loss point - as that need not be the best.