# Training ResNet on CIFAR-10
In this project, you will use the ResNet model to perform image classification on CIFAR-10. CIFAR-10 orginally contains 60K images from 10 categories. We split it into 45K/5K/10K images to serve as train/valiation/test set. We only release the ground-truth labels of training/validation dataset to you.

## Step 0: Set up the ResNet model


In [None]:
# import necessary dependencies
import argparse
import os, sys
import time
import datetime
from tqdm import tqdm_notebook as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# define a Swish function, where swish = x*sigmoid(x)
# sigmoid(x) = 1/(1 + e^-x)
def swish(x):
  return x/(1 + torch.exp(-1*x))

In [None]:
# define the ResNet model; n=3 (ResNet-20)
class ResNetCIFAR10(nn.Module):
    def __init__(self):
        super(ResNetCIFAR10, self).__init__()
        # first layer is 3x3 conv, 16 output channels
        self.conv1 = nn.Conv2d(3, 16, 3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(num_features=16)

        # first residual block, 16 filters, images are 32x32 here
        self.block1conv1 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.block1bn1 = nn.BatchNorm2d(num_features=16)
        self.block1conv2 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.block1bn2 = nn.BatchNorm2d(num_features=16)
        # identity mapping, 32->16 with 1x1 conv
        self.block1resconv1 = nn.Conv2d(16, 16, 1, stride=1)
        # should I use another BN after adding the residual?
        #self.block1bn3 = nn.BatchNorm2d(num_features=16)

        # second residual block, 16 filters, images are 32x32 here
        self.block1conv3 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.block1bn3 = nn.BatchNorm2d(num_features=16)
        self.block1conv4 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.block1bn4 = nn.BatchNorm2d(num_features=16)
        # identity mapping, 32->16 with 1x1 conv
        self.block1resconv2 = nn.Conv2d(16, 16, 1, stride=1)
        
        # third residual block, 16 filters, images are 32x32 here
        self.block1conv5 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.block1bn5 = nn.BatchNorm2d(num_features=16)
        self.block1conv6 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.block1bn6 = nn.BatchNorm2d(num_features=16)
        # identity mapping, 32->16 with 1x1 conv
        self.block1resconv3 = nn.Conv2d(16, 16, 1, stride=1)

        # first residual block, 32 filters
        # stride of 2 to subsample to 16x16 images
        self.block2conv1 = nn.Conv2d(16, 32, 3, stride=2, padding=1)
        self.block2bn1 = nn.BatchNorm2d(num_features=32)
        self.block2conv2 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.block2bn2 = nn.BatchNorm2d(num_features=32)
        # identity mapping, 16->32 with 1x1 conv, needs stride=2
        self.block2resconv1 = nn.Conv2d(16, 32, 1, stride=2)
        #self.block2bn3 = nn.BatchNorm2d(num_features=32)

        # second residual block, 32 filters
        # no need to subsample
        self.block2conv3 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.block2bn3 = nn.BatchNorm2d(num_features=32)
        self.block2conv4 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.block2bn4 = nn.BatchNorm2d(num_features=32)
        # identity mapping, size stays the same
        self.block2resconv2 = nn.Conv2d(32, 32, 1, stride=1)

        # third residual block, 32 filters
        # no need to subsample
        self.block2conv5 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.block2bn5 = nn.BatchNorm2d(num_features=32)
        self.block2conv6 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.block2bn6 = nn.BatchNorm2d(num_features=32)
        # identity mapping, size stays the same
        self.block2resconv3 = nn.Conv2d(32, 32, 1, stride=1)

        # first residual block, 64 filters
        # stride of 2 to subsample to 8x8 images
        self.block3conv1 = nn.Conv2d(32, 64, 3, stride=2, padding=1)
        self.block3bn1 = nn.BatchNorm2d(num_features=64)
        self.block3conv2 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.block3bn2 = nn.BatchNorm2d(num_features=64)
        # identity mapping, 32->64, with 1x1 conv, needs stride=2
        self.block3resconv1 = nn.Conv2d(32, 64, 1, stride=2)
        #self.block3bn3 = nn.BatchNorm2d(num_features=64)

        # second residual block, 64 filters
        # no need to subsample
        self.block3conv3 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.block3bn3 = nn.BatchNorm2d(num_features=64)
        self.block3conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.block3bn4 = nn.BatchNorm2d(num_features=64)
        # identity mapping, size stays the same
        self.block3resconv2 = nn.Conv2d(64, 64, 1, stride=1)

        # third residual block, 64 filters
        # no need to subsample
        self.block3conv5 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.block3bn5 = nn.BatchNorm2d(num_features=64)
        self.block3conv6 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.block3bn6 = nn.BatchNorm2d(num_features=64)
        # identity mapping, size stays the same
        self.block3resconv3 = nn.Conv2d(64, 64, 1, stride=1)
        

        # perform global avg pooling - kernel of 8x8 covers the entire image
        self.avgpool = nn.AvgPool2d(8, stride=1)
        # at this point, 64 features reduced to 1x1s, CIFAR-10 has 10 categories
        self.FC = nn.Linear(64*1*1, 10)

    def forward(self, x):
        # sequence: conv -> BN -> activation
        # initial conv layer
        res1 = F.relu(self.bn1(self.conv1(x)))
        out = res1
        
        # first block for 16 filters
        # sequence: conv -> BN -> activation within block
        # sequence: conv -> residual -> BN -> activation at block end
        out = self.block1conv2(F.relu(self.block1bn1(self.block1conv1(out))))
        # add the identity mapping of the residual - right BN order?
        out = F.relu(self.block1bn2(out + self.block1resconv1(res1)))
        block1res1 = out
        
        # second block for 16 filters
        out = self.block1conv4(F.relu(self.block1bn3(self.block1conv3(out))))
        out = F.relu(self.block1bn4(out + self.block1resconv2(block1res1)))
        block1res2 = out

        # third block for 16 filters
        out = self.block1conv6(F.relu(self.block1bn5(self.block1conv5(out))))
        out = F.relu(self.block1bn6(out + self.block1resconv3(block1res2)))
        block1res3 = out

        # first block for 32 filters
        out = self.block2conv2(F.relu(self.block2bn1(self.block2conv1(out))))
        out = F.relu(self.block2bn2(out + self.block2resconv1(block1res3)))
        block2res1 = out

        # second block for 32 filters
        out = self.block2conv4(F.relu(self.block2bn3(self.block2conv3(out))))
        out = F.relu(self.block2bn4(out + self.block2resconv2(block2res1)))
        block2res2 = out
        
        # third block for 32 filters
        out = self.block2conv6(F.relu(self.block2bn5(self.block2conv5(out))))
        out = F.relu(self.block2bn6(out + self.block2resconv3(block2res2)))
        block2res3 = out

        # first block for 64 filters
        out = self.block3conv2(F.relu(self.block3bn1(self.block3conv1(out))))
        out = F.relu(self.block3bn2(out + self.block3resconv1(block2res3)))
        block3res1 = out
        
        # second block for 64 filters
        out = self.block3conv4(F.relu(self.block3bn3(self.block3conv3(out))))
        out = F.relu(self.block3bn4(out + self.block3resconv2(block3res1)))
        block3res2 = out

        # third block for 64 filters
        out = self.block3conv6(F.relu(self.block3bn5(self.block3conv5(out))))
        out = F.relu(self.block3bn6(out + self.block3resconv3(block3res2)))

        # global avg pooling and FC layer
        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.FC(out)
        
        return out

### Here is a sanity check to verify the implementation of ResNet:


In [None]:
#############################################
# your code here
# sanity check for the correctness of SimpleNN
# 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
  print('using GPU')
else: 
  print('using CPU')

# initiate model
net = ResNetCIFAR10()
net = net.to(device)

# Test with batch of 5 RGB (3 channel) 32x32 random "images" to match CIFAR-10
data = torch.randn(5,3,32,32)
data = data.to(device)
out = net(data)   

# Check output shape; should be 5 (number of input tensors) x 10 (number of categories)
assert(out.detach().cpu().numpy().shape == (5,10))
print("Forward pass successful")
##

#############################################

using CPU
Forward pass successful


## Step 1: Set up preprocessing functions
Preprocessing is very important as discussed in the lecture.
You will need to write preprocessing functions with the help of *torchvision.transforms* in this step.
You can find helpful tutorial/API at [here](https://pytorch.org/vision/stable/transforms.html).

### Question (b)
For the question, you need to:
1. Complete the preprocessing code below.
2. **In the PDF report**, briefly describe what preprocessing operations you used and what are the purposes of them.

Hint: 
1. Only two operations are necessary to complete the basic preprocessing here.
2. The raw input read from the dataset will be PIL images.
3. Data augmentation operations are not mendatory, but feel free to incorporate them if you want.
4. Reference value for mean/std of CIFAR-10 images (assuming the pixel values are within [0,1]): mean (RGB-format): (0.4914, 0.4822, 0.4465), std (RGB-format): (0.2023, 0.1994, 0.2010)

In [None]:
# useful libraries
import torchvision
import torchvision.transforms as transforms

#############################################
# your code here
# specify preprocessing function
# changing the data from PIL to tensor allows use of normalize transform
# normalize data to have zero mean and variance=1 using the given reference values, makes training less sensitive to small changes and easier to optimize
# randomly crop images with a padding of 4 on all sides, need a cropped image size of 32x32 to maintain CIFAR-10 sizes
# random flips augment the data, fighting overfitting by making it more difficult for the model to "memorize" images as it doesn't see the same images repeatedly
transform_train = transforms.Compose([transforms.RandomCrop(size=(32,32),padding=4),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(), 
                                      transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
# do not perform augmentation functions (e.g. flips) on the validation set
# just convert to tensor and normalize
transform_val = transforms.Compose([transforms.ToTensor(),
                                    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
#############################################

## Step 2: Set up dataset and dataloader

### Question (c)
Set up the train/val datasets and dataloders that are to be used during the training. Check out the [official API](https://pytorch.org/docs/stable/data.html) for more information about **torch.utils.data.DataLoader**.

Here, you need to:
1. Complete the code below.

In [None]:
# do NOT change these
from tools.dataset import CIFAR10
from torch.utils.data import DataLoader

# a few arguments, do NOT change these
DATA_ROOT = "./data"
TRAIN_BATCH_SIZE = 128
VAL_BATCH_SIZE = 100

#############################################
# your code here
# construct dataset
train_set = CIFAR10(
    root=DATA_ROOT, 
    mode='train', 
    download=True,
    transform=transform_train    # your code
)
val_set = CIFAR10(
    root=DATA_ROOT, 
    mode='val', 
    download=True,
    transform=transform_val    # your code
)

# construct dataloader
# shuffle the training data only to help training process converge
train_loader = DataLoader(
    train_set, 
    batch_size=TRAIN_BATCH_SIZE,  # your code
    shuffle=True,     # your code
    num_workers=4
)
val_loader = DataLoader(
    val_set, 
    batch_size=VAL_BATCH_SIZE,  # your code
    shuffle=False,     # your code
    num_workers=4
)
#############################################

Downloading https://www.dropbox.com/s/s8orza214q45b23/cifar10_trainval_F22.zip?dl=1 to ./data/cifar10_trainval_F22.zip


0it [00:00, ?it/s]

Extracting ./data/cifar10_trainval_F22.zip to ./data
Files already downloaded and verified
Using downloaded and verified file: ./data/cifar10_trainval_F22.zip
Extracting ./data/cifar10_trainval_F22.zip to ./data
Files already downloaded and verified


  cpuset_checked))


## Step 3: Instantiate your ResNet model and deploy it to GPU devices.
### Question (d)
You may want to deploy your model to GPU device for efficient training. Please assign your model to GPU if possible. If you are training on a machine without GPUs, please deploy your model to CPUs.

Here, you need to:
1. Complete the code below.
2. **In the PDF report**, briefly describe how you verify that your model is indeed deployed on GPU. (Hint: check $\texttt{nvidia-smi}$.)

In [None]:
# specify the device for computation
#############################################
# your code here
import tensorflow as tf
model = ResNetCIFAR10()
# setup the model on GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
# can't use terminal command nvidia-smi on colab, instead use tf command
# prints device:GPU:0 if the GPU is enabled
tf.test.gpu_device_name()
    
#############################################

''

## Step 4: Set up the loss function and optimizer
Loss function/objective function is used to provide "feedback" for the neural networks. Typically, we use multi-class cross-entropy as the loss function for classification models. As for the optimizer, we will use SGD with momentum. 

### Question (e)
Here, you need to:
1. Set up the cross-entropy loss as the criterion. (Hint: there are implemented functions in **torch.nn**)
2. Specify a SGD optimizer with momentum. (Hint: there are implemented functions in **torch.optim**)

In [None]:
import torch.nn as nn
import torch.optim as optim

# hyperparameters, do NOT change right now
# initial learning rate
INITIAL_LR = 0.1

# momentum for optimizer
MOMENTUM = 0.9

# regularization strength
REG = 1e-4

#############################################
# your code here
# create loss function
criterion = nn.CrossEntropyLoss().to(device)

# Add optimizer
# if using weight_decay, penalty is L2
optimizer = optim.SGD(model.parameters(), lr=INITIAL_LR, momentum=MOMENTUM, weight_decay=REG)
#############################################

## Step 5: Start the training process.

### Question (f)/(g)
Congratulations! You have completed all of the previous steps and it is time to train our neural network.

Here you need to:
1. Complete the training codes.
2. Actually perform the training.

Hint: Training a neural network usually repeats the following 4 steps: 

**i) Get a batch of data from the dataloader and copy it to your device (GPU).**

**ii) Do a forward pass to get the outputs from the neural network and compute the loss. Be careful about your inputs to the loss function. Are the inputs required to be the logits or softmax probabilities?)**

**iii) Do a backward pass (back-propagation) to compute gradients of all weights with respect to the loss.**

**iiii) Update the model weights with the optimizer.**

You will also need to compute the accuracy of training/validation samples to track your model's performance over each epoch (the accuracy should be increasing as you train for more and more epochs).


In [None]:
# some hyperparameters
# total number of training epochs
# *** SET TO 5 FOR TESTING, CHANGE BACK TO 30 LATER ***
EPOCHS = 5

# the folder where the trained model is saved
CHECKPOINT_FOLDER = "./saved_model"

# start the training/validation process
# the process should take about 5 minutes on a GTX 1070-Ti
# if the code is written efficiently.
best_val_acc = 0
current_learning_rate = INITIAL_LR

# control learning rate schedule by decaying DECAY every DECAY_EPOCHS epochs
DECAY_EPOCHS = 40
DECAY = 0.1

print("==> Training starts!")
print("="*50)
for i in range(0, EPOCHS):
    # uncomment to handle the learning rate scheduler.
    if i % DECAY_EPOCHS == 0 and i != 0:
        current_learning_rate = current_learning_rate * DECAY
        for param_group in optimizer.param_groups:
            param_group['lr'] = current_learning_rate
        print("Current learning rate has decayed to %f" %current_learning_rate)
    
    #######################
    # your code here
    # switch to train mode
    model.train()
    
    #######################
    
    print("Epoch %d:" %i)
    # this help you compute the training accuracy
    total_examples = 0
    correct_examples = 0

    # count batches to report the initial loss before training
    batch_count = 0
    train_loss = 0 # track training loss if you want
    
    # Train the model for 1 epoch.
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        ####################################
        # your code here
        # copy inputs to device
        inputs = inputs.to(device)
        targets = targets.to(device)
        
        # compute the output and loss
        output = model(inputs)
        # criterion is nn.crossentropyloss, which includes a softmax
        curr_loss = criterion(output, targets)

        # if this is the first epoch and first batch, report initial loss
        if (i==0) and (batch_count ==0):
          print("Initial loss: %.4f" %(curr_loss))
        batch_count += 1

        # accumulate training loss
        train_loss += curr_loss

        # zero the gradient
        optimizer.zero_grad()
        
        # backpropagation
        curr_loss.backward()
        
        # apply gradient and update the weights
        optimizer.step()
        
        # count the number of correctly predicted samples in the current batch
        # gives the value (logit) and indices (category predicted) for each prediction
        value, prediction = torch.max(output, 1)
        # sums up how many times the prediction category matched the target
        correct_tensor = prediction.eq(targets).sum()
        # sums up total samples (# inputs from first dim of tensor)
        total_examples += inputs.shape[0]
        # converts correct sum from tensor to number and accumulates
        correct_examples += correct_tensor.item()
        ####################################
                
    avg_loss = train_loss / len(train_loader)
    avg_acc = correct_examples / total_examples
    print("Training loss: %.4f, Training accuracy: %.4f" %(avg_loss, avg_acc))

    # Validate on the validation dataset
    #######################
    # your code here
    # switch to eval mode
    model.eval()
    
    #######################

    # this help you compute the validation accuracy
    total_examples = 0
    correct_examples = 0
    
    val_loss = 0 # again, track the validation loss if you want

    # disable gradient during validation, which can save GPU memory
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            ####################################
            # your code here
            # copy inputs to device
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            # compute the output and loss
            output = model(inputs)
            val_loss += criterion(output, targets)

            # count the number of correctly predicted samples in the current batch
            # gives the value (logit) and indices (category predicted) for each prediction
            value, prediction = torch.max(output, 1)
            # sums up how many times the prediction category matched the target
            correct_tensor = prediction.eq(targets).sum()
            # sums up total samples (# inputs from first dim of tensor)
            total_examples += inputs.shape[0]
            # converts correct sum from tensor to number and accumulates
            correct_examples += correct_tensor.item()
            ####################################

    avg_loss = val_loss / len(val_loader)
    avg_acc = correct_examples / total_examples
    print("Validation loss: %.4f, Validation accuracy: %.4f" % (avg_loss, avg_acc))
    
    # save the model checkpoint
    if avg_acc > best_val_acc:
        best_val_acc = avg_acc
        if not os.path.exists(CHECKPOINT_FOLDER):
            os.makedirs(CHECKPOINT_FOLDER)
        print("Saving ...")
        state = {'state_dict': net.state_dict(),
                 'epoch': i,
                 'lr': current_learning_rate}
        torch.save(state, os.path.join(CHECKPOINT_FOLDER, 'resnet.pth'))
        
    print('')

print("="*50)
print(f"==> Optimization finished! Best validation accuracy: {best_val_acc:.4f}")

In [None]:
import numpy as np

import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from tools.dataset import CIFAR10

DATA_ROOT = "./data"
BATCH_SIZE = 100

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

test_set = CIFAR10(
    root=DATA_ROOT, 
    mode='test', 
    download=True,
    transform=transform_test
)

# do NOT shuffle your test data loader!!!!!!!!!!!!!!!!
# otherwise the order of samples will be messed up
# and your test accuracy is likely to drop to random guessing level
test_loader = DataLoader(
    test_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=1)

#########################################################
# use your model to generate predictions on test data
# and save the results into variable "results"
# "results" should be either a numpy array or a torch tensor with length of 10000

# initialize a resnet and load trained weights
net = ResNetCIFAR10()
state_dict = torch.load('./saved_model/resnet.pth') # change the path to your own checkpoint file
net.load_state_dict(state_dict['state_dict'])
net.to(device)

# remember to switch to eval mode whenever you are making inference
net.eval()

results = []
with torch.no_grad():
    for x in test_loader:
        results.append(
            net(x.cuda()).argmax(1)
        )

# convert results to numpy array
results = torch.cat(results).cpu().numpy()
assert len(results) == 10000

#########################################################
with open('predictions.csv', 'w') as fp:
    fp.write("Id,Label\n")
    for i in range(len(results)):
        fp.write("%d,%d\n" %(i, results[i]))

Using downloaded and verified file: ./data/cifar10_test_F22.zip
Extracting ./data/cifar10_test_F22.zip to ./data
Files already downloaded and verified
