## Import Libraries

In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import torch.nn.functional as F  


import torchvision.datasets as dset
import torchvision.transforms as T

import matplotlib.pyplot as plt

import numpy as np

## Import Data and Set up Training Batches

In [60]:
NUM_TRAIN = 49000 # Number of training samples out of 50000 total samples

# Normalize Data
transform = T.Compose([
                T.ToTensor(),
                T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) # Mean and Std Vectors
            ])

# Set up a Data Loader + Sampler combination for Batch Training

cifar10_train = dset.CIFAR10('./datasets', train=True, download=True,
                             transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64, 
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

cifar10_val = dset.CIFAR10('./datasets', train=True, download=True,
                           transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64, 
                        sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))

cifar10_test = dset.CIFAR10('./datasets', train=False, download=True, 
                            transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


## Set up Training Device and Global Variables

In [11]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print("Using device:", device)

Using device: cuda


In [12]:
print_every = 100 # How often we print the train loss
dtype = torch.float32 # Data type to use

## Custom Pytorch NNs

In [15]:
def flatten(x):
    N = x.shape[0] # read in N, C, H, W
    return x.view(N, -1)  # "flatten" the C * H * W values into a single vector per image

# Test flatten

x = torch.arange(12).view(2, 1, 3, 2)
print(f"Before flattening: {x}")
print(f"After flattening: {flatten(x)}")



Before flattening: tensor([[[[ 0,  1],
          [ 2,  3],
          [ 4,  5]]],


        [[[ 6,  7],
          [ 8,  9],
          [10, 11]]]])
After flattening: tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11]])


### Two-Layer Network

In [19]:
def two_layer_fc(x, params):
    """
    A fully-connected neural networks; the architecture is:
    NN is fully connected -> ReLU -> fully connected layer.
    
    The input to the network will be a minibatch of data, of shape
    (N, d1, ..., dM) where d1 * ... * dM = D. The hidden layer will have H units,
    and the output layer will produce scores for C classes.
    
    Inputs:
    - x: A PyTorch Tensor of shape (N, d1, ..., dM) giving a minibatch of
      input data.
    - params: A list [w1, w2] of PyTorch Tensors giving weights for the network;
      w1 has shape (D, H) and w2 has shape (H, C).
    
    Returns:
    - scores: A PyTorch Tensor of shape (N, C) giving classification scores for
      the input data x.
    """
    
    # first we flatten the image
    x = flatten(x)  # shape: [batch_size, C x H x W]
    
    w1, w2 = params
    
    # Forward pass: compute predicted y using operations on Tensors. Since w1 and
    # w2 have requires_grad=True, operations involving these Tensors will cause
    # PyTorch to build a computational graph, allowing automatic computation of
    # gradients. Since we are no longer implementing the backward pass by hand we
    # don't need to keep references to intermediate values.
    # you can also use `.clamp(min=0)`, equivalent to F.relu()
    x = F.relu(x.mm(w1))
    x = x.mm(w2)
    return x

In [22]:
# Test Output (should be [64, 10])

hidden_layer_size = 42
x = torch.zeros((64, 50), dtype=dtype)  # Minibatch size 64, Feature dimension 50
w1 = torch.zeros((50, hidden_layer_size), dtype=dtype)
w2 = torch.zeros((hidden_layer_size, 10), dtype=dtype)
scores = two_layer_fc(x, [w1, w2])
print(scores.size())  # 

torch.Size([64, 10])


### Three-Layer Convolutional Network

In [20]:
def three_layer_convnet(x, params):
    """
    1. A convolutional layer (with bias) with `channel_1` filters, each with shape `KW1 x KH1`, and zero-padding of two
    2. ReLU nonlinearity
    3. A convolutional layer (with bias) with `channel_2` filters, each with shape `KW2 x KH2`, and zero-padding of one
    4. ReLU nonlinearity
    5. Fully-connected layer with bias, producing scores for C classes.
    
    Performs the forward pass of a three-layer convolutional network with the
    architecture defined above.

    Inputs:
    - x: A PyTorch Tensor of shape (N, 3, H, W) giving a minibatch of images
    - params: A list of PyTorch Tensors giving the weights and biases for the
      network; contains the following:
      - conv_w1: PyTorch Tensor of shape (channel_1, 3, KH1, KW1) giving weights
        for the first convolutional layer
      - conv_b1: PyTorch Tensor of shape (channel_1,) giving biases for the first
        convolutional layer
      - conv_w2: PyTorch Tensor of shape (channel_2, channel_1, KH2, KW2) giving
        weights for the second convolutional layer
      - conv_b2: PyTorch Tensor of shape (channel_2,) giving biases for the second
        convolutional layer
      - fc_w: PyTorch Tensor giving weights for the fully-connected layer
      - fc_b: PyTorch Tensor giving biases for the fully-connected layer
    
    Returns:
    - scores: PyTorch Tensor of shape (N, C) giving classification scores for x
    """
    conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b = params
    scores = None

    x = F.conv2d(x, weight=conv_w1, bias=conv_b1, padding=2)
    x = F.relu(x)
    x = F.conv2d(x, weight=conv_w2, bias=conv_b2, padding=1)
    x = F.relu(x)
    x = flatten(x)
    scores = x.mm(fc_w) + fc_b

    return scores

In [23]:
# Test Output (should be [64, 10])

x = torch.zeros((64, 3, 32, 32), dtype=dtype)  # Minibatch size 64, image size [3, 32, 32]

conv_w1 = torch.zeros((6, 3, 5, 5), dtype=dtype)  # [out_channel, in_channel, kernel_H, kernel_W]
conv_b1 = torch.zeros((6,))  # out_channel
conv_w2 = torch.zeros((9, 6, 3, 3), dtype=dtype)  # [out_channel, in_channel, kernel_H, kernel_W]
conv_b2 = torch.zeros((9,))  # out_channel

# Calculating the shape of the tensor after two conv layers, before the fully-connected layer
fc_w = torch.zeros((9 * 32 * 32, 10))
fc_b = torch.zeros(10)

scores = three_layer_convnet(x, [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b])
print(scores.size())

torch.Size([64, 10])


### Custom Util Functions

In [31]:
def random_weight(shape, device, dtype):
    """
    Create random Tensors for weights
    We use Kaiming normalization: sqrt(2 / fan_in)
    """
    
    if len(shape) == 2:  # FC weight
        fan_in = shape[0]
    else:
        fan_in = np.prod(shape[1:]) # conv weight [out_channel, in_channel, kH, kW]
    # randn is standard normal distribution generator. 
    w = torch.randn(shape, device=device, dtype=dtype) * np.sqrt(2. / fan_in)
    w.requires_grad = True # Used for backward pass
    return w

def zero_weight(shape, device, dtype):
    """
    Create Tensors with zeros for weights
    """
    
    return torch.zeros(shape, device=device, dtype=dtype, requires_grad=True)

In [33]:
# create a weight of shape [3 x 5]
random_weight((3, 5), device, dtype)

tensor([[-1.9002,  0.3946, -0.0219,  0.6299,  0.5653],
        [ 0.6567, -1.3923, -1.4814, -0.0067,  0.5334],
        [-0.8200,  1.6313, -0.0149, -0.8094, -0.0763]], device='cuda:0',
       requires_grad=True)

In [35]:
def check_accuracy(loader, model_fn, params, device, dtype):
    """
    Check the accuracy of a classification model.
    
    Inputs:
    - loader: A DataLoader for the data split we want to check
    - model_fn: A function that performs the forward pass of the model,
      with the signature scores = model_fn(x, params)
    - params: List of PyTorch Tensors giving parameters of the model
    
    Returns: Nothing, but prints the accuracy of the model
    """
    split = 'val' if loader.dataset.train else 'test'
    print('Checking accuracy on the %s set' % split)
    num_correct, num_samples = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  
            y = y.to(device=device, dtype=torch.int64)
            scores = model_fn(x, params)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))

In [36]:
def train_model(model_fn, params, learning_rate, device, dtype):
    """
    Train a model on CIFAR-10.
    
    Inputs:
    - model_fn: A Python function that performs the forward pass of the model.
      It should have the signature scores = model_fn(x, params) where x is a
      PyTorch Tensor of image data, params is a list of PyTorch Tensors giving
      model weights, and scores is a PyTorch Tensor of shape (N, C) giving
      scores for the elements in x.
    - params: List of PyTorch Tensors giving weights for the model
    - learning_rate: Python scalar giving the learning rate to use for SGD
    
    Returns: Nothing
    """
    for t, (x, y) in enumerate(loader_train):
        # Move the data to the proper device (GPU or CPU)
        x = x.to(device=device, dtype=dtype)
        y = y.to(device=device, dtype=torch.long)

        # Forward pass: compute scores and loss
        scores = model_fn(x, params)
        loss = F.cross_entropy(scores, y)

        loss.backward()

        # Update parameters. We don't want to backpropagate through the
        # parameter updates, so we scope the updates under a torch.no_grad()
        # context manager to prevent a computational graph from being built.
        with torch.no_grad():
            for w in params:
                w -= learning_rate * w.grad

                # Manually zero the gradients after running the backward pass
                w.grad.zero_()

        if t % print_every == 0:
            print('Iteration %d, loss = %.4f' % (t, loss.item()))
            check_accuracy(loader_val, model_fn, params, device, dtype)
            print()

### Training the Models

In [42]:
# Training the Two-Layer Net (No hyperparameter tuning)

hidden_layer_size = 4000
learning_rate = 1e-2

# We need to explicitly allocate tensors for the fully connected weights, `w1` and `w2`
w1 = random_weight((3 * 32 * 32, hidden_layer_size), device, dtype)
w2 = random_weight((hidden_layer_size, 10), device, dtype)

train_model(two_layer_fc, [w1, w2], learning_rate, device, dtype)

Iteration 0, loss = 3.7549
Checking accuracy on the val set
Got 140 / 1000 correct (14.00%)

Iteration 100, loss = 2.2964
Checking accuracy on the val set
Got 367 / 1000 correct (36.70%)

Iteration 200, loss = 1.9493
Checking accuracy on the val set
Got 370 / 1000 correct (37.00%)

Iteration 300, loss = 1.7003
Checking accuracy on the val set
Got 409 / 1000 correct (40.90%)

Iteration 400, loss = 2.1938
Checking accuracy on the val set
Got 352 / 1000 correct (35.20%)

Iteration 500, loss = 1.8675
Checking accuracy on the val set
Got 433 / 1000 correct (43.30%)

Iteration 600, loss = 1.9911
Checking accuracy on the val set
Got 391 / 1000 correct (39.10%)

Iteration 700, loss = 1.5363
Checking accuracy on the val set
Got 432 / 1000 correct (43.20%)



In [41]:
# Training the Three-Layer ConvNet (No hyperparameter tuning)

# 1. Convolutional layer (with bias) with 32 5x5 filters, with zero-padding of 2
# 2. ReLU
# 3. Convolutional layer (with bias) with 16 3x3 filters, with zero-padding of 1
# 4. ReLU
# 5. Fully-connected layer (with bias) to compute scores for 10 classes

learning_rate = 3e-3

channel_1 = 32
channel_2 = 16

conv_w1 = random_weight((channel_1, 3, 5, 5), device, dtype)
conv_b1 = random_weight((channel_1,), device, dtype)
conv_w2 = random_weight((channel_2, channel_1, 3, 3), device, dtype)
conv_b2 = random_weight((channel_2,), device, dtype)
fc_w = random_weight((channel_2*32*32, 10), device, dtype)
fc_b = random_weight((10,), device, dtype)

params = [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b]
train_model(three_layer_convnet, params, learning_rate, device, dtype)

Iteration 0, loss = 5.0166
Checking accuracy on the val set
Got 124 / 1000 correct (12.40%)

Iteration 100, loss = 2.0025
Checking accuracy on the val set
Got 351 / 1000 correct (35.10%)

Iteration 200, loss = 1.5985
Checking accuracy on the val set
Got 402 / 1000 correct (40.20%)

Iteration 300, loss = 1.4957
Checking accuracy on the val set
Got 436 / 1000 correct (43.60%)

Iteration 400, loss = 1.4707
Checking accuracy on the val set
Got 448 / 1000 correct (44.80%)

Iteration 500, loss = 1.6312
Checking accuracy on the val set
Got 460 / 1000 correct (46.00%)

Iteration 600, loss = 1.5946
Checking accuracy on the val set
Got 463 / 1000 correct (46.30%)

Iteration 700, loss = 1.4478
Checking accuracy on the val set
Got 477 / 1000 correct (47.70%)



## Pytorch Module Models

### Two-Layer Network

In [44]:
class TwoLayerFC(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        # assign layer objects to class attributes
        self.fc1 = nn.Linear(input_size, hidden_size)
        # nn.init package contains convenient initialization methods
        # http://pytorch.org/docs/master/nn.html#torch-nn-init 
        nn.init.kaiming_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        nn.init.kaiming_normal_(self.fc2.weight)
    
    def forward(self, x):
        # forward always defines connectivity
        x = flatten(x)
        scores = self.fc2(F.relu(self.fc1(x)))
        return scores

In [45]:
# Test Output (should be [64, 10])

input_size = 50
x = torch.zeros((64, input_size), dtype=dtype)  # Minibatch size 64, feature dimension 50
model = TwoLayerFC(input_size, 42, 10)
scores = model(x)
print(scores.size())  

torch.Size([64, 10])


### Three-Layer Convolutional Network

In [48]:
class ThreeLayerConvNet(nn.Module):
    def __init__(self, in_channel, channel_1, channel_2, num_classes):
        """
        1. Convolutional layer with `channel_1` 5x5 filters with zero-padding of 2
        2. ReLU
        3. Convolutional layer with `channel_2` 3x3 filters with zero-padding of 1
        4. ReLU
        5. Fully-connected layer to `num_classes` classes
        """
        
        super().__init__()
       
        self.fc1 = nn.Conv2d(in_channels=3, out_channels=channel_1, kernel_size=5, padding=2)
        nn.init.kaiming_normal_(self.fc1.weight) # Same initialization as before
        self.fc2 = nn.Conv2d(in_channels=channel_1, out_channels=channel_2, kernel_size=3, padding=1)
        nn.init.kaiming_normal_(self.fc2.weight)        
        self.fc3 = nn.Linear(channel_2*32*32, num_classes)
        nn.init.kaiming_normal_(self.fc3.weight) 

    def forward(self, x):
        scores = None
        
        x = F.relu(self.fc2(F.relu(self.fc1(x))))
        x = flatten(x)
        scores = self.fc3(x)
        
        return scores

In [49]:
# Test Output (should be [64, 10])

x = torch.zeros((64, 3, 32, 32), dtype=dtype)  # minibatch size 64, image size [3, 32, 32]
model = ThreeLayerConvNet(in_channel=3, channel_1=12, channel_2=8, num_classes=10)
scores = model(x)
print(scores.size())

torch.Size([64, 10])


### Util Functions

In [51]:
def check_accuracy_module(loader, model, device, dtype):
    if loader.dataset.train:
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

In [53]:
def train_model_module(model, optimizer, device, dtype, epochs=1):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                check_accuracy_module(loader_val, model, device, dtype)
                print()

### Training the Models

In [54]:
# Training the Two-Layer Net (No hyperparameter tuning)

hidden_layer_size = 4000
learning_rate = 1e-2
model = TwoLayerFC(3 * 32 * 32, hidden_layer_size, 10)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

train_model_module(model, optimizer, device, dtype)

Iteration 0, loss = 3.6557
Checking accuracy on validation set
Got 160 / 1000 correct (16.00)

Iteration 100, loss = 2.3441
Checking accuracy on validation set
Got 313 / 1000 correct (31.30)

Iteration 200, loss = 2.1471
Checking accuracy on validation set
Got 371 / 1000 correct (37.10)

Iteration 300, loss = 2.3162
Checking accuracy on validation set
Got 382 / 1000 correct (38.20)

Iteration 400, loss = 2.0473
Checking accuracy on validation set
Got 381 / 1000 correct (38.10)

Iteration 500, loss = 1.6748
Checking accuracy on validation set
Got 452 / 1000 correct (45.20)

Iteration 600, loss = 1.7866
Checking accuracy on validation set
Got 435 / 1000 correct (43.50)

Iteration 700, loss = 1.8146
Checking accuracy on validation set
Got 373 / 1000 correct (37.30)



In [55]:
# Training the Three-Layer ConvNet (No hyperparameter tuning)

learning_rate = 3e-3
channel_1 = 32
channel_2 = 16

model = None
optimizer = None

model = ThreeLayerConvNet(in_channel=3, channel_1=channel_1, channel_2=channel_2, num_classes=10)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)


train_model_module(model, optimizer, device, dtype)

Iteration 0, loss = 3.7216
Checking accuracy on validation set
Got 113 / 1000 correct (11.30)

Iteration 100, loss = 1.9634
Checking accuracy on validation set
Got 335 / 1000 correct (33.50)

Iteration 200, loss = 1.8439
Checking accuracy on validation set
Got 400 / 1000 correct (40.00)

Iteration 300, loss = 1.7141
Checking accuracy on validation set
Got 421 / 1000 correct (42.10)

Iteration 400, loss = 1.7226
Checking accuracy on validation set
Got 436 / 1000 correct (43.60)

Iteration 500, loss = 1.9288
Checking accuracy on validation set
Got 453 / 1000 correct (45.30)

Iteration 600, loss = 1.7896
Checking accuracy on validation set
Got 465 / 1000 correct (46.50)

Iteration 700, loss = 1.5000
Checking accuracy on validation set
Got 448 / 1000 correct (44.80)



### Sequential Model

In [56]:
# We need to wrap `flatten` function in a module in order to stack it in nn.Sequential
class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)
    
channel_1 = 32
channel_2 = 16
learning_rate = 1e-2

model = None
optimizer = None

# Same Three-Layer ConvNet Model
model = nn.Sequential(nn.Conv2d(in_channels=3, out_channels=channel_1, kernel_size=5, padding=2), 
                      nn.ReLU(), 
                      nn.Conv2d(in_channels=channel_1, out_channels=channel_2, kernel_size=3, padding=1),
                      nn.ReLU(), 
                      Flatten(),
                      nn.Linear(channel_2*32*32, 10)
        )
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

train_model_module(model, optimizer, device, dtype)

Iteration 0, loss = 2.2929
Checking accuracy on validation set
Got 120 / 1000 correct (12.00)

Iteration 100, loss = 2.0204
Checking accuracy on validation set
Got 334 / 1000 correct (33.40)

Iteration 200, loss = 1.6405
Checking accuracy on validation set
Got 409 / 1000 correct (40.90)

Iteration 300, loss = 1.4548
Checking accuracy on validation set
Got 426 / 1000 correct (42.60)

Iteration 400, loss = 1.5407
Checking accuracy on validation set
Got 476 / 1000 correct (47.60)

Iteration 500, loss = 1.5166
Checking accuracy on validation set
Got 492 / 1000 correct (49.20)

Iteration 600, loss = 1.6643
Checking accuracy on validation set
Got 503 / 1000 correct (50.30)

Iteration 700, loss = 1.5494
Checking accuracy on validation set
Got 519 / 1000 correct (51.90)

