# CS-GY 9223-D: Deep Learning Homework 1
Due on Tuesday, 19th February 2019, 11:55 PM

This homework can be done in pairs. Everyone must submit on NYU Classes individually.

Write down the UNIs (NetIDs) of your group (if applicable)

Member 1: Matthew Avallone, mva271

Member 2: Victor Zheng, vz365

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.transforms import transforms
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

## Data Loading

In [0]:
import torch
import torchvision
from torchvision.transforms import transforms

Downloading the Datasets

In [0]:
valid_size = 0.1 # 10% of training data

train_transform = transforms.Compose(
    [transforms.RandomCrop(size=[32,32], padding=4), # adding some data augmentation
     transforms.RandomHorizontalFlip(), transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)
validset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)



Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


Loading the Datasets

In [0]:
from torch.utils.data import DataLoader

num_train = len(trainset)
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))
np.random.shuffle(indices) # selecting random examples for training/validation

train_idx, valid_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

trainloader = DataLoader(trainset,batch_size=64, sampler=train_sampler)
validloader = DataLoader(validset,batch_size=64, sampler=valid_sampler)
testloader = DataLoader(testset, batch_size=64, shuffle=False)
print(len(trainloader.dataset), len(validloader.dataset)*valid_size, len(testloader.dataset))

50000 5000.0 10000


## Model Architecture

In [0]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        #network architecture
        #Input is originally 32x32 RGB images (3 channels)
        
#         self.conv1 = nn.Conv2d(3, 12, 3, stride=1, padding=1)
#         self.conv2 = nn.Conv2d(12, 12, 3, stride=1, padding=1)
#         self.conv3 = nn.Conv2d(12, 24, 3, stride=1, padding=1)
#         self.conv4 = nn.Conv2d(24, 24, 3, stride=1, padding=1)
#         self.pool = nn.MaxPool2d(kernel_size=2)
#         self.dp = nn.Dropout(p=0.5)
#         self.bn1 = nn.BatchNorm2d(12)
#         self.bn2 = nn.BatchNorm2d(24)
#         self.activ = nn.ReLU()          
#         self.fc = nn.Linear(16 * 16 * 24, 10)
#       Could not exceed 70% test accuracy --> added more layers, filters and regularization

        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv4 = nn.Conv2d(64, 64, 3, padding=1)   
        self.conv5 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv6 = nn.Conv2d(128, 128, 3, padding=1)
        
        self.pool = nn.MaxPool2d(kernel_size=2)
        
        self.dp1 = nn.Dropout(p=0.2)
        self.dp2 = nn.Dropout(p=0.5) 
        
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.activ = nn.ReLU()       
        
        self.fc = nn.Linear(4 * 4 * 128, 10)
        
    def forward(self,x):
        #forward pass
        #x is the input
        x = self.conv1(x)
        x = self.activ(x)
#         x = self.bn1(x) batch norm layers caused dropoff in valid accuracy
        x = self.conv2(x)
        x = self.activ(x)
        x = self.bn1(x)        
        x = self.pool(x)
        x = self.dp1(x)

        x = self.conv3(x)
        x = self.activ(x)
#         x = self.bn2(x)
        x = self.conv4(x)
        x = self.activ(x)
        x = self.bn2(x)
        x = self.pool(x)
        x = self.dp1(x)

        x = self.conv5(x)
        x = self.activ(x)
#         x = self.bn3(x)
        x = self.conv6(x)
        x = self.activ(x)
        x = self.bn3(x)
        x = self.pool(x)
        
        x = x.view(-1, 4 * 4 * 128)

        x = self.fc(x)
#         x = self.dp2(x) training accuracy could not exceed ~40% when used

        return F.log_softmax(x)


## **Defining the Optimizer and Loss Function**

In [0]:
model = CNN()
model.cuda() # Moving the CNN onto a GPU

# loss function
loss_function = nn.CrossEntropyLoss()
 
#optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001) # generated better training accuracy than SGD and RMSprop
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# optimizer = optim.RMSprop(model.parameters(), lr=0.001, weight_decay=1e-6)

# adaptive learning rate
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.3) # adaptive step size (lr=0.0003 when we reach epoch 25)

## **Training the Model**

In [0]:
batch_size = 64
num_epochs = 50
model.train()

i = 0
for epoch in range(num_epochs):
    running_loss = 0.0

    for X, y in trainloader:

        #get the inputs
        X, y = X.cuda(), y.cuda()

        #please ensure that the data is in the appropriate tensor form as required.
        # zero the parameter gradients
        # forward pass
        # backward pass
        # optimize the weights
        # print statistics during training
        
        optimizer.zero_grad()
        output = model(X)
        loss = loss_function(output, y)
        loss.backward()
        running_loss = loss.item()
        optimizer.step()
        y_hat = output.data.max(1)[1]
        accuracy = np.sum(y_hat.cpu().numpy()==y.cpu().numpy()) / batch_size*100
        
        
        if i % 700 == 0:
            model.eval()
            correct = 0.0
            for images, labels in validloader:
                with torch.no_grad():
                    images, labels = images.cuda(), labels.cuda()
                    # Predict classes using images from the test set
                    outputs = model(images)
                    y_pred = outputs.data.max(1)[1]
                    correct += y_pred.eq(labels.data).sum()
                    
            print('Epoch: {}\tTrain Step: {}\tloss: {:.3f}\tTrain Accuracy: {:.1f}\tValid Accuracy: {:.1f}'.format(epoch, i, loss.item(), accuracy, 100.0*correct / (len(validloader.dataset)*valid_size)))
            model.train()

        i+= 1





Epoch: 0	Train Step: 0	loss: 2.381	Train Accuracy: 15.6	Valid Accuracy: 9.0
Epoch: 0	Train Step: 700	loss: 0.975	Train Accuracy: 57.8	Valid Accuracy: 64.0
Epoch: 1	Train Step: 1400	loss: 0.726	Train Accuracy: 70.3	Valid Accuracy: 72.0
Epoch: 2	Train Step: 2100	loss: 0.869	Train Accuracy: 71.9	Valid Accuracy: 73.0
Epoch: 3	Train Step: 2800	loss: 0.832	Train Accuracy: 73.4	Valid Accuracy: 77.0
Epoch: 4	Train Step: 3500	loss: 0.933	Train Accuracy: 59.4	Valid Accuracy: 79.0
Epoch: 5	Train Step: 4200	loss: 0.827	Train Accuracy: 76.6	Valid Accuracy: 80.0
Epoch: 6	Train Step: 4900	loss: 0.640	Train Accuracy: 75.0	Valid Accuracy: 80.0
Epoch: 7	Train Step: 5600	loss: 0.776	Train Accuracy: 73.4	Valid Accuracy: 81.0
Epoch: 8	Train Step: 6300	loss: 0.750	Train Accuracy: 73.4	Valid Accuracy: 82.0
Epoch: 9	Train Step: 7000	loss: 0.468	Train Accuracy: 79.7	Valid Accuracy: 82.0
Epoch: 10	Train Step: 7700	loss: 0.639	Train Accuracy: 79.7	Valid Accuracy: 84.0
Epoch: 11	Train Step: 8400	loss: 0.432	Train

## **Testing The Model**

In [0]:
model.eval()
correct = 0.0
test_pred = torch.LongTensor().cuda()
        
for X, y in testloader:
    with torch.no_grad():
        X, y = X.cuda(), y.cuda()
        # Predict classes using images from the test set
        outputs = model(X)
        y_pred = outputs.data.max(1)[1]
        test_pred = torch.cat((test_pred, y_pred), dim=0)
      
        correct += y_pred.eq(y.data).sum()


ans2 = test_pred.cpu().numpy()
print(ans2.shape)

# saving the test results
np.save('./ans2-mva271-vz365', ans2)
print('Test Accuracy: {:.1f}%'.format(100.0*correct / len(testloader.dataset)))

#predict on the test data
#save the predictions to ans2-uni.py



(10000,)
Test Accuracy: 88.0%


## **Discussion**

**Model Architecture** 

The optimal model presented needed 6 convolutional layers, 6 activation layers, 1 fully-connected layer and 1 max-pooling layer. At first, we tried using only 4 convolutional/activation layers (with the rest the same) and could not surpase 70% on the validation data. The activation function was chosen to be ReLU since this is most commonly used with good results. Regularization was also used in the form of 3 instances of batch normalization and 2 instances of droput. We experimented with adding more regularization (commented out in the code) but found that the combination used provided the best results. A log-softmax function is used in the end since the model returns a probalitily for each class and adding a log function to the softmax makes the calculation more stable (turns the division operation inside softmax into subtraction).

**Optimizer and Loss Function**

The loss function chosen was categorical cross-entropy, since this is commonly used for multiclass classification. Three optimizers were tried, with Adam and RMSprop showing similar and superior results than SGD. We ultimately decided to go with Adam as it increased validation accuracy by 1% over RMSprop. The learning rate was set to default value and weight decay was added as it was seen in several online examples. A weight decay value of 1e-5 and 1e-6 were attempted with no obvious performance differences between the two. An adaptive learning rate was implemented to help the model converge at higher epochs. The step size factor, gamma, was chosen to be 0.3 (next_lr = 30% of current_lr) randomly and seemed to yield sufficient results. We also tried gamma=0.1 and found slight decrease in validation accuracy. 

**Data**

The input data was split into 3 sets: training, validation and test. The validation set was selected randomly from 10% of the training set. The training data was augmented with random cropping and horizontal flipping and all of the data was normalized. This was done to increase the number of examples and helped improve the accuracy of the model by about 20%. The data was broken down into batches of 64 examples, with 50 epochs performed. More epochs were attempted with no improvement to the accuracy.

