In [4]:
!bash ../download_data.sh

--2020-01-12 11:36:37--  https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170498071 (163M) [application/x-gzip]
Saving to: ‘data/input_data.tar.gz’


2020-01-12 11:36:41 (50.6 MB/s) - ‘data/input_data.tar.gz’ saved [170498071/170498071]

cifar-10-batches-py/
cifar-10-batches-py/data_batch_4
cifar-10-batches-py/readme.html
cifar-10-batches-py/test_batch
cifar-10-batches-py/data_batch_3
cifar-10-batches-py/batches.meta
cifar-10-batches-py/data_batch_2
cifar-10-batches-py/data_batch_5
cifar-10-batches-py/data_batch_1


In [1]:
import sys
sys.path.append('..')

In [2]:
%matplotlib inline 
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from functools import reduce
import torch.optim as optim
from torch.utils.data import TensorDataset
from matplotlib import pyplot as plt

## Preprocessing Modules

* DataLoader: Class that can be run to load the training data for Cifar-10 and runs preprocessing + data augmentation


        argument
            input_dir - directory of your inputs
            data_augmentation_functions - a list of functions that can be applied to an individual image
                                          for augmentation. if left blank, images won't be augmented
            peprocessing_functions - a list of functions that can be applied to an entire
                                     np array of images that can be used for dataprep and will
                                     be run on both the TRAIN and TEST SET, if left blank
                                     images won't be augmented
        returns
            torch datasets for:
                training data
                test data
                label encodings
        usage:
            data_loader = DataLoader(input_dir, data_augmentation_functions, preprocessing_functions)
            train_data, test_data, label_encodings = data_loader.execute()

* Transformation Functions: a series of simple data transformation functions that I implemented
  * In this case I include:
    * min_max_scaling
    * random rotation
    * horizontal flip


In [3]:
from preprocessing.DataLoader import DataLoader 
from preprocessing.TransformationFunctions import min_max_scaling, rotate_random, flip_horizontal 

## Model Training Functions
* I'm too lazy to explain this rn but I'll add docs later


In [4]:
def plot_evaluation_stats(losses):
    """
        prints out the train vs validation loss, error rate, and accuracy over time
        --- this might only print out properly on google collab
        argument - 
            losses: pd dataframe of evaluation stats at each epoch
        returns
            none
    """
    losses[['train_loss', 'validation_loss']].plot(kind='line', title='train vs validation loss by epoch')

    #it can be more interpretable to look at error rate, as opposed to accuracy to compare with the loss values 
    losses['error rate'] = 1 - losses['accuracy']
   
    losses[['error rate']].plot(kind='line', title='error rate by epoch')
    losses[['accuracy']].plot(kind='line', title='accuracy by epoch')

def train_model(net, 
                train_data, 
                test_data,
                batch_size = 128,
                n_epochs = 20, 
                learning_rate = .001, 
                weight_decay = None,
                optim_method = optim.SGD,
                cuda = False
                ):
    """
        trains a pytorch model and returns a dataframe of the training_loss,
        validation_loss, and accuracy at each epoch

        arguments - 
            net: pytorch nn object
            train_data: training data
            test_data: test data
            batch_size: batch size 
            n_epochs: number of epochs
            learning_rate: learning rate for optimizer
            optim_method: optimizer function 
        returns
            training loss: list of training loss at each epoch
            validation loss: list of validation loss at each epoch
            validation accuracy: list of accuracy on the validation set 


    """
    if cuda:
        net.cuda()
    
    training_loss, validation_loss, accuracy_list = [], [], []
    
    train_loader = get_data_loader(train_data, batch_size)
    test_loader = get_data_loader(test_data, batch_size)

    n_batches = len(train_loader)
    loss, optimizer = get_loss_and_optimizer(net, learning_rate, optim_method, weight_decay)
    
    for epoch in range(n_epochs):
        running_loss = 0
        total_train_loss = 0
        val_loss = 0

        for i, data in enumerate(train_loader, 0):
            
            
            inputs, labels = data
  
  
            if cuda:
                inputs, labels = inputs.cuda(), labels.cuda()
            
            optimizer.zero_grad()
            outputs = net(inputs)
            loss_size = loss(outputs, labels)
            loss_size.backward()
            optimizer.step()
            
            
            running_loss += loss_size.data.item() * batch_size
            total_train_loss += loss_size.data.item() 
            
            if i % n_batches//4 == 0 and i != 0: 
                print('Iteration {0}: \n  \
                       Running Loss Is {1}'
                      .format(i, running_loss/(100 * batch_size)))
                running_loss = 0
            
        accuracy = 0 
        with torch.no_grad():
          for inputs, labels in test_loader: 
              if cuda:   
                  inputs, labels = inputs.cuda(), labels.cuda()
              val_outputs = net(inputs)
              val_loss += loss(val_outputs, labels).data.item() 
              accuracy += (torch.max(val_outputs, 1).indices == labels).sum().item()

        accuracy = accuracy/len(test_data)
        train_loss = total_train_loss/(len(train_data)/batch_size)
        val_loss = val_loss/(len(test_data)/batch_size)
        print(val_loss)
        print("Epoch {0}, \n \
                Train Loss: {1} \n \
                Validation Loss: {2} \
                Accuracy: {3}".format(str(epoch), train_loss, val_loss, accuracy))
        accuracy_list.append(accuracy)
        validation_loss.append(val_loss)
        training_loss.append(train_loss)
        running_loss = 0

                

    return pd.DataFrame(data=[training_loss, validation_loss, accuracy_list],
                        index=['train_loss', 'validation_loss', 'accuracy']).T




def get_loss_and_optimizer(architecture, learning_rate, optim_method, weight_decay):
    """
        returns the loss and optimizer objects for pytorch to use in the model

        arugment - 
            architecute: pytorch nn object
            learning_rate: learning rate
            optim_method: optimizer function
        return
            loss: loss object
            optimizer: optimizer object
    """
    loss = torch.nn.CrossEntropyLoss()
    
    optimizer = optim_method(architecture.parameters(), lr = learning_rate, weight_decay=0)

    return loss, optimizer

def get_data_loader(dataset, batch_size):
    """
        converts a dataset into a data loader object
        argument - 
            dataset: pytorch dataset object
            batch_size: batch_size
        return 
            data loader object
    """
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

## Improved Cifar-10 Model

* From the graph above, we can see that our vanilla convnet has overfit on the training data. To fix this, we will modify the preprocessing of our data and change our model architecture + optimization function. 

### Preprocessing changes
1. Data augmentation functions to add more training data
2. Normalization to scale the training data

### Model architecture changes
1. Batch Normalization
  * Normalizes each channel of the batch, generally you do this before a point of nonlinearity. 
2. Dropout
  * Randomly zeros out weights in training to keep certain features from getting too large, forcing the model to learn from different sets of features. I will be using this on the feed forward layers of the neural network. 

(Additionally, I utilized the nn.Sequential to make the model more architecture easier to read)

### Notes
For future improvement I can look into
* Additional preprocessing functions
  * tensorflow/pytorch have an extension CV library that can be used to improve performance.
* Variable learning rate, such as utilizing the adam optimizer or a circular learning rate
* Hyperparameter Tuning
  * A simple way to do this is to use a randomized grid search + cross validation and a fair amount of compute. If I were looking to go further I could look into bayesian or bandit based tuning methods.


In [5]:
class CNNNorm(torch.nn.Module):
    def __init__(self, num_classes=10):
        super(CNNNorm, self).__init__()


        self.conv1 = nn.Sequential(nn.Conv2d(3, 32, 3),
                                   nn.BatchNorm2d(32),
                                   nn.ReLU(True),
                                   nn.Conv2d(32, 64, 3),
                                   nn.BatchNorm2d(64),
                                   nn.ReLU(True))
        
        self.conv2 = nn.Sequential(nn.Conv2d(64, 128, 3),
                                   nn.BatchNorm2d(128),
                                   nn.ReLU(True),
                                   nn.Conv2d(128, 128, 3),
                                   nn.BatchNorm2d(128),
                                   nn.ReLU(True),
                                   )
          
        self.conv3 = nn.Sequential(
                                   nn.Conv2d(128, 256, 3),
                                   nn.BatchNorm2d(256),
                                   nn.ReLU(True),
                                   nn.Conv2d(256, 256, 3),
                                   nn.BatchNorm2d(256),
                                   nn.ReLU(True),
                                   )
          
        self.fc = nn.Sequential(
                                nn.Linear(1024, 1024),
                                nn.Dropout(p=0.5),
                                nn.Linear(1024, 1024),
                                nn.Dropout(p=0.5),
                                nn.Linear(1024, 256),
                                nn.Dropout(p=0.5),
                                nn.Linear(256, 10)
                                )

                                     
        
        self.pool = nn.MaxPool2d(2,2, padding=1)
        
   
        
    def forward(self, x):

        x = self.pool(self.conv1(x))
        x = self.pool(self.conv2(x))
        x = self.pool(self.conv3(x))

        x = x.view(x.size(0), -1)
        x = self.fc(x)

        # Don't need to run (f.softMax because BCEloss will run softmax while calculating)
        return x
    


In [None]:
data_loader = DataLoader('../data/cifar-10-batches-py/', [flip_horizontal])
train, test, labels = data_loader.execute()

model = CNNNorm()

refined_convnet_params = {'batch_size': 20,
                          'n_epochs': 50, 
                          'learning_rate': .001,
                          'optim_method': optim.AdamW,
                          'weight_decay': 5e-6
                          }
      
losses = train_model(model, train, test, **refined_convnet_params)




Iteration 1: 
                         Running Loss Is 0.04858360528945923
Iteration 2: 
                         Running Loss Is 0.021649088859558106
Iteration 3: 
                         Running Loss Is 0.023021183013916015
