# Two layer nueral network to identify digits

In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# loading data

train_ds = datasets.MNIST('../data', train=True, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
# transpose.Compose help to several transforms together
# toTensor() - convert to torch tensor
# Normalise with mean and std

test_ds = datasets.MNIST('../data', train=False, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

In [5]:
batch_size = 64
#batch_size = 5 # for testing
kwargs = {'num_workers': 1, 'pin_memory': True} 

# num_workers (int, optional) – how many subprocesses to use for data loading.
# pin_memory (bool, optional) – If True, the data loader will copy tensors into CUDA pinned memory before returning them.

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False, **kwargs)

In [83]:
def show(img, title=None):
    plt.imshow(img, interpolation='none', cmap="gray")
    if title is not None: plt.title(title)

In [84]:
# define the architecture
def get_model(M = 300):
    net = nn.Sequential(nn.Linear(28*28, M),
                        nn.ReLU(),
                        nn.Linear(M, 10))
    return net.cuda()

In [9]:
def train_model(train_loader, test_loader, num_epochs, model, optimizer):
    #model.train() #when not using drop outs

    for epoch in range(num_epochs):
        model.train() #when using drop outs
        sum_loss = 0.0
        total = 0
        for i, (images, labels) in enumerate(train_loader):  
            batch = images.shape[0] # size of the batch
            # Convert torch tensor to Variable, change shape of the input
            images = Variable(images.view(-1, 28*28)).cuda()
            labels = Variable(labels).cuda()
        
            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = model(images)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
        
            total += batch
            sum_loss += batch * loss.data[0]
                
        train_loss = sum_loss/total
        print('Epoch [%d/%d], Loss: %.4f' %(epoch+1, num_epochs, train_loss))
        val_acc, val_loss = model_accuracy_loss(model, test_loader)
        print('Epoch [%d/%d], Valid Accuracy: %.4f, Valid Loss: %.4f' %(epoch+1, num_epochs, val_acc, val_loss))
    return val_acc, val_loss, train_loss

In [7]:
def model_accuracy_loss(model, test_loader):
    model.eval()
    correct = 0
    sum_loss = 0.0
    total = 0
    for images, labels in test_loader:
        images = Variable(images.view(-1, 28*28)).cuda()
        labels = Variable(labels).cuda()
        outputs = model(images)
        _, pred = torch.max(outputs.data, 1) # return index of the location
        loss = F.cross_entropy(outputs, labels)
        sum_loss += labels.size(0)*loss.data[0]
        total += labels.size(0)
        correct += pred.eq(labels.data).cpu().sum()
    return 100 * correct / total, sum_loss/ total

# Model performance with changes in hyper parameters of the nueral net

### Changing learning rate

In [None]:
val_accuracy = []
for i in [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:
    print('learning rate: '+str(i))
    net = get_model()
    learning_rate = i
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    v, _, _ = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    val_accuracy.append(v)

In [47]:
#results
pd.DataFrame({'learning_rate': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001], 'validation_accuracy': val_accuracy})

Unnamed: 0,learning_rate,validation_accuracy
0,1.0,9.86
1,0.1,12.68
2,0.01,95.68
3,0.001,97.68
4,0.0001,97.55
5,1e-05,92.89


In [None]:
val_accuracy = []
for i in [0.0003, 0.0005, 0.0007, 0.0009]:
    print('learning rate: '+str(i))
    net = get_model()
    learning_rate = i
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    v, _, _ = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    val_accuracy.append(v)

In [49]:
#results
pd.DataFrame({'learning_rate': [0.0003, 0.0005, 0.0007, 0.0009], 'validation_accuracy': val_accuracy})

Unnamed: 0,learning_rate,validation_accuracy
0,0.0003,98.09
1,0.0005,98.0
2,0.0007,98.0
3,0.0009,97.86


Best validation accuracy is reported when the learning rate is 0.0003

### Number of nuerons in the hidden layer

In [None]:
M = [10, 50, 100, 300, 1000, 2000]
val_accuracy = []
for m in M:
    print('hidden layer size: '+str(m))
    net = get_model(M=m)
    learning_rate = 0.01
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    v, _, _ = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    val_accuracy.append(v)

In [53]:
#results
pd.DataFrame({'no_hidden_layers': M, 'validation_accuracy': val_accuracy})

Unnamed: 0,no_hidden_layers,validation_accuracy
0,10,91.66
1,50,94.99
2,100,95.63
3,300,95.61
4,1000,95.52
5,2000,95.64


Highest validation set accuracy is obtained when the size of the hidden layer is 2000. However, validation accuracy does not improve significantly beyond layer size of 100. And this does not provide evidence for overfitting. So this indicates that increasing layer size beyond 100 is not useful

### Weight decay

In [None]:
decay = [0.0, 0.0001, 0.001, 0.01, 0.1, 0.3]
val_accuracy = []
val_los = []
trn_los = []

for d in decay:
    print('decay parameter : '+str(d))
    net = get_model(M=300)
    learning_rate = 0.001
    optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay = d)
    va, vl, tl = train_model(train_loader, test_loader, num_epochs=20, model=net, optimizer=optimizer)
    val_accuracy.append(va)
    val_los.append(vl)
    trn_los.append(tl)

In [58]:
#results
pd.DataFrame({'decay_parameter': decay, 'validation_accuracy': val_accuracy, 'validation_loss': val_los, 
              'training_loss': trn_los})

Unnamed: 0,decay_parameter,training_loss,validation_accuracy,validation_loss
0,0.0,0.006596,98.14,0.112253
1,0.0001,0.016828,96.82,0.117563
2,0.001,0.051107,97.65,0.073913
3,0.01,0.156216,96.01,0.146521
4,0.1,0.460008,89.99,0.435906
5,0.3,0.810344,85.99,0.78426


Model works better when there is no weight decay

###  Drop out

In [10]:
def get_model_v2(M = 300, p=0):
    modules = []
    modules.append(nn.Linear(28*28, M))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(M, 10))
    return nn.Sequential(*modules).cuda()

In [None]:
dropout = [0.2, 0.3, 0.4, 0.6, 0.8, 1]
val_accuracy = []
val_los = []
trn_los= []

for d in dropout:
    print('dropout: '+ str(d))
    net = get_model_v2(M=300, p=d)
    learning_rate = 0.001
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    va, vl, tl = train_model(train_loader, test_loader, num_epochs=20, model=net, optimizer=optimizer)
    val_accuracy.append(va)
    val_los.append(vl)
    trn_los.append(tl)

In [12]:
#results
pd.DataFrame({'dropout': dropout, 'validation_accuracy': val_accuracy, 'validation_loss': val_los, 
              'training_loss': trn_los})

Unnamed: 0,dropout,training_loss,validation_accuracy,validation_loss
0,0.2,0.023907,98.14,0.088596
1,0.3,0.030965,98.28,0.082948
2,0.4,0.044091,98.29,0.079834
3,0.6,0.089218,98.21,0.078871
4,0.8,0.229248,97.27,0.102745
5,1.0,2.301304,9.84,2.326252


Dropout of 0.4 gives the best validation accuracy. Model in 3.2 with M=300, only achieves validation accuracy of 95.61 . Therefore it can be seen that dropout do help in improving the validation accuracy.