# HWK7 PROBLEM 7

### Your goal is to train a convnet with multiple layers on fashion-MNIST and to obtain the lowest error rate possible on the test set. Try various hyperparameter (number of layers, hidden_sizes, etc...). Good luck!

In [102]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import randint
import utils
import time

In [103]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

cuda


In [104]:
train_data=torch.load('../../data/fashion-mnist/train_data.pt')
train_label=torch.load('../../data/fashion-mnist/train_label.pt')
test_data=torch.load('../../data/fashion-mnist/test_data.pt')
test_label=torch.load('../../data/fashion-mnist/test_label.pt')

print(train_data.size())
print(test_data.size())

torch.Size([60000, 28, 28])
torch.Size([10000, 28, 28])


In [105]:
mean= train_data.mean()
std= train_data.std()
print(mean)
print(std)

tensor(0.2860)
tensor(0.3530)


In [106]:
class ten_layer_convnet(nn.Module):

    def __init__(self):

        super(ten_layer_convnet, self).__init__()

        # block 1:         1 x 28 x 28 --> 64 x 14 x 14        
        self.conv1a = nn.Conv2d(1,   64,  kernel_size=3, padding=1 )
        self.conv1b = nn.Conv2d(64,  64,  kernel_size=3, padding=1 )
        self.pool1  = nn.MaxPool2d(2,2)

        # block 2:         64 x 14 x 14 --> 128 x 7 x 7
        self.conv2a = nn.Conv2d(64,  128, kernel_size=3, padding=1 )
        self.conv2b = nn.Conv2d(128, 128, kernel_size=3, padding=1 )
        self.pool2  = nn.MaxPool2d(7,7)

        # linear layers:   128 x 2 x 2 --> 512 --> 1024 --> 1024 --> 10
        self.linear1 = nn.Linear(512, 1024)
        self.linear2 = nn.Linear(1024,1024)
        self.linear3 = nn.Linear(1024, 10)


    def forward(self, x):

        # block 1:         1 x 28 x 28 --> 64 x 14 x 14
        x = self.conv1a(x)
        x = F.relu(x)
        x = self.conv1b(x)
        x = F.relu(x)
        x = self.pool1(x)
        # block 2:         64 x 14 x 14 --> 128 x 7 x 7
        x = self.conv2a(x)
        x = F.relu(x)
        x = self.conv2b(x)
        x = F.relu(x)
        x = self.pool2(x)

        # linear layers:   128 x 2 x 2 --> 512 --> 1024 --> 1024 --> 10
        x = x.view(-1, 512)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x) 
        
        return x

In [107]:
net=ten_layer_convnet()

print(net)

ten_layer_convnet(
  (conv1a): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv1b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2a): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool2): MaxPool2d(kernel_size=7, stride=7, padding=0, dilation=1, ceil_mode=False)
  (linear1): Linear(in_features=512, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=1024, bias=True)
  (linear3): Linear(in_features=1024, out_features=10, bias=True)
)


In [108]:
net = net.to(device)

mean=mean.to(device)

std=std.to(device)

In [109]:
criterion = nn.CrossEntropyLoss()
my_lr=0.25 
bs= 100

In [110]:
def eval_on_test_set():

    running_error=0
    num_batches=0
    
    with torch.no_grad():    

        for i in range(0,1000,bs):

            minibatch_data =  test_data[i:i+bs]
            minibatch_label= test_label[i:i+bs]

            minibatch_data=minibatch_data.to(device)
            minibatch_label=minibatch_label.to(device)

            inputs = (minibatch_data - mean)/std

            scores=net(inputs.view(bs,1, 28,28) ) 

            error = utils.get_error( scores , minibatch_label)

            running_error += error.item()

            num_batches+=1

    total_error = running_error/num_batches
    print( 'error rate on test set =', total_error*100 ,'percent')

In [111]:
start=time.time()

for epoch in range(1,20):
    
    # divide the learning rate by 2 at epoch 10, 14 and 18
    if epoch==10 or epoch == 14 or epoch==18:
        my_lr = my_lr / 2
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quatities to zero at the beginning of the epoch
    running_loss=0
    running_error=0
    num_batches=0
    
    # set the order in which to visit the image from the training set
    shuffled_indices=torch.randperm(60000)
 
    for count in range(0,60000,bs):
    
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch       
        indices=shuffled_indices[count:count+bs]
        minibatch_data =  train_data[indices]
        minibatch_label=  train_label[indices]
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # normalize the minibatch 
        inputs = (minibatch_data - mean)/std
        
        # tell Pytorch to start tracking all operations that will be done on "inputs"
        inputs.requires_grad_()

        # forward the minibatch through the net 
        scores=net( inputs.view(bs, 1, 28,28) ) 

        # Compute the average of the losses of the data points in the minibatch
        loss =  criterion( scores , minibatch_label) 
        
        # backward pass to compute dL/dU, dL/dV and dL/dW   
        loss.backward()

        # do one step of stochastic gradient descent: U=U-lr(dL/dU), V=V-lr(dL/dU), ...
        optimizer.step()
        
        # COMPUTE STATS
        num_batches+=1
        with torch.no_grad():
            running_loss += loss.item()
            error = utils.get_error( scores , minibatch_label)
            running_error += error.item()  
              
    
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    total_error = running_error/num_batches
    elapsed = (time.time()-start)/60
    

    print('epoch=',epoch, '\t time=', elapsed,'min','\t lr=', my_lr  ,'\t loss=', total_loss , '\t error=', total_error*100 ,'percent')
    eval_on_test_set() 
    print(' ')

epoch= 1 	 time= 0.32942428986231487 min 	 lr= 0.25 	 loss= 0.9330022667100032 	 error= 34.996668219566345 percent
error rate on test set = 19.200002551078796 percent
 
epoch= 2 	 time= 0.6638630509376526 min 	 lr= 0.25 	 loss= 0.3477637661496798 	 error= 12.950002123912174 percent
error rate on test set = 11.200001835823059 percent
 
epoch= 3 	 time= 0.9985509792963664 min 	 lr= 0.25 	 loss= 0.27775863132129114 	 error= 10.543335417906444 percent
error rate on test set = 10.300002694129944 percent
 
epoch= 4 	 time= 1.3314709583918254 min 	 lr= 0.25 	 loss= 0.23802630470444758 	 error= 8.756668448448181 percent
error rate on test set = 10.000001192092896 percent
 
epoch= 5 	 time= 1.6647847652435304 min 	 lr= 0.25 	 loss= 0.210462295897305 	 error= 7.883335173130035 percent
error rate on test set = 9.900001883506775 percent
 
epoch= 6 	 time= 1.9971109986305238 min 	 lr= 0.25 	 loss= 0.18798616617918015 	 error= 7.045001824696858 percent
error rate on test set = 8.400001525878906 perc