## Assignment 2 Deep Learning - CNNs

_Magnus Caesar_

In [1]:
import numpy as np
import torch
import torch.utils.data as data
from load_func import load_mnist_func
import matplotlib.pyplot as plt
import time
np.random.seed(42)


### Exercise 1. Multi-layer fully connected neural network
Implement exactly the same network as for assignment 1.

1. Compare the performance
2. Compare speed
3. Learning curve plot

The first network from assignment 1 is a linear model. The input dimension is 784 (28x28 pixels) and the output dimension is 10. The second network is multilayered. The one I implemented had 2 hidden layers á 100 neurons each, which the same necessary input and output dimensions.

In [3]:
# Load the data
X_train, Y_train, X_test, Y_test = load_mnist_func("station")

Poopdogs
Reading MNIST: stationary


In [57]:
# Loading the data into pytorch's domain

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)

# Create TensorDataset.
train_dataset = data.TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = data.TensorDataset(X_test_tensor, Y_test_tensor)

**Base network and forward**

Basic class for the network.

In [58]:
class SeqNetwork(torch.nn.Module):
    def __init__(self, num_hidden_layers, input_dim=784, output_dim=10, hidden_dim=100):
        super().__init__()
        self.layers = torch.nn.ModuleList([]) # Ska ha lista med lager / moduler

        # If no hidden layers
        if num_hidden_layers == 0: 
            self.layers.append(torch.nn.Linear(input_dim, output_dim, dtype=torch.float32))

        # If hidden layers layers
        # TODO
        # this might have some logic flaws
        else:
            # First layer
            self.layers.append(torch.nn.Linear(input_dim, hidden_dim))
            self.layers.append(torch.nn.ReLU())
            
            # Add extra layers
            for _ in range(num_hidden_layers):
                self.layers.append(torch.nn.Linear(hidden_dim, hidden_dim))
                self.layers.append(torch.nn.ReLU())
            
            self.layers.append(torch.nn.Linear(hidden_dim, output_dim))
        

    def forward(self, x):

        for l in self.layers:
            x = l(x)

        return x
        

**Computing loss**

This is the exact same "compute_loss" function used in lab 2.

In [59]:
def compute_loss(model, batch):
    # forward pass and loss function

    inp_data, labels = batch # Read current batch

    output = model(inp_data) # Get output from model
    loss = torch.nn.functional.cross_entropy(output, labels)
    acc = (output.argmax(dim=1) == labels.argmax(dim=1)).float().mean() # Check accuracy

    return loss, acc

**Optimizing**

The optimizer is SGD (as opposed to ADAM which comes later). PyTorch supports momentum, but that wasn't used in lab 1 and so won't be used here either.

In [60]:
model = SeqNetwork(0)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0)

num_epochs = 50
batch_size = 100 # size used in lab1

train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64)



# Utilize mini-batch gradient descent
# After each batch: collect training cost and accuracy
# After each epoch (all batches: full data set)
# Epochs training accuracy is the average
# After each epoch: calculate test cost and accuracy

# These four lists are requested in the assignment
# Every batch
train_accuracies = []
train_costs = []

# Every epoch
test_costs = []
test_accuracies = []

# Same training structure as from lab 2
for epoch in range(num_epochs):
    model.train()
    print(f"Entering epoch {epoch+1} out of {num_epochs}.")

    for b in train_loader:
        optimizer.zero_grad()
        loss, train_acc = compute_loss(model, b)
        loss.backward()
        optimizer.step()

        # Appending training metrics
        train_costs.append(loss.item())
        train_accuracies.append(train_acc)
    
    # Appending test metrics
    temp_costs = []
    temp_acc = []
    for b in test_loader:
        loss, acc = compute_loss(model, b)
        temp_costs.append(loss.item())
        temp_acc.append(acc)
    
    test_costs.append(np.mean(temp_costs))
    test_accuracies.append(np.mean(temp_acc))

    print(f"Current testing loss: {test_costs[-1]}")
    print(f"Current test accuracy: {test_accuracies[-1]}")
    print()




AttributeError: module 'sympy' has no attribute 'printing'

**Producing plots**

In [None]:
#plots

### Exercise 2. Multilayer convolutional neural network
CNN using PyTorch with SGD and cross-entropy loss. Reach 98% accuracy.

1. How many learnable weights does the network contain? Compare with previous exercise.
2. Learning curve plot.

**Base class CNN**

In [9]:
# Creating new datasets with dimensions tailored for CNN networks
train_dataset_cnn = data.TensorDataset(X_train_tensor.view(60000, 1, 28, 28), Y_train_tensor)
test_dataset_cnn = data.TensorDataset(X_test_tensor.view(10000, 1, 28, 28), Y_test_tensor)

In [11]:
class ConvNetwork(torch.nn.Module):

    def __init__(self):
        super().__init__()

        self.conv1 = torch.nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1)
        self.relu = torch.nn.ReLU()
        self.pool = torch.nn.MaxPool2d(2, stride=2)
        self.conv2 = torch.nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1)
        #relu
        #pool
        self.conv3 = torch.nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        #relu
        self.flatten = torch.nn.Flatten()
        self.fullyconnected = torch.nn.Linear(32*7*7, 10)
        #self.softm = torch.nn.Softmax(dim=1)

        self.components = torch.nn.ModuleList([self.conv1,
                                               self.relu,
                                               self.pool,
                                               self.conv2,
                                               self.relu,
                                               self.pool,
                                               self.conv3,
                                               self.relu,
                                               self.flatten,
                                               self.fullyconnected])
    
    def forward(self, x):

        for l in self.components:
            x = l(x)

        return x

**Optimizing and training**

In [52]:
def train(model, traindata, testdata, num_epochs=40, batch_size=100, optmzr="sgd"):
    """
    Train model.
    Input
        model:              a subclass of torch.nn.Module to be trained
        traindata:          self-explanatory. Must be of type torch.TensorDataset
        testdata:           see above
        num_epochs:         # epochs trained [int]
        batch_size:         # batches [int]
        optmzr:             optimizer. Select 'sgd' or 'adam' [string]
    Returns
        x_axis_train:       torch.arange(correct size)
        x_axis_test:        torch.arange(test size)
        train_costs:        [size 60000 / batches * num_epochs]
        train_accuracies:   [size 60000 / batches * num_epochs]
        test_costs:         [size num_epochs]. Average test cost every epoch
        test_accuracies:    [size num_epochs]. Also average across epoch
        total_time:         Time spent training the network
        avgspeed:           Average time / epoch
    Ex. 
    x_tr, x_te, trc, tra, tec, tac = train(model, train_dataset_cnn, test_dataset_cnn)
    """

    
    # Optimizer
    if optmzr == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0)
    
    if optmzr == "adam":
        optimizer = torch.optim.Adam(model.parameters()) # Standard parameters in Adam

    # Load data
    train_loader = data.DataLoader(traindata, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(testdata, batch_size=batch_size)
    
    # Lists for later plotting
    train_costs = []
    train_accuracies = []
    test_costs = []
    test_accuracies = []


    # For time tracking
    starttime = time.time()

    # Same training structure as from lab 2
    for epoch in range(0, num_epochs):
        model.train()
        print(f"Entering epoch {epoch+1} out of {num_epochs}.")

        for b in train_loader:
            # b[0][0][0] är en bild. Ser bra ut: kan plottas

            # print(b[0].size())
            # plt.imshow(b[0][4][0])

            # Training
            optimizer.zero_grad()
            loss, train_acc = compute_loss(model, b)
            loss.backward()
            optimizer.step()

            # Appending training metrics
            train_costs.append(loss.item())
            train_accuracies.append(train_acc)
            
        # Appending test metrics
        temp_costs = []
        temp_acc = []
        
        for b in test_loader:
            loss, acc = compute_loss(model, b)
            temp_costs.append(loss.item())
            temp_acc.append(acc)
        
        test_costs.append(np.mean(temp_costs))
        test_accuracies.append(np.mean(temp_acc))

        # Time related
        t0 = time.time()
        t_tot = t0-starttime
        avgspeed = t_tot / (epoch+1)
        est_time_remaining = (avgspeed) * (num_epochs - epoch+1)

        # Statistics
        print(f"Current testing loss:      {test_costs[-1]:>10.4f}")
        print(f"Current test accuracy:     {test_accuracies[-1]:>10.2f}")
        print(f"Ellapsed time:             {round(t0 - starttime, 2):>10.2f} seconds.")
        print(f"Estimated time remaining:  {round(est_time_remaining, 2):>10.2f} seconds.")
        print()
    
    tend = time.time()
    total_time = t_end-starttime
    print(f"Final results:")
    print(f"Total time elapsed: {round(total_time, 3)} s")
    print(f"Average speed: {round(avgspeed, 3)} s / epoch")

    # x-axis for plotting
    x_axis_train = torch.arange(len(train_costs)) # For every actual iteration
    x_axis_test = torch.arange(len(test_costs)) # Average for every epoch

    return x_axis_train, x_axis_test, train_costs, train_accuracies, test_costs, test_accuracies, total_time, avgspeed

In [56]:
model = ConvNetwork()

num_epochs = 30
batch_size = 64 # size used in lab1

trc, tra, tec, tac = train(model, train_dataset_cnn, test_dataset_cnn, num_epochs=num_epochs, batch_size=batch_size)

AttributeError: module 'sympy' has no attribute 'printing'

In [None]:
# Plot: regular learning curve plot of the CNN: see first page of HA2

### Exercise 3: swapping the order of max pooling and the activation function

1. How does this affect the models performance? Final accuracy?
2. Swapping ReLU to tanh: differences? Time taken, final accuarcy?

In [None]:
class SwapedConvNetwork(torch.nn.Module):
    # Easier to create a new class than to fiddle in the old one
    def __init__(self, activator: str):
        super().__init__()

        # Reused layers
        if activator.lower() == "relu":
            self.activator = torch.nn.ReLU()
        if activator.lower() == "tanh":
            self.activator == torch.nn.Tanh()
        
        self.pool = torch.nn.MaxPool2d(2, stride=2)

        # Full laid out structure
        self.conv1 = torch.nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1)
        # Pool
        # Activator
        self.conv2 = torch.nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1)
        # Pool
        # Activator
        self.conv3 = torch.nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        # Activator
        self.flatten = torch.nn.Flatten()
        self.fullyconnected = torch.nn.Linear(32*7*7, 10)
        #self.softm = torch.nn.Softmax(dim=1)

        self.components = torch.nn.ModuleList([self.conv1,
                                               self.pool,
                                               self.activator,
                                               self.conv2,
                                               self.pool,
                                               self.activator,
                                               self.conv3,
                                               self.activator,
                                               self.flatten,
                                               self.fullyconnected])
    
    def forward(self, x):

        for l in self.components:
            x = l(x)

        return x



In [None]:
model = SwapedConvNetwork("relu")

x_train, x_test, trc2, tra2, tec2, tac2, time, avgspeed = train(model, train_dataset_cnn, test_dataset_cnn, batch_size=64, optmzr="sgd")

Entering epoch 1 out of 40.
Current testing loss:          2.2707
Current test accuracy:           0.26
Ellapsed time:                  14.47 seconds.
Estimated time remaining:      593.40 seconds.

Entering epoch 2 out of 40.
Current testing loss:          0.5702
Current test accuracy:           0.81
Ellapsed time:                  28.42 seconds.
Estimated time remaining:      568.48 seconds.

Entering epoch 3 out of 40.
Current testing loss:          0.3917
Current test accuracy:           0.87
Ellapsed time:                  42.62 seconds.
Estimated time remaining:      554.06 seconds.

Entering epoch 4 out of 40.
Current testing loss:          0.3504
Current test accuracy:           0.88
Ellapsed time:                  57.47 seconds.
Estimated time remaining:      545.99 seconds.

Entering epoch 5 out of 40.
Current testing loss:          0.2375
Current test accuracy:           0.93
Ellapsed time:                  72.20 seconds.
Estimated time remaining:      534.29 seconds.

Enter

In [None]:
# Compare with ex2:
# 3a
# Longer or faster training time?
# Final accuracy?
# Learning curve plot: both models next to each other!

# 3b: ReLU -> tanh
# longer or faster training time?
# Final accuracy?
# learning curve plot !

# 3c conclusions?

### Exercise 4: swapping SGD for ADAM

Are good results obtained faster? Compare with SGD; provide learning curve plot.

In [None]:
model_ex4_sgd = SwapedConvNetwork("relu")
model_ex4_adam = SwapedConvNetwork("relu")

num_epochs = 30

sgd_stats = train(model_ex4_sgd, train_dataset_cnn, test_dataset_cnn, optmzr="sgd")
adam_stats = train(model_ex4_adam, train_dataset_cnn, test_dataset_cnn, optmzr="adam")

In [None]:
# ex4:
# learning curve plot of adam vs sgd here

### Exercise 5: residual connection

Does this improve performance?

Use the same structure as before but replace each conv+act pair with a block of two similar parts with a residual connection over each such block.

So x -> conv1 -> relu = f(x) ==> x -> conv1 -> relu -> conv1 -> relu = g(x) + x 

In [53]:
class ResidualBlock(torch.nn.Module):
    def __init__(self, channels, activator):
        super().__init__()
        if activator.lower() == "relu":
            self.activator = torch.nn.ReLU()
        if activator.lower() == "tanh":
            self.activator = torch.nn.Tanh()

        self.conv = torch.nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        identity = x  # Save input for residual connection
        out = self.conv(x)
        out = self.activator(out)
        out = self.conv(out)
        out = self.activator(out)
        out += identity  # Add residual connection
        return out

class ResConvNetwork(torch.nn.Module):
    def __init__(self, activator: str):
        super().__init__()

        # Reused layers
        if activator.lower() == "relu":
            self.activator = torch.nn.ReLU()
        if activator.lower() == "tanh":
            self.activator = torch.nn.Tanh()
        
        self.initial_conv = torch.nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1)
        self.resblock1 = ResidualBlock(8, "relu") # blir det inte lite konstigt med dimensionerna här?
        self.conv2 = torch.nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1)
        self.resblock2 = ResidualBlock(16, "relu")
        self.conv3 = torch.nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.resblock3 = ResidualBlock(32, "relu")

        self.flatten = torch.nn.Flatten()
        self.fullyconnected = torch.nn.Linear(32*7*7, 10)

        self.components = torch.nn.ModuleList([self.initial_conv,
                                               self.resblock1,
                                               self.conv2,
                                               self.resblock2,
                                               self.conv3,
                                               self.resblock3,
                                               self.flatten,
                                               self.fullyconnected])
    
    def forward(self, x):
        for l in self.components:
            x = l(x)
        
        return x

In [55]:
resnet_model = ResConvNetwork("tanh")

x_train, x_test, trc2, tra2, tec2, tac2, time, avgspeed = train(resnet_model, train_dataset_cnn, test_dataset_cnn, batch_size=64, optmzr="sgd")

AttributeError: module 'sympy' has no attribute 'printing'

### Exercise 6: CNN with three variations

Todo

1. Swapa SGD mot ADAM och se till att det funkar
2. Implementera residual connection och kolla vad som sker
3. CNN with three variations:
    1. Regularization
    2. Hur olika djup påverkar träningshastigheten?
    3. Hur olika learning rates påverkar träningshastigheten?

Misc: på något vis få ut x-axel från train() så att man kan plotta flera samtidigt