# Task 1
## Acquire the data

Here we load the Sign Language MNIST dataset for classification via Multilayer Perceptron (MLP) and Convolutional Neural Network (CNN).  The dataset is normalized and broken into traning, validation, and testing sets for use with the models.

In [None]:
# download_datasets()

# -*- coding: utf-8 -*-
"""COMP_551_A3_Shareable.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1RxfnoXX-1MvlNM93d7GC-68PWB-QbOhL
"""

import gdown

def download_datasets() -> None:

    train_data_path = "https://drive.google.com/uc?id=1Q6Iefx7rWFSpHhuQJ7ygPHHTTxMwLYth"
    test_data_path = "https://drive.google.com/uc?id=1M6aIcBXmRKRR1go9kKpDA9aGq_Uy0mz0"

    output_train = 'sign_mnist_train.csv'
    output_test = 'sign_mnist_test.csv'

    gdown.download(train_data_path, output_train)
    gdown.download(test_data_path, output_test)

In [None]:
# from google.colab import drive
import pandas as pd

# Path to dataset file in Google Drive
train = '../data/sign_mnist_train.csv'
test = '../data/sign_mnist_test.csv'
# Load the dataset into a Pandas DataFrame
df_train = pd.read_csv(train)
df_test = pd.read_csv(test)

print(df_train.head())

df_train.info()
df_train.shape
# df_train.describe()
df_train.head(10)

# Separate labels from dataset and create validation set

from sklearn.model_selection import train_test_split

import numpy as np

y_train = df_train['label']
x_train = df_train.drop(['label'], axis=1)

y_test = df_test['label']
x_test = df_test.drop(['label'], axis=1)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

x_train = np.asarray(x_train)/255
y_train = np.asarray(y_train)

x_val = np.asarray(x_val)/255
y_val = np.asarray(y_val)

x_test = np.asarray(x_test)/255
y_test = np.asarray(y_test)

print("y_train shape:", y_train.shape)
print("x_train shape:", x_train.shape)

print("y_val shape:", y_val.shape)
print("x_val shape:", x_val.shape)

print("y_test shape:", y_test.shape)
print("x_test shape:", x_test.shape)

unique_labels_y = np.unique(y_train)
print(unique_labels_y.min())
print(unique_labels_y.max())

num_classes_y = len(unique_labels_y) + 1
print("Number of unique labels in y_train:", num_classes_y)

# Task 2
## Implement MLP

Here we define a multilayer perceptron (MLP) with mini-batch gradient descent optimization for training the network. The framework comprises several classes representing different types of neural network layers. The NeuralNetLayer serves as the base class, defining common attributes like gradients, parameters, lambda coefficients for regularization, and flags for L2 penalty normalization. Subclasses include LinearLayer for fully connected layers with random weight initialization and L2 regularization, ReLULayer implementing the ReLU activation function, SoftmaxOutputLayer for the output layer with softmax activation in classification tasks, SigmoidLayer implementing the sigmoid activation function, and LeakyReLULayer for the Leaky ReLU activation function. Each layer class provides methods for forward propagation (forward) and backpropagation (backward).

Additionally, the MiniBatchMLP class represents the multi-layer perceptron model and utilizes the forward and backward methods of individual layers to perform forward and backward passes through the network. It also includes functionality to calculate the number of hidden layers in the network and compute regularization based on L2 norms of parameters. The MiniBatchGradientDescentOptimizer class implements mini-batch gradient descent optimization for updating the network parameters. It uses a specified learning rate (lr) for parameter updates based on gradients computed during backpropagation, updating parameters for each layer based on the mean gradient of the mini-batch.

In [None]:
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Define the NeuralNetLayer base class and its subclasses
class NeuralNetLayer:
    def __init__(self):
        self.gradient = None
        self.parameters = None
        self.lambda_coef = 0
        self.normalize_l2_penalty = False

    def forward(self, x):
        raise NotImplementedError

    def backward(self, gradient):
        raise NotImplementedError

class LinearLayer(NeuralNetLayer):
    def __init__(self, input_size, output_size, lambda_coef=0, normalize_l2_penalty=False):
        super().__init__()
        self.ni = input_size
        self.no = output_size
        self.w = np.random.randn(output_size, input_size)
        self.b = np.random.randn(output_size)
        self.cur_input = None
        self.lambda_coef = lambda_coef
        self.normalize_l2_penalty = normalize_l2_penalty
        self.parameters = [self.w, self.b]

    def forward(self, x):
        self.cur_input = x
        return (self.w[None, :, :] @ x[:, :, None]).squeeze() + self.b

    def backward(self, gradient):
        assert self.cur_input is not None, "Must call forward before backward"
        dw = gradient[:, :, None] @ self.cur_input[:, None, :] / self.cur_input.shape[0]
        # Add the gradient of the L2 penalty term
        dw += (self.lambda_coef * self.w)
        db = gradient
        self.gradient = [dw, db]
        return gradient.dot(self.w)

class ReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.0)
        return np.maximum(0, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient

class SoftmaxOutputLayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        self.cur_probs = None


    def forward(self, x):
        x = np.clip(x, a_min=None, a_max=706) # above 709+ , exp will give -inf and high gradients of inputs will lead to mumerical instability
        exps = np.exp(x)
        probs = exps / np.sum(exps, axis=-1)[:, None]
        self.cur_probs = probs
        return probs

    def backward(self, target):
        assert self.cur_probs is not None, "Must call forward before backward"
        return self.cur_probs - target

class SigmoidLayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = self.sigmoid(x) * (1 - self.sigmoid(x))
        return self.sigmoid(x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

class LeakyReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.01)
        return self.leaky_relu(x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient

    def leaky_relu(self, z):
        return np.maximum(0.01 * z, z)

In [None]:
# Define the MiniBatchMLP and MiniBatchGradientDescentOptimizer classes
class MiniBatchMLP:
    def __init__(self, *args):
        self.layers = args

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def n_hidden(self):
        n_hidden_layers = 0
        for layer in self.layers:
            if isinstance(layer, LinearLayer):
                n_hidden_layers += 1
        return n_hidden_layers - 1 # - the input layer

    def backward(self, target):
        for layer in self.layers[::-1]:
            target = layer.backward(target)
        return target

    def regularization(self):
        l2_reg = 0
        total_params = 0
        for layer in self.layers:
            if layer.parameters is not None:
                l2_reg += np.sum(np.square(layer.parameters[0]))
                total_params += np.prod(layer.parameters[0].shape)

        if self.layers[0].normalize_l2_penalty:
            # this is done to prevent huge losses
            # Normalize by the total number of parameters
            return (self.layers[0].lambda_coef * l2_reg) / total_params
        else:
            return self.layers[0].lambda_coef * l2_reg

class MiniBatchGradientDescentOptimizer:
    def __init__(self, net, lr):
        self.net = net
        self.lr = lr

    def step(self):
        for layer in self.net.layers[::-1]:
            if layer.parameters is not None:
                self.update(layer.parameters, layer.gradient)

    def update(self, params, gradient):
        for (p, g) in zip(params, gradient):
            p -= self.lr * g.mean(axis=0)


In [None]:
# Assuming y_train is a numpy array
unique_labels = np.unique(y_train)
num_classes = len(unique_labels) + 1
print("Number of unique labels:", num_classes)

# Create a dictionary to map unique labels to integers
label_to_int = {label: i for i, label in enumerate(range(num_classes))}

# Convert labels to integers using the dictionary
int_labels = np.array([label_to_int[label] for label in y_train])

# Create one-hot encoded labels using the correct size
labels = np.eye(num_classes)[int_labels]

print(labels.shape)

Here we define the training loop function `train_mlp` for handling mini-batches in training the MLP model. It begins by preprocessing the data, including converting labels to integers using a dictionary mapping and one-hot encoding for classification tasks. The training loop iteratively updates the MLP's parameters using mini-batch gradient descent optimization (`MiniBatchGradientDescentOptimizer`). Within each training step, the data is shuffled, and batches are processed to compute predictions, calculate the loss (cross-entropy), perform backpropagation, and update the model's parameters. Training progress is monitored through printing epoch-wise information such as loss, training accuracy, validation accuracy, and test accuracy. The loop also stores losses and accuracies for visualization and analysis.

In [None]:
# Define the training loop function to handle mini-batches

unique_labels = np.unique(y_train)
num_classes = len(unique_labels) + 1
print("Number of unique labels:", num_classes)

# Create a dictionary to map unique labels to integers
label_to_int = {label: i for i, label in enumerate(range(num_classes))}

# Convert labels to integers using the dictionary
int_labels = np.array([label_to_int[label] for label in y_train])

### Version of train_mlp that I have been using

def train_mlp(mlp, hidden_unit, data_x, data_y, x_val, y_val, x_test, y_test, steps=100, batch_size=128):
    losses = []
    accuracies = []
    train_predictions = []
    val_predictions = []
    test_predictions = []

    train_acc = []
    val_acc = []
    test_acc = []

    # labels = np.eye(num_classes)[int_labels]
    # print('Here num class: {}'.format(labels.shape))

    for units in [hidden_unit]:

        print(f"Training MLP with {mlp.n_hidden()} hidden layers and {units} hidden units...")

        opt = MiniBatchGradientDescentOptimizer(mlp, lr=1e-3)
        for step in tqdm(range(steps)):
            # Shuffle data for each epoch
            inputs, labels = shuffle(data_x, data_y, random_state=42)
            batch_losses = []
            for batch_start in range(0, len(data_x), batch_size):
                batch_end = batch_start + batch_size
                x_batch = inputs[batch_start:batch_end]
                y_batch = labels[batch_start:batch_end]

                # One-hot encoding of labels
                y_batch = np.eye(num_classes)[y_batch]
                # forward pass
                predictions_batch = mlp.forward(x_batch)
                # batch training loss
                # print(predictions_batch)
                loss_batch = -(y_batch * np.log(predictions_batch + 1e-5)).sum(axis=-1).mean() + mlp.regularization()  # Cross Entropy
                batch_losses.append(loss_batch)
                # backward pass
                mlp.backward(y_batch)
                # propagate error back & update parameters
                opt.step()

            step_loss = np.array(batch_losses).mean()
            losses.append(step_loss)

            # Store predictions for later accuracy calculation
            train_pred, val_pred, test_pred = np.argmax(mlp.forward(data_x), axis=1), np.argmax(mlp.forward(x_val), axis=1), np.argmax(mlp.forward(x_test), axis=1)

            N_train = len(data_y)
            N_val = len(y_val)
            N_test = len(y_test)

            accuracy_train = (data_y == train_pred).sum() / N_train
            accuracy_eval = (y_val == val_pred).sum() / N_val
            accuracy_test = (y_test == test_pred).sum() / N_test

            print('Epoch: {} - Train Loss: {}'.format(step + 1, step_loss))
            print('Epoch: {} - Train Accuracy: {}'.format(step + 1, accuracy_train))
            print('Epoch: {} - Val Accuracy: {}'.format(step + 1, accuracy_eval))
            print('Epoch: {} - Test Accuracy: {}'.format(step + 1, accuracy_test))

            train_acc.append(accuracy_train)
            val_acc.append(accuracy_eval)
            test_acc.append(accuracy_test)

    return losses, train_acc, val_acc, test_acc # train_predictions, val_predictions, test_predictions

# Task 3
## Task 3.1

For this task we created three separate instances of the MLP model: One with no hidden layer, one with a single hidden layer, and one with two hidden layers.  For the models with hidden layers, the ReLU activation function is used.  All versions use a softmax function for output classification. The results are stored in a DataFrame and saved to CSV files for further analysis.

In [None]:
# Define the models and their configurations

hidden_units = [32, 64, 128, 256]
batch_sizes = [2, 16, 32, 64]

num_input = 784
num_output = 25

for batch_size in batch_sizes:

    for hidden_unit in hidden_units:

        mlp_no_hidden = MiniBatchMLP(
            LinearLayer(num_input, num_output),
            SoftmaxOutputLayer()
        )
        mlp_single_hidden = MiniBatchMLP(
            LinearLayer(num_input, hidden_unit),
            ReLULayer(),
            LinearLayer(hidden_unit,num_output),
            SoftmaxOutputLayer()
        )
        mlp_two_hidden = MiniBatchMLP(
            LinearLayer(num_input, hidden_unit),
            LeakyReLULayer(),
            LinearLayer(hidden_unit, hidden_unit),
            LeakyReLULayer(),
            LinearLayer(hidden_unit, num_output),
            SoftmaxOutputLayer()
        )

        mlps = [mlp_no_hidden, mlp_single_hidden, mlp_two_hidden]

        # Train the models
        for mlp in mlps:
            losses, train_accuracies, val_accuracies, test_accuracies = train_mlp(mlp, hidden_unit, x_train, y_train, x_val, y_val, x_test, y_test, batch_size=batch_size)
            frame = pd.DataFrame()
            frame['loss'] = losses
            frame['train_acc'] = train_accuracies
            frame['val_acc'] = val_accuracies
            frame['test_acc'] = test_accuracies
            frame.to_csv('../results/mlp_accuracies_{}_hidden_{}_{}.csv'.format(mlp.n_hidden(), hidden_unit, batch_size), index=False)

## Task 3.2

For this task, we take our two layer model from the Task 3.1 and create two new implementations, one with sigmoid activation functions and one with Leaky-ReLU activation functions. Task 3.1 established a best batch_size = [32] which will be used for task 3.3 as well. The results are stored in a DataFrame and saved to CSV files for further analysis.

In [None]:
for batch_size in [32]:
    for hidden_unit in hidden_units:

        leaky_mlp_two_hidden = MiniBatchMLP(
            LinearLayer(num_input, hidden_unit),
            LeakyReLULayer(),
            LinearLayer(hidden_unit, hidden_unit),
            LeakyReLULayer(),
            LinearLayer(hidden_unit, num_output),
            SoftmaxOutputLayer()
        )

        sigmoid_mlp_two_hidden = MiniBatchMLP(
                    LinearLayer(num_input, hidden_unit),
                    SigmoidLayer(),
                    LinearLayer(hidden_unit, hidden_unit),
                    SigmoidLayer(),
                    LinearLayer(hidden_unit, num_output),
                    SoftmaxOutputLayer()
                )

        names = ['leaky', 'sigmoid']
        # Train the models
        for mlp, name in zip([leaky_mlp_two_hidden, sigmoid_mlp_two_hidden], names):
            losses, train_accuracies, val_accuracies, test_accuracies = train_mlp(mlp_two_hidden, hidden_unit, x_train, y_train, x_val, y_val, x_test, y_test, batch_size=batch_size)
            frame = pd.DataFrame()
            frame['loss'] = losses
            frame['train_acc'] = train_accuracies
            frame['val_acc'] = val_accuracies
            frame['test_acc'] = test_accuracies
            frame.to_csv('../results/{}_mlp_accuracies_{}_hidden_{}_{}.csv'.format(name, mlp_two_hidden.n_hidden(), hidden_unit, batch_size), index=False)

## Task 3.3

For this task we implement the two hidden layer model using ReLU activation.  We implement the training function to loop over various values for the weight penalty hyperparameter (lambda_coef).  We train both with and without L2 regularization to examine how the weight penalties interact with it. The results are stored in a DataFrame and saved to CSV files for further analysis.

In [None]:
# Establish list of lambda values to test over
lambdas = [1e-4, 1e-5, 1e-3, 0.1, 0.3, 0.5, 0.7]

for batch_size in [32]:

    for hidden_unit in hidden_units:

        for lambda_ in lambdas:

            regularize = [False, True]

            for regularize_ in regularize:

                relu_mlp_two_hidden = MiniBatchMLP(
                    LinearLayer(num_input, hidden_unit, lambda_, regularize_),
                    ReLULayer(),
                    LinearLayer(hidden_unit, hidden_unit, lambda_, regularize_),
                    ReLULayer(),
                    LinearLayer(hidden_unit, num_output, lambda_, regularize_),
                    SoftmaxOutputLayer()
                )

                losses, train_accuracies, val_accuracies, test_accuracies = train_mlp(relu_mlp_two_hidden, hidden_unit, x_train, y_train, x_val, y_val, x_test, y_test, batch_size=batch_size)
                frame = pd.DataFrame()
                frame['loss'] = losses
                frame['train_acc'] = train_accuracies
                frame['val_acc'] = val_accuracies
                frame['test_acc'] = test_accuracies
                frame.to_csv('../results/{}_mlp_accuracies_{}_hidden_{}_{}_reg_{}.csv'.format(lambda_, relu_mlp_two_hidden.n_hidden(), hidden_unit, batch_size, regularize_), index=False)

## Task 3.4
<!--
For this task we design a Convolutional Neural Network (CNN) using tools from the PyTorch library.  It is designed with three convolutional layers and two fully-connected (dense) layers.  We implement the CNN using the adaptive moment estimation (Adam) optimization method with cross entropy loss across a wide variety of dense layer sizes within different batch sizes to test for an ideal configuration. -->

For this task we design a Convolutional Neural Network (CNN) using PyTorch. The CNN architecture comprises three convolutional layers and two fully-connected (dense) layers. The optimization is performed using the Adam optimizer along with cross-entropy loss. The code conducts experiments with different batch sizes and dense layer sizes to find an optimal configuration.

We begin by importing necessary libraries and defining the CNN architecture. The BasicCNN class initializes the model with specified parameters such as dense size, number of filters, kernel size, and image size. The CustomDataset class is used to preprocess and load the dataset for training, validation, and testing.

The training process involves iterating over various batch sizes and dense layer sizes. Within each iteration, the model is trained using the train_model function, which performs forward and backward passes, updates model weights, and computes training, validation, and test metrics. The results are stored in a DataFrame and saved to CSV files for further analysis.

In [None]:
import torch
from torch.utils.data import Dataset, Subset, DataLoader, SubsetRandomSampler
import random
import numpy as np
from typing import Tuple, List, Dict, Any
from numpy import array
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import os
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelBinarizer

num_gpus = [i for i in range(torch.cuda.device_count())]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BasicCNN(nn.Module):
    def __init__(self, dense_size, channels=1, n_filters=32, kernel_size=4,
                 num_classes=25, image_size=28):
        super(BasicCNN, self).__init__()
        """
        n_filters: number of filters
        dense_size: size of the fully connected layer
        kernel_size: kernel filter size
        num_classes: # of output classes
        """
        self.conv1 = nn.Conv2d(channels, n_filters, kernel_size=kernel_size)
        self.conv2 = nn.Conv2d(n_filters, n_filters, kernel_size=kernel_size)
        self.conv3 = nn.Conv2d(n_filters, n_filters, kernel_size=kernel_size)
        self.activation = nn.ReLU()
        self.dense = nn.Linear(n_filters * ((image_size) // 2 + (kernel_size + 1)) * ((image_size) // 2 +  (kernel_size + 1)), dense_size)
        self.classifier = nn.Linear(dense_size, num_classes)

    def forward(self, x):
        x = torch.transpose(x, 1, 3) # transpose (bs, img_size, img_size, channels) --> (bs, channels, img_size, img_size)

        x = self.conv1(x)
        x = self.activation(x)

        x = self.conv2(x)
        x = self.activation(x)

        x = self.conv3(x)
        x = self.activation(x)

        x = torch.flatten(x, 1)

        x = self.dense(x)
        x = self.activation(x)

        out = self.classifier(x)

        return out

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        # self.transform = transform
        self.data = dataframe.drop(['label'], axis=1).values
        self.labels = dataframe['label'].values
        self.normalize_dataset()

    def normalize_dataset(self):
        self.data = self.data.reshape(-1, 28, 28, 1)
        self.data = self.data / 255
        self.data = self.data.astype(np.float32)

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, i):
        x = torch.tensor(self.data[i])
        y = torch.tensor(self.labels[i])
        return x, y


def train_model(model: BasicCNN, dataloaders: Dict,
                batch_size: int, criterion: torch.nn.CrossEntropyLoss,
                optimizer: torch.optim.Optimizer, num_epochs: int, lr: float,
                device: torch.device, dense_size: int) -> Tuple[List, List, List, List, float, float]:

    if not os.path.exists('../models'):
        os.mkdir('../models')

    best_loss = 1000
    train_loss, train_acc, eval_loss, eval_acc = [], [], [], []
    path = os.path.join('../models/basic_cnn_{}_{}_{}_{}.pt'.format(batch_size, num_epochs, lr, dense_size))

    for epoch in range(num_epochs):

        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0.0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            if phase == 'train':
                train_loss.append(epoch_loss)
                train_acc.append(epoch_acc.item())
            else:
                eval_loss.append(epoch_loss)
                eval_acc.append(epoch_acc.item())

            print('{} loss: {:.4f} acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            if phase == 'val' and epoch_loss < best_loss:
                # selecting the weights with lowest eval loss
                best_loss = epoch_loss
                torch.save(model.state_dict(), path)

    checkpoints = torch.load(path)
    model.load_state_dict(checkpoints)

    test_loss, test_acc = test_model(model, dataloaders, criterion, device)

    print('Best val loss: {:4f}'.format(best_loss))
    print('Test loss: {:4f}'.format(test_loss))
    print('Test Accuracy: {:4f}'.format(test_acc))
    del model
    torch.cuda.empty_cache()

    return train_loss, train_acc, eval_loss, eval_acc, test_loss, test_acc

def test_model(model: BasicCNN, dataloaders: Dict,
               criterion: torch.nn.CrossEntropyLoss,
               device: torch.device) -> Tuple[float, float]:
    loss = 0.0
    corrects = 0.0
    for inputs, labels in dataloaders['test']:
        inputs = inputs.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
        loss += loss.item() * inputs.size(0)
        corrects += torch.sum(preds == labels.data)
    test_loss = loss / len(dataloaders['test'].dataset)
    test_acc = corrects.double() / len(dataloaders['test'].dataset)
    return test_loss.item(), test_acc.item()

def build_model(dense_size: int, number_of_gpus: List, device: torch.device) -> BasicCNN:
    model = BasicCNN(dense_size)
    if len(number_of_gpus) > 1:
        print("Let's use", len(number_of_gpus), "GPUs!")
        os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in number_of_gpus)
        model = torch.nn.DataParallel(model, device_ids=number_of_gpus)
        model = model.module
    model = model.to(device)
    return model


In [None]:
## Slightly different loading and handling of training/testing/validation data

import pandas as pd
from sklearn.model_selection import train_test_split
from utils import *
from torch.utils.data import DataLoader
import torch.optim as optim
import gdown

def download_datasets() -> None:

    train_data_path = "https://drive.google.com/uc?id=1Q6Iefx7rWFSpHhuQJ7ygPHHTTxMwLYth"
    test_data_path = "https://drive.google.com/uc?id=1M6aIcBXmRKRR1go9kKpDA9aGq_Uy0mz0"

    output_train = '../data/sign_mnist_train.csv'
    output_test = '../data/sign_mnist_test.csv'

    gdown.download(train_data_path, output_train)
    gdown.download(test_data_path, output_test)

train_path = '../data/sign_mnist_train.csv'
test_path = '../data/sign_mnist_test.csv'

if not os.path.exists(train_path):
    download_datasets()

train_frame = pd.read_csv(train_path)
test_frame = pd.read_csv(test_path)

real_train_frame, valid_frame = train_test_split(train_frame, test_size=0.1, random_state=42)
# valid would be used to select the best weights

train_dataset = CustomDataset(real_train_frame)
valid_dataset = CustomDataset(valid_frame)
test_dataset = CustomDataset(test_frame)


def run_training() -> None:
    batch_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 1024, 2048, 4096]

    for batch_size in batch_sizes:

        num_epochs, lr = 100, 1e-4

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
        valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

        p, l_squared = 0.5, 0.5
        weight_decay = ((1-p)*l_squared)/len(train_dataset)

        experiment_dataloader_dict = {'train': train_dataloader, 'val': valid_dataloader,
                                    'test': test_dataloader}
        criterion = nn.CrossEntropyLoss()
        train_losses, train_accs, eval_losses, eval_accs, test_losses, test_accs = [], [], [], [], [], []
        dense_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]

        for dense_size in dense_sizes:
            print('============== Experiment for bs: {} and dense_size: {} =============='.format(batch_size, dense_size))
            model = build_model(dense_size, num_gpus, device=device)
            optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
            train_loss, train_acc, eval_loss, eval_acc, test_loss, test_acc = train_model(model, experiment_dataloader_dict, optimizer=optimizer,
                    batch_size=batch_size, criterion=criterion, num_epochs=num_epochs, lr=lr,
                    device=device, dense_size=dense_size)

            train_losses.append(train_loss)
            eval_losses.append(eval_loss)
            test_losses.append(test_loss)

            train_accs.append(train_acc)
            eval_accs.append(eval_acc)
            test_accs.append(test_acc)

        results_frame = pd.DataFrame()
        results_frame['dense_sizes'] = dense_sizes
        results_frame['train_loss'] = train_losses
        results_frame['train_acc'] = train_accs

        results_frame['eval_loss'] = eval_losses
        results_frame['eval_acc'] = eval_accs

        results_frame['test_loss'] = test_losses
        results_frame['test_acc'] = test_accs

        results_frame.to_csv('../results/cnn_experiments_bs_{}.csv'.format(batch_size), index=False)

run_training()