# **HW3: Required Submissions:**
1.  Submit  colab/jupyter notebooks.
2. You can do everything in one notebook or multiple notebooks
2. Pdf version of the notebooks (HWs will not be graded if pdf version is not provided.
3. **The notebooks and pdf files should have the output.**
4. **Name files as follows : FirstName_file1_hw3, FirstName_file2_h3**
5. You are not allowed to use PyTorch Lightning for this HW.

# Q1 Load CIFAR10 dataset (1 point)

- Load CIFAR 10 dataset from PyTorch datasets
- Create train/valid/test datasets and dataloaders
- Apply appropraite Transformations
- Create a smaller subset of 50 train images

In [None]:
# Importing the necessary libraries
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import random

from datetime import datetime
from pathlib import Path
import plotly.io as pio
pio.renderers.default = 'colab'

In [None]:
# Import random function
import random

# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_folder = Path('/content/drive/MyDrive/Data/DL')

In [None]:
lecture_folder = Path('/content/drive/MyDrive/Data/Models/HW3')

In [None]:
# Install wandb and update it to the latest version
%%capture
!pip install wandb --upgrade

In [None]:
# Import wandb
import wandb

# Login to W&B
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Load dataset and necessary data loaders and transformations
# Transform to convert images to pytorch tensors and normalize the data
trans = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.473363,), (0.251568925))])

train_full = torchvision.datasets.CIFAR10(root=data_folder,
                                              train=True, 
                                              transform=trans,
                                              download=True)
trainset, validset = torch.utils.data.random_split(train_full, [40000, 10000], generator=torch.Generator().manual_seed(42) )
testset  = torchvision.datasets.CIFAR10(root=data_folder,
                                              train=False, 
                                              transform=trans,
                                              download=True)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


Files already downloaded and verified
Files already downloaded and verified


In [None]:
# Check length of original train and valid datasets
len(trainset),len(validset)

(40000, 10000)

In [None]:
# Create subset of 50 images
# n sample points
train_sample_size = 50
valid_sample_size = 10

# Getting n random indices
train_subset_indices = random.sample(range(0, len(trainset)), train_sample_size)
valid_subset_indices = random.sample(range(0, len(testset)), valid_sample_size)

# Getting subset of dataset
train_subset = torch.utils.data.Subset(trainset, train_subset_indices)
valid_subset = torch.utils.data.Subset(validset, valid_subset_indices)

In [None]:
# Shape of training data
train_full.data.shape

(50000, 32, 32, 3)

In [None]:
len(trainset.dataset)

50000

In [None]:
# Shape of testing data
testset.data.shape

(10000, 32, 32, 3)

In [None]:
# check the max value of inputs
train_full.data.max()

255

In [None]:
# check the min value of inputs
train_full.data.min()

0

In [None]:
# check the min value of inputs
train_full.data.mean()/255

0.4733630004850899

In [None]:
# check the min value of inputs
train_full.data.std()/255

0.2515689250632208

In [None]:
# Check smaller subset
len(train_subset)

50

In [None]:
check_loader = torch.utils.data.DataLoader(trainset, batch_size = 32, shuffle = True)

In [None]:
len(check_loader)

1250

In [None]:
1250*32

40000

In [None]:
# check imputs and outputs for transformations
for input, target in check_loader:
  print(f'shape of inputs is :{input.shape}')
  print(f'\nmax input value  :{input.max()}')
  print(f'\nmin input value  :{input.min()}')
  print(f'\nmean input value  :{input.mean()}')
  print(f'\nstd input value  :{input.std()}')
  print(f'\nshape of targets is :{target.shape}')
   
  break

shape of inputs is :torch.Size([32, 3, 32, 32])

max input value  :2.0934102535247803

min input value  :-1.881643533706665

mean input value  :-0.0411696620285511

std input value  :1.0160225629806519

shape of targets is :torch.Size([32])


In [None]:
def get_cifar10_labels(labels):  
    """ 
    Function to generate labels.
    Input: numerical labels
    Output: actual string labels
    """

    # Create a list of labels
    text_labels = ['plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

    # Return text_labels according to numerical values
    return [text_labels[int(i)] for i in labels]

# Q2 Overfit a three layer network (1 point)
Try a three-layer network ( 2 hidden layers and one output layer) with 100 units in each hidden layer. Tweak the learning rate and weight initialization to overfit the smaller subset and achieve 100% training accuracy within 20 epochs. No regularization method should be used in this step. Use SGD as optimizer. You will use 50 images from train dataset and  complete validation dataset for this question.  You will use ReLU activation as non linearity in your model. Use  batch_size of 25. 

In [None]:
# Define custom model using nn.Module()
class CustomDeepNetwork(nn.Module):
  def __init__(self,  output_dim, h_sizes, dprob, non_linearity, batch_norm):
        
    super().__init__()

    self.h_sizes = h_sizes
    self.non_linearity = non_linearity
    self.batch_norm = batch_norm
    self.dprob = dprob
    self.output_dim = output_dim

    # Initialize hidden layers  
    self.hidden = nn.ModuleList()
    self.dropout = nn.ModuleList()
    self.batchnorm = nn.ModuleList()

    for k in range(len(h_sizes)-1):
      self.hidden.append(nn.Linear(self.h_sizes[k], h_sizes[k+1]))
      self.dropout.append(nn.Dropout(p=dprob[k]))

      if self.batch_norm:
        self.batchnorm.append(nn.BatchNorm1d(self.h_sizes[k+1], momentum=0.9))
      
    
    self.output_layer = nn.Linear(self.h_sizes[-1], output_dim)
    self.flatten = nn.Flatten()
    
    ## it is better to use nn.functional.relu in the forward function
    # self.relu = nn.ReLU()

  def forward(self,x):
    x = self.flatten(x)

    for  k in range(len(self.h_sizes)-1):
      x =  self.non_linearity(self.hidden[k](x))
      if self.batch_norm:
        x = self.batchnorm[k](x)
      x= self.dropout[k](x)

    x = self.output_layer(x)
    # we are not using softmax function in the forward passs
    # nn.crossentropy loss (which we will use to define our loss) combines  nn.LogSoftmax() and nn.NLLLoss() in one single class
    return x  

In [None]:
# Training data epochs
def train(train_loader, model, optimizer, loss_function, log_batch, log_interval):

  """ 
  Function for training the model in each epoch
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate.
  Output: final weights, bias, train loss, train accuracy
  """
  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_train
  global batch_ct_train

  # Training Loop loop
  # Initialize train_loss at the he start of the epoch
  running_train_loss = 0
  running_train_correct = 0
  
  # put the model in training mode
  model.train()

  # Iterate on batches from the dataset using train_loader
  for input, targets in train_loader:
    
    # move inputs and outputs to GPUs
    input = input.to(device)
    targets = targets.to(device)

    # Forward pass
    output = model(input)
    loss = loss_function(output, targets)

    # Correct prediction
    y_pred = torch.argmax(output, dim = 1)
    correct = torch.sum(y_pred == targets)

    example_ct_train +=  len(targets)
    batch_ct_train += 1

    # set gradients to zero 
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Update parameters using their gradient
    optimizer.step()
          
    # Add train loss of a batch 
    running_train_loss += loss.item()

    # Add Corect counts of a batch
    running_train_correct += correct

    # log batch loss and accuracy
    if log_batch:
      if ((batch_ct_train + 1) % log_interval) == 0:
        wandb.log({f"Train Batch Loss  :": loss})
        wandb.log({f"Train Batch Acc :": correct/len(targets)})

  
  # Calculate mean train loss for the whole dataset for a particular epoch
  train_loss = running_train_loss/len(train_loader)

  # Calculate accuracy for the whole dataset for a particular epoch
  train_acc = running_train_correct/len(train_loader.dataset)

  return train_loss, train_acc

In [None]:
# Validation data epochs
def valid(loader, model, optimizer, loss_function, log_batch, log_interval):

  """ 
  Function for training the model and plotting the graph for train & valid loss vs epoch.
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate, batch size.
  Output: final weights, bias and train loss and valid loss for each epoch.
  """

  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_valid
  global batch_ct_valid

  # Validation loop
  running_valid_loss = 0
  running_valid_correct = 0
  
  # put the model in evaluation mode
  model.eval()

  with torch.no_grad():
    for input,targets in loader:

      # move inputs and outputs to GPUs
      input = input.to(device)
      targets = targets.to(device)

      # Forward pass
      output = model(input)
      loss = loss_function(output,targets)

      # Correct Predictions
      y_pred = torch.argmax(output, dim = 1)
      correct = torch.sum(y_pred == targets)

      # count of images and batches
      example_ct_valid +=  len(targets)
      batch_ct_valid += 1

      # Add valid loss of a batch 
      running_valid_loss += loss.item()

      # Add correct count for each batch
      running_valid_correct += correct

      # log batch loss and accuracy
      if log_batch:
        if ((batch_ct_valid + 1) % log_interval) == 0:
          wandb.log({f"Valid Batch Loss  :": loss})
          wandb.log({f"Valid Batch Accuracy :": correct/len(targets)})


    # Calculate mean valid loss for the whole dataset for a particular epoch
    valid_loss = running_valid_loss/len(valid_loader)

    # Calculate accuracy for the whole dataset for a particular epoch
    valid_acc = running_valid_correct/len(valid_loader.dataset)
    
  return valid_loss, valid_acc

In [None]:
# Model Training
def train_loop(train_loader, valid_loader, model, loss_function, optimizer, epochs, device,
               file_model):

  '''
  model: specify your model for training
  criterion: loss function 
  optimizer: optimizer like SGD , ADAM etc.
  train loader: function to carete batches for training data
  loader : function to create batches for valid data set
  file_model : specify file name for saving your model. This way we can upload the model weights from file. We will not to run model again.
  

  '''
  # Create lists to store train and valid loss at each epoch

  train_loss_history = []
  valid_loss_history = []
  train_acc_history = []
  valid_acc_history = []
  delta = 0
  best_score = None
  valid_loss_min = np.Inf


  # Iterate for the given number of epochs
  for epoch in range(epochs):
    t0 = datetime.now()
    # Get train loss and accuracy for one epoch

    train_loss, train_acc = train(train_loader, model, optimizer, loss_function, 
                                  wandb.config.log_batch, wandb.config.log_interval)
    valid_loss, valid_acc = valid(valid_loader, model, optimizer, loss_function,
                                    wandb.config.log_batch, wandb.config.log_interval)

    dt = datetime.now() - t0

    # Save history of the Losses and accuracy
    train_loss_history.append(train_loss)
    train_acc_history.append(train_acc)
    valid_loss_history.append(valid_loss)
    valid_acc_history.append(valid_acc)

    score = -valid_loss
    if best_score is None:
      best_score=score
      print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving Model...')
      torch.save(model.state_dict(), file_model)
      valid_loss_min = valid_loss

    elif score < best_score + delta:
      print(f'Validation loss has not decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Not Saving Model...')
    
    else:
      best_score = score
      print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
      torch.save(model.state_dict(), file_model)
      valid_loss_min = valid_loss

    # Log the train and valid loss to W&B
    wandb.log({f"Train epoch Loss :": train_loss, f"Valid epoch Loss :": valid_loss })
    wandb.log({f"Train epoch Acc :": train_acc, f"Valid epoch Acc :": valid_acc})



    # Print the train loss and accuracy for given number of epochs, batch size and number of samples
    print(f'Epoch : {epoch+1} / {epochs}')
    print(f'Time to complete {epoch+1} is {dt}')
    print(f'Train Loss: {train_loss : .4f} | Train Accuracy: {train_acc * 100 : .4f}%')
    print(f'Valid Loss: {valid_loss : .4f} | Valid Accuracy: {valid_acc * 100 : .4f}%')
    print()

  return train_loss_history, train_acc_history, valid_loss_history, valid_acc_history


In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 20,
    output_dim = 10,
    h_sizes = [3072] + [100] * 2,
    batch_norm = False,
    dprob = [0] * 2,
    batch_size = 25,
    learning_rate = 0.06,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q2.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )

non_linearity = F.relu 


In [None]:
# Initilialize wandb
wandb.init(name = 'Q2', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▅▆█████████████████▁▄▅▅▅▆▆▆▆▆▇▇▇▇▇█████
Train epoch Loss :,█▅▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂
Valid epoch Acc :,▅▁▄▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▂▄▅▅▅▅▆▇▇▇▇▇▇██▇█▇▇█
Valid epoch Loss :,▅█▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇███▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃

0,1
Train epoch Acc :,0.96
Train epoch Loss :,0.36357
Valid epoch Acc :,0.2021
Valid epoch Loss :,2.57485


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size, shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

#model.apply(init_weights) # No special initialization defaults to lecun's

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fef10e2e8d0>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 2.288256). Saving Model...
Epoch : 1 / 20
Time to complete 1 is 0:00:06.371410
Train Loss:  2.3013 | Train Accuracy:  6.0000%
Valid Loss:  2.2883 | Valid Accuracy:  14.3000%

Validation loss has decreased (2.288256 --> 2.275278). Saving model...
Epoch : 2 / 20
Time to complete 2 is 0:00:06.407159
Train Loss:  2.1567 | Train Accuracy:  44.0000%
Valid Loss:  2.2753 | Valid Accuracy:  16.5100%

Validation loss has decreased (2.275278 --> 2.267970). Saving model...
Epoch : 3 / 20
Time to complete 3 is 0:00:06.337209
Train Loss:  2.0259 | Train Accuracy:  54.0000%
Valid Loss:  2.2680 | Valid Accuracy:  17.4600%

Validation loss has not decreased (2.267970 --> 2.273416). Not Saving Model...
Epoch : 4 / 20
Time to complete 4 is 0:00:06.402390
Train Loss:  1.8775 | Train Accuracy:  62.0000%
Valid Loss:  2.2734 | Valid Accuracy:  16.8800%

Validation loss has not decreased (2.267970 --> 2.284541). Not Saving Model...
Epoch : 5 / 20
Time to complete 5 is 0:

# Q3 Overfit a five layer network (1 Point)
Craete a five-layer network ( 4 hidden layers and one output layer) with 100 units on each layer to overfit smaller subset. Here also you will have to adjust the learning rate and weight initialization  to achieve 100% training accuracy within 20 epochs. Use SGD as optimizer. You will use 50 images from train dataset and  all the images from the validation dataset for this question.  You will use ReLU activation as non linearity in your model. Use batch_size of 25.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 20,
    output_dim = 10,
    h_sizes = [3072] + [100] * 4,
    batch_norm = False,
    dprob = [0] * 4,
    batch_size = 25,
    learning_rate = 0.03,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q3.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )

non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q3', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▁▁▁▁▁▁▁▁
Train epoch Loss :,█▇▆▅▄▃▂▂▁
Valid epoch Acc :,▁▃▆▃▂▆███
Valid epoch Loss :,▁▂▃▄▅▆▇▇█

0,1
Train epoch Acc :,1.0
Train epoch Loss :,0.20469
Valid epoch Acc :,0.1924
Valid epoch Loss :,2.65076


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 2.399706). Saving Model...
Epoch : 1 / 20
Time to complete 1 is 0:00:02.246074
Train Loss:  2.3154 | Train Accuracy:  16.0000%
Valid Loss:  2.3997 | Valid Accuracy:  10.9700%

Validation loss has not decreased (2.399706 --> 2.413553). Not Saving Model...
Epoch : 2 / 20
Time to complete 2 is 0:00:02.168589
Train Loss:  1.9217 | Train Accuracy:  40.0000%
Valid Loss:  2.4136 | Valid Accuracy:  11.1700%

Validation loss has not decreased (2.399706 --> 2.422621). Not Saving Model...
Epoch : 3 / 20
Time to complete 3 is 0:00:02.168676
Train Loss:  1.6545 | Train Accuracy:  46.0000%
Valid Loss:  2.4226 | Valid Accuracy:  12.4200%

Validation loss has not decreased (2.399706 --> 2.442618). Not Saving Model...
Epoch : 4 / 20
Time to complete 4 is 0:00:02.217401
Train Loss:  1.4738 | Train Accuracy:  52.0000%
Valid Loss:  2.4426 | Valid Accuracy:  13.0800%

Validation loss has not decreased (2.399706 --> 2.442037). Not Saving Model...
Epoch : 5 / 20
Time to

# Q4 : Optimizers (2 Points)

Train a six-layer network ( 5 hidden layers and one output layer) with SGD, SGD+momentum. You will use 4000 images from train dataset and all the images from the validation dataset for this question. You will use ReLU activation as non linearity in your model.  Use  batch_size of 100 and learning rate of around 5e-02. Which one converges faster? 


In [None]:
# Create 4000 image subset of training dataset
# n sample points
train_sample_size4 = 4000

# Getting n random indices
train_subset_indices4 = random.sample(range(0, len(trainset)), train_sample_size4)

# Getting subset of dataset
train_subset4 = torch.utils.data.Subset(trainset, train_subset_indices4)

In [None]:
len(train_subset4)

4000

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 30,
    output_dim = 10,
    h_sizes = [3072] + [100] * 5,
    batch_norm = False,
    dprob = [0] * 5,
    batch_size = 100,
    learning_rate = 5e-02,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q4.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )

non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q4', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇██
Train epoch Loss :,█▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁
Valid epoch Acc :,▁▄▆▅▇▇▆█▆▆█▆▇▄█▅▅▆▅▇
Valid epoch Loss :,▃▂▂▂▁▁▁▁▂▃▂▄▃▇▃▇▆▆█▇

0,1
Train epoch Acc :,0.7925
Train epoch Loss :,0.64691
Valid epoch Acc :,0.3809
Valid epoch Loss :,2.44434


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset4, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 2.000381). Saving Model...
Epoch : 1 / 30
Time to complete 1 is 0:00:03.045798
Train Loss:  2.1246 | Train Accuracy:  22.3750%
Valid Loss:  2.0004 | Valid Accuracy:  27.7900%

Validation loss has decreased (2.000381 --> 1.874531). Saving model...
Epoch : 2 / 30
Time to complete 2 is 0:00:02.971482
Train Loss:  1.8977 | Train Accuracy:  31.3250%
Valid Loss:  1.8745 | Valid Accuracy:  33.1300%

Validation loss has decreased (1.874531 --> 1.828926). Saving model...
Epoch : 3 / 30
Time to complete 3 is 0:00:03.015825
Train Loss:  1.7679 | Train Accuracy:  37.5000%
Valid Loss:  1.8289 | Valid Accuracy:  35.1500%

Validation loss has not decreased (1.828926 --> 1.871571). Not Saving Model...
Epoch : 4 / 30
Time to complete 4 is 0:00:02.964783
Train Loss:  1.6578 | Train Accuracy:  41.5000%
Valid Loss:  1.8716 | Valid Accuracy:  33.5400%

Validation loss has decreased (1.828926 --> 1.771366). Saving model...
Epoch : 5 / 30
Time to complete 5 is 0:00:02.9

In [None]:
# Initilialize wandb
wandb.init(name = 'Q4_2', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇██▇▇██▇
Train epoch Loss :,█▇▇▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▁▁▂▂▁▁▂
Valid epoch Acc :,▁▄▆▅▇▇▆█▆▆█▆▇▄█▅▅▆▅▇▇▇▇▅▆▆▅█▆▃
Valid epoch Loss :,▂▂▁▂▁▁▁▁▁▂▂▂▂▄▂▄▄▄▅▄▄▅▆█▇█▆▇▆▃

0,1
Train epoch Acc :,0.85975
Train epoch Loss :,0.60281
Valid epoch Acc :,0.3122
Valid epoch Loss :,2.27421


In [None]:
# For SGD with momentum
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset4, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = wandb.config.learning_rate, momentum=0.9)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 1.925511). Saving Model...
Epoch : 1 / 30
Time to complete 1 is 0:00:03.544200
Train Loss:  2.0623 | Train Accuracy:  23.8750%
Valid Loss:  1.9255 | Valid Accuracy:  29.5100%

Validation loss has decreased (1.925511 --> 1.832888). Saving model...
Epoch : 2 / 30
Time to complete 2 is 0:00:03.086463
Train Loss:  1.8567 | Train Accuracy:  32.7500%
Valid Loss:  1.8329 | Valid Accuracy:  33.5200%

Validation loss has not decreased (1.832888 --> 1.853909). Not Saving Model...
Epoch : 3 / 30
Time to complete 3 is 0:00:03.086779
Train Loss:  1.7209 | Train Accuracy:  37.5500%
Valid Loss:  1.8539 | Valid Accuracy:  34.9300%

Validation loss has not decreased (1.832888 --> 1.868482). Not Saving Model...
Epoch : 4 / 30
Time to complete 4 is 0:00:03.060086
Train Loss:  1.6212 | Train Accuracy:  42.2250%
Valid Loss:  1.8685 | Valid Accuracy:  34.5500%

Validation loss has not decreased (1.832888 --> 1.835842). Not Saving Model...
Epoch : 5 / 30
Time to complet

The model using the SGD optimizer with momentum converged faster than the model using the SGD optimizer without momentum.

# Q5 : Regularization (2 Points)

In this question, you will add dropout layer. Add dropout after every ReLU non-linearity.

You will now train following two-layer networks:

1. Hidden size 256, dropout = 0
2. Hidden size 512, dropout = 0
3. Hidden size 512, dropout = 0.5

You will use 20,000 images from train dataset and all the images from the validation dataset for this question. You will use ReLU activation as non linearity in your model. Use batch_size of 100 and learning rate of around 5e-03. In this experiment, you will use Adam optimizer. Further train model for 100 epochs and use batch size of 512.
Which model gave better accuracy on validation datset - smaller model with no regularization or a bigger model with regularization? 

In [None]:
# Create subset of 20,000 images from training dataset
# n sample points
train_sample_size20 = 20000

# Getting n random indices
train_subset_indices20 = random.sample(range(0, len(trainset)), train_sample_size20)

# Getting subset of dataset
train_subset20 = torch.utils.data.Subset(trainset, train_subset_indices20)

In [None]:
len(train_subset20)

20000

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 10,
    h_sizes = [3072] + [256],
    batch_norm = True,
    dprob = [0],
    batch_size = 100,
    learning_rate = 5e-03,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q5_1_1.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q5', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇█▇▇█▇█▇█████▇████████
Train epoch Loss :,█▆▅▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▂▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁
Valid epoch Acc :,▁▄▅▇█▆▆▇▇▇▅█▆▅█▆▇▅▅▆▇▅▅▆▆▅▆▅▆▆▅▆▇▅▆▅▆▆▅▆
Valid epoch Loss :,▂▁▁▁▁▁▂▂▂▂▃▃▃▄▃▄▄▅▅▄▄▆▆▅▆▆▆▇▇▇▇▇▆▇▇██▇▇▇

0,1
Train epoch Acc :,0.95815
Train epoch Loss :,0.13724
Valid epoch Acc :,0.448
Valid epoch Loss :,3.9234


In [None]:
# For Adam optimizer
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset20, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7f8be968ccd0>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 1.696490). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:22.901535
Train Loss:  1.8804 | Train Accuracy:  35.7150%
Valid Loss:  1.6965 | Valid Accuracy:  40.9600%

Validation loss has decreased (1.696490 --> 1.629888). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:23.198470
Train Loss:  1.5996 | Train Accuracy:  44.2150%
Valid Loss:  1.6299 | Valid Accuracy:  43.2700%

Validation loss has decreased (1.629888 --> 1.563060). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:23.228773
Train Loss:  1.4932 | Train Accuracy:  47.7500%
Valid Loss:  1.5631 | Valid Accuracy:  45.4000%

Validation loss has decreased (1.563060 --> 1.559669). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:23.174432
Train Loss:  1.4095 | Train Accuracy:  50.7550%
Valid Loss:  1.5597 | Valid Accuracy:  46.3100%

Validation loss has not decreased (1.559669 --> 1.588669). Not Saving Model...
Epoch : 5 / 100
Time to complete 5 is 0:00

Now, will train the model with 100 epochs and a batch size of 512 for the first two-layer network.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 10,
    h_sizes = [3072] + [256],
    batch_norm = True,
    dprob = [0],
    batch_size = 512,
    learning_rate = 5e-03,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q5_1_2.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q5_1_1', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇█████████████████████
Train epoch Loss :,█▆▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Valid epoch Acc :,▁▅▇▆█▇█▇▆▇▅▇▅▅▆▆▆▆▅▅▅▅▅▆▆▄▅▅▆▅▅▆▅▅▆▅▅▇▅▆
Valid epoch Loss :,▁▁▁▁▁▁▁▂▂▂▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▆▇▇▇▇▇█▇█▇██

0,1
Train epoch Acc :,0.9671
Train epoch Loss :,0.10439
Valid epoch Acc :,0.4695
Valid epoch Loss :,4.71729


In [None]:
# For Adam optimizer
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset20, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7f8be418be10>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 1.753011). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:12.123238
Train Loss:  2.1432 | Train Accuracy:  31.2100%
Valid Loss:  1.7530 | Valid Accuracy:  38.7700%

Validation loss has decreased (1.753011 --> 1.723279). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:11.766354
Train Loss:  1.6762 | Train Accuracy:  41.0650%
Valid Loss:  1.7233 | Valid Accuracy:  41.3200%

Validation loss has decreased (1.723279 --> 1.642208). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:11.733672
Train Loss:  1.5986 | Train Accuracy:  44.5550%
Valid Loss:  1.6422 | Valid Accuracy:  42.6200%

Validation loss has decreased (1.642208 --> 1.601442). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:11.673638
Train Loss:  1.5204 | Train Accuracy:  46.7000%
Valid Loss:  1.6014 | Valid Accuracy:  44.6700%

Validation loss has not decreased (1.601442 --> 1.655050). Not Saving Model...
Epoch : 5 / 100
Time to complete 5 is 0:00

Second two-layer neural network model with Hidden size 512 and dropout = 0

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 10,
    h_sizes = [3072] + [512],
    batch_norm = True,
    dprob = [0],
    batch_size = 100,
    learning_rate = 5e-03,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q4\5_2_1.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q5_2', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▄▄▅▅▅▆▆▇▇▇▇▇▇▇▇▇▇████████████████████
Train epoch Loss :,█▆▆▅▅▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Valid epoch Acc :,▁▅▇▇███▇▆▇▆▆▅▆▆▅▅▆▆▅▅▆▅▅▆▆▅▅▆▅▅▆▆▅▅▅▃▅▅▆
Valid epoch Loss :,▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▆▆▇▇▇▇█▇▇▇

0,1
Train epoch Acc :,0.96475
Train epoch Loss :,0.10491
Valid epoch Acc :,0.4659
Valid epoch Loss :,4.67282


In [None]:
# For Adam optimizer
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset20, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7f8be550b850>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 1.745676). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:37.940330
Train Loss:  1.9397 | Train Accuracy:  35.1900%
Valid Loss:  1.7457 | Valid Accuracy:  39.4400%

Validation loss has decreased (1.745676 --> 1.625384). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:37.862800
Train Loss:  1.6284 | Train Accuracy:  42.9100%
Valid Loss:  1.6254 | Valid Accuracy:  44.0200%

Validation loss has not decreased (1.625384 --> 1.654846). Not Saving Model...
Epoch : 3 / 100
Time to complete 3 is 0:00:37.493685
Train Loss:  1.5199 | Train Accuracy:  46.7100%
Valid Loss:  1.6548 | Valid Accuracy:  43.0100%

Validation loss has not decreased (1.625384 --> 1.637864). Not Saving Model...
Epoch : 4 / 100
Time to complete 4 is 0:00:37.169047
Train Loss:  1.4160 | Train Accuracy:  50.8900%
Valid Loss:  1.6379 | Valid Accuracy:  45.1100%

Validation loss has decreased (1.625384 --> 1.534564). Saving model...
Epoch : 5 / 100
Time to complete 5

Now, the same model with batch_size=512 and epochs=100.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 10,
    h_sizes = [3072] + [512],
    batch_norm = True,
    dprob = [0],
    batch_size = 512,
    learning_rate = 5e-03,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q5_2_2.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q5_2_1', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇███████████████████████
Train epoch Loss :,█▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Valid epoch Acc :,▁▄▇███▇██▆█▆▇▆▇▇▇▇▇█▆▇▇▇▇▇▆▇▇▇▇▇▆▇▆▆▆▇▇▇
Valid epoch Loss :,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇█▇█▇▇█

0,1
Train epoch Acc :,0.96395
Train epoch Loss :,0.11513
Valid epoch Acc :,0.4686
Valid epoch Loss :,4.97335


In [None]:
# For Adam optimizer
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset20, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7f8be3a35b90>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 1.816538). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:16.275166
Train Loss:  2.2951 | Train Accuracy:  31.8300%
Valid Loss:  1.8165 | Valid Accuracy:  35.3500%

Validation loss has decreased (1.816538 --> 1.739346). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:16.022647
Train Loss:  1.6775 | Train Accuracy:  40.8800%
Valid Loss:  1.7393 | Valid Accuracy:  40.6000%

Validation loss has decreased (1.739346 --> 1.659620). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:16.162685
Train Loss:  1.5822 | Train Accuracy:  44.6150%
Valid Loss:  1.6596 | Valid Accuracy:  41.8500%

Validation loss has not decreased (1.659620 --> 1.707487). Not Saving Model...
Epoch : 4 / 100
Time to complete 4 is 0:00:16.022968
Train Loss:  1.5110 | Train Accuracy:  47.1250%
Valid Loss:  1.7075 | Valid Accuracy:  42.1400%

Validation loss has decreased (1.659620 --> 1.638553). Saving model...
Epoch : 5 / 100
Time to complete 5 is 0:00

Third model with Hidden size = 512 and dropout = 0.5.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 10,
    h_sizes = [3072] + [512],
    batch_norm = True,
    dprob = [0.5],
    batch_size = 100,
    learning_rate = 5e-03,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q5_3_1.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q5_3', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▃▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇█▇▇▇▇███████▇████████
Train epoch Loss :,█▆▅▅▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁
Valid epoch Acc :,▁▅▇▆▇▇▆▇█▆▇▇▆▄▆█▆█▅▆▇▇▇▇▇▇▇▇▇▄▇▇██▇█▇▇▆▆
Valid epoch Loss :,▁▁▁▁▁▁▂▂▁▂▂▃▃▅▄▃▄▃▅▅▄▅▄▅▄▅▆▆▅▇▆▆▆▆█▆▇▇▇█

0,1
Train epoch Acc :,0.9627
Train epoch Loss :,0.12324
Valid epoch Acc :,0.446
Valid epoch Loss :,4.80493


In [None]:
# For Adam optimizer
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset20, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7f8be39cd850>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 1.712177). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:37.490904
Train Loss:  2.1788 | Train Accuracy:  30.6300%
Valid Loss:  1.7122 | Valid Accuracy:  39.0100%

Validation loss has decreased (1.712177 --> 1.711150). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:37.099912
Train Loss:  1.8133 | Train Accuracy:  36.5400%
Valid Loss:  1.7111 | Valid Accuracy:  40.8500%

Validation loss has decreased (1.711150 --> 1.599598). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:37.271258
Train Loss:  1.7003 | Train Accuracy:  40.4550%
Valid Loss:  1.5996 | Valid Accuracy:  43.7900%

Validation loss has decreased (1.599598 --> 1.547552). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:37.261238
Train Loss:  1.6326 | Train Accuracy:  42.1600%
Valid Loss:  1.5476 | Valid Accuracy:  45.8900%

Validation loss has decreased (1.547552 --> 1.534583). Saving model...
Epoch : 5 / 100
Time to complete 5 is 0:00:37.5207

Finally, training the third model with 100 epochs and a batch size of 512.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 10,
    h_sizes = [3072] + [512],
    batch_norm = True,
    dprob = [0.5],
    batch_size = 512,
    learning_rate = 5e-03,
    dataset="CIFAR10",
    architecture="MLP",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q5_3_2.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 

In [None]:
# Initilialize wandb
wandb.init(name = 'Q5_3', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train epoch Acc :,▁▂▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇████████
Train epoch Loss :,█▆▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
Valid epoch Acc :,▁▄▅▆▆▇▆▇█▇▇▇▇████▇█▇█▇██▇▇▇█▇▇▇▇█▇▇▇█▇▇▇
Valid epoch Loss :,▅▃▂▂▁▁▂▁▁▁▁▁▂▁▁▂▂▂▂▂▃▄▃▄▄▄▅▄▅▅▆▆▅▆▆▆▇▇▇█

0,1
Train epoch Acc :,0.79625
Train epoch Loss :,0.59442
Valid epoch Acc :,0.5058
Valid epoch Loss :,1.93203


In [None]:
# For Adam optimizer
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subset20, batch_size=wandb.config.batch_size, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.CrossEntropyLoss()

# device 
model = CustomDeepNetwork(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7f8be395aa90>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 1.809185). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:16.082093
Train Loss:  2.5759 | Train Accuracy:  26.8400%
Valid Loss:  1.8092 | Valid Accuracy:  35.3300%

Validation loss has decreased (1.809185 --> 1.699606). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:15.600139
Train Loss:  1.9195 | Train Accuracy:  33.9050%
Valid Loss:  1.6996 | Valid Accuracy:  40.8000%

Validation loss has decreased (1.699606 --> 1.669424). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:15.549427
Train Loss:  1.7765 | Train Accuracy:  37.5750%
Valid Loss:  1.6694 | Valid Accuracy:  41.7700%

Validation loss has decreased (1.669424 --> 1.621998). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:15.644916
Train Loss:  1.7036 | Train Accuracy:  39.6000%
Valid Loss:  1.6220 | Valid Accuracy:  42.8000%

Validation loss has decreased (1.621998 --> 1.605964). Saving model...
Epoch : 5 / 100
Time to complete 5 is 0:00:15.5111

Overall, bigger models with regularization tend to give better accuracy and generalization to the validation dataset than smaller models with no regularization.

# Q6 Batch Norm and SELU (3 Points)
Generate training and test datasets for a binary classiﬁcation problem using Fashion-MNIST with class 1 being a combination of sneaker and pullover and class 0 being the combination of sandal and shirt categories. 
- Train the model using Logistic regression. Report train and test loss.
- Train a Neural Network with one hidden layer (100 neurons). Use Logistic loss function, Adam optimizer and Relu activation for hidden layer.  First overfit a small sample to check erroers and get idea of learning rate. Then train on complete dataset. Add regularization (dropout or weight decay)if needed.
- Now add another hidden layer (50 Neurons). Adjust the learning rate if you have to. Add regularization (dropout or weight decay) if needed.
- Now try adding Batch Normalization and compare the train and test loss : Is it converging faster than before? Does it produce a better model? How does it affect training speed? **Do not use dropout with batch normalization.**
- Try replacing Batch Normalization with SELU, and make the necessary adjustments to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers). Compare the results with Batch Normalization. **For SELU if you are using dropout then use alpha dropout.** Alpha dropout make sure that network is self normalized.


In [None]:
# Generate train and test datasets
# Transform to convert images to pytorch tensors and normalize the data
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.2860,), (0.3530,))])
train_full = torchvision.datasets.FashionMNIST(root=data_folder,
                                              train=True, 
                                              transform=trans,
                                              download=True)
trainset, validset = torch.utils.data.random_split(train_full, [50000, 10000], generator=torch.Generator().manual_seed(42) )
testset  = torchvision.datasets.FashionMNIST(root=data_folder,
                                              train=False, 
                                              transform=trans,
                                              download=True)

In [None]:
# Check targets to change to binary targets
# check target values for train set
train_full.targets.shape

torch.Size([60000])

In [None]:
# Unique Target values
train_full.targets.unique()

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
# Filter data for observations with only target values that are 2 = pullover, 7 = sneaker, 5 = sandal, 6 = shirt
idx = (train_full.targets==7) | (train_full.targets==2) | (train_full.targets==5) | (train_full.targets==6)
train_full.targets = train_full.targets[idx]
train_full.data = train_full.data[idx]

In [None]:
train_full.targets.unique()

tensor([2, 5, 6, 7])

In [None]:
len(train_full)

24000

In [None]:
idx = (testset.targets==7) | (testset.targets==2) | (testset.targets==5) | (testset.targets==6)
testset.targets = testset.targets[idx]
testset.data = testset.data[idx]

In [None]:
testset.targets.unique()

tensor([2, 5, 6, 7])

In [None]:
len(testset)

4000

In [None]:
# Map target classes: sneaker and pullover -> class 1, sandal and shirt -> class 0
train_full.targets[train_full.targets == 2] = 1
train_full.targets[train_full.targets == 7] = 1
train_full.targets[train_full.targets == 5] = 0
train_full.targets[train_full.targets == 6] = 0

In [None]:
testset.targets[testset.targets == 2] = 1
testset.targets[testset.targets == 7] = 1
testset.targets[testset.targets == 5] = 0
testset.targets[testset.targets == 6] = 0

In [None]:
train_full.targets.unique()

tensor([0, 1])

In [None]:
train_full.targets = train_full.targets.unsqueeze(-1)

In [None]:
testset.targets.unique()

tensor([0, 1])

In [None]:
testset.targets = testset.targets.unsqueeze(-1)

Logistic Regression

In [None]:
# Initializing the batch size
batch_size = 256

# Creating data loader for train set
train_loader = torch.utils.data.DataLoader(dataset = train_full, 
                                            batch_size = batch_size, 
                                            shuffle = True)

# Creating data loader for test set
test_loader = torch.utils.data.DataLoader(dataset = testset, 
                                          batch_size = batch_size, 
                                          shuffle = False)


In [None]:
# Initialize a sequential model with 2 layers: flatten and linear 
num_inputs = 784
num_outputs = 1

model = nn.Sequential(nn.Flatten(),
                      nn.Linear(in_features=num_inputs, out_features= num_hidden),
                      nn.Sigmoid())

In [None]:
# loss function
logloss = nn.NLLLoss()

In [None]:
def train(train_loader, model, optimizer, loss_function):

  """ 
  Function for training the model in each epoch
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate.
  Output: final weights, bias, train loss, train accuracy
  """


  # Training Loop loop
  # Initialize train_loss at the he strat of the epoch
  running_train_loss = 0
  running_train_correct = 0
  
  # put the model in training mode
  model.train()

  # Iterate on batches from the dataset using train_loader
  for input, targets in train_loader:
    
    # move inputs and outputs to GPUs
    input = input.to(device)
    targets = targets.to(device)

    # Forward pass
    output = model(input)
    loss = loss_function(output, targets)

    # Correct prediction
    y_pred = torch.argmax(output, dim = 1)
    correct = torch.sum(y_pred == targets)

    # set gradients to zero 
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Update parameters using their gradient
    optimizer.step()
          
    # Add train loss of a batch 
    running_train_loss += loss.item()

    # Add Corect counts of a batch
    running_train_correct += correct
  
  # Calculate mean train loss for the whole dataset for a particular epoch
  train_loss = running_train_loss/len(train_loader)

  # Calculate accuracy for the whole dataset for a particular epoch
  train_acc = running_train_correct/len(train_loader.dataset)

  return train_loss, train_acc

In [None]:
def validate(test_loader, model, loss_function):

  """ 
  Function for training the model and plotting the graph for train & test loss vs epoch.
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate, batch size.
  Output: final weights, bias and train loss and test loss for each epoch.
  """

  # Validation loop
  # Initialize train_loss at the he strat of the epoch
  running_test_loss = 0
  running_test_correct = 0
  
  # put the model in evaluation mode
  model.eval()

  with torch.no_grad():
    for input,targets in test_loader:

      # move inputs and outputs to GPUs
      input = input.to(device)
      targets = targets.to(device)

      # Forward pass
      output = model(input)
      loss = logloss(output,targets)

      # Correct Predictions
      y_pred = torch.argmax(output, dim = 1)
      correct = torch.sum(y_pred == targets)

      # Add test loss of a batch 
      running_test_loss += loss.item()

      # Add correct count for each batch
      running_test_correct += correct

    # Calculate mean test loss for the whole dataset for a particular epoch
    test_loss = running_test_loss/len(test_loader)

    # Calculate accuracy for the whole dataset for a particular epoch
    test_acc = running_test_correct/len(test_loader.dataset)
    
  return test_loss, test_acc

In [None]:
# device 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model = model
# this step is optional
# nn.Linear does this step, we are overriding it
for layer in model:
  if isinstance(layer, nn.Linear):
   torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
   torch.nn.init.zeros_(layer.bias)

# put model to GPUs
model.to(device)

# Initialize number of epochs, learning rate and batch size
epochs = 10

learning_rate = 0.05

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

loss_function = logloss

In [None]:
# Create lists to store train and test loss at each epoch
train_loss_history = []
test_loss_history = []
train_acc_history = []
test_acc_history = []

# Iterate for the given number of epochs
for epoch in range(epochs):

  # Get train loss and accuracy for one epoch
  train_loss, train_acc = train(train_loader, model, optimizer, loss_function)
  test_loss, test_acc   = validate(test_loader, model, loss_function)

  # Save history of the Losses and accuracy
  train_loss_history.append(train_loss)
  train_acc_history.append(train_acc)

  test_loss_history.append(test_loss)
  test_acc_history.append(test_acc)

  # Log the train and test loss to W&B
  wandb.log({f"Train Loss :": train_loss})
  wandb.log({f"Train Acc :": train_acc})

  wandb.log({f"Test Loss :": test_loss})
  wandb.log({f"Test Acc :": test_acc})


  # Print the train loss and accuracy for given number of epochs, batch size and number of samples
  print(f'Epoch : {epoch+1} / {epochs}')
  print(f'Train Loss: {train_loss : .4f} | Train Accuracy: {train_acc * 100 : .4f}%')
  print(f'Test Loss: {test_loss : .4f} | Test Accuracy: {test_acc * 100 : .4f}%')
  print()


Epoch : 1 / 10
Train Loss: -0.9973 | Train Accuracy:  71.6667%
Test Loss: -0.9978 | Test Accuracy:  71.7250%

Epoch : 2 / 10
Train Loss: -0.9983 | Train Accuracy:  71.6833%
Test Loss: -0.9985 | Test Accuracy:  71.8500%

Epoch : 3 / 10
Train Loss: -0.9987 | Train Accuracy:  71.6875%
Test Loss: -0.9988 | Test Accuracy:  71.9000%

Epoch : 4 / 10
Train Loss: -0.9990 | Train Accuracy:  71.7625%
Test Loss: -0.9990 | Test Accuracy:  72.1500%

Epoch : 5 / 10
Train Loss: -0.9992 | Train Accuracy:  72.0875%
Test Loss: -0.9991 | Test Accuracy:  72.0250%

Epoch : 6 / 10
Train Loss: -0.9993 | Train Accuracy:  72.0667%
Test Loss: -0.9992 | Test Accuracy:  72.2250%

Epoch : 7 / 10
Train Loss: -0.9994 | Train Accuracy:  72.1500%
Test Loss: -0.9993 | Test Accuracy:  72.3250%

Epoch : 8 / 10
Train Loss: -0.9994 | Train Accuracy:  72.2042%
Test Loss: -0.9994 | Test Accuracy:  72.2250%

Epoch : 9 / 10
Train Loss: -0.9995 | Train Accuracy:  72.1292%
Test Loss: -0.9994 | Test Accuracy:  72.0000%

Epoch : 10

In [None]:
def get_acc_pred(data_loader, model):
  """ 
  Function to get predictions for a given test set and calculate accuracy.
  Input: Iterator to the test set.
  Output: Prections and Accuracy for test set.
  """
  with torch.no_grad():
    # Array to store predicted labels
    predictions = torch.Tensor()
    predictions = predictions.to(device)

    # Array to store actual labels
    y = torch.Tensor()
    y = y.to(device)
    # Iterate over batches from test set
    for input, targets in data_loader:
      
      # move inputs and outputs to GPUs
      input = input.to(device)
      targets = targets.to(device)

      # Calculated the predicted labels
      output = model(input)

      # Choose the label with maximum probability
      indices = torch.argmax(output, dim = 1)

      # Add the predicted labels to the array
      predictions = torch.cat((predictions, indices)) 

      # Add the actual labels to the array
      y = torch.cat((y, targets)) 

    # Check for complete dataset if actual and predicted labels are same or not
    # Calculate accuracy
    acc = (predictions == y).float().mean()

  # Return array containing predictions and accuracy
  return predictions, acc
  

In [None]:
# Get the prediction and accuracy for the train dataset
predictions_train, acc_train = get_acc_pred(train_loader, model)

In [None]:
# Get the prediction and accuracy for the test dataset
predictions, acc_test = get_acc_pred(test_loader, model)

In [None]:
# Print Accuracy for train dataset
print(acc_train * 100)

# Print Accuracy for test dataset
print(acc_test * 100)

tensor(72.0958)
tensor(71.8500)


Train neural network.

In [None]:
# Define custom model using nn.Module()
class LogisticRegression(nn.Module):
  def __init__(self,  output_dim, h_sizes, dprob, non_linearity, batch_norm):
        
    super().__init__()

    self.h_sizes = h_sizes
    self.non_linearity = non_linearity
    self.batch_norm = batch_norm
    self.dprob = dprob
    self.output_dim = output_dim

    # Initialize hidden layers  
    self.hidden = nn.ModuleList()
    self.dropout = nn.ModuleList()
    self.batchnorm = nn.ModuleList()

    for k in range(len(h_sizes)-1):
      self.hidden.append(nn.Linear(self.h_sizes[k], h_sizes[k+1]))
      self.dropout.append(nn.Dropout(p=dprob[k]))

      if self.batch_norm:
        self.batchnorm.append(nn.BatchNorm1d(self.h_sizes[k+1], momentum=0.9))
      
    
    self.output_layer = nn.Linear(self.h_sizes[-1], output_dim)
    self.flatten = nn.Flatten()
    
    ## it is better to use nn.functional.relu in the forward function
    # self.relu = nn.ReLU()
    # for logistic regression, will use sigmoid activation function

  def forward(self,x):
    x = self.flatten(x)

    for  k in range(len(self.h_sizes)-1):
      x =  self.non_linearity(self.hidden[k](x))
      if self.batch_norm:
        x = self.batchnorm[k](x)
      x= self.dropout[k](x)

    x = self.output_layer(x)
    # we are not using softmax function in the forward passs
    # nn.crossentropy loss (which we will use to define our loss) combines  nn.LogSoftmax() and nn.NLLLoss() in one single class
    return x  

In [None]:
# Training data epochs
def train(train_loader, model, optimizer, loss_function, log_batch, log_interval):

  """ 
  Function for training the model in each epoch
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate.
  Output: final weights, bias, train loss, train accuracy
  """
  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_train
  global batch_ct_train

  # Training Loop loop
  # Initialize train_loss at the he start of the epoch
  running_train_loss = 0
  running_train_correct = 0
  
  # put the model in training mode
  model.train()

  # Iterate on batches from the dataset using train_loader
  for input, targets in train_loader:
    
    # move inputs and outputs to GPUs
    input = input.to(device)
    targets = targets.to(device)

    # Forward pass
    output = model(input)
    loss = loss_function(output, targets)

    # Correct prediction
    y_pred = torch.argmax(output, dim = 1)
    correct = torch.sum(y_pred == targets)

    example_ct_train +=  len(targets)
    batch_ct_train += 1

    # set gradients to zero 
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Update parameters using their gradient
    optimizer.step()
          
    # Add train loss of a batch 
    running_train_loss += loss.item()

    # Add Corect counts of a batch
    running_train_correct += correct

    # log batch loss and accuracy
    if log_batch:
      if ((batch_ct_train + 1) % log_interval) == 0:
        wandb.log({f"Train Batch Loss  :": loss})
        wandb.log({f"Train Batch Acc :": correct/len(targets)})

  
  # Calculate mean train loss for the whole dataset for a particular epoch
  train_loss = running_train_loss/len(train_loader)

  # Calculate accuracy for the whole dataset for a particular epoch
  train_acc = running_train_correct/len(train_loader.dataset)

  return train_loss, train_acc

In [None]:
# Test data epochs
def test(loader, model, optimizer, loss_function, log_batch, log_interval):

  """ 
  Function for training the model and plotting the graph for train & valid loss vs epoch.
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate, batch size.
  Output: final weights, bias and train loss and valid loss for each epoch.
  """

  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_test
  global batch_ct_test

  # Test loop
  running_test_loss = 0
  running_test_correct = 0
  
  # put the model in evaluation mode
  model.eval()

  with torch.no_grad():
    for input,targets in loader:

      # move inputs and outputs to GPUs
      input = input.to(device)
      targets = targets.to(device)

      # Forward pass
      output = model(input)
      loss = loss_function(output,targets)

      # Correct Predictions
      y_pred = torch.argmax(output, dim = 1)
      correct = torch.sum(y_pred == targets)

      # count of images and batches
      example_ct_test +=  len(targets)
      batch_ct_test += 1

      # Add test loss of a batch 
      running_test_loss += loss.item()

      # Add correct count for each batch
      running_test_correct += correct

      # log batch loss and accuracy
      if log_batch:
        if ((batch_ct_test + 1) % log_interval) == 0:
          wandb.log({f"Test Batch Loss  :": loss})
          wandb.log({f"Test Batch Accuracy :": correct/len(targets)})


    # Calculate mean test loss for the whole dataset for a particular epoch
    test_loss = running_test_loss/len(test_loader)

    # Calculate accuracy for the whole dataset for a particular epoch
    test_acc = running_test_correct/len(test_loader.dataset)
    
  return test_loss, test_acc

In [None]:
# Model Training
def train_loop(train_loader, test_loader, model, loss_function, optimizer, epochs, device,
               file_model):

  '''
  model: specify your model for training
  criterion: loss function 
  optimizer: optimizer like SGD , ADAM etc.
  train loader: function to carete batches for training data
  loader : function to create batches for valid data set
  file_model : specify file name for saving your model. This way we can upload the model weights from file. We will not to run model again.
  

  '''
  # Create lists to store train and valid loss at each epoch

  train_loss_history = []
  test_loss_history = []
  train_acc_history = []
  test_acc_history = []
  delta = 0
  best_score = None
  test_loss_min = np.Inf


  # Iterate for the given number of epochs
  for epoch in range(epochs):
    t0 = datetime.now()
    # Get train loss and accuracy for one epoch

    train_loss, train_acc = train(train_loader, model, optimizer, loss_function, 
                                  wandb.config.log_batch, wandb.config.log_interval)
    test_loss, test_acc = test(test_loader, model, optimizer, loss_function,
                                    wandb.config.log_batch, wandb.config.log_interval)

    dt = datetime.now() - t0

    # Save history of the Losses and accuracy
    train_loss_history.append(train_loss)
    train_acc_history.append(train_acc)
    test_loss_history.append(test_loss)
    test_acc_history.append(test_acc)

    score = -test_loss
    if best_score is None:
      best_score=score
      print(f'Testloss has decreased ({test_loss_min:.6f} --> {test_loss:.6f}). Saving Model...')
      torch.save(model.state_dict(), file_model)
      test_loss_min = test_loss

    elif score < best_score + delta:
      print(f'Test loss has not decreased ({test_loss_min:.6f} --> {test_loss:.6f}). Not Saving Model...')
    
    else:
      best_score = score
      print(f'Test loss has decreased ({test_loss_min:.6f} --> {test_loss:.6f}). Saving model...')
      torch.save(model.state_dict(), file_model)
      test_loss_min = test_loss

    # Log the train and test loss to W&B
    wandb.log({f"Train epoch Loss :": train_loss, f"Test epoch Loss :": test_loss })
    wandb.log({f"Train epoch Acc :": train_acc, f"Test epoch Acc :": test_acc})



    # Print the train loss and accuracy for given number of epochs, batch size and number of samples
    print(f'Epoch : {epoch+1} / {epochs}')
    print(f'Time to complete {epoch+1} is {dt}')
    print(f'Train Loss: {train_loss : .4f} | Train Accuracy: {train_acc * 100 : .4f}%')
    print(f'Test Loss: {test_loss : .4f} | Test Accuracy: {test_acc * 100 : .4f}%')
    print()

  return train_loss_history, train_acc_history, test_loss_history, test_acc_history


In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 30,
    output_dim = 2,
    h_sizes = [784] + [100],
    batch_norm = False,
    dprob = [0],
    batch_size = 500,
    learning_rate = 3e-04,
    dataset="FashionMNIST",
    architecture="LogisticRegression",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q6.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 


In [None]:
# Initilialize wandb
wandb.init(name = 'Q6_overfit', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test epoch Acc :,▁▃▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
Test epoch Loss :,██▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
Train epoch Acc :,▁▄▆█████▇█████████████████████
Train epoch Loss :,██▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁

0,1
Test epoch Acc :,0.66
Test epoch Loss :,-1.82705
Train epoch Acc :,0.78
Train epoch Loss :,-1.84053


Run overfit model

In [None]:
# Create small dataset to overfit to check for errors
# n sample points
train_sample_sizemnist = 50

# Getting n random indices
train_subset_indicesmnist = random.sample(range(0, len(train_full)), train_sample_sizemnist)

# Getting subset of dataset
train_subsetmnist = torch.utils.data.Subset(train_full, train_subset_indicesmnist)


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_subsetmnist, batch_size=wandb.config.batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.NLLLoss()

# device 
model = LogisticRegression(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

# model.apply(init_weights) # Use lecun initialization when commented out

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fc747498fd0>]

In [None]:
example_ct_train, batch_ct_train, example_ct_test, batch_ct_test = 0, 0, 0, 0
train_loss_history, train_acc_history, test_loss_history, test_acc_history = train_loop(train_loader, test_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Testloss has decreased (inf --> -0.162262). Saving Model...
Epoch : 1 / 30
Time to complete 1 is 0:00:00.996387
Train Loss:  0.0486 | Train Accuracy:  44.0000%
Test Loss: -0.1623 | Test Accuracy:  58.2750%

Test loss has decreased (-0.162262 --> -0.357822). Saving model...
Epoch : 2 / 30
Time to complete 2 is 0:00:00.912043
Train Loss: -0.1657 | Train Accuracy:  78.0000%
Test Loss: -0.3578 | Test Accuracy:  61.5750%

Test loss has decreased (-0.357822 --> -0.547364). Saving model...
Epoch : 3 / 30
Time to complete 3 is 0:00:00.906273
Train Loss: -0.3725 | Train Accuracy:  76.0000%
Test Loss: -0.5474 | Test Accuracy:  62.6000%

Test loss has decreased (-0.547364 --> -0.730985). Saving model...
Epoch : 4 / 30
Time to complete 4 is 0:00:00.941219
Train Loss: -0.5699 | Train Accuracy:  76.0000%
Test Loss: -0.7310 | Test Accuracy:  62.6750%

Test loss has decreased (-0.730985 --> -0.909168). Saving model...
Epoch : 5 / 30
Time to complete 5 is 0:00:00.921012
Train Loss: -0.7601 | Train Accu

Train on complete set.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 2,
    h_sizes = [784] + [100],
    batch_norm = False,
    dprob = [0.9],
    batch_size = 500,
    learning_rate = 03e-04,
    dataset="FashionMNIST",
    architecture="LogisticRegression",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q6_2.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 


In [None]:
# Initilialize wandb
wandb.init(name = 'Q6_all', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test epoch Acc :,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Test epoch Loss :,██████████████▇▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▁
Train epoch Acc :,▃▂█▃▄▄▄▃▃▄▂▄▂▃▄▃▄▇▅▃▂▃▃▃▃▄▂▄▆▃▄▃█▃▇▃▆▃▄▁
Train epoch Loss :,███████████████▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▁

0,1
Test epoch Acc :,0.5
Test epoch Loss :,-90573511852032.0
Train epoch Acc :,0.49562
Train epoch Loss :,-17649490810197.332


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_full, batch_size=wandb.config.batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.NLLLoss()

# device 
model = LogisticRegression(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

# model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fc745cea810>]

In [None]:
example_ct_train, batch_ct_train, example_ct_test, batch_ct_test = 0, 0, 0, 0
train_loss_history, train_acc_history, test_loss_history, test_acc_history = train_loop(train_loader, test_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Testloss has decreased (inf --> -10.736485). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:06.598694
Train Loss: -0.6961 | Train Accuracy:  53.1958%
Test Loss: -10.7365 | Test Accuracy:  74.8000%

Test loss has decreased (-10.736485 --> -34.426782). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:06.653757
Train Loss: -3.3760 | Train Accuracy:  55.7583%
Test Loss: -34.4268 | Test Accuracy:  71.9750%

Test loss has decreased (-34.426782 --> -77.349114). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:06.509384
Train Loss: -8.7554 | Train Accuracy:  55.7417%
Test Loss: -77.3491 | Test Accuracy:  62.8500%

Test loss has decreased (-77.349114 --> -142.540483). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:06.496724
Train Loss: -17.5828 | Train Accuracy:  55.0167%
Test Loss: -142.5405 | Test Accuracy:  57.3250%

Test loss has decreased (-142.540483 --> -230.822269). Saving model...
Epoch : 5 / 100
Time to complete 5 is 0:00:06.599226
Train Los

Another hidden layer with 50 neurons.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 2,
    h_sizes = [784] + [100] + [50],
    batch_norm = False,
    dprob = [0.5]*2,
    batch_size = 500,
    learning_rate = 0.05,
    dataset="FashionMNIST",
    architecture="LogisticRegression",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q6_2.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 


In [None]:
# Initilialize wandb
wandb.init(name = 'Q6_3', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test epoch Acc :,▅▂▅▅▆▆▄▃▂▃▁▄▂▅▅▄▅▆▃▄█▄▂▄▅▅▄▃▂▃▆▆█▄▃▆▅▅▂▃
Test epoch Loss :,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
Train epoch Acc :,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train epoch Loss :,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
Test epoch Acc :,0.48
Test epoch Loss :,-245.58443
Train epoch Acc :,0.5
Train epoch Loss :,-238.56694


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_full, batch_size=wandb.config.batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.NLLLoss()

# device 
model = LogisticRegression(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

# model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fc745c4aed0>]

In [None]:
example_ct_train, batch_ct_train, example_ct_test, batch_ct_test = 0, 0, 0, 0
train_loss_history, train_acc_history, test_loss_history, test_acc_history = train_loop(train_loader, test_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Testloss has decreased (inf --> -23070893.250000). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:06.680132
Train Loss: -854997.7921 | Train Accuracy:  49.8417%
Test Loss: -23070893.2500 | Test Accuracy:  50.0000%

Test loss has decreased (-23070893.250000 --> -394668844.000000). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:06.588501
Train Loss: -29058965.3646 | Train Accuracy:  50.6042%
Test Loss: -394668844.0000 | Test Accuracy:  50.0250%

Test loss has decreased (-394668844.000000 --> -1944520048.000000). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:06.543363
Train Loss: -199538805.5000 | Train Accuracy:  49.8083%
Test Loss: -1944520048.0000 | Test Accuracy:  50.0000%

Test loss has decreased (-1944520048.000000 --> -5758168512.000000). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:06.564232
Train Loss: -703132695.3333 | Train Accuracy:  50.0000%
Test Loss: -5758168512.0000 | Test Accuracy:  50.0000%

Test loss has decreased (-575

Adding batch normalization without dropout.

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 2,
    h_sizes = [784] + [100] + [50],
    batch_norm = True,
    dprob = [1]*2,
    batch_size = 500,
    learning_rate = .05,
    dataset="FashionMNIST",
    architecture="LogisticRegression",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q6_4.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.relu
   )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu 


In [None]:
# Initilialize wandb
wandb.init(name = 'Q6_4', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test epoch Acc :,▁▃▃▃▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆██████████████
Test epoch Loss :,██████████████▇▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▁
Train epoch Acc :,▄▄▁▃▇▇▆▂▄▂▇▃▆▅▃▆▆█▅▃▄▄▇▅▄▃▆▆▄▅▅▄▄▃▅▅▄▃▅▄
Train epoch Loss :,███████████████▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▂▂▂▁

0,1
Test epoch Acc :,0.501
Test epoch Loss :,-31887724642304.0
Train epoch Acc :,0.50058
Train epoch Loss :,-837705981952.0


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_full, batch_size=wandb.config.batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.NLLLoss()

# device 
model = LogisticRegression(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

# model.apply(init_weights) 

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fc745c0a110>]

In [None]:
example_ct_train, batch_ct_train, example_ct_test, batch_ct_test = 0, 0, 0, 0
train_loss_history, train_acc_history, test_loss_history, test_acc_history = train_loop(train_loader, test_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Testloss has decreased (inf --> -7.833074). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:06.798885
Train Loss: -1.1867 | Train Accuracy:  50.0000%
Test Loss: -7.8331 | Test Accuracy:  48.3750%

Test loss has decreased (-7.833074 --> -9.886872). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:06.799739
Train Loss: -3.5845 | Train Accuracy:  50.0000%
Test Loss: -9.8869 | Test Accuracy:  48.5250%

Test loss has decreased (-9.886872 --> -10.428002). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:06.778381
Train Loss: -5.9824 | Train Accuracy:  50.0000%
Test Loss: -10.4280 | Test Accuracy:  47.8000%

Test loss has decreased (-10.428002 --> -17.144353). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:06.666521
Train Loss: -8.3805 | Train Accuracy:  50.0000%
Test Loss: -17.1444 | Test Accuracy:  48.0750%

Test loss has decreased (-17.144353 --> -18.832236). Saving model...
Epoch : 5 / 100
Time to complete 5 is 0:00:06.780414
Train Loss: -10.7785

The model converged faster with batch normalization. Adding batch normalization did result in a slightly better model.

Next, we replace batch normalization with SELU for self-normalization by ensuring that the data is standard normalized, the model keeps lecun's intialization and replaces the Relu function with the Selu function.

In [None]:
check_loader = torch.utils.data.DataLoader(train_full, batch_size = 32, shuffle = True)

In [None]:
# Check if standard normal mean = 0, std = 1; it is normalized already from the beginning

In [None]:
# check imputs and outputs 
for input, target in check_loader:
  print(f'shape of inputs is :{input.shape}')
  print(f'\nmax input value  :{input.max()}')
  print(f'\nmin input value  :{input.min()}')
  print(f'\nmean input value  :{input.mean()}')
  print(f'\nstd input value  :{input.std()}')
  print(f'\nshape of targets is :{target.shape}')
   
  break

shape of inputs is :torch.Size([32, 1, 28, 28])

max input value  :2.022662878036499

min input value  :-0.8101983666419983

mean input value  :-0.1585819125175476

std input value  :0.9130802750587463

shape of targets is :torch.Size([32])


In [None]:
# Define custom model using nn.Module(), make changes for Alpha dropout
class LogisticRegression(nn.Module):
  def __init__(self,  output_dim, h_sizes, dprob, non_linearity, batch_norm):
        
    super().__init__()

    self.h_sizes = h_sizes
    self.non_linearity = non_linearity
    self.batch_norm = batch_norm
    self.dprob = dprob
    self.output_dim = output_dim

    # Initialize hidden layers  
    self.hidden = nn.ModuleList()
    self.dropout = nn.ModuleList()
    self.batchnorm = nn.ModuleList()

    for k in range(len(h_sizes)-1):
      self.hidden.append(nn.Linear(self.h_sizes[k], h_sizes[k+1]))
      self.dropout.append(nn.AlphaDropout(p=dprob[k]))

      if self.batch_norm:
        self.batchnorm.append(nn.BatchNorm1d(self.h_sizes[k+1], momentum=0.9))
      
    
    self.output_layer = nn.Linear(self.h_sizes[-1], output_dim)
    self.flatten = nn.Flatten()
    
    ## it is better to use nn.functional.relu in the forward function
    # self.relu = nn.ReLU()
    # for logistic regression, will use sigmoid activation function

  def forward(self,x):
    x = self.flatten(x)

    for  k in range(len(self.h_sizes)-1):
      x =  self.non_linearity(self.hidden[k](x))
      if self.batch_norm:
        x = self.batchnorm[k](x)
      x= self.dropout[k](x)

    x = self.output_layer(x)
    # we are not using softmax function in the forward passs
    # nn.crossentropy loss (which we will use to define our loss) combines  nn.LogSoftmax() and nn.NLLLoss() in one single class
    return x  

In [None]:
# Metadata for model
hyperparameters = dict(
    epochs = 100,
    output_dim = 2,
    h_sizes = [784] + [100] + [50],
    batch_norm = False,
    dprob = [0.9]*2,
    batch_size = 500,
    learning_rate = 0.05,
    dataset="FashionMNIST",
    architecture="LogisticRegression",
    log_interval = 1,
    log_batch = False,
    file_model = lecture_folder/'Q6_5.pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
    non_linearity=F.selu
   )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.selu 


In [None]:
# Initilialize wandb
wandb.init(name = 'Q6_5', project = 'DL_Course_HW3', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Test epoch Acc :,█▃▁█▁
Test epoch Loss :,██▇▅▁
Train epoch Acc :,▃▅▅▁█
Train epoch Loss :,██▇▅▁

0,1
Test epoch Acc :,0.50025
Test epoch Loss :,-703.56653
Train epoch Acc :,0.50892
Train epoch Loss :,-13.43722


In [None]:
# Dataloader, loss function, model, optimizer, and weight initializer for overfitting
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_full, batch_size=wandb.config.batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size, shuffle = False)
# test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   shuffle = False)

# cross entropy loss function
loss_function = nn.NLLLoss()

# device 
model = LogisticRegression(wandb.config.output_dim, wandb.config.h_sizes, wandb.config.dprob, 
                              non_linearity, wandb.config.batch_norm)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

# model.apply(init_weights) # No initialization of weights (lecun's default) for selu activation

# put model to GPUs
model.to(wandb.config.device)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = wandb.config.learning_rate)

In [None]:
wandb.watch(model, log = 'all', log_freq=1, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fc7451fc550>]

In [None]:
example_ct_train, batch_ct_train, example_ct_test, batch_ct_test = 0, 0, 0, 0
train_loss_history, train_acc_history, test_loss_history, test_acc_history = train_loop(train_loader, test_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.file_model)

Testloss has decreased (inf --> -10608212.375000). Saving Model...
Epoch : 1 / 100
Time to complete 1 is 0:00:06.818862
Train Loss: -51657.2898 | Train Accuracy:  50.0458%
Test Loss: -10608212.3750 | Test Accuracy:  50.0000%

Test loss has decreased (-10608212.375000 --> -184031860.000000). Saving model...
Epoch : 2 / 100
Time to complete 2 is 0:00:06.796337
Train Loss: -1817409.9805 | Train Accuracy:  50.1167%
Test Loss: -184031860.0000 | Test Accuracy:  50.0000%

Test loss has decreased (-184031860.000000 --> -887887432.000000). Saving model...
Epoch : 3 / 100
Time to complete 3 is 0:00:07.037471
Train Loss: -12337819.3646 | Train Accuracy:  50.1333%
Test Loss: -887887432.0000 | Test Accuracy:  50.0250%

Test loss has decreased (-887887432.000000 --> -2560872064.000000). Saving model...
Epoch : 4 / 100
Time to complete 4 is 0:00:06.762171
Train Loss: -42619663.9167 | Train Accuracy:  49.7000%
Test Loss: -2560872064.0000 | Test Accuracy:  50.0250%

Test loss has decreased (-2560872064

Self-normalizing did result in a slightly better but similar result with batch normalization.