In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from pathlib import Path
import os
from torch import nn
import torch.nn.functional as F
import random
import pandas as pd

# Q1 Load CIFAR10 dataset (1.5 point)

- Load CIFAR 10 dataset from PyTorch datasets
- Create train/valid/test datasets and dataloaders
- Apply appropraite Transformations
- Create a smaller subset of 50 train images

In [59]:
data_folder = Path('U:\OneDrive - The University of Texas at Dallas\\6382\Datasets')

trans1 = transforms.ToTensor()

# Transform to normalize the data
# The mean and std are based on train subset which we will create below
trans2 = transforms.Normalize((0.49, 0.482, 0.447), (0.247, 0.244, 0.262))
trans = transforms.Compose([trans1, trans2])

# Download the training_validation data (we will create two subsets - trainset and valset frpm this)
train_val_set = torchvision.datasets.CIFAR10(root = data_folder,  train = True, transform = trans, download = True)

# Download the testing data
testset = torchvision.datasets.CIFAR10(root = data_folder, train = False, transform = trans, download = True)

Files already downloaded and verified
Files already downloaded and verified


In [60]:
train_val_set.data.shape

(50000, 32, 32, 3)

In [4]:
def split_dataset(base_dataset, fraction, seed):
    split_a_size = int(fraction * len(base_dataset))
    split_b_size = len(base_dataset) - split_a_size
    return torch.utils.data.random_split(base_dataset, [split_a_size, split_b_size], generator=torch.Generator().manual_seed(seed)
    )

In [62]:
trainset, validset = split_dataset(train_val_set, 0.8, 42)

In [63]:
len(trainset), len(validset)

(40000, 10000)

# Q2 Overfit a three layer network (1.5 point)
Try a three-layer network ( 2 hidden layers and one output layer) with 100 units in each hidden layer. Tweak the learning rate and weight initialization to overfit the smaller subset and achieve 100% training accuracy within 20 epochs. No regularization method should be used in this step. Use SGD as optimizer. You will use 50 images from train dataset and  complete validation dataset for this question.  You will use ReLU activation as non linearity in your model. Use  batch_size of 25. 

In [8]:
class CustomNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, h_sizes, non_linearity, dropout):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.h_sizes = h_sizes
        self.non_linearity = non_linearity
        if dropout:
            self.dropout = dropout

        model_layers = [nn.Flatten()]

        for i, hidden_size in enumerate(self.h_sizes):
            model_layers.append(nn.Linear(input_dim, hidden_size))
            model_layers.append(self.non_linearity)
            if dropout:
                model_layers.append(nn.Dropout(p = self.dropout[i]))
            input_dim = hidden_size
        model_layers.append(nn.Linear(self.h_sizes[-1], self.output_dim))

        self.module_list = nn.ModuleList(model_layers)

    def forward(self, x):
        for layer in self.module_list:
            x = layer(x)
        return x            

In [5]:
def train(train_loader, loss_function, model, optimizer, log_batch, log_interval):
    
    global batch_ct_train
    
    running_train_loss = 0
    running_train_correct = 0

    model.train()

    for input_, targets in train_loader:
        input_ = input_.to(device)
        targets = targets.to(device)

        output = model.forward(input_)
        loss = loss_function(output, targets)

        ypred = torch.argmax(output, dim = 1)
        correct= torch.sum(ypred == targets)

        batch_ct_train += 1

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
        
        running_train_loss += loss.item()
        running_train_correct += correct

    train_loss = running_train_loss/len(train_loader)
    train_acc = running_train_correct/len(train_loader.dataset)

    return train_loss, train_acc

In [6]:
def validate(valid_loader, loss_function, model, log_batch, log_interval):
    """
    Function for validation of passed dataset
    """
    global batch_ct_valid

    running_val_loss = 0
    running_val_correct = 0

    model.eval()

    with torch.no_grad():
        for input_, targets in valid_loader:

            input_ = input_.to(device)
            targets = targets.to(device)

            output = model.forward(input_)
            loss = loss_function(output, targets)

            ypred = torch.argmax(output, dim = 1)
            correct = torch.sum(ypred == targets)

            batch_ct_valid += 1

            running_val_loss += loss.item()

            running_val_correct += correct

    val_loss = running_val_loss/len(valid_loader)

    val_acc = running_val_correct/len(valid_loader.dataset)

    return val_loss, val_acc    

In [7]:
def train_loop(train_loader, valid_loader, model, optimizer, loss_function, epochs, device, log_batch, log_interval):
    
    train_loss_history = []
    valid_loss_history = []
    train_acc_history = []
    valid_acc_history = []

    for epoch in range(epochs):
        train_loss, train_acc = train(train_loader, loss_function, model, optimizer, log_batch, log_interval)
        valid_loss, valid_acc = validate(valid_loader, loss_function, model, log_batch, log_interval)

        train_loss_history.append(train_loss)
        train_acc_history.append(train_acc)

        valid_loss_history.append(valid_loss)
        valid_acc_history.append(valid_acc)

        if ((epoch+1)%log_interval == 0):
            print(f'Epoch: {epoch+1}/{epochs}')
            print(f'Train Loss: {train_loss : .4f} | Train Accuracy: {train_acc * 100 : .4f}%')
            print(f'Valid Loss: {valid_loss : .4f} | Valid Accuracy: {valid_acc * 100 : .4f}%')
        torch.cuda.empty_cache()

    return train_loss_history, train_acc_history, valid_loss_history, valid_acc_history

In [12]:
print(trainset.dataset.data[trainset.indices].shape)

(40000, 32, 32, 3)


In [40]:
sample_size = int(len(trainset)/800)
sample_indices = random.sample(range(0, len(trainset)), sample_size)
small_sample = torch.utils.data.Subset(trainset,sample_indices)
small_train_sample = torch.utils.data.DataLoader(small_sample, batch_size= 25, shuffle = True)

valid_loader = torch.utils.data.DataLoader(validset, batch_size=25, shuffle = False)
test_loader = torch.utils.data.DataLoader(testset, batch_size=25,   shuffle = False)

len(small_train_sample)

2

In [8]:
def init_weights(layer):
  if type(layer) == nn.Linear:
    torch.nn.init.normal_(layer.weight, mean = 0, std = 1/784**0.5)
    torch.nn.init.zeros_(layer.bias)

### <font color = "red">Hyperparameters

In [15]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 20
input_dim = 32*32*3
output_dim = 10
h_sizes = [100]*2
learning_rate = 0.1
log_interval = 2
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = []

model = CustomNetwork(input_dim, output_dim, h_sizes, non_linearity, dropout)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [18]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(small_train_sample, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 1/20
Train Loss:  2.2800 | Train Accuracy:  12.0000%
Valid Loss:  2.2913 | Valid Accuracy:  12.4600%
Epoch: 3/20
Train Loss:  2.0257 | Train Accuracy:  44.0000%
Valid Loss:  2.3020 | Valid Accuracy:  15.3000%
Epoch: 5/20
Train Loss:  1.7457 | Train Accuracy:  46.0000%
Valid Loss:  2.3547 | Valid Accuracy:  16.1100%
Epoch: 7/20
Train Loss:  1.4601 | Train Accuracy:  64.0000%
Valid Loss:  2.3876 | Valid Accuracy:  19.3000%
Epoch: 9/20
Train Loss:  1.1265 | Train Accuracy:  74.0000%
Valid Loss:  2.5439 | Valid Accuracy:  19.3500%
Epoch: 11/20
Train Loss:  0.8523 | Train Accuracy:  78.0000%
Valid Loss:  2.5771 | Valid Accuracy:  19.9900%
Epoch: 13/20
Train Loss:  0.6798 | Train Accuracy:  88.0000%
Valid Loss:  2.7813 | Valid Accuracy:  17.6200%
Epoch: 15/20
Train Loss:  0.4436 | Train Accuracy:  96.0000%
Valid Loss:  2.7892 | Valid Accuracy:  19.5200%
Epoch: 17/20
Train Loss:  0.3259 | Train Accuracy:  98.0000%
Valid Loss:  2.8863 | Valid Accuracy:  20.4400%
Epoch: 19/20
Train Loss:

# Q3 Overfit a five layer network (1.5 Point)
Craete a five-layer network ( 4 hidden layers and one output layer) with 100 units on each layer to overfit smaller subset. Here also you will have to adjust the learning rate and weight initialization  to achieve 100% training accuracy within 20 epochs. Use SGD as optimizer. You will use 50 images from train dataset and  all the images from the validation dataset for this question.  You will use ReLU activation as non linearity in your model. Use batch_size of 25.

### <font color = "red">Hyperparameters

In [49]:
def init_weight1(layer):
  if type(layer) == nn.Linear:
    torch.nn.init.kaiming_normal_(layer.weight)
    torch.nn.init.zeros_(layer.bias)

In [50]:
c

device = torch.device('cuda:0')

epochs = 20
input_dim = 32*32*3
output_dim = 10
h_sizes = [100]*4
learning_rate = 0.1
log_interval = 1
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = []

model = CustomNetwork(input_dim, output_dim, h_sizes, non_linearity, dropout)

model.to(device)
model.apply(init_weight1)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [51]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(small_train_sample, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 1/20
Train Loss:  3.0881 | Train Accuracy:  10.0000%
Valid Loss:  6.6599 | Valid Accuracy:  10.3000%
Epoch: 2/20
Train Loss:  5.0653 | Train Accuracy:  22.0000%
Valid Loss:  5.5363 | Valid Accuracy:  10.6600%
Epoch: 3/20
Train Loss:  4.1316 | Train Accuracy:  10.0000%
Valid Loss:  3.1171 | Valid Accuracy:  12.3600%
Epoch: 4/20
Train Loss:  2.0711 | Train Accuracy:  36.0000%
Valid Loss:  3.0004 | Valid Accuracy:  13.1200%
Epoch: 5/20
Train Loss:  1.4174 | Train Accuracy:  58.0000%
Valid Loss:  3.3269 | Valid Accuracy:  14.4300%
Epoch: 6/20
Train Loss:  1.2546 | Train Accuracy:  68.0000%
Valid Loss:  2.9818 | Valid Accuracy:  15.6000%
Epoch: 7/20
Train Loss:  0.8433 | Train Accuracy:  80.0000%
Valid Loss:  2.7549 | Valid Accuracy:  15.7200%
Epoch: 8/20
Train Loss:  0.4755 | Train Accuracy:  92.0000%
Valid Loss:  3.0308 | Valid Accuracy:  17.7200%
Epoch: 9/20
Train Loss:  0.2000 | Train Accuracy:  98.0000%
Valid Loss:  3.3236 | Valid Accuracy:  16.1900%
Epoch: 10/20
Train Loss:  0.

# Q4 : Optimizers (3 Points)

Train a six-layer network ( 5 hidden layers and one output layer) with (1) SGD, and (2) SGD+momentum. You will use 4000 images from train dataset and all the images from the validation dataset for this question. You will use ReLU activation as non linearity in your model.  Use  batch_size of 100 and learning rate of around 5e-02. Which one converges faster (SGD or SGD + momentum)? 


In [65]:
sample_size2 = int(len(trainset)/10)

sample_indices2 = random.sample(range(0, len(trainset)), sample_size2)

small_sample2 = torch.utils.data.Subset(trainset,sample_indices2)

train_sample2 = torch.utils.data.DataLoader(small_sample2, batch_size= 100, shuffle = True)
len(train_sample2)

40

### <font color = "red">Hyperparameters (without momentum)

In [66]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 100
input_dim = 32*32*3
output_dim = 10
h_sizes = [100]*5
learning_rate = 0.05
log_interval = 10
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = []

model = CustomNetwork(input_dim, output_dim, h_sizes, non_linearity, dropout)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [67]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_sample2, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 10/100
Train Loss:  2.3009 | Train Accuracy:  10.6500%
Valid Loss:  2.3014 | Valid Accuracy:  9.7300%
Epoch: 20/100
Train Loss:  2.1910 | Train Accuracy:  16.5250%
Valid Loss:  2.1666 | Valid Accuracy:  17.4100%
Epoch: 30/100
Train Loss:  1.8780 | Train Accuracy:  26.7750%
Valid Loss:  1.9412 | Valid Accuracy:  24.8200%
Epoch: 40/100
Train Loss:  1.6220 | Train Accuracy:  35.8750%
Valid Loss:  2.0050 | Valid Accuracy:  26.8400%
Epoch: 50/100
Train Loss:  1.4061 | Train Accuracy:  44.5500%
Valid Loss:  2.3122 | Valid Accuracy:  28.0100%
Epoch: 60/100
Train Loss:  1.1730 | Train Accuracy:  55.2500%
Valid Loss:  2.1430 | Valid Accuracy:  32.8000%
Epoch: 70/100
Train Loss:  0.9273 | Train Accuracy:  65.6000%
Valid Loss:  2.5217 | Valid Accuracy:  34.3500%
Epoch: 80/100
Train Loss:  0.6478 | Train Accuracy:  76.4500%
Valid Loss:  2.8530 | Valid Accuracy:  36.2200%
Epoch: 90/100
Train Loss:  0.4356 | Train Accuracy:  85.1500%
Valid Loss:  3.5625 | Valid Accuracy:  34.6900%
Epoch: 100/

### <font color = "red">Hyperparameters (with momentum)

In [68]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 50
input_dim = 32*32*3
output_dim = 10
h_sizes = [100]*5
learning_rate = 0.05
log_interval = 5
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = []

model = CustomNetwork(input_dim, output_dim, h_sizes, non_linearity, dropout)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, momentum=0.9)

In [69]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_sample2, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 5/50
Train Loss:  2.0192 | Train Accuracy:  20.8500%
Valid Loss:  1.9719 | Valid Accuracy:  25.0500%
Epoch: 10/50
Train Loss:  1.6076 | Train Accuracy:  38.5000%
Valid Loss:  1.9304 | Valid Accuracy:  31.2300%
Epoch: 15/50
Train Loss:  1.2758 | Train Accuracy:  51.5750%
Valid Loss:  2.0242 | Valid Accuracy:  36.4700%
Epoch: 20/50
Train Loss:  0.9981 | Train Accuracy:  62.9250%
Valid Loss:  2.1326 | Valid Accuracy:  36.9000%
Epoch: 25/50
Train Loss:  0.7835 | Train Accuracy:  71.5250%
Valid Loss:  2.3320 | Valid Accuracy:  37.6200%
Epoch: 30/50
Train Loss:  0.6587 | Train Accuracy:  76.8750%
Valid Loss:  2.3244 | Valid Accuracy:  38.8400%
Epoch: 35/50
Train Loss:  0.5496 | Train Accuracy:  80.6000%
Valid Loss:  3.1094 | Valid Accuracy:  36.8800%
Epoch: 40/50
Train Loss:  0.5175 | Train Accuracy:  83.2000%
Valid Loss:  2.9977 | Valid Accuracy:  37.0700%
Epoch: 45/50
Train Loss:  0.4006 | Train Accuracy:  86.7000%
Valid Loss:  3.7445 | Valid Accuracy:  38.5200%
Epoch: 50/50
Train L

### <font color = "teal"> SGD with momentum converges much faster than without.

# Q5 : Regularization (3 Points)

In this question, you will add dropout layer. Add dropout afte every ReLU non-linearity.

You will now train following two-layer networks:

1. Hidden size 256, dropout = 0
2. Hidden size 512, dropout = 0
3. Hidden size 512, dropout = 0.5

You will use 20,000 images from train dataset and all the images from the validation dataset for this question. You will use ReLU activation as non linearity in your model. Use learning rate of around 5e-03. In this experiment, you will use Adam optimizer. Further train model for 100 epochs and use batch size of 512.
Which model gave better accuracy on validation datset - smaller model with no regularization or a bigger model with regularization? 

In [70]:
sample_size3 = int(len(trainset)/2)
sample_indices3 = random.sample(range(0, len(trainset)), sample_size3)
small_sample3 = torch.utils.data.Subset(trainset,sample_indices3)
small_train_sample3 = torch.utils.data.DataLoader(small_sample3, batch_size= 512, shuffle = True)

In [71]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 100
input_dim = 32*32*3
output_dim = 10
h_sizes = [256]
learning_rate = 0.005
log_interval = 5
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = [0]

model = CustomNetwork(input_dim, output_dim, h_sizes, non_linearity, dropout)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [72]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(small_train_sample3, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 5/100
Train Loss:  1.3899 | Train Accuracy:  52.1200%
Valid Loss:  1.6423 | Valid Accuracy:  45.3300%
Epoch: 10/100
Train Loss:  1.2638 | Train Accuracy:  58.0000%
Valid Loss:  1.9865 | Valid Accuracy:  42.7400%
Epoch: 15/100
Train Loss:  1.1076 | Train Accuracy:  63.8400%
Valid Loss:  2.0836 | Valid Accuracy:  44.9800%
Epoch: 20/100
Train Loss:  0.9230 | Train Accuracy:  70.1000%
Valid Loss:  2.3347 | Valid Accuracy:  43.0700%
Epoch: 25/100
Train Loss:  0.7534 | Train Accuracy:  76.0350%
Valid Loss:  3.2439 | Valid Accuracy:  41.0100%
Epoch: 30/100
Train Loss:  0.5858 | Train Accuracy:  81.2900%
Valid Loss:  2.8411 | Valid Accuracy:  45.2100%
Epoch: 35/100
Train Loss:  0.8358 | Train Accuracy:  78.4500%
Valid Loss:  4.1962 | Valid Accuracy:  39.9400%
Epoch: 40/100
Train Loss:  0.4722 | Train Accuracy:  85.8500%
Valid Loss:  3.5401 | Valid Accuracy:  46.1600%
Epoch: 45/100
Train Loss:  0.3682 | Train Accuracy:  89.0750%
Valid Loss:  4.1237 | Valid Accuracy:  45.3900%
Epoch: 50/1

In [73]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 100
input_dim = 32*32*3
output_dim = 10
h_sizes = [512]
learning_rate = 0.005
log_interval = 5
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = [0]

model = CustomNetwork(input_dim, output_dim, h_sizes, non_linearity, dropout)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [74]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(small_train_sample3, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 5/100
Train Loss:  1.3941 | Train Accuracy:  51.9400%
Valid Loss:  1.9041 | Valid Accuracy:  42.7300%
Epoch: 10/100
Train Loss:  1.3781 | Train Accuracy:  56.3650%
Valid Loss:  1.9422 | Valid Accuracy:  43.8900%
Epoch: 15/100
Train Loss:  1.1325 | Train Accuracy:  64.6000%
Valid Loss:  2.2450 | Valid Accuracy:  44.3700%
Epoch: 20/100
Train Loss:  0.9055 | Train Accuracy:  72.4000%
Valid Loss:  2.9507 | Valid Accuracy:  40.7800%
Epoch: 25/100
Train Loss:  0.6935 | Train Accuracy:  78.7200%
Valid Loss:  3.1072 | Valid Accuracy:  45.1600%
Epoch: 30/100
Train Loss:  0.8293 | Train Accuracy:  77.6550%
Valid Loss:  3.5602 | Valid Accuracy:  43.0300%
Epoch: 35/100
Train Loss:  1.0027 | Train Accuracy:  76.5750%
Valid Loss:  4.1970 | Valid Accuracy:  42.1500%
Epoch: 40/100
Train Loss:  0.4068 | Train Accuracy:  88.7100%
Valid Loss:  4.2521 | Valid Accuracy:  45.3400%
Epoch: 45/100
Train Loss:  1.1431 | Train Accuracy:  77.7600%
Valid Loss:  5.9092 | Valid Accuracy:  42.7700%
Epoch: 50/1

In [75]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 100
input_dim = 32*32*3
output_dim = 10
h_sizes = [512]
learning_rate = 0.005
log_interval = 5
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = [0.5]

model = CustomNetwork(input_dim, output_dim, h_sizes, non_linearity, dropout)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [76]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(small_train_sample3, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 5/100
Train Loss:  1.9227 | Train Accuracy:  38.1500%
Valid Loss:  1.9529 | Valid Accuracy:  38.0200%
Epoch: 10/100
Train Loss:  2.0525 | Train Accuracy:  38.5100%
Valid Loss:  1.9712 | Valid Accuracy:  40.3800%
Epoch: 15/100
Train Loss:  2.0771 | Train Accuracy:  40.4950%
Valid Loss:  2.2452 | Valid Accuracy:  40.1400%
Epoch: 20/100
Train Loss:  2.0477 | Train Accuracy:  42.0800%
Valid Loss:  2.0378 | Valid Accuracy:  40.9300%
Epoch: 25/100
Train Loss:  2.1668 | Train Accuracy:  42.3500%
Valid Loss:  2.0380 | Valid Accuracy:  41.6300%
Epoch: 30/100
Train Loss:  1.9293 | Train Accuracy:  44.7200%
Valid Loss:  2.1342 | Valid Accuracy:  41.2400%
Epoch: 35/100
Train Loss:  2.3188 | Train Accuracy:  42.2950%
Valid Loss:  1.9796 | Valid Accuracy:  42.5400%
Epoch: 40/100
Train Loss:  1.8121 | Train Accuracy:  47.9600%
Valid Loss:  2.1899 | Valid Accuracy:  40.5700%
Epoch: 45/100
Train Loss:  1.8985 | Train Accuracy:  47.1950%
Valid Loss:  2.1920 | Valid Accuracy:  43.5800%
Epoch: 50/1

# Q6 Batch Norm and SELU (4.5 Points)
Generate training and test datasets for a binary classiﬁcation problem using Fashion-MNIST with class 1 being a combination of sneaker and pullover and class 0 being the combination of sandal and shirt categories. 
- Train the model using Logistic regression (No Hidden Layers). Report train and test loss.
- Train a Neural Network with one hidden layer (100 neurons). Use Adam optimizer and Relu activation for hidden layer.  First overfit a small sample to check errors and get idea of learning rate. Then train on complete dataset. Add regularization (dropout or weight decay)if needed.
- Now add another hidden layer (50 Neurons). Adjust the learning rate if you have to. Add regularization (dropout or weight decay) if needed.
- Now try adding Batch Normalization and compare the train and test loss : Is it converging faster than before? Does it produce a better model? How does it affect training speed? **Do not use dropout with batch normalization.**
- Try replacing Batch Normalization with SELU, and make the necessary adjustments to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers). Compare the results with Batch Normalization. **For SELU if you are using dropout then use alpha dropout.** Alpha dropout make sure that network is self normalized.


In [128]:
data_folder = Path('U:\OneDrive - The University of Texas at Dallas\\6382\Datasets')

trans1 = transforms.ToTensor()

# Transform to normalize the data
# The mean and std are based on train subset which we will create below
trans2 = transforms.Normalize(0, 0.5)
trans = transforms.Compose([trans1, trans2])

train_val_set = torchvision.datasets.FashionMNIST(root = data_folder, train = True, transform = trans, download=True)

testset = torchvision.datasets.FashionMNIST(root = data_folder, train = False, transform = trans, download=True)

In [129]:
idx = (train_val_set.targets==7) | (train_val_set.targets==2) | (train_val_set.targets==5) | (train_val_set.targets==6)
train_val_set.targets = train_val_set.targets[idx]
train_val_set.data = train_val_set.data[idx]

In [130]:
trainset, validset = split_dataset(train_val_set, 0.8, 45)

In [131]:
len(trainset), len(validset)

(19200, 4800)

In [132]:
[trainset.dataset.classes[i] for i in [int(i) for i in trainset.dataset.targets.unique()]]

['Pullover', 'Sandal', 'Shirt', 'Sneaker']

In [133]:
trainset.dataset.data.shape

torch.Size([24000, 28, 28])

In [134]:
train_loader = torch.utils.data.DataLoader(trainset, batch_size = 100, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=100, shuffle = False)
test_loader = torch.utils.data.DataLoader(testset, batch_size=100,   shuffle = False)

In [135]:
input_dim = 28*28
output_dim = 2
model = nn.Sequential(nn.Linear(input_dim, output_dim), nn.Softmax())

In [136]:
loss = nn.CrossEntropyLoss()
epochs = 20
optim = torch.optim.SGD(model.parameters(), lr = 0.1)

#### <font color = "blue"> Could not figure out logistic regression for subset of the dataset

In [9]:
data_folder = Path('U:\OneDrive - The University of Texas at Dallas\\6382\Datasets')

trans1 = transforms.ToTensor()

# Transform to normalize the data
# The mean and std are based on train subset which we will create below
trans2 = transforms.Normalize(mean=0, std=0.5)
trans = transforms.Compose([trans1, trans2])

train_val_set = torchvision.datasets.FashionMNIST(root = data_folder, train = True, transform = trans, download=True)

testset = torchvision.datasets.FashionMNIST(root = data_folder, train = False, transform = trans, download=True)

In [10]:
trainset, validset = split_dataset(train_val_set, 0.8, 45)

In [11]:
len(trainset), len(validset)

(48000, 12000)

In [17]:
class CustomNetwork2(nn.Module):
    def __init__(self, input_dim, output_dim, h_sizes, non_linearity, dropout, batch_norm):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.h_sizes = h_sizes
        self.non_linearity = non_linearity
        self.batch_norm = batch_norm
        if dropout:
            self.dropout = dropout
        

        model_layers = [nn.Flatten()]

        for i, hidden_size in enumerate(self.h_sizes):
            model_layers.append(nn.Linear(input_dim, hidden_size))
            model_layers.append(self.non_linearity)
            if dropout:
                model_layers.append(nn.Dropout(p = self.dropout[i]))
            if self.batch_norm:
                model_layers.append(nn.BatchNorm1d(hidden_size, momentum=0.9))
            input_dim = hidden_size
        model_layers.append(nn.Linear(self.h_sizes[-1], self.output_dim))

        self.module_list = nn.ModuleList(model_layers)

    def forward(self, x):
        for layer in self.module_list:
            x = layer(x)
        return x            
        

In [13]:
sample_size4 = int(len(trainset)/120)
sample_indices4 = random.sample(range(0, len(trainset)), sample_size4)
small_sample4 = torch.utils.data.Subset(trainset,sample_indices4)
small_train_sample4 = torch.utils.data.DataLoader(small_sample4, batch_size= 100, shuffle = True)

train_loader = torch.utils.data.DataLoader(trainset, batch_size = 100, shuffle = True)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=100, shuffle = False)
test_loader = torch.utils.data.DataLoader(testset, batch_size=100,   shuffle = False)

len(small_train_sample4)

4

In [18]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 20
input_dim = 784
output_dim = 10
h_sizes = [100]
learning_rate = 0.01
log_interval = 1
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = []
batch_norm = False

model = CustomNetwork2(input_dim, output_dim, h_sizes, non_linearity, dropout, batch_norm)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [101]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(small_train_sample4, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 1/20
Train Loss:  2.1682 | Train Accuracy:  30.0000%
Valid Loss:  1.8548 | Valid Accuracy:  50.5750%
Epoch: 2/20
Train Loss:  1.5023 | Train Accuracy:  54.0000%
Valid Loss:  1.1134 | Valid Accuracy:  53.5833%
Epoch: 3/20
Train Loss:  0.9468 | Train Accuracy:  66.2500%
Valid Loss:  1.0002 | Valid Accuracy:  63.3750%
Epoch: 4/20
Train Loss:  0.7971 | Train Accuracy:  69.7500%
Valid Loss:  0.8336 | Valid Accuracy:  68.1333%
Epoch: 5/20
Train Loss:  0.6451 | Train Accuracy:  75.2500%
Valid Loss:  0.7966 | Valid Accuracy:  70.4000%
Epoch: 6/20
Train Loss:  0.5971 | Train Accuracy:  78.2500%
Valid Loss:  0.7757 | Valid Accuracy:  71.7167%
Epoch: 7/20
Train Loss:  0.5165 | Train Accuracy:  81.0000%
Valid Loss:  0.7381 | Valid Accuracy:  72.5167%
Epoch: 8/20
Train Loss:  0.4518 | Train Accuracy:  82.7500%
Valid Loss:  0.7048 | Valid Accuracy:  74.1083%
Epoch: 9/20
Train Loss:  0.3935 | Train Accuracy:  83.7500%
Valid Loss:  0.6703 | Valid Accuracy:  75.0667%
Epoch: 10/20
Train Loss:  0.

In [144]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 1/20
Train Loss:  0.5375 | Train Accuracy:  80.6250%
Valid Loss:  0.4241 | Valid Accuracy:  84.2917%
Epoch: 2/20
Train Loss:  0.4216 | Train Accuracy:  84.8438%
Valid Loss:  0.4310 | Valid Accuracy:  84.7000%
Epoch: 3/20
Train Loss:  0.3913 | Train Accuracy:  85.8646%
Valid Loss:  0.3955 | Valid Accuracy:  86.1917%
Epoch: 4/20
Train Loss:  0.3726 | Train Accuracy:  86.4938%
Valid Loss:  0.3826 | Valid Accuracy:  85.6667%
Epoch: 5/20
Train Loss:  0.3631 | Train Accuracy:  86.6667%
Valid Loss:  0.3677 | Valid Accuracy:  86.4917%
Epoch: 6/20
Train Loss:  0.3521 | Train Accuracy:  87.0229%
Valid Loss:  0.4078 | Valid Accuracy:  84.9333%
Epoch: 7/20
Train Loss:  0.3478 | Train Accuracy:  87.2979%
Valid Loss:  0.3744 | Valid Accuracy:  86.3667%
Epoch: 8/20
Train Loss:  0.3398 | Train Accuracy:  87.5125%
Valid Loss:  0.3933 | Valid Accuracy:  86.3833%
Epoch: 9/20
Train Loss:  0.3375 | Train Accuracy:  87.6208%
Valid Loss:  0.4028 | Valid Accuracy:  86.5167%
Epoch: 10/20
Train Loss:  0.

In [148]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 30
input_dim = 784
output_dim = 10
h_sizes = [100, 50]
learning_rate = 0.01
log_interval = 5
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = []
batch_norm = False

model = CustomNetwork2(input_dim, output_dim, h_sizes, non_linearity, dropout, batch_norm)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [149]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 5/30
Train Loss:  0.3594 | Train Accuracy:  86.8458%
Valid Loss:  0.4109 | Valid Accuracy:  85.8083%
Epoch: 10/30
Train Loss:  0.3307 | Train Accuracy:  88.1521%
Valid Loss:  0.3843 | Valid Accuracy:  86.5583%
Epoch: 15/30
Train Loss:  0.3100 | Train Accuracy:  88.8229%
Valid Loss:  0.3830 | Valid Accuracy:  86.9667%
Epoch: 20/30
Train Loss:  0.2921 | Train Accuracy:  89.4646%
Valid Loss:  0.3771 | Valid Accuracy:  87.7167%
Epoch: 25/30
Train Loss:  0.2793 | Train Accuracy:  89.9042%
Valid Loss:  0.3999 | Valid Accuracy:  87.2583%
Epoch: 30/30
Train Loss:  0.2779 | Train Accuracy:  90.1875%
Valid Loss:  0.4013 | Valid Accuracy:  86.7000%


In [19]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 30
input_dim = 784
output_dim = 10
h_sizes = [100, 50]
learning_rate = 0.01
log_interval = 5
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.ReLU()
dropout = []
batch_norm = True

model = CustomNetwork2(input_dim, output_dim, h_sizes, non_linearity, dropout, batch_norm)

model.to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [20]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 5/30
Train Loss:  0.3259 | Train Accuracy:  87.9563%
Valid Loss:  0.9716 | Valid Accuracy:  86.7750%
Epoch: 10/30
Train Loss:  0.2793 | Train Accuracy:  89.6292%
Valid Loss:  0.5166 | Valid Accuracy:  87.1250%
Epoch: 15/30
Train Loss:  0.2499 | Train Accuracy:  90.7146%
Valid Loss:  0.3759 | Valid Accuracy:  87.3667%
Epoch: 20/30
Train Loss:  0.2229 | Train Accuracy:  91.5896%
Valid Loss:  0.4586 | Valid Accuracy:  88.5167%
Epoch: 25/30
Train Loss:  0.2066 | Train Accuracy:  92.2312%
Valid Loss:  0.6357 | Valid Accuracy:  88.9083%
Epoch: 30/30
Train Loss:  0.1885 | Train Accuracy:  92.8521%
Valid Loss:  0.6084 | Valid Accuracy:  88.0167%


<font color = "teal"> Batch normalization converges faster

In [21]:
SEED = 2345
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0')

epochs = 30
input_dim = 784
output_dim = 10
h_sizes = [100, 50]
learning_rate = 0.01
log_interval = 5
log_batch = True
loss_function = nn.CrossEntropyLoss()
non_linearity = nn.SELU()
dropout = []
batch_norm = False

def init_weight2(layer):
    if type(layer) == nn.Linear:
        torch.nn.init.xavier_uniform_(layer.weight)
        torch.nn.init.zeros_(layer.bias)

model = CustomNetwork2(input_dim, output_dim, h_sizes, non_linearity, dropout, batch_norm)

model.to(device)
model.apply(init_weight2)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [22]:
batch_ct_train, batch_ct_valid = 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader,
                                                                                                                                                        model, optimizer, 
                                                                                                                                                        loss_function, epochs,
                                                                                                                                                        device, log_batch, log_interval)

Epoch: 5/30
Train Loss:  0.3774 | Train Accuracy:  86.2354%
Valid Loss:  0.3860 | Valid Accuracy:  85.6917%
Epoch: 10/30
Train Loss:  0.3472 | Train Accuracy:  87.6063%
Valid Loss:  0.3653 | Valid Accuracy:  86.9750%
Epoch: 15/30
Train Loss:  0.3388 | Train Accuracy:  88.1500%
Valid Loss:  0.4589 | Valid Accuracy:  86.5333%
Epoch: 20/30
Train Loss:  0.3120 | Train Accuracy:  89.0583%
Valid Loss:  0.3951 | Valid Accuracy:  87.5417%
Epoch: 25/30
Train Loss:  0.2971 | Train Accuracy:  89.7625%
Valid Loss:  0.4035 | Valid Accuracy:  87.6750%
Epoch: 30/30
Train Loss:  0.2937 | Train Accuracy:  89.9042%
Valid Loss:  0.4673 | Valid Accuracy:  85.7333%
