In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd "/content/drive/MyDrive/deep_learning_hw/foundations-dl-hw1/"


/content/drive/MyDrive/deep_learning_hw/foundations-dl-hw1


In [3]:
import torch
import itertools
import pandas as pd

from src.train import train, MultipleTrainResults
from src.cifar10_dataset import trainloader, testloader
from src.models import BaselineNN

from src.utils import init_func__zero_mean_gaussian, get_sgd_optimizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

>> Initializing data with parameters: BATCH_SIZE=64, DATA_FRACTION=0.1
Files already downloaded and verified
Files already downloaded and verified


device(type='cuda')

In [4]:
def get_optimal_baseline_training_params(hidden_layer_dim: int = 256, 
                                         hidden_layers_count: int = 1,
                                         flattened_img_dim: int = 3072):
  model = BaselineNN(hidden_layer_dim = hidden_layer_dim, 
                     hidden_layers_count = hidden_layers_count,
                     flattened_img_dim = flattened_img_dim)
  optimizer = get_sgd_optimizer(model, lr=0.001, momentum=0.9)
  baseline_parameters = dict(
    model=model,
    optimizer=optimizer,
    init_func=init_func__zero_mean_gaussian(std=0.1),
    trainloader=trainloader,
    testloader=testloader,
    num_epochs=100,
  )

  return baseline_parameters

## [0] Run training sanity check

In [None]:
model = BaselineNN()
results = train(
    model=model,
    init_func=init_func__zero_mean_gaussian(std=1),
    optimizer=get_sgd_optimizer(model, lr=0.001, momentum=0.9),
    trainloader=trainloader,
    testloader=testloader,
    num_epochs=3,
    device=device
)

best_epoch, best_acc = results.test_accuracies.argmax(), results.test_accuracies.max()
print(f"Best test-set accuracy: {best_acc} in epoch {best_epoch}")

results.report()

## [1] Grid search for baseline HParams

In [7]:
# Full grid-search that was saved in CSV
"""
lrs = [0.001, 0.01, 0.1, 0.2, 0.3]
momentums = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
stds = [0.1, 0.5, 1.0]

grid_search_results = []  # params --> best-test-accuracy

for lr, momentum, std in itertools.product(lrs, momentums, stds):
    print(f">> Training with: lr={lr}, momentum={momentum}, std={std} --------------------------")
    model = BaselineNN()
    results = train(
      model=model,
      init_func=init_func__zero_mean_gaussian(std=std),
      optimizer=get_sgd_optimizer(model, lr=lr, momentum=momentum),
      trainloader=trainloader,
      testloader=testloader,
      num_epochs=50,
    )    
    results.get_accuracies_curve().show()

    grid_search_results.append(dict(lr=lr, momentum=momentum, std=std, 
                                    best_test_acc=results.test_accuracies.max(), 
                                    best_train_acc=results.train_accuracies.max()))
# Save results to CSV
pd.DataFrame(grid_search_results).to_csv('./report/q2-baseline-grid-search.csv')
"""

# Grid search results:
pd.read_csv('./report/q2-baseline-grid-search.csv')

Unnamed: 0.1,Unnamed: 0,lr,momentum,std,best_test_acc,best_train_acc
0,0,0.001,0.0,0.1,0.178125,0.145767
1,1,0.001,0.0,0.5,0.156836,0.139834
2,2,0.001,0.0,1.0,0.159180,0.141218
3,3,0.001,0.2,0.1,0.211523,0.183742
4,4,0.001,0.2,0.5,0.185547,0.179984
...,...,...,...,...,...,...
85,85,0.300,0.8,0.5,0.112891,0.113133
86,86,0.300,0.8,1.0,0.112891,0.110166
87,87,0.300,0.9,0.1,0.112891,0.110562
88,88,0.300,0.9,0.5,0.112891,0.109771


#### Optimal baseline hyper parameters found:

In [None]:
def get_optimal_baseline_training_params(hidden_layer_dim: int = 256, 
                                         hidden_layers_count: int = 1,
                                         flattened_img_dim: int = 3072):
  model = BaselineNN(hidden_layer_dim = hidden_layer_dim, 
                     hidden_layers_count = hidden_layers_count,
                     flattened_img_dim = flattened_img_dim)
  optimizer = get_sgd_optimizer(model, lr=0.001, momentum=0.9)
  baseline_parameters = dict(
    model=model,
    optimizer=optimizer,
    init_func=init_func__zero_mean_gaussian(std=0.1),
    trainloader=trainloader,
    testloader=testloader,
    num_epochs=100,
  )

  return baseline_parameters

results = train(**get_optimal_baseline_training_params())

results.report()

## [2] Optimization Impact

In [None]:
from src.utils import get_adam_optimizer

train_params = get_optimal_baseline_training_params()

# Override the optimizer of the model
for lr in [train_params['optimizer'].defaults['lr'], 0.005]: #[0.0001, 0.001, 0.01, 0.05]
  adam_opt = get_adam_optimizer(model=train_params['model'], lr=lr)
  train_params.update({'optimizer': adam_opt})

  print(train_params)
  results = train(**train_params)

  results.report()

{'model': BaselineNN(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3072, out_features=256, bias=True)
    (2): Dropout(p=0, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
), 'optimizer': Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
), 'init_func': <function init_func__zero_mean_gaussian.<locals>.func at 0x7efe4ebb1ca0>, 'trainloader': <torch.utils.data.dataloader.DataLoader object at 0x7efd7fd0b2b0>, 'testloader': <torch.utils.data.dataloader.DataLoader object at 0x7efd77a3f8b0>, 'num_epochs': 100}
>> Runs training of BaselineNN(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3072, out_features=256, bias=True)
    (2): Dropout(p=0, inplace=Fal

Train - Epoch 0: 100%|██████████| 79/79 [00:02<00:00, 33.61batch/s, accuracy=0, loss=4.76]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:00<00:00, 49.96batch/s, accuracy=0.2, loss=2.61]
Train - Epoch 1: 100%|██████████| 79/79 [00:01<00:00, 46.50batch/s, accuracy=0, loss=3.11]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:00<00:00, 49.18batch/s, accuracy=0.225, loss=4.15]
Train - Epoch 2: 100%|██████████| 79/79 [00:01<00:00, 47.29batch/s, accuracy=0.125, loss=3.65]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 49.49batch/s, accuracy=0.15, loss=6.29]
Train - Epoch 3: 100%|██████████| 79/79 [00:01<00:00, 48.39batch/s, accuracy=0.25, loss=2.1]
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:00<00:00, 51.38batch/s, accuracy=0.15, loss=4.35]
Train - Epoch 4: 100%|██████████| 79/79 [00:01<00:00, 47.93batch/s, accuracy=0.25, loss=2.43]
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 50.81batch/s, accuracy=0.125, loss=5.97]
Train - Epoch 5: 100%|██████████| 79/79 [00:02<00:

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=57.02136075949367, accuracy=57.02136075949367%, loss=1.2609955608090269
   >> TEST-SET: best-accuracy=36.34765625 accuracy=35.48828125%, loss=2.0410516262054443
{'model': BaselineNN(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3072, out_features=256, bias=True)
    (2): Dropout(p=0, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
), 'optimizer': Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.005
    maximize: False
    weight_decay: 0
), 'init_func': <function init_func__zero_mean_gaussian.<locals>.func at 0x7efe4ebb1ca0>, 'trainloader': <torch.utils.data.dataloader.DataLoader object at 0x7efd7fd0b2b0>, 'testloader': <torch.utils.data.dataloader.DataLoader object at 0x7efd77a

Train - Epoch 0: 100%|██████████| 79/79 [00:01<00:00, 44.79batch/s, accuracy=0.125, loss=2.37]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:00<00:00, 49.71batch/s, accuracy=0.15, loss=2.19]
Train - Epoch 1: 100%|██████████| 79/79 [00:01<00:00, 45.37batch/s, accuracy=0.125, loss=2.09]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:00<00:00, 49.63batch/s, accuracy=0.275, loss=2.15]
Train - Epoch 2: 100%|██████████| 79/79 [00:01<00:00, 46.62batch/s, accuracy=0.25, loss=1.88]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 49.40batch/s, accuracy=0.175, loss=2.14]
Train - Epoch 3: 100%|██████████| 79/79 [00:01<00:00, 45.07batch/s, accuracy=0.125, loss=2.01]
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:00<00:00, 48.87batch/s, accuracy=0.225, loss=2.4]
Train - Epoch 4: 100%|██████████| 79/79 [00:02<00:00, 34.85batch/s, accuracy=0.625, loss=1.8]
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 30.81batch/s, accuracy=0.225, loss=2.27]
Train - Epoch 5: 100%|██████████| 79/79

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=65.46677215189874, accuracy=61.90664556962025%, loss=1.0398219914375981
   >> TEST-SET: best-accuracy=40.3515625 accuracy=36.23046875%, loss=2.5415172651410103


## [3] Initialization Impact

In [None]:
from src.utils import init_func__xavier

train_params = get_optimal_baseline_training_params()
train_params.update({'init_func': init_func__xavier()})
print(train_params)

results = train(**train_params)
results.report()

{'model': BaselineNN(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3072, out_features=256, bias=True)
    (2): Dropout(p=0, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
), 'optimizer': SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
), 'init_func': <function init_func__xavier.<locals>.func at 0x7f07e01ceaf0>, 'trainloader': <torch.utils.data.dataloader.DataLoader object at 0x7f0805fc3070>, 'testloader': <torch.utils.data.dataloader.DataLoader object at 0x7f0804d161c0>, 'num_epochs': 100}
>> Runs training of BaselineNN(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3072, out_features=256, bias=True)
    (2): Dropout(p=0, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=256, o

Train - Epoch 0: 100%|██████████| 79/79 [00:01<00:00, 44.50batch/s, accuracy=0.25, loss=1.96]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:00<00:00, 44.78batch/s, accuracy=0.275, loss=2.01]
Train - Epoch 1: 100%|██████████| 79/79 [00:02<00:00, 31.24batch/s, accuracy=0.125, loss=2.3]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:00<00:00, 27.70batch/s, accuracy=0.35, loss=1.97]
Train - Epoch 2: 100%|██████████| 79/79 [00:02<00:00, 34.03batch/s, accuracy=0.375, loss=1.96]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 43.44batch/s, accuracy=0.325, loss=1.86]
Train - Epoch 3: 100%|██████████| 79/79 [00:01<00:00, 45.73batch/s, accuracy=0.375, loss=1.57]
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:00<00:00, 43.07batch/s, accuracy=0.375, loss=1.85]
Train - Epoch 4: 100%|██████████| 79/79 [00:01<00:00, 45.66batch/s, accuracy=0.625, loss=1.34]
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 44.24batch/s, accuracy=0.3, loss=1.91]
Train - Epoch 5: 100%|██████████| 79/79 

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=72.80458860759494, accuracy=72.38924050632912%, loss=0.8939230321328852
   >> TEST-SET: best-accuracy=42.63671875 accuracy=41.50390625%, loss=1.7150256857275963


## [4] Regularization

### 4.1 Weight Decay

In [None]:
result_data = {}
train_params = get_optimal_baseline_training_params()

for decay in [0.001, 0.01, 0.1]:
  new_optimizer = get_sgd_optimizer(train_params['model'], lr=0.001, momentum=0.9, weight_decay=decay)
  train_params.update({'optimizer': new_optimizer})
  print(train_params)
  result_data[decay] = train(**train_params)

In [None]:
result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()

### 4.2 Dropout

In [None]:
result_data = {}
train_params = get_optimal_baseline_training_params()

for dropout in [0.1, 0.2, 0.3, 0.4, 0.5]:
  model = BaselineNN(p_dropout = dropout)
  new_optimizer = get_sgd_optimizer(model, lr=0.001, momentum=0.9)
  train_params.update({'model': model, 'optimizer': new_optimizer})
  print(train_params)
  result_data[dropout] = train(**train_params)

In [None]:
result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()

## [5] Preprocessing

In [5]:
from src.cifar10_dataset import trainloader_PCA, testloader_PCA

train_params = get_optimal_baseline_training_params(flattened_img_dim = 500)
train_params.update({'trainloader': trainloader_PCA, 'testloader': testloader_PCA})
print(train_params)

results = train(**train_params)
results.report()

model parameters:  <generator object Module.parameters at 0x7f66e0b88cf0>
{'model': BaselineNN(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=500, out_features=256, bias=True)
    (2): Dropout(p=0, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
), 'optimizer': SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
), 'init_func': <function init_func__zero_mean_gaussian.<locals>.func at 0x7f67b13e3b80>, 'trainloader': <torch.utils.data.dataloader.DataLoader object at 0x7f66e0b93c10>, 'testloader': <torch.utils.data.dataloader.DataLoader object at 0x7f66e0b93bb0>, 'num_epochs': 100}
>> Runs training of BaselineNN(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=500, out_features=256, bias=True)
   

Train - Epoch 0: 100%|██████████| 79/79 [00:00<00:00, 118.01batch/s, accuracy=0.25, loss=2.27]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:00<00:00, 98.11batch/s, accuracy=0.225, loss=2.26]
Train - Epoch 1: 100%|██████████| 79/79 [00:00<00:00, 138.97batch/s, accuracy=0.375, loss=2.14]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:00<00:00, 97.32batch/s, accuracy=0.25, loss=2.26]
Train - Epoch 2: 100%|██████████| 79/79 [00:00<00:00, 145.60batch/s, accuracy=0.5, loss=2.02]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 93.92batch/s, accuracy=0.275, loss=2.25]
Train - Epoch 3: 100%|██████████| 79/79 [00:00<00:00, 148.60batch/s, accuracy=0.625, loss=1.9]
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:00<00:00, 98.75batch/s, accuracy=0.25, loss=2.24]
Train - Epoch 4: 100%|██████████| 79/79 [00:00<00:00, 134.89batch/s, accuracy=0.75, loss=1.79]
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 104.50batch/s, accuracy=0.25, loss=2.23]
Train - Epoch 5: 100%|██████████| 79/

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=91.07990506329115, accuracy=91.07990506329115%, loss=0.49854334245754195
   >> TEST-SET: best-accuracy=28.90625 accuracy=28.90625%, loss=2.929327055811882


## [6] Network Width

In [None]:
train_params = get_optimal_baseline_training_params()

result_data = {}

for dim in [6, 10, 12]:
  train_params = get_optimal_baseline_training_params(hidden_layer_dim = 2**dim)
  print(train_params)
  result_data[dim] = train(**train_params)

In [None]:
import pickle
with open("2_6_res.pkl", "wb") as f:
  pickle.dump(result_data, f)

In [None]:
result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()

## [7] Network Depth

In [None]:
from src.utils import weights_init_normal, init_func__xavier

result_data = {}

for dim in [3,4,10]:
  train_params = get_optimal_baseline_training_params(hidden_layer_dim = 64, hidden_layers_count = dim)
  train_params.update({'init_func' : weights_init_normal(0.1)})
  print(train_params)
  result_data[dim] = train(**train_params)

In [None]:
import pickle
with open("2_7_res.pkl", "wb") as f:
  pickle.dump(result_data, f)

In [None]:
result_data = {}

import pickle
with open("2_7_res.pkl", "rb") as f:
  result_data = pickle.load(f)

result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()