In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd "/content/drive/MyDrive/deep_learning_hw/foundations-dl-hw1/"

/content/drive/MyDrive/deep_learning_hw/foundations-dl-hw1


In [3]:
import torch
import itertools
import pandas as pd

from src.train import train, MultipleTrainResults
from src.cifar10_dataset import trainloader, testloader
from src.models import CNN, BaselineNN

from src.utils import init_func__zero_mean_gaussian, get_sgd_optimizer, weights_init_normal

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

>> Initializing data with parameters: BATCH_SIZE=64, DATA_FRACTION=0.1
Files already downloaded and verified
Files already downloaded and verified


device(type='cuda')

In [4]:
def get_optimal_baseline_training_params(filter_size_1: int = 64, 
                                         filter_size_2: int = 16,
                                         hidden_layers_count: int = 2,
                                         flattened_img_dim: int = 3072):
  model = CNN(filter_size_1 = filter_size_1, 
              filter_size_2 = filter_size_2,
              hidden_layers_count = hidden_layers_count, 
              flattened_img_dim = flattened_img_dim)
  optimizer = get_sgd_optimizer(model, lr=0.001, momentum=0.9)
  baseline_parameters = dict(
    model=model,
    optimizer=optimizer,
    init_func=weights_init_normal(std=0.1),
    trainloader=trainloader,
    testloader=testloader,
    num_epochs=100,
    device=device
  )

  return baseline_parameters

## [0] Run training sanity check

In [None]:
model = CNN()
results = train(
    model=model,
    trainloader=trainloader,
    testloader=testloader,
    optimizer=get_sgd_optimizer(model, lr=0.001, momentum=0.9),
    init_func=weights_init_normal(std=0.1),
    num_epochs=100,
    device=device
)

results.report()

## [1] Grid search for baseline HParams

In [None]:
# Full grid-search that was saved in CSV
"""
lrs = [0.001, 0.01, 0.1, 0.2, 0.3]
momentums = [0.9, 0.7, 0.5]
stds = [0.01, 0.05, 0.1, 0.5, 1.0]

grid_search_results = []  # params --> best-test-accuracy

for momentum, lr, std in itertools.product(momentums, lrs, stds):
    print(f">> Training with: lr={lr}, momentum={momentum}, std={std} --------------------------")
    model = CNN()
    results = train(
      model=model,
      init_func=init_func__zero_mean_gaussian(std=std),
      optimizer=get_sgd_optimizer(model, lr=lr, momentum=momentum),
      trainloader=trainloader,
      testloader=testloader,
      num_epochs=100,
      device=device
    )    
    results.get_accuracies_curve().show()

    grid_search_results.append(dict(lr=lr, momentum=momentum, std=std, 
                                    best_test_acc=results.test_accuracies.max(), 
                                    best_train_acc=results.train_accuracies.max()))
# Save results to CSV
pd.DataFrame(grid_search_results).to_csv('./report/q3-cnn-grid-search.csv')
"""

# Grid search results:
pd.read_csv('./report/q3-cnn-grid-search.csv')

Unnamed: 0.1,Unnamed: 0,lr,momentum,std,best_test_acc,best_train_acc
0,0,0.001,0.9,0.01,0.236111,0.194561
1,1,0.001,0.9,0.05,0.114583,0.116737
2,2,0.001,0.9,0.10,0.111979,0.116887
3,3,0.001,0.9,0.50,0.114583,0.118389
4,4,0.001,0.9,1.00,0.118056,0.123347
...,...,...,...,...,...,...
70,70,0.300,0.5,0.01,0.118056,0.116136
71,71,0.300,0.5,0.05,0.118056,0.117488
72,72,0.300,0.5,0.10,0.118056,0.117488
73,73,0.300,0.5,0.50,0.118056,0.116136


#### Optimal baseline hyper parameters found:

In [None]:
def get_optimal_baseline_training_params(hidden_layer_dim: int = 784, 
                                         hidden_layers_count: int = 1,
                                         flattened_img_dim: int = 3072):
  model = CNN(hidden_layer_dim = hidden_layer_dim, 
              hidden_layers_count = hidden_layers_count,
              flattened_img_dim = flattened_img_dim)
  optimizer = get_sgd_optimizer(model, lr=0.001, momentum=0.9)
  baseline_parameters = dict(
    model=model,
    optimizer=optimizer,
    init_func=weights_init_normal(std=0.1),
    trainloader=trainloader,
    testloader=testloader,
    num_epochs=100,
  )

  return baseline_parameters

results = train(**get_optimal_baseline_training_params())

results.report()

model parameters:  <generator object Module.parameters at 0x7f3627c10f90>
>> Runs training of CNN(
  (model): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): Dropout(p=0, inplace=False)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 16, kernel_size=(3, 3), stride=(1, 1))
    (5): Dropout(p=0, inplace=False)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (8): Flatten(start_dim=1, end_dim=-1)
    (9): Linear(in_features=1024, out_features=784, bias=True)
    (10): Dropout(p=0, inplace=False)
    (11): ReLU()
    (12): Linear(in_features=784, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
) on device=cpu for 100 epochs.


Train - Epoch 0: 100%|██████████| 79/79 [00:08<00:00,  9.11batch/s, accuracy=0, loss=2.44]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:00<00:00, 19.53batch/s, accuracy=0.15, loss=2.32]
Train - Epoch 1: 100%|██████████| 79/79 [00:05<00:00, 13.33batch/s, accuracy=0.125, loss=2.27]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:01<00:00, 13.70batch/s, accuracy=0.075, loss=2.31]
Train - Epoch 2: 100%|██████████| 79/79 [00:06<00:00, 11.38batch/s, accuracy=0.25, loss=2.19]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 19.53batch/s, accuracy=0.15, loss=2.24]
Train - Epoch 3: 100%|██████████| 79/79 [00:06<00:00, 12.40batch/s, accuracy=0.125, loss=2.34]
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:01<00:00, 13.27batch/s, accuracy=0.2, loss=2.15]
Train - Epoch 4: 100%|██████████| 79/79 [00:06<00:00, 12.05batch/s, accuracy=0, loss=2.19]
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 19.03batch/s, accuracy=0.25, loss=2.13]
Train - Epoch 5: 100%|██████████| 79/79 [00:06<00

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=85.77927215189874, accuracy=84.55300632911393%, loss=0.4856751006615313
   >> TEST-SET: best-accuracy=47.03125 accuracy=45.9765625%, loss=2.06581549346447


## [2] Optimization Impact

In [None]:
from src.utils import get_adam_optimizer

train_params = get_optimal_baseline_training_params()
adam_opt = get_adam_optimizer(model=train_params['model'], lr=0.001)
train_params.update({'optimizer': adam_opt})

print(train_params)
results = train(**train_params)

results.report()

model parameters:  <generator object Module.parameters at 0x7f3609c96740>
{'model': CNN(
  (model): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): Dropout(p=0, inplace=False)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 16, kernel_size=(3, 3), stride=(1, 1))
    (5): Dropout(p=0, inplace=False)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (8): Flatten(start_dim=1, end_dim=-1)
    (9): Linear(in_features=1024, out_features=784, bias=True)
    (10): Dropout(p=0, inplace=False)
    (11): ReLU()
    (12): Linear(in_features=784, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
), 'optimizer': Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_dec

Train - Epoch 0: 100%|██████████| 79/79 [00:07<00:00, 10.17batch/s, accuracy=0.125, loss=2.5]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:00<00:00, 18.78batch/s, accuracy=0.35, loss=1.89]
Train - Epoch 1: 100%|██████████| 79/79 [00:06<00:00, 12.19batch/s, accuracy=0.375, loss=1.92]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:01<00:00, 13.21batch/s, accuracy=0.4, loss=1.65]
Train - Epoch 2: 100%|██████████| 79/79 [00:06<00:00, 11.46batch/s, accuracy=0.875, loss=0.863]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 19.18batch/s, accuracy=0.275, loss=1.8]
Train - Epoch 3: 100%|██████████| 79/79 [00:07<00:00, 10.97batch/s, accuracy=0.5, loss=1.42]
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:01<00:00, 12.50batch/s, accuracy=0.45, loss=1.64]
Train - Epoch 4: 100%|██████████| 79/79 [00:06<00:00, 12.41batch/s, accuracy=0.5, loss=0.929]
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 19.00batch/s, accuracy=0.425, loss=1.67]
Train - Epoch 5: 100%|██████████| 79/79 [00

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=100.0, accuracy=100.0%, loss=1.0110494926427432e-05
   >> TEST-SET: best-accuracy=51.03515625 accuracy=50.9375%, loss=4.964302659034729


## [3] Initialization Impact

In [None]:
from src.utils import init_func__xavier

train_params = get_optimal_baseline_training_params()
train_params.update({'init_func': init_func__xavier()})
print(train_params)

results = train(**train_params)
results.report()

model parameters:  <generator object Module.parameters at 0x7f09d97a5580>
{'model': CNN(
  (model): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): Dropout(p=0, inplace=False)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 16, kernel_size=(3, 3), stride=(1, 1))
    (5): Dropout(p=0, inplace=False)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (8): Flatten(start_dim=1, end_dim=-1)
    (9): Linear(in_features=1024, out_features=784, bias=True)
    (10): Dropout(p=0, inplace=False)
    (11): ReLU()
    (12): Linear(in_features=784, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
), 'optimizer': SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
), 'init_func': <function init_func_

Train - Epoch 0: 100%|██████████| 79/79 [00:07<00:00, 10.69batch/s, accuracy=0.125, loss=2.27]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:01<00:00, 11.33batch/s, accuracy=0.125, loss=2.29]
Train - Epoch 1: 100%|██████████| 79/79 [00:06<00:00, 12.30batch/s, accuracy=0.125, loss=2.28]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:00<00:00, 17.09batch/s, accuracy=0.125, loss=2.26]
Train - Epoch 2: 100%|██████████| 79/79 [00:07<00:00, 10.02batch/s, accuracy=0.125, loss=2.18]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 16.28batch/s, accuracy=0.2, loss=2.15]
Train - Epoch 3: 100%|██████████| 79/79 [00:06<00:00, 12.08batch/s, accuracy=0.125, loss=2.2]
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:00<00:00, 16.94batch/s, accuracy=0.2, loss=2.13]
Train - Epoch 4: 100%|██████████| 79/79 [00:07<00:00, 10.03batch/s, accuracy=0.25, loss=1.88]
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 16.60batch/s, accuracy=0.25, loss=2.06]
Train - Epoch 5: 100%|██████████| 79/79 [0

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=99.42642405063292, accuracy=99.34731012658227%, loss=0.06364409465201293
   >> TEST-SET: best-accuracy=48.73046875 accuracy=46.484375%, loss=2.681158110499382


## [4] Regularization

### 4.1 Weight Decay

In [None]:
result_data = {}
train_params = get_optimal_baseline_training_params()

for decay in [0.001, 0.01, 0.1]:
  model = CNN()
  new_optimizer = get_sgd_optimizer(model, lr=0.001, momentum=0.9, weight_decay=decay)
  train_params.update({'model': model, 'optimizer': new_optimizer})
  print(train_params)
  result_data[decay] = train(**train_params)

In [None]:
result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()

### 4.2 Dropout

In [None]:
result_data = {}
train_params = get_optimal_baseline_training_params()

for dropout in [0.1, 0.2, 0.3, 0.4, 0.5]:
  model = CNN(p_dropout = dropout)
  new_optimizer = get_sgd_optimizer(model, lr=0.001, momentum=0.9)
  train_params.update({'model': model, 'optimizer': new_optimizer})
  print(train_params)
  result_data[dropout] = train(**train_params)

In [None]:
result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()

## [5] Preprocessing

In [5]:
from sklearn.decomposition import PCA
from src.cifar10_dataset import trainset_x, trainset_y, testset_x, testset_y

PCA_obj = PCA(whiten=True, n_components=300)
PCA_obj.fit(trainset_x)

In [6]:
# print(PCA_obj.transform(trainset_x).reshape(-1, 3, 10, 10).shape)
# print(next(iter(trainloader))[0].shape)

trainset_PCA = torch.utils.data.TensorDataset(torch.Tensor(PCA_obj.transform(trainset_x).reshape(-1, 3, 10, 10)),
                                              torch.Tensor(trainset_y).type(torch.LongTensor))
trainloader_PCA = torch.utils.data.DataLoader(trainset_PCA, batch_size=64, 
                                              shuffle=False, num_workers=2)

testset_PCA = torch.utils.data.TensorDataset(torch.Tensor(PCA_obj.transform(testset_x).reshape(-1, 3, 10, 10)),
                                              torch.Tensor(testset_y).type(torch.LongTensor))
testloader_PCA  = torch.utils.data.DataLoader(testset_PCA, batch_size=64, 
                                              shuffle=False, num_workers=2)

In [7]:
# from src.cifar10_dataset import trainloader_PCA, testloader_PCA

train_params = get_optimal_baseline_training_params(hidden_layers_count = -1)
train_params.update({'trainloader': trainloader_PCA, 'testloader': testloader_PCA})
print(train_params)

results = train(**train_params)
results.report()

model parameters:  <generator object Module.parameters at 0x7fd6ed02d4a0>
{'model': CNN(
  (model): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): Dropout(p=0, inplace=False)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 16, kernel_size=(3, 3), stride=(1, 1))
    (5): Dropout(p=0, inplace=False)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(2, 2), stride=2, padding=1, dilation=1, ceil_mode=False)
    (8): Flatten(start_dim=1, end_dim=-1)
    (9): Linear(in_features=64, out_features=784, bias=True)
    (10): Dropout(p=0, inplace=False)
    (11): ReLU()
    (12): Linear(in_features=784, out_features=10, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
), 'optimizer': SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
), 'init_func': <function weights_init

Train - Epoch 0: 100%|██████████| 79/79 [00:02<00:00, 28.19batch/s, accuracy=0.375, loss=2.08]
Evaluate - Epoch 0: 100%|██████████| 16/16 [00:00<00:00, 63.84batch/s, accuracy=0.125, loss=2.33]
Train - Epoch 1: 100%|██████████| 79/79 [00:00<00:00, 106.46batch/s, accuracy=0.5, loss=1.91]
Evaluate - Epoch 1: 100%|██████████| 16/16 [00:00<00:00, 64.71batch/s, accuracy=0.125, loss=2.31]
Train - Epoch 2: 100%|██████████| 79/79 [00:00<00:00, 102.48batch/s, accuracy=0.875, loss=1.77]
Evaluate - Epoch 2: 100%|██████████| 16/16 [00:00<00:00, 49.33batch/s, accuracy=0.125, loss=2.3]
Train - Epoch 3: 100%|██████████| 79/79 [00:00<00:00, 86.20batch/s, accuracy=0.875, loss=1.66] 
Evaluate - Epoch 3: 100%|██████████| 16/16 [00:00<00:00, 51.15batch/s, accuracy=0.15, loss=2.29]
Train - Epoch 4: 100%|██████████| 79/79 [00:00<00:00, 88.24batch/s, accuracy=0.875, loss=1.58] 
Evaluate - Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 47.10batch/s, accuracy=0.175, loss=2.28]
Train - Epoch 5: 100%|██████████| 7

>> Accuracies Curves:


>> Losses Curves:


>> Optimization ended with:
   >> TRAIN-SET: best-accuracy=82.67405063291139, accuracy=82.67405063291139%, loss=0.5997836001311676
   >> TEST-SET: best-accuracy=26.210937499999996 accuracy=26.210937499999996%, loss=3.3616655319929123


## [6] Network Width

In [None]:
train_params = get_optimal_baseline_training_params()

result_data = {}

for filter_size_1, filter_size_2 in [(256, 64), (512, 256)]:
  train_params = get_optimal_baseline_training_params(filter_size_1 = filter_size_1,
                                                      filter_size_2 = filter_size_2)
  print(train_params)
  result_data[f'{(filter_size_1, filter_size_2)}'] = train(**train_params)

In [None]:
import pickle
with open("3_6_res.pkl", "wb") as f:
  pickle.dump(result_data, f)

In [None]:
result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()

## [7] Network Depth

In [None]:
result_data = {}

for k in [3,4,5]:
  train_params = get_optimal_baseline_training_params(hidden_layers_count = k)
  print(train_params)
  result_data[k] = train(**train_params)

In [None]:
import pickle
with open("3_7_res.pkl", "wb") as f:
  pickle.dump(result_data, f)

In [None]:
result_comparison = MultipleTrainResults(result_data)
result_comparison.get_accuracies_curve().show()
result_comparison.get_losses_curve().show()