In [1]:
import os
import time
import torch
import sys
import numpy as np
import torch.optim as optim
import itertools
import shutil
from shutil import copyfile
from torchvision import transforms as T
%matplotlib notebook

In [2]:
from models import SpecialFuseNetModel
from data_manager import rgbd_gradients_dataset, rgbd_gradients_dataloader
from train import FuseNetTrainer
from functions import make_ckpt_fname

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


Sets all hyper-parameters options, and creates all of their combinations.

In [4]:
CWD                      = os.getcwd()
hyperparameters_filename = 'hyperparameters'
overfit_data_dir_path    = 'data/nyuv2_overfit'
normal_data_dir_path     = 'data/nyuv2'

# OVERFITTING_TRAINING     = True
OVERFITTING_TRAINING     = False

IMAGE_SIZE              = [(64, 64), (224,224)] # (448, 448)
TRAIN_TEST_RATIO        = [0.9]
BATCH_SIZE              = [4,16,32,64]
NUM_WORKERS             = [4]

BETAS                   = [(0.9, 0.99)]
LR                      = [0.001]
MOMENTUM                = [0.9]
WEIGHT_DECAY            = [0.0005]

STEP_SIZE               = [1000]
GAMMA                   = [0.1]

NUM_EPOCHS              = [400]
    
if OVERFITTING_TRAINING:
    DATASET_DIR          = os.path.join(CWD, overfit_data_dir_path)
    NUM_EPOCHS           = [50] # Doesn't need more in Overfitting
    TRAIN_TEST_RATIO     = [0.5] # <-- MUST USE 0.5 FOR OVERFITTING
else:
    DATASET_DIR     = os.path.join(CWD, normal_data_dir_path)
print(DATASET_DIR)

all_combintations = list(itertools.product(*[IMAGE_SIZE, TRAIN_TEST_RATIO, BATCH_SIZE, \
                                             NUM_WORKERS, BETAS, LR, MOMENTUM, \
                                             WEIGHT_DECAY, STEP_SIZE, GAMMA, \
                                             NUM_EPOCHS]))

/home/manor/cs236781-DeepLearning/project/master/data/nyuv2


In [5]:
dl_train,dl_test = rgbd_gradients_dataloader(root=DATASET_DIR, use_transforms=True, overfit_mode=OVERFITTING_TRAINING)

sample_batch     = next(iter(dl_train))
rgb_size         = tuple(sample_batch['rgb'].shape[1:])
depth_size       = tuple(sample_batch['depth'].shape[1:])
grads_size       = tuple(sample_batch['x'].shape[1:])

Make sure that all stochastic elements renoved from the model in overfit mode 

In [6]:
train_sample_batch1 = next(iter(dl_train))
train_sample_batch2 = next(iter(dl_train))
print(f"Consecutive RGB mini-batchs equals: {not np.any((train_sample_batch1['rgb']-train_sample_batch2['rgb']).numpy())}")
print(f"Consecutive D mini-batchs equals: {not np.any((train_sample_batch1['depth']-train_sample_batch2['depth']).numpy())}")
print(f"Consecutive X mini-batchs equals: {not np.any((train_sample_batch1['x']-train_sample_batch2['x']).numpy())}")
print(f"Consecutive Y mini-batchs equals: {not np.any((train_sample_batch1['y']-train_sample_batch2['y']).numpy())}")
fusenetmodel = SpecialFuseNetModel(rgb_size=rgb_size,depth_size=depth_size,grads_size=grads_size,
                                   device=device, dropout_p=0)
xy1 = fusenetmodel(train_sample_batch1['rgb'],train_sample_batch1['depth']).detach()
xy2 = fusenetmodel(train_sample_batch2['rgb'],train_sample_batch2['depth']).detach()
print(f"Outputs on consecutive mini-batchs equals: {not np.any((xy1-xy2).numpy())}")

Consecutive RGB mini-batchs equals: False
Consecutive D mini-batchs equals: False
Consecutive X mini-batchs equals: False
Consecutive Y mini-batchs equals: False
[I] - device=cpu
    - seed=42
    - dropout_p=0
    - optimizer=None
    - scheduler=None
[I] - Init SpecialFuseNet
    - warm start=True
    - BN momentum=0.1
    - dropout_p=0
[I] - Initialize Net.
    - Init type=xavier
    - Init gain=0.02

[I] - default optimizer set: SGD(lr=0.001,momentum=0.9,weight_decay=0.0005)
[I] - default scheduler set: StepSR(step_size=1000,gamma=0.1)
Outputs on consecutive mini-batchs equals: False


Loops over all the combinations, trains, and saves both the models, and their hyper-parameters files.

In [7]:
for combintation in all_combintations:
    image_size       = combintation[0]
    train_test_ratio = combintation[1]
    batch_size       = combintation[2]
    num_workers      = combintation[3]

    betas            = combintation[4]
    lr               = combintation[5]
    momentum         = combintation[6]
    weight_decay     = combintation[7]

    step_size        = combintation[8]
    gamma            = combintation[9]

    num_epochs       = combintation[10]
    
    with open(hyperparameters_filename + '.py', "w") as hyperparameters_file:
        print(f"IMAGE_SIZE={image_size}", file=hyperparameters_file)
        print(f"TRAIN_TEST_RATIO={train_test_ratio}", file=hyperparameters_file)
        print(f"BATCH_SIZE={batch_size}", file=hyperparameters_file)
        print(f"NUM_WORKERS={num_workers}", file=hyperparameters_file)
        print(f"", file=hyperparameters_file)
        print(f"BETAS={betas}", file=hyperparameters_file)
        print(f"LR={lr}", file=hyperparameters_file)
        print(f"MOMENTUM={momentum}", file=hyperparameters_file)
        print(f"WEIGHT_DECAY={weight_decay}", file=hyperparameters_file)
        print(f"", file=hyperparameters_file)
        print(f"STEP_SIZE={step_size}", file=hyperparameters_file)
        print(f"GAMMA={gamma}", file=hyperparameters_file)
        print(f"", file=hyperparameters_file)
        print(f"NUM_EPOCHS={num_epochs}", file=hyperparameters_file)

    print("Current Hyper-Parameters:")
    with open(hyperparameters_filename + '.py', "r") as hyperparameters_file:
        print(hyperparameters_file.read())
       
    # Train
    if OVERFITTING_TRAINING:
        fusenetmodel = SpecialFuseNetModel(rgb_size=rgb_size,depth_size=depth_size,grads_size=grads_size,
                                       device=device, dropout_p=0)
    else:
        fusenetmodel = SpecialFuseNetModel(rgb_size=rgb_size,depth_size=depth_size,grads_size=grads_size,
                                       device=device)
    trainer = FuseNetTrainer(model=fusenetmodel, device=device, num_epochs=num_epochs)
    
    dl_train,dl_test = rgbd_gradients_dataloader(root=DATASET_DIR, use_transforms=True,
                                                 overfit_mode=OVERFITTING_TRAINING)

    checkpoint_folder    = 'checkpoints/'
    checkpoint_file_name = make_ckpt_fname(image_size, batch_size, betas, lr, momentum)
    if OVERFITTING_TRAINING:
        checkpoint_file_name += '_overfit'
    checkpoint_file      = os.path.join(CWD, checkpoint_folder, checkpoint_file_name)
    checkpoint_res_file  = os.path.join(CWD, checkpoint_folder, checkpoint_file_name+'_res')
    
    if os.path.isfile(f'{checkpoint_file}.pt'):
        print(f'[I] - {checkpoint_file} exist')
        os.remove(f'{checkpoint_file}.pt')
    if os.path.isfile(f'{checkpoint_res_file}.pt'):
        print(f'[I] - {checkpoint_res_file} exist')
        os.remove(f'{checkpoint_res_file}.pt')
        
    res = trainer.fit(dl_train, dl_test, early_stopping=20, print_every=10, checkpoints=checkpoint_file)
    
    res.save(checkpoint_res_file)
    # Save the current hyper-parameters file next to the current saved model.
#     shutil.move(os.path.join(CWD,hyperparameters_filename+'.py'),
#                 os.path.join(CWD,checkpoint_folder,hyperparameters_filename+'_'+checkpoint_file_name+'.py'))
    copyfile(os.path.join(CWD,hyperparameters_filename+'.py'),
             os.path.join(CWD,checkpoint_folder,checkpoint_file_name+'_'+hyperparameters_filename+'.py'))
    print("-----------------------------------------------------------------")

Current Hyper-Parameters:
IMAGE_SIZE=(64, 64)
TRAIN_TEST_RATIO=0.9
BATCH_SIZE=4
NUM_WORKERS=4

BETAS=(0.9, 0.99)
LR=0.001
MOMENTUM=0.9
WEIGHT_DECAY=0.0005

STEP_SIZE=1000
GAMMA=0.1

NUM_EPOCHS=400

[I] - device=cpu
    - seed=42
    - dropout_p=0.4
    - optimizer=None
    - scheduler=None
[I] - Init SpecialFuseNet
    - warm start=True
    - BN momentum=0.1
    - dropout_p=0.4
[I] - Initialize Net.
    - Init type=xavier
    - Init gain=0.02

[I] - default optimizer set: SGD(lr=0.001,momentum=0.9,weight_decay=0.0005)
[I] - default scheduler set: StepSR(step_size=1000,gamma=0.1)
--- EPOCH 1/400 ---
train_batch (0.099):   9%|▉         | 26/290 [00:27<04:42,  1.07s/it]


KeyboardInterrupt: 