In [1]:
import os
import time
import torch
import sys
import numpy as np
import torch.optim as optim
import itertools
import shutil
from shutil import copyfile
from torchvision import transforms as T
%matplotlib notebook

In [2]:
# from models import SpecialFuseNetModel
from models_chain import ModelsChain
# from data_manager import rgbd_gradients_dataset, rgbd_gradients_dataloader
from models_chain_data_manager import models_chain_dataset, models_chain_dataloader
# from train import FuseNetTrainer
from models_chains_train import ModelsChainTrainer
from functions import make_ckpt_fname

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


Sets all hyper-parameters options, and creates all of their combinations.

In [4]:
CWD                      = os.getcwd()
hyperparameters_filename = 'hyperparameters'
overfit_data_dir_path    = 'data/nyuv2_overfit'
normal_data_dir_path     = 'data/nyuv2'

# OVERFITTING_TRAINING     = True
OVERFITTING_TRAINING     = False

IMAGE_SIZE              = [(64, 64)]#, (224,224)] # (448, 448)
TRAIN_TEST_RATIO        = [0.9]
BATCH_SIZE              = [16]#[4,16,32,64]
NUM_WORKERS             = [4]

BETAS                   = [(0.9, 0.99)]
LR                      = [0.001]
MOMENTUM                = [0.99]#[0.9]
WEIGHT_DECAY            = [0.0005]

STEP_SIZE               = [1000]
GAMMA                   = [0.1]

NUM_EPOCHS              = [400]
    
if OVERFITTING_TRAINING:
    DATASET_DIR          = os.path.join(CWD, overfit_data_dir_path)
    NUM_EPOCHS           = [50] # Doesn't need more in Overfitting
    TRAIN_TEST_RATIO     = [0.5] # <-- MUST USE 0.5 FOR OVERFITTING
else:
    DATASET_DIR     = os.path.join(CWD, normal_data_dir_path)
print(DATASET_DIR)

all_combintations = list(itertools.product(*[IMAGE_SIZE, TRAIN_TEST_RATIO, BATCH_SIZE, \
                                             NUM_WORKERS, BETAS, LR, MOMENTUM, \
                                             WEIGHT_DECAY, STEP_SIZE, GAMMA, \
                                             NUM_EPOCHS]))

C:\Users\tomav\Documents\GitHub\cs236781-project\data/nyuv2


In [5]:
# dl_train,dl_test = models_chain_dataloader(root=DATASET_DIR, use_transforms=True, overfit_mode=OVERFITTING_TRAINING)
dl_train,dl_test = models_chain_dataloader(root=DATASET_DIR, batch_size=4, num_workers=4, train_test_ratio=0.9, image_size=(64, 64), use_transforms=True, overfit_mode=OVERFITTING_TRAINING)
sample_batch     = next(iter(dl_train))
rgb_rgb2d_size   = tuple(sample_batch['rgb_rgb2d'].shape[1:])
rgb_size         = tuple(sample_batch['rgb'].shape[1:])
depth_size       = tuple(sample_batch['depth'].shape[1:])
grads_size       = tuple(sample_batch['x'].shape[1:])

[I (models_chain_dataloader)] - root=C:\Users\tomav\Documents\GitHub\cs236781-project\data/nyuv2
                              - batch_size=4
                              - num_workers=4
                              - train_test_ratio=0.9
                              - image_size=(64, 64)
                              - use_transforms=True
                              - overfit_mode=False
                              - seed=42
                              - inference=None
                              - goto_pixel=True

[I (models_chain_dataset)] - root=C:\Users\tomav\Documents\GitHub\cs236781-project\data/nyuv2
                           - image_size=(64, 64)
                           - use_transforms=True
                           - overfit_mode=False
                           - goto_pixel=True

self.image_size (64, 64)
self.image_size_rgb2d (128, 128)
[I] - |self|=1449
[1305, 144]
[I (models_chain_dataloader)] - |Train Dataset|=1305, |Test Dataset|=144


Make sure that all stochastic elements renoved from the model in overfit mode 

In [6]:
# train_sample_batch1 = next(iter(dl_train))
# train_sample_batch2 = next(iter(dl_train))
# print(f"Consecutive RGB mini-batchs equals: {not np.any((train_sample_batch1['rgb']-train_sample_batch2['rgb']).numpy())}")
# print(f"Consecutive D mini-batchs equals: {not np.any((train_sample_batch1['depth']-train_sample_batch2['depth']).numpy())}")
# print(f"Consecutive X mini-batchs equals: {not np.any((train_sample_batch1['x']-train_sample_batch2['x']).numpy())}")
# print(f"Consecutive Y mini-batchs equals: {not np.any((train_sample_batch1['y']-train_sample_batch2['y']).numpy())}")
# fusenetmodel = ModelsChain(rgb_size=rgb_size,depth_size=depth_size,grads_size=grads_size,
#                                    device=device, dropout_p=0)
# # fusenetmodel = SpecialFuseNetModel(rgb_size=rgb_size,depth_size=depth_size,grads_size=grads_size,
# #                                    device=device, dropout_p=0)
# xy1 = fusenetmodel(train_sample_batch1['rgb'],train_sample_batch1['depth']).detach()
# xy2 = fusenetmodel(train_sample_batch2['rgb'],train_sample_batch2['depth']).detach()
# print(f"Outputs on consecutive mini-batchs equals: {not np.any((xy1-xy2).numpy())}")

Loops over all the combinations, trains, and saves both the models, and their hyper-parameters files.

In [7]:
for combintation in all_combintations:
    image_size       = combintation[0]
    train_test_ratio = combintation[1]
    batch_size       = combintation[2]
    num_workers      = combintation[3]

    betas            = combintation[4]
    lr               = combintation[5]
    momentum         = combintation[6]
    weight_decay     = combintation[7]

    step_size        = combintation[8]
    gamma            = combintation[9]

    num_epochs       = combintation[10]
    
    with open(hyperparameters_filename + '.py', "w") as hyperparameters_file:
        print(f"IMAGE_SIZE={image_size}", file=hyperparameters_file)
        print(f"TRAIN_TEST_RATIO={train_test_ratio}", file=hyperparameters_file)
        print(f"BATCH_SIZE={batch_size}", file=hyperparameters_file)
        print(f"NUM_WORKERS={num_workers}", file=hyperparameters_file)
        print(f"", file=hyperparameters_file)
        print(f"BETAS={betas}", file=hyperparameters_file)
        print(f"LR={lr}", file=hyperparameters_file)
        print(f"MOMENTUM={momentum}", file=hyperparameters_file)
        print(f"WEIGHT_DECAY={weight_decay}", file=hyperparameters_file)
        print(f"", file=hyperparameters_file)
        print(f"STEP_SIZE={step_size}", file=hyperparameters_file)
        print(f"GAMMA={gamma}", file=hyperparameters_file)
        print(f"", file=hyperparameters_file)
        print(f"NUM_EPOCHS={num_epochs}", file=hyperparameters_file)

    print("Current Hyper-Parameters:")
    with open(hyperparameters_filename + '.py', "r") as hyperparameters_file:
        print(hyperparameters_file.read())
       
    # Train
    if OVERFITTING_TRAINING:
        modelschainmodel = ModelsChain(rgb_size=rgb_size, depth_size=depth_size, grads_size=grads_size,
                                       device=device, dropout_p=0)
    else:
        modelschainmodel = ModelsChain(rgb_size=rgb_size, depth_size=depth_size, grads_size=grads_size,
                                       device=device)
    trainer          = ModelsChainTrainer(models_chain=modelschainmodel, device=device, num_epochs=num_epochs)
    dl_train,dl_test = models_chain_dataloader(root=DATASET_DIR, batch_size=batch_size, num_workers=num_workers,
                                               train_test_ratio=train_test_ratio, image_size=image_size,
                                               use_transforms=True, overfit_mode=OVERFITTING_TRAINING)

    checkpoint_folder    = 'checkpoints/'
    checkpoint_file_name_densedepth      = make_ckpt_fname(model_name='densedepth', image_size=image_size, batch_size=batch_size, betas=betas, lr=lr, momentum=momentum)
    checkpoint_file_name_special_fusenet = make_ckpt_fname(model_name='special_fusenet', image_size=image_size, batch_size=batch_size, betas=betas, lr=lr, momentum=momentum)
    if OVERFITTING_TRAINING:
        checkpoint_file_name_special_fusenet += '_overfit'
        checkpoint_file_name_densedepth      += '_overfit'
    checkpoint_file_densedepth           = os.path.join(CWD, checkpoint_folder, checkpoint_file_name_densedepth)
    checkpoint_res_file_densedepth       = os.path.join(CWD, checkpoint_folder, checkpoint_file_name_densedepth+'_res')
    checkpoint_file_special_fusenet      = os.path.join(CWD, checkpoint_folder, checkpoint_file_name_special_fusenet)
    checkpoint_res_file_special_fusenet  = os.path.join(CWD, checkpoint_folder, checkpoint_file_name_special_fusenet+'_res')
    
    def remove_old_file(checkpoint_file, checkpoint_res_file):
        if os.path.isfile(f'{checkpoint_file}.pt'):
            print(f'[I] - {checkpoint_file} exist')
            os.remove(f'{checkpoint_file}.pt')
        if os.path.isfile(f'{checkpoint_res_file}.pt'):
            print(f'[I] - {checkpoint_res_file} exist')
            os.remove(f'{checkpoint_res_file}.pt')
    remove_old_file(checkpoint_file_densedepth, checkpoint_res_file_densedepth)
    remove_old_file(checkpoint_file_special_fusenet, checkpoint_res_file_special_fusenet)
    
    (res_densedepth, res_special_fusenet) = trainer.fit(dl_train=dl_train, dl_test=dl_test, checkpoints_densedepth=checkpoint_file_densedepth, checkpoints_special_fusenet=checkpoint_file_special_fusenet, early_stopping_densedepth=20, early_stopping_special_fusenet=20, print_every=10)
    
    res_densedepth.save(checkpoint_res_file_densedepth)
    res_special_fusenet.save(checkpoint_res_file_special_fusenet)
    # Save the current hyper-parameters file next to the current saved model.
#     shutil.move(os.path.join(CWD,hyperparameters_filename+'.py'),
#                 os.path.join(CWD,checkpoint_folder,hyperparameters_filename+'_'+checkpoint_file_name_special_fusenet+'.py'))
    copyfile(os.path.join(CWD, hyperparameters_filename+'.py'),
             os.path.join(CWD, checkpoint_folder, checkpoint_file_name_special_fusenet+'_'+hyperparameters_filename+'.py'))
    print("-----------------------------------------------------------------")

Current Hyper-Parameters:
IMAGE_SIZE=(64, 64)
TRAIN_TEST_RATIO=0.9
BATCH_SIZE=16
NUM_WORKERS=4

BETAS=(0.9, 0.99)
LR=0.001
MOMENTUM=0.99
WEIGHT_DECAY=0.0005

STEP_SIZE=1000
GAMMA=0.1

NUM_EPOCHS=400

[I] - device=cuda
    - seed=42
    - dropout_p=0.4
    - optimizer=None
    - scheduler=None    - overfit_mode=False
[I] - Init SpecialFuseNet
    - warm start=True
    - BN momentum=0.1
    - dropout_p=0.4    - overfit_mode=False
[I] - Initialize Net.
    - Init type=xavier
    - Init gain=0.02

[I] - default optimizer set: SGD(lr=0.001,momentum=0.9,weight_decay=0.0005)
[I] - default scheduler set: StepSR(step_size=1000,gamma=0.1)
[I (ModelsChainTrainer)] - model=ModelsChain(
  (densedepth): DenseDepth(
    (encoder): Encoder(
      (original_model): DenseNet(
        (features): Sequential(
          (conv0): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (norm0): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    

train_batch (Avg. Losses: DD 0.411, SFN 0.095: 100%|███████████████████████████████████| 82/82 [00:42<00:00,  1.94it/s]
test_batch (Avg. Losses: DD 0.391, SFN 0.108: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.56it/s]
train_batch (Avg. Losses: DD 0.359, SFN 0.079: 100%|███████████████████████████████████| 82/82 [00:42<00:00,  1.95it/s]
test_batch (Avg. Losses: DD 0.369, SFN 0.092: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.55it/s]
[I] - Saved DenseDepth checkpoint C:\Users\tomav\Documents\GitHub\cs236781-project\checkpoints/densedepth,img_size=64,64,batch_size=16,betas=0.9,0.99,lr=0.001,momentum=0.99.pt at epoch 2
[I] - Saved SpecialFuseNet checkpoint C:\Users\tomav\Documents\GitHub\cs236781-project\checkpoints/special_fusenet,img_size=64,64,batch_size=16,betas=0.9,0.99,lr=0.001,momentum=0.99.pt at epoch 2
train_batch (Avg. Losses: DD 0.330, SFN 0.071: 100%|███████████████████████████████████| 82/82 [00:41<00:00,  1.96it/s]
test_batch (Avg. 

test_batch (Avg. Losses: DD 0.349, SFN 0.043: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.56it/s]
[I] - Saved SpecialFuseNet checkpoint C:\Users\tomav\Documents\GitHub\cs236781-project\checkpoints/special_fusenet,img_size=64,64,batch_size=16,betas=0.9,0.99,lr=0.001,momentum=0.99.pt at epoch 22
train_batch (Avg. Losses: DD 0.264, SFN 0.046: 100%|███████████████████████████████████| 82/82 [00:44<00:00,  1.84it/s]
test_batch (Avg. Losses: DD 0.335, SFN 0.041: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.51it/s]
[I] - Saved DenseDepth checkpoint C:\Users\tomav\Documents\GitHub\cs236781-project\checkpoints/densedepth,img_size=64,64,batch_size=16,betas=0.9,0.99,lr=0.001,momentum=0.99.pt at epoch 23
[I] - Saved SpecialFuseNet checkpoint C:\Users\tomav\Documents\GitHub\cs236781-project\checkpoints/special_fusenet,img_size=64,64,batch_size=16,betas=0.9,0.99,lr=0.001,momentum=0.99.pt at epoch 23
train_batch (Avg. Losses: DD 0.263, SFN 0.048: 100%|██████

test_batch (Avg. Losses: DD 0.341, SFN 0.041: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.55it/s]
--- EPOCH 51/400 ---
train_batch (Avg. Losses: DD 0.256, SFN 0.038: 100%|███████████████████████████████████| 82/82 [00:43<00:00,  1.88it/s]
test_batch (Avg. Losses: DD 0.337, SFN 0.033: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.53it/s]
[I] - Saved SpecialFuseNet checkpoint C:\Users\tomav\Documents\GitHub\cs236781-project\checkpoints/special_fusenet,img_size=64,64,batch_size=16,betas=0.9,0.99,lr=0.001,momentum=0.99.pt at epoch 51
train_batch (Avg. Losses: DD 0.255, SFN 0.040: 100%|███████████████████████████████████| 82/82 [00:44<00:00,  1.86it/s]
test_batch (Avg. Losses: DD 0.343, SFN 0.035: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.51it/s]
train_batch (Avg. Losses: DD 0.254, SFN 0.039: 100%|███████████████████████████████████| 82/82 [00:43<00:00,  1.87it/s]
test_batch (Avg. Losses: DD 0.341, SFN 0.036: 100%|███████████

train_batch (Avg. Losses: DD nan, SFN 0.033: 100%|█████████████████████████████████████| 82/82 [00:14<00:00,  5.75it/s]
test_batch (Avg. Losses: DD 0.342, SFN 0.033: 100%|██████████████████████████████████████| 9/9 [00:06<00:00,  1.49it/s]
--- EPOCH 81/400 ---
train_batch (Avg. Losses: DD nan, SFN 0.034: 100%|█████████████████████████████████████| 82/82 [00:14<00:00,  5.63it/s]
test_batch (Avg. Losses: DD 0.344, SFN 0.035: 100%|██████████████████████████████████████| 9/9 [00:06<00:00,  1.49it/s]
train_batch (Avg. Losses: DD nan, SFN 0.035: 100%|█████████████████████████████████████| 82/82 [00:14<00:00,  5.59it/s]
test_batch (Avg. Losses: DD 0.345, SFN 0.033: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.51it/s]
train_batch (Avg. Losses: DD nan, SFN 0.034: 100%|█████████████████████████████████████| 82/82 [00:14<00:00,  5.60it/s]
test_batch (Avg. Losses: DD 0.346, SFN 0.033: 100%|██████████████████████████████████████| 9/9 [00:05<00:00,  1.51it/s]
train_batch (Avg. L

train_batch (Avg. Losses: DD nan, SFN 0.030: 100%|█████████████████████████████████████| 82/82 [00:14<00:00,  5.60it/s]
test_batch (Avg. Losses: DD 0.345, SFN 0.028: 100%|██████████████████████████████████████| 9/9 [00:06<00:00,  1.50it/s]
train_batch (Avg. Losses: DD nan, SFN 0.030: 100%|█████████████████████████████████████| 82/82 [00:14<00:00,  5.65it/s]
test_batch (Avg. Losses: DD 0.345, SFN 0.032: 100%|██████████████████████████████████████| 9/9 [00:06<00:00,  1.46it/s]
train_batch (Avg. Losses: DD nan, SFN 0.031: 100%|█████████████████████████████████████| 82/82 [00:14<00:00,  5.69it/s]
test_batch (Avg. Losses: DD 0.342, SFN 0.028: 100%|██████████████████████████████████████| 9/9 [00:06<00:00,  1.48it/s]
-----------------------------------------------------------------


Loops over all the combinations, trains, and saves both the models, and their hyper-parameters files.

In [8]:
# for combintation in all_combintations:
#     image_size       = combintation[0]
#     train_test_ratio = combintation[1]
#     batch_size       = combintation[2]
#     num_workers      = combintation[3]

#     betas            = combintation[4]
#     lr               = combintation[5]
#     momentum         = combintation[6]
#     weight_decay     = combintation[7]

#     step_size        = combintation[8]
#     gamma            = combintation[9]

#     num_epochs       = combintation[10]
    
#     with open(hyperparameters_filename + '.py', "w") as hyperparameters_file:
#         print(f"IMAGE_SIZE={image_size}", file=hyperparameters_file)
#         print(f"TRAIN_TEST_RATIO={train_test_ratio}", file=hyperparameters_file)
#         print(f"BATCH_SIZE={batch_size}", file=hyperparameters_file)
#         print(f"NUM_WORKERS={num_workers}", file=hyperparameters_file)
#         print(f"", file=hyperparameters_file)
#         print(f"BETAS={betas}", file=hyperparameters_file)
#         print(f"LR={lr}", file=hyperparameters_file)
#         print(f"MOMENTUM={momentum}", file=hyperparameters_file)
#         print(f"WEIGHT_DECAY={weight_decay}", file=hyperparameters_file)
#         print(f"", file=hyperparameters_file)
#         print(f"STEP_SIZE={step_size}", file=hyperparameters_file)
#         print(f"GAMMA={gamma}", file=hyperparameters_file)
#         print(f"", file=hyperparameters_file)
#         print(f"NUM_EPOCHS={num_epochs}", file=hyperparameters_file)

#     print("Current Hyper-Parameters:")
#     with open(hyperparameters_filename + '.py', "r") as hyperparameters_file:
#         print(hyperparameters_file.read())
       
#     # Train
#     if OVERFITTING_TRAINING:
#         fusenetmodel = SpecialFuseNetModel(rgb_size=rgb_size,depth_size=depth_size,grads_size=grads_size,
#                                        device=device, dropout_p=0)
#     else:
#         fusenetmodel = SpecialFuseNetModel(rgb_size=rgb_size,depth_size=depth_size,grads_size=grads_size,
#                                        device=device)
#     trainer = FuseNetTrainer(model=fusenetmodel, device=device, num_epochs=num_epochs)
    
#     dl_train,dl_test = rgbd_gradients_dataloader(root=DATASET_DIR, use_transforms=True,
#                                                  overfit_mode=OVERFITTING_TRAINING)

#     checkpoint_folder    = 'checkpoints/'
#     checkpoint_file_name = make_ckpt_fname(image_size, batch_size, betas, lr, momentum)
#     if OVERFITTING_TRAINING:
#         checkpoint_file_name += '_overfit'
#     checkpoint_file      = os.path.join(CWD, checkpoint_folder, checkpoint_file_name)
#     checkpoint_res_file  = os.path.join(CWD, checkpoint_folder, checkpoint_file_name+'_res')
    
#     if os.path.isfile(f'{checkpoint_file}.pt'):
#         print(f'[I] - {checkpoint_file} exist')
#         os.remove(f'{checkpoint_file}.pt')
#     if os.path.isfile(f'{checkpoint_res_file}.pt'):
#         print(f'[I] - {checkpoint_res_file} exist')
#         os.remove(f'{checkpoint_res_file}.pt')
        
#     res = trainer.fit(dl_train, dl_test, early_stopping=20, print_every=10, checkpoints=checkpoint_file)
    
#     res.save(checkpoint_res_file)
#     # Save the current hyper-parameters file next to the current saved model.
# #     shutil.move(os.path.join(CWD,hyperparameters_filename+'.py'),
# #                 os.path.join(CWD,checkpoint_folder,hyperparameters_filename+'_'+checkpoint_file_name+'.py'))
#     copyfile(os.path.join(CWD,hyperparameters_filename+'.py'),
#              os.path.join(CWD,checkpoint_folder,checkpoint_file_name+'_'+hyperparameters_filename+'.py'))
#     print("-----------------------------------------------------------------")