In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [3]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from nltk.translate.bleu_score import corpus_bleu

In [4]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 1000  # dimension of word embeddings
attention_dim = 1000  # dimension of attention linear layers
decoder_dim = 1000  # dimension of decoder RNN
dropout = 0.3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 2  # number of epochs to train for (if early stopping is not triggered)
batch_size = 64
workers = 4
encoder_lr = 3e-4  # learning rate for encoder if fine-tuning
decoder_lr = 2e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none

In [5]:
DATA_NAME = 'testing_experiemnts_process'

# local
# DATA_JSON_PATH = 'data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
DATA_JSON_PATH = 'data.json'
IMGS_PATH = 'flickr/Images/'

In [6]:
vocab = build_vocab(DATA_JSON_PATH)

100%|██████████| 40000/40000 [00:00<00:00, 372487.64it/s]


In [7]:
vocab_len = len(vocab); vocab_len

5089

In [8]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder
}

m_params = {
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'max_seq_length': 100,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [21]:
# experiment name
name = DATA_NAME
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [10]:
t_params

{'data_name': 'testing_experiemnts_process',
 'imgs_path': 'flickr/Images/',
 'df_path': 'data.json',
 'vocab': <dataset.Vocabulary at 0x7f3612dfbe90>,
 'epochs': 2,
 'batch_size': 64,
 'workers': 4,
 'decoder_lr': 0.0002,
 'encoder_lr': 0.0003,
 'fine_tune_encoder': False}

In [14]:
fit(t_params=t_params, m_params=m_params, logger)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/469]	Batch Time 3.944 (3.944)	Data Load Time 1.068 (1.068)	Loss 9.4606 (9.4606)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][100/469]	Batch Time 1.359 (1.377)	Data Load Time 0.001 (0.011)	Loss 5.2761 (5.7068)	Top-5 Accuracy 49.596 (42.486)
Epoch: [0][200/469]	Batch Time 1.419 (1.373)	Data Load Time 0.000 (0.006)	Loss 4.7461 (5.3222)	Top-5 Accuracy 55.707 (47.879)
Epoch: [0][300/469]	Batch Time 1.375 (1.375)	Data Load Time 0.000 (0.004)	Loss 4.6615 (5.1015)	Top-5 Accuracy 56.899 (50.902)
Epoch: [0][400/469]	Batch Time 1.351 (1.376)	Data Load Time 0.001 (0.003)	Loss 4.3963 (4.9499)	Top-5 Accuracy 60.680 (52.956)
Epoch train time 646.779 (epoch_time.avg:.3

  0%|          | 0/5000 [00:00<?, ?it/s]

1: 63.47428594160521
2: 39.58047964932961
3: 22.91363011374662
4: 13.473099968356847
-------------------------
----- METEOR Score -----


100%|██████████| 5000/5000 [00:11<00:00, 424.37it/s]


m: 0.30135740038171277

 * LOSS - 5.119, TOP-5 ACCURACY - 56.367, BLEU-4 - 13.473099968356847

Epoch validation time 83.919 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training --------------------
Epoch: [1][0/469]	Batch Time 7.735 (7.735)	Data Load Time 6.334 (6.334)	Loss 4.1532 (4.1532)	Top-5 Accuracy 64.238 (64.238)
Epoch: [1][100/469]	Batch Time 1.390 (1.441)	Data Load Time 0.000 (0.063)	Loss 4.2258 (4.2656)	Top-5 Accuracy 62.549 (61.710)
Epoch: [1][200/469]	Batch Time 1.390 (1.419)	Data Load Time 0.000 (0.032)	Loss 4.0724 (4.2202)	Top-5 Accuracy 62.963 (62.309)
Epoch: [1][300/469]	Batch Time 1.423 (1.410)	Data Load Time 0.000 (0.022)	Loss 4.0619 (4.1855)	Top-5 Accuracy 63.787 (62.759)
Epoch: [1][400/469]	Batch Time 1.408 (1.405)	Data Load Time 0.000 (0.016)	Loss 4.1961 (4.1520)	Top-5 Accuracy 63.373 (63.268)
Epoch train time 657.862 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/79]	Batch T

  1%|          | 45/5000 [00:00<00:11, 449.93it/s]

1: 64.3503053678497
2: 40.45690149367875
3: 23.787402299442828
4: 13.934173482196025
-------------------------
----- METEOR Score -----


100%|██████████| 5000/5000 [00:12<00:00, 412.57it/s]


m: 0.32407175136920063

 * LOSS - 5.114, TOP-5 ACCURACY - 58.813, BLEU-4 - 13.934173482196025

Epoch validation time 84.101 (epoch_time.avg:.3f)


In [10]:
# load checkpoint 
# Load model
CHECKPOINT_PATH = 'BEST_checkpoint_testing_experiemnts_process.pth.tar'

checkpoint = load_checkpoint(CHECKPOINT_PATH)
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 1
Best Bleu-4: 13.934173482196025


In [12]:
from eval import test_score

b1, b2, b3, b4, m = test_score(1, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)

EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [03:14<00:00, 25.65it/s]


----- Bleu-n Scores -----


  0%|          | 0/4995 [00:00<?, ?it/s]

1: 56.115238267802134
2: 38.15185584589656
3: 24.771081427232787
4: 16.003448564877804
-------------------------
----- METEOR Score -----


100%|██████████| 4995/4995 [00:16<00:00, 311.25it/s]

m: 0.3778662442990078





In [15]:
test_dict = {}

In [16]:
for i in [1, 3, 5]:
    
    b1, b2, b3, b4, m = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [03:31<00:00, 23.60it/s]


----- Bleu-n Scores -----
1: 56.115238267802134
2: 38.15185584589656
3: 24.771081427232787
4: 16.003448564877804
-------------------------
----- METEOR Score -----


100%|██████████| 4995/4995 [00:14<00:00, 352.08it/s]


m: 0.3778662442990085


EVALUATING AT BEAM SIZE 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [04:25<00:00, 18.81it/s]


----- Bleu-n Scores -----


  0%|          | 0/5000 [00:00<?, ?it/s]

1: 59.96753576138785
2: 41.09254161374708
3: 27.484138401378967
4: 18.16994793899022
-------------------------
----- METEOR Score -----


100%|██████████| 5000/5000 [00:13<00:00, 376.71it/s]


m: 0.3853924658437256


EVALUATING AT BEAM SIZE 5:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 5000/5000 [05:56<00:00, 14.04it/s]


----- Bleu-n Scores -----


  0%|          | 0/5000 [00:00<?, ?it/s]

1: 60.933232891919374
2: 41.813695672017104
3: 27.99716154984623
4: 18.335074227709068
-------------------------
----- METEOR Score -----


100%|██████████| 5000/5000 [00:13<00:00, 379.49it/s]

m: 0.379726242907985





In [17]:
test_dict

{'b4-b1': 16.003448564877804,
 'b1': 59.96753576138785,
 'b2': 41.09254161374708,
 'b3': 27.484138401378967,
 'b4-b3': 18.16994793899022,
 'b4-b5': 18.335074227709068}

In [24]:
# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 2,
    'top5acc/valid/1': 58.8,
    'b-1/test': test_dict['b1'],
    'b-2/test': test_dict['b2'],
    'b-3/test': test_dict['b3'],
    'b-4/b3': test_dict['b4-b3'],
    'b-4/b1': test_dict['b4-b1'],
    'b-4/b5': test_dict['b4-b5']
}

In [26]:
logger.add_hparams(logger_dic, results_dic, run_name='finetune')