In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
py_files_path = '../'
import sys
sys.path.append(py_files_path)

In [3]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from utils import adjust_learning_rate
from nltk.translate.bleu_score import corpus_bleu

In [26]:
# Model parameters
encoder_dim = 512
emb_dim = 256  # dimension of word embeddings
attention_dim = 256  # dimension of attention linear layers
decoder_dim = 256  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 1  # number of epochs to train for (if early stopping is not triggered)
batch_size = 32
workers = 4
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none

In [27]:
# load vocab
vocab = build_vocab('../data.json')

100%|██████████| 30000/30000 [00:00<00:00, 358207.89it/s]


In [28]:
len(vocab)

4451

In [29]:
from train import fit

In [32]:
t_params = {
    'data_name': 'flickr8k_5_cap_per_img_2_min_word_freq',
    'imgs_path': '../flickr/Images/',
    'df_path': '../data.json',
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}


fit(t_params=t_params, m_params=m_params)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
Epoch: [0][0/938]	Batch Time 0.792 (0.792)	Data Load Time 0.593 (0.593)	Loss 9.3271 (9.3271)	Top-5 Accuracy 0.687 (0.687)
Epoch: [0][100/938]	Batch Time 0.174 (0.182)	Data Load Time 0.000 (0.006)	Loss 6.1308 (6.7088)	Top-5 Accuracy 34.824 (29.400)
Epoch: [0][200/938]	Batch Time 0.157 (0.177)	Data Load Time 0.000 (0.003)	Loss 5.6063 (6.3051)	Top-5 Accuracy 41.554 (32.847)
Epoch: [0][300/938]	Batch Time 0.182 (0.176)	Data Load Time 0.000 (0.002)	Loss 5.6357 (6.0563)	Top-5 Accuracy 43.003 (36.081)
Epoch: [0][400/938]	Batch Time 0.172 (0.174)	Data Load Time 0.000 (0.002)	Loss 5.3434 (5.8833)	Top-5 Accuracy 46.724 (38.698)
Epoch: [0][500/938]	Batch Time 0.168 (0.174)	Data Load Time 0.000 (0.001)	Loss 4.9073 (5.7391)	Top-5 Accuracy 52.786 (40.882)
Epoch: [0][600/938]