In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
py_files_path = '../'
import sys
sys.path.append(py_files_path)

In [3]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from utils import adjust_learning_rate
from train import fit
from nltk.translate.bleu_score import corpus_bleu

In [4]:
# Model parameters
encoder_dim = 512
emb_dim = 256  # dimension of word embeddings
attention_dim = 256  # dimension of attention linear layers
decoder_dim = 256  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 10  # number of epochs to train for (if early stopping is not triggered)
batch_size = 64
workers = 4
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none

In [5]:
# load vocab
vocab = build_vocab('../data.json')

100%|██████████| 30000/30000 [00:00<00:00, 355924.06it/s]


In [6]:
len(vocab)

4451

In [9]:
t_params = {
    'data_name': 'flickr8k_5_cap_per_img_2_min_word_freq',
    'imgs_path': '../flickr/Images/',
    'df_path': '../data.json',
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

epochs=100
fit(t_params=t_params, m_params=m_params)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
Epoch: [0][0/469]	Batch Time 1.247 (1.247)	Data Load Time 0.908 (0.908)	Loss 9.3561 (9.3561)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][100/469]	Batch Time 0.298 (0.318)	Data Load Time 0.000 (0.009)	Loss 6.0188 (6.6589)	Top-5 Accuracy 35.126 (29.721)
Epoch: [0][200/469]	Batch Time 0.265 (0.314)	Data Load Time 0.000 (0.005)	Loss 5.3723 (6.2308)	Top-5 Accuracy 44.607 (33.889)
Epoch: [0][300/469]	Batch Time 0.311 (0.313)	Data Load Time 0.000 (0.003)	Loss 5.4370 (5.9596)	Top-5 Accuracy 45.771 (37.649)
Epoch: [0][400/469]	Batch Time 0.299 (0.313)	Data Load Time 0.000 (0.002)	Loss 5.1125 (5.7697)	Top-5 Accuracy 50.083 (40.444)
Validation: [0/79]	Batch Time 1.375 (1.375)	Loss 5.4618 (5.4618)	Top-5 Accuracy 48.590 (48.590)	

 * LOSS - 5.756, TOP-5 ACCURACY - 46.976, BLEU-