In [6]:
# !rm -r Image-Captioning/

In [7]:
# get the code for kaggle
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = '/kaggle/working/Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 526, done.[K
remote: Counting objects: 100% (526/526), done.[K
remote: Compressing objects: 100% (233/233), done.[K
remote: Total 526 (delta 320), reused 491 (delta 285), pack-reused 0[K
Receiving objects: 100% (526/526), 36.81 MiB | 16.02 MiB/s, done.
Resolving deltas: 100% (320/320), done.


In [8]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [9]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [10]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none

In [11]:
DATA_NAME = 'flickr30k_10k'

# local
# DATA_JSON_PATH = 'data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data30.json'
IMGS_PATH = '../input/flickr30k/images/flickr30k_images/'

In [12]:
# load vocab
# vocab = build_vocab(DATA_JSON_PATH)
top10k_words = get_10k_vocab("./Image-Captioning/10k_words.txt")
vocab = top10k_vocab(top10k_words)
vocab_len = len(vocab)
vocab_len

100%|██████████| 1/1 [00:00<00:00, 111.14it/s]


10004

In [13]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': 100,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [14]:
t_params

{'data_name': 'flickr30k_10k',
 'imgs_path': '../input/flickr30k/images/flickr30k_images/',
 'df_path': '/kaggle/working/Image-Captioning/data30.json',
 'vocab': <dataset.Vocabulary at 0x7f8f91fc0750>,
 'epochs': 30,
 'batch_size': 256,
 'workers': 2,
 'decoder_lr': 0.0004,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': False,
 'pretrained_embeddings': False}

In [15]:
# experiment name
name = DATA_NAME + "_10k_words"
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [16]:
torch.cuda.empty_cache()

In [None]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Loading Data
Dataset split: train
Unique images: 29000
Total size: 145000
Dataset split: val
Unique images: 1014
Total size: 5070
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/567]	Batch Time 9.796 (9.796)	Data Load Time 5.565 (5.565)	Loss 10.0955 (10.0955)	Top-5 Accuracy 0.057 (0.057)
Epoch: [0][100/567]	Batch Time 4.151 (2.998)	Data Load Time 2.278 (1.158)	Loss 5.3575 (5.9454)	Top-5 Accuracy 47.827 (40.556)
Epoch: [0][200/567]	Batch Time 3.244 (2.879)	Data Load Time 1.512 (1.068)	Loss 4.9234 (5.5583)	Top-5 Accuracy 54.001 (45.571)
Epoch: [0][300/567]	Batch Time 2.961 (2.806)	Data Load Time 1.105 (0.978)	Loss 4.7621 (5.3522)	Top-5 Accuracy 56.495 (48.234)
Epoch: [0][400/567]	Batch Time 2.933 (2.761)	Data Load Time 1.032 (0.932)	Loss 4.8033 (5.2131)	Top-5 Accuracy 55.053 (49.994)
Epoch: [0][500/567]	Batch Time 3.232 (2

In [None]:
# save the model to gdrive
from IPython.display import FileLink
FileLink("./BEST_checkpoint_flickr30k_10k.pth.tar")

<a href='./BEST_checkpoint_flickr30k_10k.pth.tar'>download<a>

In [None]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 15  # number of epochs to train for (if early stopping is not triggered)
batch_size = 32
workers = 4
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = True  # fine-tune encoder?
checkpoint = './BEST_checkpoint_flickr30k_5_cap_per_img_2_min_word_freq_resnet101.pth.tar'  # path to checkpoint, None if none

In [None]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

t_params

In [None]:
fit(t_params=t_params, checkpoint=checkpoint, m_params=m_params)