In [46]:
# !rm -r Image-Captioning/

In [47]:
# # get the code for kaggle
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 506, done.[K
remote: Counting objects: 100% (506/506), done.[K
remote: Compressing objects: 100% (217/217), done.[K
remote: Total 506 (delta 312), reused 475 (delta 281), pack-reused 0[K
Receiving objects: 100% (506/506), 36.49 MiB | 13.91 MiB/s, done.
Resolving deltas: 100% (312/312), done.


In [48]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [49]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [4]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 300  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder
pretrained_embeddings = True
fine_tune_embeddings = True
checkpoint = None  # path to checkpoint, None if none

In [5]:
DATA_NAME = 'flickr8k_tb'

# local
# DATA_JSON_PATH = 'data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
# DATA_JSON_PATH = 'Image-Captioning/data.json'
# IMGS_PATH = 'flickr8k/images/'

In [6]:
# load vocab
vocab = build_vocab(DATA_JSON_PATH)
vocab_len = len(vocab); vocab_len

100%|██████████| 40000/40000 [00:00<00:00, 216834.42it/s]


5089

### Pre-trained Embeddings

In [7]:
glove = pd.read_csv('../input/glove6b/glove.6B.300d.txt', sep=' ', quoting=3, header=None, index_col=0)
glove_embedding = {key: val.values for key, val in glove.T.items()}

In [8]:
def create_embedding_matrix(vocab, embedding_dict, dimension):
    embedding_matrix = np.zeros((len(vocab), dimension))
    
    for word, index in vocab.stoi.items():
        if word in embedding_dict:
            embedding_matrix[index] = embedding_dict[word]
        else: 
            embedding_matrix[index] = np.random.uniform(-.1, .1, size=dimension)
    return embedding_matrix

In [9]:
embedding_matrix = create_embedding_matrix(vocab, glove_embedding, 300)

In [10]:
embedding_matrix.shape

(5089, 300)

In [11]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'fine_tune_embeddings': fine_tune_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout,
    'embeddings_matrix': embedding_matrix
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': 100,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}


t_params

{'data_name': 'flickr8k_tb',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': '/kaggle/working/Image-Captioning/data.json',
 'vocab': <dataset.Vocabulary at 0x7f0388ab1990>,
 'epochs': 20,
 'batch_size': 256,
 'workers': 2,
 'decoder_lr': 0.0004,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': False,
 'pretrained_embeddings': True,
 'fine_tune_embeddings': True}

In [12]:
# experiment name
name = DATA_NAME + "_pretrained_glove"
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [14]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/118]	Batch Time 6.601 (6.601)	Data Load Time 5.134 (5.134)	Loss 9.4666 (9.4666)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][100/118]	Batch Time 2.796 (2.295)	Data Load Time 1.431 (0.964)	Loss 4.7633 (5.4140)	Top-5 Accuracy 54.613 (46.523)
Epoch train time 266.007 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/20]	Batch Time 7.088 (7.088)	Loss 5.5493 (5.5493)	Top-5 Accuracy 50.976 (50.976)	
----- Bleu-n Scores -----
1: 58.915703682508834
2: 32.81225167382208
3: 15.582803588027405
4: 7.495922958508829
-------------------------
----- METEOR Score -----

 * LOSS - 5.322, TOP-5 ACCURACY - 52.010, BLEU-4 - 7.49592295

In [15]:
!ls

BEST_checkpoint_flickr8k_tb.pth.tar  checkpoint_flickr8k_tb.pth.tar
Image-Captioning		     experiments
__notebook_source__.ipynb


In [16]:
!zip -r BEST_checkpoint_flickr8k_tb.pth.tar experiments

  adding: experiments/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/events.out.tfevents.1624444330.49c0fa9f8e04.42.0 (deflated 67%)


In [20]:
!ls -l

total 559392
-rw-r--r-- 1 root root 286433971 Jun 23 11:56 BEST_checkpoint_flickr8k_tb.pth.tar
drwxr-xr-x 8 root root      4096 Jun 23 10:31 Image-Captioning
---------- 1 root root       263 Jun 23 10:30 __notebook_source__.ipynb
-rw-r--r-- 1 root root 286367733 Jun 23 11:50 checkpoint_flickr8k_tb.pth.tar
drwxr-xr-x 3 root root      4096 Jun 23 10:32 experiments


In [22]:
m = load_checkpoint('checkpoint_flickr8k_tb.pth.tar')

Loaded Checkpoint!!
Last Epoch: 14
Best Bleu-4: 15.11688192686112


In [23]:
fit(t_params=t_params, checkpoint="checkpoint_flickr8k_tb.pth.tar", m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 15
Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [15][0/118]	Batch Time 5.065 (5.065)	Data Load Time 3.622 (3.622)	Loss 3.8590 (3.8590)	Top-5 Accuracy 66.766 (66.766)
Epoch: [15][100/118]	Batch Time 3.545 (2.136)	Data Load Time 2.067 (0.830)	Loss 3.9267 (3.9038)	Top-5 Accuracy 65.809 (66.085)
Epoch train time 248.314 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/20]	Batch Time 5.339 (5.339)	Loss 4.9193 (4.9193)	Top-5 Accuracy 60.426 (60.426)	
----- Bleu-n Scores -----
1: 66.06131066911954
2: 42.05707192834203
3: 25.132566884557594
4: 15.150452886353694
-------------------------
----- METEOR Score -----

 * LOSS - 4.982, T

In [27]:
m = load_checkpoint("BEST_checkpoint_flickr8k_tb.pth.tar")

Loaded Checkpoint!!
Last Epoch: 17
Best Bleu-4: 15.212723516412229


In [30]:
batch_size = 64
fine_tune_encoder = True
checkpoint = 'BEST_checkpoint_flickr8k_tb.pth.tar'
epochs = 30


t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['epochs'] = epochs
t_params

{'data_name': 'flickr8k_tb_finetune',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': '/kaggle/working/Image-Captioning/data.json',
 'vocab': <dataset.Vocabulary at 0x7f0388ab1990>,
 'epochs': 30,
 'batch_size': 64,
 'workers': 2,
 'decoder_lr': 0.0004,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': True,
 'pretrained_embeddings': True,
 'fine_tune_embeddings': True}

In [31]:
fit(t_params=t_params, checkpoint="BEST_checkpoint_flickr8k_tb.pth.tar", m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 18
Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [18][0/469]	Batch Time 4.009 (4.009)	Data Load Time 1.145 (1.145)	Loss 3.8489 (3.8489)	Top-5 Accuracy 67.468 (67.468)
Epoch: [18][100/469]	Batch Time 0.723 (0.769)	Data Load Time 0.000 (0.012)	Loss 3.9696 (3.9997)	Top-5 Accuracy 65.306 (64.578)
Epoch: [18][200/469]	Batch Time 0.734 (0.751)	Data Load Time 0.000 (0.006)	Loss 4.1231 (3.9676)	Top-5 Accuracy 64.549 (65.143)
Epoch: [18][300/469]	Batch Time 0.774 (0.745)	Data Load Time 0.001 (0.004)	Loss 4.1054 (3.9527)	Top-5 Accuracy 62.667 (65.419)
Epoch: [18][400/469]	Batch Time 0.711 (0.742)	Data Load Time 0.000 (0.003)	Loss 3.7707 (3.9388)	Top-5 Accuracy 70.112 (65.64

In [33]:
m = load_checkpoint("BEST_checkpoint_flickr8k_tb.pth.tar")

Loaded Checkpoint!!
Last Epoch: 17
Best Bleu-4: 15.212723516412229


In [34]:
m = load_checkpoint("BEST_checkpoint_flickr8k_tb_finetune.pth.tar")

Loaded Checkpoint!!
Last Epoch: 24
Best Bleu-4: 15.533793671583021


In [35]:
!ls

BEST_checkpoint_flickr8k_tb.pth.tar
BEST_checkpoint_flickr8k_tb_finetune.pth.tar
Image-Captioning
__notebook_source__.ipynb
checkpoint_flickr8k_tb.pth.tar
checkpoint_flickr8k_tb_finetune.pth.tar
experiments


In [36]:
!zip -r pre_trained.zip BEST_checkpoint_flickr8k_tb_finetune.pth.tar experiments

  adding: BEST_checkpoint_flickr8k_tb_finetune.pth.tar (deflated 9%)
  adding: experiments/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/events.out.tfevents.1624444330.49c0fa9f8e04.42.0 (deflated 69%)


In [40]:
!zip -r experiemnts_pretrained.zip experiments

  adding: experiments/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/events.out.tfevents.1624444330.49c0fa9f8e04.42.0 (deflated 69%)


In [41]:
from IPython.display import FileLink
FileLink('experiemnts_pretrained.zip')

In [42]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_tb_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 24
Best Bleu-4: 15.533793671583021


In [50]:
from eval import test_score

b1, b2, b3, b4 = test_score(1, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)

EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [04:07<00:00, 20.21it/s]


----- Bleu-n Scores -----
1: 60.712287898564234
2: 43.53202026800945
3: 29.852698297065306
4: 20.294859174878876
-------------------------


In [53]:
test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [04:09<00:00, 20.01it/s]


----- Bleu-n Scores -----
1: 60.712287898564234
2: 43.53202026800945
3: 29.852698297065306
4: 20.294859174878876
-------------------------


EVALUATING AT BEAM SIZE 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [04:31<00:00, 18.39it/s]


----- Bleu-n Scores -----
1: 63.953949980150846
2: 46.48314296326771
3: 32.935330253964565
4: 22.739269233648933
-------------------------


EVALUATING AT BEAM SIZE 5:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 5000/5000 [04:46<00:00, 17.46it/s]


----- Bleu-n Scores -----
1: 65.16982704730152
2: 47.3396422938947
3: 33.53530794815447
4: 23.23361748348483
-------------------------


In [54]:
test_dict

{'b4-b1': 20.294859174878876,
 'b1': 63.953949980150846,
 'b2': 46.48314296326771,
 'b3': 32.935330253964565,
 'b4-b3': 22.739269233648933,
 'b4-b5': 23.23361748348483}

In [55]:
# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 2,
    'top5acc/valid/1': 58.8,
    'b-1/test': test_dict['b1'],
    'b-2/test': test_dict['b2'],
    'b-3/test': test_dict['b3'],
    'b-4/b3': test_dict['b4-b3'],
    'b-4/b1': test_dict['b4-b1'],
    'b-4/b5': test_dict['b4-b5']
}

In [56]:
logger.add_hparams(logger_dic, results_dic, run_name='finetune')

In [57]:
!ls

BEST_checkpoint_flickr8k_tb.pth.tar
BEST_checkpoint_flickr8k_tb_finetune.pth.tar
Image-Captioning
__notebook_source__.ipynb
checkpoint_flickr8k_tb.pth.tar
checkpoint_flickr8k_tb_finetune.pth.tar
experiemnts_pretrained.zip
experiments
pre_trained.zip


In [58]:
!zip -r pre_trained_exp.zip experiments

  adding: experiments/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/finetune/ (stored 0%)
  adding: experiments/flickr8k_tb_pretrained_glove/finetune/events.out.tfevents.1624461312.49c0fa9f8e04.42.1 (deflated 45%)
  adding: experiments/flickr8k_tb_pretrained_glove/events.out.tfevents.1624444330.49c0fa9f8e04.42.0 (deflated 69%)


In [59]:
from IPython.display import FileLink
FileLink('pre_trained_exp.zip')