In [74]:
!nvidia-smi

Thu Jun 24 13:15:33 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    32W /  70W |   4772MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# !mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [5]:
!pip install kaggle -q
!kaggle datasets download -d aladdinpersson/flickr8kimagescaptions
!unzip -q flickr8kimagescaptions.zip

Downloading flickr8kimagescaptions.zip to /content
 98% 1.01G/1.04G [00:06<00:00, 154MB/s]
100% 1.04G/1.04G [00:06<00:00, 163MB/s]


In [32]:
!rm -r Image-Captioning/

In [1]:
# # get the code for kaggle
# !git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
# !pip3 install nltk==3.6.2
# import nltk
# nltk.download("wordnet")

In [4]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [5]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none

In [6]:
DATA_NAME = 'flickr8k'

# local
# DATA_JSON_PATH = 'data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = 'Image-Captioning/data.json'
IMGS_PATH = 'flickr8k/images/'

In [7]:
# load vocab
vocab = build_vocab(DATA_JSON_PATH)
# top10k_words = get_10k_vocab("/content/Image-Captioning/10k_words.txt")
# vocab = top10k_vocab(top10k_words)
vocab_len = len(vocab)
vocab_len

100%|██████████| 40000/40000 [00:00<00:00, 278697.31it/s]


5089

In [8]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': 100,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [9]:
t_params

{'batch_size': 256,
 'data_name': 'flickr8k',
 'decoder_lr': 0.0004,
 'df_path': 'Image-Captioning/data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_encoder': False,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': False,
 'vocab': <dataset.Vocabulary at 0x7f851678a590>,
 'workers': 2}

In [10]:
# experiment name
name = DATA_NAME + ""
# path
log_dir = '/content/drive/MyDrive/ImageCaptioning/flickr8/experiments_vanilla'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [82]:
# with scheduler
fit(t_params=t_params, m_params=m_params, logger=logger)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/118]	Batch Time 6.178 (6.178)	Data Load Time 3.764 (3.764)	Loss 9.4634 (9.4634)	Top-5 Accuracy 0.066 (0.066)
Epoch: [0][100/118]	Batch Time 2.468 (2.495)	Data Load Time 0.008 (0.041)	Loss 4.8366 (5.5073)	Top-5 Accuracy 53.386 (44.615)
Epoch train time 291.381 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/20]	Batch Time 5.966 (5.966)	Loss 5.5111 (5.5111)	Top-5 Accuracy 51.320 (51.320)	
----- Bleu-n Scores -----
1: 59.34414293382591
2: 34.05497832528091
3: 17.225916095691
4: 8.844724455255006
-------------------------

 * LOSS - 5.426, TOP-5 ACCURACY - 50.979, BLEU-4 - 8.844724455255006

Epoch validation tim

In [16]:
torch.cuda.empty_cache()

In [104]:
!cp /content/BEST_checkpoint_flickr8k.pth.tar /content/drive/MyDrive/ImageCaptioning/flickr8

In [11]:
batch_size = 64
fine_tune_encoder = True
checkpoint = '/content/BEST_checkpoint_flickr8k.pth.tar'
# epochs = 30

t_params['batch_size'] = batch_size
t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['decoder_lr'] = t_params['decoder_lr'] / 10
# t_params['epochs'] = epochs
t_params

{'batch_size': 64,
 'data_name': 'flickr8k_finetune',
 'decoder_lr': 4e-05,
 'df_path': 'Image-Captioning/data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_encoder': True,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': False,
 'vocab': <dataset.Vocabulary at 0x7f851678a590>,
 'workers': 2}

In [12]:
fit(t_params, checkpoint=checkpoint, m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 8
Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: [8][0/469]	Batch Time 4.394 (4.394)	Data Load Time 1.075 (1.075)	Loss 3.9341 (3.9341)	Top-5 Accuracy 65.668 (65.668)
Epoch: [8][100/469]	Batch Time 1.355 (1.366)	Data Load Time 0.001 (0.012)	Loss 4.0626 (4.0607)	Top-5 Accuracy 63.625 (63.763)
Epoch: [8][200/469]	Batch Time 1.341 (1.348)	Data Load Time 0.001 (0.007)	Loss 4.0942 (4.0225)	Top-5 Accuracy 64.295 (64.396)
Epoch: [8][300/469]	Batch Time 1.316 (1.341)	Data Load Time 0.002 (0.005)	Loss 3.9342 (3.9926)	Top-5 Accuracy 64.673 (64.900)
Epoch: [8][400/469]	Batch Time 1.315 (1.337)	Data Load Time 0.003 (0.004)	Loss 3.8777 (3.9742)	Top-5 Accuracy 65.847 (65.196)
Epoch train time 627.633 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/79]	Batch Time 1.560 (1.560)	Loss 5.0039 (5.0039)	Top-5 Accuracy 59.468 (59.468)	
----- Bleu-n Scores -----
1: 66.23301406134856
2: 42.47408427762649
3: 25.16285785882184
4: 14.916057098634305
-------------------------

 * LOSS - 5.018, TOP-5 ACCURACY - 59.7

In [13]:
!cp BEST_checkpoint_flickr8k_finetune.pth.tar /content/drive/MyDrive/ImageCaptioning/flickr8 

In [14]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 12
Best Bleu-4: 15.97917426288958


In [15]:
from eval import test_score

test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

  cpuset_checked))
EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [02:50<00:00, 29.26it/s]


----- Bleu-n Scores -----
1: 60.22812266267764
2: 43.382614808552674
3: 29.616014800242358
4: 19.937961468711514
-------------------------


  cpuset_checked))
EVALUATING AT BEAM SIZE 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [03:13<00:00, 25.87it/s]


----- Bleu-n Scores -----
1: 64.08114558472555
2: 46.50725094600124
3: 32.63582641164054
4: 22.48417748427286
-------------------------


  cpuset_checked))
EVALUATING AT BEAM SIZE 5:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 5000/5000 [03:40<00:00, 22.65it/s]


----- Bleu-n Scores -----
1: 65.32056619483764
2: 47.5853480841862
3: 33.54273081013491
4: 23.093440505233385
-------------------------
