In [2]:
!nvidia-smi

Sat Jun 26 10:07:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# # get the code for kaggle
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 550, done.[K
remote: Counting objects: 100% (550/550), done.[K
remote: Compressing objects: 100% (250/250), done.[K
remote: Total 550 (delta 334), reused 508 (delta 292), pack-reused 0[K
Receiving objects: 100% (550/550), 38.27 MiB | 20.04 MiB/s, done.
Resolving deltas: 100% (334/334), done.


In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [6]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none

In [7]:
DATA_NAME = 'flickr8k'

# local
# DATA_JSON_PATH = 'data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
# DATA_JSON_PATH = 'Image-Captioning/data.json'
# IMGS_PATH = 'flickr8k/images/'

In [8]:
df = pd.read_json(DATA_JSON_PATH)
df.head()

Unnamed: 0,file_name,split,caption,tok_len,tokens
0,2513260012_03d33305cf.jpg,train,A black dog is running after a white dog in th...,12,"[a, black, dog, is, running, after, a, white, ..."
1,2513260012_03d33305cf.jpg,train,Black dog chasing brown dog through snow,7,"[black, dog, chasing, brown, dog, through, snow]"
2,2513260012_03d33305cf.jpg,train,Two dogs chase each other across the snowy gro...,9,"[two, dogs, chase, each, other, across, the, s..."
3,2513260012_03d33305cf.jpg,train,Two dogs play together in the snow .,7,"[two, dogs, play, together, in, the, snow]"
4,2513260012_03d33305cf.jpg,train,Two dogs running through a low lying body of w...,10,"[two, dogs, running, through, a, low, lying, b..."


In [9]:
df.tok_len.max()

37

In [10]:
# load vocab
vocab = build_vocab(DATA_JSON_PATH, max_seq=40)
# top10k_words = get_10k_vocab("/content/Image-Captioning/10k_words.txt")
# vocab = top10k_vocab(top10k_words)
vocab_len = len(vocab)
vocab_len

100%|██████████| 40000/40000 [00:00<00:00, 229257.07it/s]


5089

In [11]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': 100,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [12]:
t_params

{'data_name': 'flickr8k',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': '/kaggle/working/Image-Captioning/data.json',
 'vocab': <dataset.Vocabulary at 0x7f753ff65810>,
 'epochs': 30,
 'batch_size': 256,
 'workers': 2,
 'decoder_lr': 0.0004,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': False,
 'pretrained_embeddings': False}

In [13]:
# experiment name
name = DATA_NAME + "small_max_seq"
# path
log_dir = '/content/drive/MyDrive/ImageCaptioning/flickr8/experiments_vanilla'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [14]:
# with scheduler
fit(t_params=t_params, m_params=m_params, logger=logger)

Downloading: "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth" to /root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth


  0%|          | 0.00/170M [00:00<?, ?B/s]

Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/118]	Batch Time 9.531 (9.531)	Data Load Time 5.227 (5.227)	Loss 9.4770 (9.4770)	Top-5 Accuracy 0.134 (0.134)
Epoch: [0][100/118]	Batch Time 2.872 (2.281)	Data Load Time 1.421 (0.885)	Loss 4.8643 (5.5175)	Top-5 Accuracy 53.353 (44.620)
Epoch train time 262.880 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/20]	Batch Time 7.065 (7.065)	Loss 5.5638 (5.5638)	Top-5 Accuracy 49.249 (49.249)	
----- Bleu-n Scores -----
1: 56.85350047148461
2: 30.66176031276061
3: 13.782722962087693
4: 5.9202819152917705
-------------------------

 * LOSS - 5.445, TOP-5 ACCURACY - 51.106, BLEU-4 - 5.9202819152917705

Epoch validatio

In [15]:
torch.cuda.empty_cache()

In [16]:
batch_size = 64
fine_tune_encoder = True
checkpoint = 'BEST_checkpoint_flickr8k.pth.tar'
# epochs = 30

t_params['batch_size'] = batch_size
t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['decoder_lr'] = t_params['decoder_lr'] / 10
# t_params['epochs'] = epochs
t_params

{'data_name': 'flickr8k_finetune',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': '/kaggle/working/Image-Captioning/data.json',
 'vocab': <dataset.Vocabulary at 0x7f753ff65810>,
 'epochs': 30,
 'batch_size': 64,
 'workers': 2,
 'decoder_lr': 4e-05,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': True,
 'pretrained_embeddings': False}

In [17]:
fit(t_params, checkpoint=checkpoint, m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 7
Loading Data
Dataset split: train
Unique images: 6000
Total size: 30000
Dataset split: val
Unique images: 1000
Total size: 5000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [7][0/469]	Batch Time 3.304 (3.304)	Data Load Time 0.980 (0.980)	Loss 4.0045 (4.0045)	Top-5 Accuracy 66.304 (66.304)
Epoch: [7][100/469]	Batch Time 0.734 (0.763)	Data Load Time 0.000 (0.010)	Loss 4.0089 (4.0392)	Top-5 Accuracy 64.371 (64.151)
Epoch: [7][200/469]	Batch Time 0.724 (0.746)	Data Load Time 0.000 (0.005)	Loss 4.0815 (3.9990)	Top-5 Accuracy 64.932 (64.777)
Epoch: [7][300/469]	Batch Time 0.712 (0.743)	Data Load Time 0.000 (0.004)	Loss 4.1129 (3.9804)	Top-5 Accuracy 61.968 (65.052)
Epoch: [7][400/469]	Batch Time 0.709 (0.741)	Data Load Time 0.000 (0.003)	Loss 3.9733 (3.9678)	Top-5 Accuracy 65.410 (65.290)
Epo

In [18]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 12
Best Bleu-4: 15.958687149759504


In [19]:
from eval import test_score

test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [03:51<00:00, 21.56it/s]


----- Bleu-n Scores -----
1: 60.587742955454296
2: 43.942940128884835
3: 30.54183515877139
4: 21.042882148918814
-------------------------


EVALUATING AT BEAM SIZE 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [04:29<00:00, 18.55it/s]


----- Bleu-n Scores -----
1: 63.9591916813812
2: 46.76456351470242
3: 33.10728567829948
4: 23.041341382869543
-------------------------


EVALUATING AT BEAM SIZE 5:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 5000/5000 [04:45<00:00, 17.53it/s]


----- Bleu-n Scores -----
1: 65.0749640583282
2: 47.37754712439627
3: 33.51688921329188
4: 23.1951649964219
-------------------------


In [20]:

# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 16,
    'b-1/test': test_dict['b1'],
    'b-2/test': test_dict['b2'],
    'b-3/test': test_dict['b3'],
    'b-4/b3': test_dict['b4-b3'],
    'b-4/b1': test_dict['b4-b1'],
    'b-4/b5': test_dict['b4-b5']
}

In [21]:
logger.add_hparams(logger_dic, results_dic, run_name='flickr_small_max_seq')

In [22]:
!ls

BEST_checkpoint_flickr8k.pth.tar	   __notebook_source__.ipynb
BEST_checkpoint_flickr8k_finetune.pth.tar  checkpoint_flickr8k.pth.tar
Image-Captioning			   checkpoint_flickr8k_finetune.pth.tar
