In [1]:
!nvidia-smi

Sat Jul 17 13:05:53 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# get the code form github
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 679, done.[K
remote: Counting objects: 100% (679/679), done.[K
remote: Compressing objects: 100% (336/336), done.[K
remote: Total 679 (delta 412), reused 597 (delta 330), pack-reused 0[K
Receiving objects: 100% (679/679), 43.13 MiB | 16.39 MiB/s, done.
Resolving deltas: 100% (412/412), done.


In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [5]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none

In [6]:
DATA_NAME = 'flickr8k_ar_arabert'

# local
# DATA_JSON_PATH = 'ar_data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
DATA_JSON_PATH = '/kaggle/working/Image-Captioning/ar_data.json'
IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
# DATA_JSON_PATH = 'Image-Captioning/ar_data.json'
# IMGS_PATH = 'flickr8k/images/'

In [7]:
max_seq = 65
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 145193.80it/s]


3309

In [8]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>', '<sos>', '<eos>', '<unk>', '+', 'ة', 'طفل', 'صغير', 'تتسلق', 'إلى'])

In [9]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': max_seq,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [10]:
t_params

{'data_name': 'flickr8k_ar_arabert',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': '/kaggle/working/Image-Captioning/ar_data.json',
 'vocab': <dataset.Vocabulary at 0x7f3160623990>,
 'epochs': 30,
 'batch_size': 256,
 'workers': 2,
 'decoder_lr': 0.0004,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': False,
 'pretrained_embeddings': False}

In [11]:
# experiment name
name = DATA_NAME + ""
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [12]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Downloading: "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth" to /root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth


  0%|          | 0.00/170M [00:00<?, ?B/s]

Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/71]	Batch Time 10.593 (10.593)	Data Load Time 5.750 (5.750)	Loss 9.0160 (9.0160)	Top-5 Accuracy 0.227 (0.227)
Epoch train time 201.116 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 7.353 (7.353)	Loss 5.0295 (5.0295)	Top-5 Accuracy 56.101 (56.101)	
----- Bleu-n Scores -----
1: 67.60305065467168
2: 47.40331747437025
3: 27.147164303095288
4: 16.173766776214396
-------------------------

 * LOSS - 5.143, TOP-5 ACCURACY - 54.776, BLEU-4 - 16.173766776214396

Epoch validation time 42.291 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training ---------

In [13]:
!ls

BEST_checkpoint_flickr8k_ar_arabert.pth.tar
Image-Captioning
__notebook_source__.ipynb
checkpoint_flickr8k_ar_arabert.pth.tar
experiments


In [14]:
m = load_checkpoint("BEST_checkpoint_flickr8k_ar_arabert.pth.tar")

Loaded Checkpoint!!
Last Epoch: 15
Best Bleu-4: 24.829548987868076


In [15]:
batch_size = 64
fine_tune_encoder = True
checkpoint = 'BEST_checkpoint_flickr8k_ar_arabert.pth.tar'
# epochs = 30

t_params['batch_size'] = batch_size
t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['decoder_lr'] = t_params['decoder_lr'] / 10
# t_params['epochs'] = epochs
t_params

{'data_name': 'flickr8k_ar_arabert_finetune',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': '/kaggle/working/Image-Captioning/ar_data.json',
 'vocab': <dataset.Vocabulary at 0x7f3160623990>,
 'epochs': 30,
 'batch_size': 64,
 'workers': 2,
 'decoder_lr': 4e-05,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': True,
 'pretrained_embeddings': False}

In [16]:
fit(t_params, checkpoint=checkpoint, m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 16
Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [16][0/282]	Batch Time 3.986 (3.986)	Data Load Time 1.176 (1.176)	Loss 3.1119 (3.1119)	Top-5 Accuracy 75.024 (75.024)
Epoch: [16][100/282]	Batch Time 0.864 (0.917)	Data Load Time 0.001 (0.012)	Loss 3.2319 (3.3089)	Top-5 Accuracy 72.364 (71.655)
Epoch: [16][200/282]	Batch Time 0.855 (0.902)	Data Load Time 0.001 (0.006)	Loss 3.1731 (3.2915)	Top-5 Accuracy 73.360 (71.972)
Epoch train time 253.141 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/47]	Batch Time 1.757 (1.757)	Loss 4.7189 (4.7189)	Top-5 Accuracy 63.591 (63.591)	
----- Bleu-n Scores -----
1: 71.98438051630166
2: 53.47

In [17]:
!ls

BEST_checkpoint_flickr8k_ar_arabert.pth.tar
BEST_checkpoint_flickr8k_ar_arabert_finetune.pth.tar
Image-Captioning
__notebook_source__.ipynb
checkpoint_flickr8k_ar_arabert.pth.tar
checkpoint_flickr8k_ar_arabert_finetune.pth.tar
experiments


In [19]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_ar_arabert_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 17
Best Bleu-4: 25.49970489970383


In [20]:
from eval import test_score

test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

EVALUATING AT BEAM SIZE 1:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 3000/3000 [04:06<00:00, 12.17it/s]


----- Bleu-n Scores -----
1: 59.263331500824634
2: 45.44429489890518
3: 33.15118752258775
4: 24.110450138582106
-------------------------


EVALUATING AT BEAM SIZE 3:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 3000/3000 [04:48<00:00, 10.42it/s]


----- Bleu-n Scores -----
1: 60.96960141888567
2: 48.01618913794198
3: 36.05148885101262
4: 27.008806858539085
-------------------------


EVALUATING AT BEAM SIZE 5:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 3000/3000 [05:16<00:00,  9.47it/s]


----- Bleu-n Scores -----
1: 60.6381407289413
2: 48.33520707341749
3: 36.96943682295309
4: 28.276851460713115
-------------------------


In [21]:
!ls

BEST_checkpoint_flickr8k_ar_arabert.pth.tar
BEST_checkpoint_flickr8k_ar_arabert_finetune.pth.tar
Image-Captioning
__notebook_source__.ipynb
checkpoint_flickr8k_ar_arabert.pth.tar
checkpoint_flickr8k_ar_arabert_finetune.pth.tar
experiments


In [22]:
# save the models to gdrive
from IPython.display import FileLink
FileLink("BEST_checkpoint_flickr8k_ar_arabert_finetune.pth.tar")

In [25]:
test_dict

{'b4-b1': 24.110450138582106,
 'b1': 60.96960141888567,
 'b2': 48.01618913794198,
 'b3': 36.05148885101262,
 'b4-b3': 27.008806858539085,
 'b4-b5': 28.276851460713115}

In [26]:
# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 5.653,
    'b-1/test': test_dict['b1'],
    'b-2/test': test_dict['b2'],
    'b-3/test': test_dict['b3'],
    'b-4/b3': test_dict['b4-b3'],
    'b-4/b1': test_dict['b4-b1'],
    'b-4/b5': test_dict['b4-b5']
}

In [27]:
logger.add_hparams(logger_dic, results_dic, run_name='Arabic_30maxlen')