In [1]:
!nvidia-smi

Fri Jul  2 08:08:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# get the code form github
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 617, done.[K
remote: Counting objects: 100% (617/617), done.[K
remote: Compressing objects: 100% (298/298), done.[K
remote: Total 617 (delta 370), reused 555 (delta 308), pack-reused 0[K
Receiving objects: 100% (617/617), 38.54 MiB | 19.27 MiB/s, done.
Resolving deltas: 100% (370/370), done.


In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [20]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 300  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = True
fine_tune_embeddings = True
checkpoint = None  # path to checkpoint, None if none


In [21]:
DATA_NAME = 'flickr8k_ar_pretrained'

# local
# DATA_JSON_PATH = 'ar_data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = 'Image-Captioning/ar_data.json'
# IMGS_PATH = 'flickr8k/images/'

In [22]:
max_seq = 30
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 275327.45it/s]


5788

In [8]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>',
  '<sos>',
  '<eos>',
  '<unk>',
  'طفلة',
  'صغيرة',
  'تتسلق',
  'إلى',
  'كلب',
  'أسود'])

### Pre-trained Arabic Embeddings

In [9]:
# downloading arabic cbow pretrained word embedings
! wget https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_300_wiki.zip
! unzip -q full_grams_cbow_300_wiki.zip

--2021-07-02 08:09:07--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_300_wiki.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1491895880 (1.4G) [application/zip]
Saving to: ‘full_grams_cbow_300_wiki.zip’


2021-07-02 08:09:41 (42.2 MB/s) - ‘full_grams_cbow_300_wiki.zip’ saved [1491895880/1491895880]



In [10]:
import gensim

model = gensim.models.Word2Vec.load("./full_grams_cbow_300_wiki.mdl")

In [11]:
model.wv.save_word2vec_format("aravec.txt")

In [12]:
import numpy as np

In [13]:
def get_weights(embedding_path):
    embeddings_index = {}
    with open(embedding_path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    print("Found %s word vectors." % len(dict(embeddings_index)))
    
    num_tokens = len(vocab)
    embedding_dim = 300
    hits = 0
    misses = 0
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, index in tqdm(vocab.stoi.items()):
        if word in embeddings_index:
            embedding_matrix[index] = embeddings_index[word]
            hits+=1
        else:
            misses+=1
            embedding_matrix[index] = np.random.uniform(-.1, .1, size=embedding_dim)
    print("Hist:", hits, " | Misses:", misses)
    return embedding_matrix

In [14]:
embedding_matrix = get_weights("./aravec.txt")

100%|██████████| 5788/5788 [00:00<00:00, 141684.41it/s]

Found 662110 word vectors.
Hist: 3746  | Misses: 2042





In [15]:
embedding_matrix.shape

(5788, 300)

In [16]:
len(vocab.itos)

5788

In [23]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'fine_tune_embeddings': fine_tune_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout,
    'embeddings_matrix': embedding_matrix
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': max_seq,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}


t_params

{'data_name': 'flickr8k_ar_pretrained',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': 'Image-Captioning/ar_data.json',
 'vocab': <dataset.Vocabulary at 0x7fde74a16250>,
 'epochs': 30,
 'batch_size': 256,
 'workers': 2,
 'decoder_lr': 0.0004,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': False,
 'pretrained_embeddings': True,
 'fine_tune_embeddings': True}

In [24]:
# experiment name
name = DATA_NAME + "pretrained"
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [25]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/71]	Batch Time 5.810 (5.810)	Data Load Time 4.532 (4.532)	Loss 9.6299 (9.6299)	Top-5 Accuracy 0.044 (0.044)
Epoch train time 156.404 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 6.694 (6.694)	Loss 6.2604 (6.2604)	Top-5 Accuracy 38.759 (38.759)	
----- Bleu-n Scores -----
1: 33.507602628872206
2: 14.928633565033495
3: 5.965244780730277
4: 2.3890386455816652
-------------------------

 * LOSS - 6.265, TOP-5 ACCURACY - 38.979, BLEU-4 - 2.3890386455816652

Epoch validation time 33.433 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training ----------

In [26]:
!ls

BEST_checkpoint_flickr8k_ar_pretrained.pth.tar
Image-Captioning
__notebook_source__.ipynb
aravec.txt
checkpoint_flickr8k_ar_pretrained.pth.tar
experiments
full_grams_cbow_300_wiki.mdl
full_grams_cbow_300_wiki.mdl.trainables.syn1neg.npy
full_grams_cbow_300_wiki.mdl.wv.vectors.npy
full_grams_cbow_300_wiki.zip


In [28]:
m = load_checkpoint("BEST_checkpoint_flickr8k_ar_pretrained.pth.tar")

Loaded Checkpoint!!
Last Epoch: 5
Best Bleu-4: 5.68707610237029


In [29]:
batch_size = 64
fine_tune_encoder = True
checkpoint = 'BEST_checkpoint_flickr8k_ar_pretrained.pth.tar'
# epochs = 30

t_params['batch_size'] = batch_size
t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['decoder_lr'] = t_params['decoder_lr'] / 10
# t_params['epochs'] = epochs
t_params

{'data_name': 'flickr8k_ar_pretrained_finetune',
 'imgs_path': '../input/flickr8kimagescaptions/flickr8k/images/',
 'df_path': 'Image-Captioning/ar_data.json',
 'vocab': <dataset.Vocabulary at 0x7fde74a16250>,
 'epochs': 30,
 'batch_size': 64,
 'workers': 2,
 'decoder_lr': 4e-05,
 'encoder_lr': 0.0001,
 'fine_tune_encoder': True,
 'pretrained_embeddings': True,
 'fine_tune_embeddings': True}

In [30]:
fit(t_params, checkpoint=checkpoint, m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 6
Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [6][0/282]	Batch Time 3.223 (3.223)	Data Load Time 1.000 (1.000)	Loss 4.9358 (4.9358)	Top-5 Accuracy 52.335 (52.335)
Epoch: [6][100/282]	Batch Time 0.696 (0.692)	Data Load Time 0.000 (0.010)	Loss 4.8724 (4.8522)	Top-5 Accuracy 53.616 (52.897)
Epoch: [6][200/282]	Batch Time 0.658 (0.677)	Data Load Time 0.005 (0.006)	Loss 4.9288 (4.8126)	Top-5 Accuracy 50.091 (53.521)
Epoch train time 189.811 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/47]	Batch Time 1.602 (1.602)	Loss 5.7948 (5.7948)	Top-5 Accuracy 46.699 (46.699)	
----- Bleu-n Scores -----
1: 43.34978929433547
2: 23.210977

In [31]:
!ls

BEST_checkpoint_flickr8k_ar_pretrained.pth.tar
BEST_checkpoint_flickr8k_ar_pretrained_finetune.pth.tar
Image-Captioning
__notebook_source__.ipynb
aravec.txt
checkpoint_flickr8k_ar_pretrained.pth.tar
checkpoint_flickr8k_ar_pretrained_finetune.pth.tar
experiments
full_grams_cbow_300_wiki.mdl
full_grams_cbow_300_wiki.mdl.trainables.syn1neg.npy
full_grams_cbow_300_wiki.mdl.wv.vectors.npy
full_grams_cbow_300_wiki.zip


In [35]:
from IPython.display import FileLink
FileLink("BEST_checkpoint_flickr8k_ar_pretrained_finetune.pth.tar")

In [37]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_ar_pretrained_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 7
Best Bleu-4: 6.671753110127286


In [38]:
from eval import test_score

test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

EVALUATING AT BEAM SIZE 1:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 3000/3000 [02:01<00:00, 24.72it/s]


----- Bleu-n Scores -----
1: 38.98180535630877
2: 25.100887490307773
3: 15.08706464380532
4: 9.0263118611082
-------------------------


EVALUATING AT BEAM SIZE 3:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 3000/3000 [02:07<00:00, 23.62it/s]


----- Bleu-n Scores -----


EVALUATING AT BEAM SIZE 5:   0%|          | 0/3000 [00:00<?, ?it/s]

1: 39.78043797747967
2: 26.43180951180445
3: 16.23051908803009
4: 9.823807664028124
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 3000/3000 [02:12<00:00, 22.70it/s]


----- Bleu-n Scores -----
1: 38.02362652557281
2: 25.166121201171286
3: 15.394174456280037
4: 9.227210386713303
-------------------------


In [39]:
test_dict

{'b4-b1': 9.0263118611082,
 'b1': 39.78043797747967,
 'b2': 26.43180951180445,
 'b3': 16.23051908803009,
 'b4-b3': 9.823807664028124,
 'b4-b5': 9.227210386713303}

In [40]:
# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 5.653,
    'b-1/test': test_dict['b1'],
    'b-2/test': test_dict['b2'],
    'b-3/test': test_dict['b3'],
    'b-4/b3': test_dict['b4-b3'],
    'b-4/b1': test_dict['b4-b1'],
    'b-4/b5': test_dict['b4-b5']
}

In [41]:
logger.add_hparams(logger_dic, results_dic, run_name='pretrianed')

In [42]:
!zip -r exps.zip experiments
FileLink("exps.zip")

updating: experiments/ (stored 0%)
updating: experiments/flickr8k_ar_pretrainedpretrained/ (stored 0%)
updating: experiments/flickr8k_ar_pretrainedpretrained/events.out.tfevents.1625213740.209d6de66ef0.43.1 (deflated 68%)
updating: experiments/flickr8k_ar_pretrainedpretrained/events.out.tfevents.1625213650.209d6de66ef0.43.0 (deflated 5%)
  adding: experiments/flickr8k_ar_pretrainedpretrained/pretrianed/ (stored 0%)
  adding: experiments/flickr8k_ar_pretrainedpretrained/pretrianed/events.out.tfevents.1625219958.209d6de66ef0.43.2 (deflated 45%)
