In [8]:
!nvidia-smi

Sat Jul 17 13:20:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# setting up kaggle json
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

# downloading dataset from kaggle
!pip install kaggle -q
!kaggle datasets download -d aladdinpersson/flickr8kimagescaptions
!unzip -q flickr8kimagescaptions.zip

Downloading flickr8kimagescaptions.zip to /content
 99% 1.02G/1.04G [00:08<00:00, 155MB/s]
100% 1.04G/1.04G [00:08<00:00, 125MB/s]


In [4]:
# get the code form github
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 679, done.[K
remote: Counting objects: 100% (679/679), done.[K
remote: Compressing objects: 100% (336/336), done.[K
remote: Total 679 (delta 412), reused 597 (delta 330), pack-reused 0[K
Receiving objects: 100% (679/679), 43.13 MiB | 24.04 MiB/s, done.
Resolving deltas: 100% (412/412), done.


In [9]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [10]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [11]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 300  # dimension of word embeddings
attention_dim = 300  # dimension of attention linear layers
decoder_dim = 300  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = True
fine_tune_embeddings = True
checkpoint = None  # path to checkpoint, None if none


In [13]:
DATA_NAME = 'flickr8k_ar_arabert_pretrained'

# local
# DATA_JSON_PATH = 'ar_data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = 'Image-Captioning/ar_data.json'
IMGS_PATH = 'flickr8k/images/'

In [14]:
max_seq = 65
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 227449.12it/s]


3309

In [15]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>', '<sos>', '<eos>', '<unk>', '+', 'ة', 'طفل', 'صغير', 'تتسلق', 'إلى'])

### Pre-trained Arabic Embeddings

In [16]:
# downloading arabic cbow pretrained word embedings
! wget https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_300_wiki.zip
! unzip -q full_grams_cbow_300_wiki.zip

--2021-07-17 13:21:33--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_cbow_300_wiki.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1491895880 (1.4G) [application/zip]
Saving to: ‘full_grams_cbow_300_wiki.zip’


2021-07-17 13:22:00 (54.4 MB/s) - ‘full_grams_cbow_300_wiki.zip’ saved [1491895880/1491895880]



In [17]:
import gensim
model = gensim.models.Word2Vec.load("./full_grams_cbow_300_wiki.mdl")
model.wv.save_word2vec_format("aravec.txt")

In [18]:
import numpy as np
def get_weights(embedding_path):
    embeddings_index = {}
    with open(embedding_path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    print("Found %s word vectors." % len(dict(embeddings_index)))
    
    num_tokens = len(vocab)
    embedding_dim = 300
    hits = 0
    misses = 0
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, index in tqdm(vocab.stoi.items()):
        if word in embeddings_index:
            embedding_matrix[index] = embeddings_index[word]
            hits+=1
        else:
            misses+=1
            embedding_matrix[index] = np.random.uniform(-.1, .1, size=embedding_dim)
    print("Hist:", hits, " | Misses:", misses)
    return embedding_matrix

In [19]:
embedding_matrix = get_weights("./aravec.txt")

100%|██████████| 3309/3309 [00:00<00:00, 191618.83it/s]

Found 662110 word vectors.
Hist: 2723  | Misses: 586





In [20]:
embedding_matrix.shape

(3309, 300)

In [21]:
len(vocab.itos)

3309

In [22]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'fine_tune_embeddings': fine_tune_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout,
    'embeddings_matrix': embedding_matrix
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': max_seq,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}


t_params

{'batch_size': 256,
 'data_name': 'flickr8k_ar_arabert_pretrained',
 'decoder_lr': 0.0004,
 'df_path': 'Image-Captioning/ar_data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_embeddings': True,
 'fine_tune_encoder': False,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': True,
 'vocab': <dataset.Vocabulary at 0x7f1e787bea10>,
 'workers': 2}

In [23]:
# experiment name
name = DATA_NAME + "pretrained"
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [24]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth


HBox(children=(FloatProgress(value=0.0, max=178793939.0), HTML(value='')))


Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: [0][0/71]	Batch Time 10.000 (10.000)	Data Load Time 4.665 (4.665)	Loss 8.9380 (8.9380)	Top-5 Accuracy 0.122 (0.122)
Epoch train time 191.608 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 6.636 (6.636)	Loss 4.7598 (4.7598)	Top-5 Accuracy 58.225 (58.225)	
----- Bleu-n Scores -----
1: 63.2881016448392
2: 43.23017418363778
3: 25.534674601162727
4: 15.0724907619684
-------------------------

 * LOSS - 4.902, TOP-5 ACCURACY - 57.098, BLEU-4 - 15.0724907619684

Epoch validation time 38.831 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training --------------------
Epoch: [1][0/71]	Batch Time 6.587 (6.587)	Data Load Time 3.837 (3.837)	Loss 3.9533 (3.9533)	Top-5 Accuracy 63.711 (63.711)
Epoch train time 184.460 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 6.258 (6.258)	Loss 5.1243 (5.1243)	Top-5 Accuracy 57.104 (57.104)	
-----

In [25]:
!ls

aravec.txt
BEST_checkpoint_flickr8k_ar_arabert_pretrained.pth.tar
checkpoint_flickr8k_ar_arabert_pretrained.pth.tar
drive
experiments
flickr8k
full_grams_cbow_300_wiki.mdl
full_grams_cbow_300_wiki.mdl.trainables.syn1neg.npy
full_grams_cbow_300_wiki.mdl.wv.vectors.npy
full_grams_cbow_300_wiki.zip
Image-Captioning
sample_data


In [26]:
m = load_checkpoint("BEST_checkpoint_flickr8k_ar_arabert_pretrained.pth.tar")

Loaded Checkpoint!!
Last Epoch: 6
Best Bleu-4: 23.64870583972747


In [27]:
batch_size = 64
fine_tune_encoder = True
checkpoint = 'BEST_checkpoint_flickr8k_ar_arabert_pretrained.pth.tar'
# epochs = 30

t_params['batch_size'] = batch_size
t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['decoder_lr'] = t_params['decoder_lr'] / 10
# t_params['epochs'] = epochs
t_params

{'batch_size': 64,
 'data_name': 'flickr8k_ar_arabert_pretrained_finetune',
 'decoder_lr': 4e-05,
 'df_path': 'Image-Captioning/ar_data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_embeddings': True,
 'fine_tune_encoder': True,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': True,
 'vocab': <dataset.Vocabulary at 0x7f1e787bea10>,
 'workers': 2}

In [28]:
fit(t_params, checkpoint=checkpoint, m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 7
Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [7][0/282]	Batch Time 4.982 (4.982)	Data Load Time 1.191 (1.191)	Loss 3.3514 (3.3514)	Top-5 Accuracy 72.744 (72.744)
Epoch: [7][100/282]	Batch Time 1.390 (1.427)	Data Load Time 0.001 (0.013)	Loss 3.3280 (3.3822)	Top-5 Accuracy 72.120 (71.114)
Epoch: [7][200/282]	Batch Time 1.370 (1.407)	Data Load Time 0.003 (0.007)	Loss 3.3692 (3.3701)	Top-5 Accuracy 72.269 (71.366)
Epoch train time 394.684 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/47]	Batch Time 1.635 (1.635)	Loss 4.3954 (4.3954)	Top-5 Accuracy 65.733 (65.733)	
----- Bleu-n Scores -----
1: 70.14225421073516
2: 51.780019

### Test Scores

In [30]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_ar_arabert_pretrained_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 9
Best Bleu-4: 24.949378413361714


In [31]:
from eval import test_score

test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

  cpuset_checked))
EVALUATING AT BEAM SIZE 1:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
EVALUATING AT BEAM SIZE 1: 100%|██████████| 3000/3000 [02:11<00:00, 22.80it/s]


----- Bleu-n Scores -----


  cpuset_checked))
EVALUATING AT BEAM SIZE 3:   0%|          | 0/3000 [00:00<?, ?it/s]

1: 59.27134312126155
2: 45.52397958654338
3: 33.58850576504064
4: 24.918812277227662
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 3000/3000 [02:27<00:00, 20.32it/s]


----- Bleu-n Scores -----
1: 60.32593136195551
2: 47.5536072957737
3: 36.147037636633875
4: 27.524282207029003
-------------------------


  cpuset_checked))
EVALUATING AT BEAM SIZE 5:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 3000/3000 [02:42<00:00, 18.51it/s]


----- Bleu-n Scores -----
1: 58.678484239386094
2: 46.85683508774053
3: 36.14555791431082
4: 27.864202291806382
-------------------------


In [32]:
test_dict

{'b1': 60.32593136195551,
 'b2': 47.5536072957737,
 'b3': 36.147037636633875,
 'b4-b1': 24.918812277227662,
 'b4-b3': 27.524282207029003,
 'b4-b5': 27.864202291806382}

In [33]:
# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 5.653,
    'b-1/test': test_dict['b1'],
    'b-2/test': test_dict['b2'],
    'b-3/test': test_dict['b3'],
    'b-4/b3': test_dict['b4-b3'],
    'b-4/b1': test_dict['b4-b1'],
    'b-4/b5': test_dict['b4-b5']
}

In [34]:
logger.add_hparams(logger_dic, results_dic, run_name='pretrianed')

### Test Example

In [50]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [54]:
def caption_image(img_path, beam_size=3):
    
    # transforms
    tt = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225]),
    ])


    # english
    # vocab = build_vocab('data.json')
    # checkpoint = load_checkpoint('E:\GP\Image-Captioning\models\BEST_checkpoint_flickr8k_finetune.pth.tar', cpu=True)

    # arabic
    vocab = build_vocab('/content/Image-Captioning/ar_data.json')
    checkpoint = load_checkpoint('BEST_checkpoint_flickr8k_ar_arabert_pretrained_finetune.pth.tar', cpu=True)


    addit_tokens = [vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']]
    device = torch.device( 'cpu')


    encoder = checkpoint['encoder'].to(device)
    decoder = checkpoint['decoder'].to(device)

    #def cap_image(encoder, decoder, image_path, vocab):
    vocab_size = len(vocab)


    img = Image.open(img_path).convert("RGB")
    img = tt(img).unsqueeze(0) # transform and batch
    image = img.to(device)

    #encoder
    encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)



    k = beam_size 
    # We'll treat the problem as having a batch size of k
    encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

    # Tensor to store top k previous words at each step; now they're just <start>
    k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to(device)  # (k, 1)

    # Tensor to store top k sequences; now they're just <start>
    seqs = k_prev_words  # (k, 1)

    # Tensor to store top k sequences' scores; now they're just 0
    top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

    # Tensor to store top k sequences' alphas; now they're just 1s
    seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(device)  # (k, 1, enc_image_size, enc_image_size)

    # Lists to store completed sequences, their alphas and scores
    complete_seqs = list()
    complete_seqs_alpha = list()
    complete_seqs_scores = list()

    # Start decoding
    step = 1
    h, c = decoder.init_hidden_state(encoder_out)

    # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
    while True:

        embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

        awe, alpha = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

        alpha = alpha.view(-1, enc_image_size, enc_image_size)  # (s, enc_image_size, enc_image_size)

        gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe

        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

        scores = decoder.fc(h)  # (s, vocab_size)
        scores = F.log_softmax(scores, dim=1)

        # Add
        scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

        # For the first step, all k points will have the same scores (since same k previous words, h, c)
        if step == 1:
            top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
        else:
            # Unroll and find top scores, and their unrolled indices
            top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

        # Convert unrolled indices to actual indices of scores
        prev_word_inds = top_k_words // vocab_size  # (s)
        next_word_inds = top_k_words % vocab_size  # (s)
        
        # Add new words to sequences, alphas
        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
        seqs_alpha = torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
                               dim=1)  # (s, step+1, enc_image_size, enc_image_size)
#         print(seqs[prev_word_inds], prev_word_inds)
#         if step == 5:
#             return seqs
        # Which sequences are incomplete (didn't reach <end>)?
        incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != vocab.stoi['<eos>']]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

        # Set aside complete sequences
        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])
        k -= len(complete_inds)  # reduce beam length accordingly

        # Proceed with incomplete sequences
        if k == 0:
            break
        seqs = seqs[incomplete_inds]
        seqs_alpha = seqs_alpha[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        # Break if things have been going on too long
        if step > 50:
            break
        step += 1

    i = complete_seqs_scores.index(max(complete_seqs_scores))
    seq = complete_seqs[i]
    alphas = complete_seqs_alpha[i]

    print(complete_seqs_scores)
    # print(seq)
    all_caps = [" ".join([vocab.itos[i] for i in sent if i not in addit_tokens]) for sent in complete_seqs]
    all_b_caps = ""
    z = 1
    for cap in all_caps:
        all_b_caps += str(z) + ". " + cap + " || <br> "
        z += 1
    # all_b_caps = [" || ".join(all_caps)][0]


    # return seq, alphas, complete_seqs, i
    # return [" ".join([vocab.itos[i] for i in seq if i not in addit_tokens])][0]
    # return all_b_caps
    return alphas, seq, all_caps

In [55]:
alphas, seq, all_caps = caption_image("/content/Image-Captioning/test_examples/dog.jpg")

100%|██████████| 24000/24000 [00:00<00:00, 189372.99it/s]


Loaded Checkpoint!!
Last Epoch: 9
Best Bleu-4: 24.949378413361714
[tensor(-5.3566, grad_fn=<UnbindBackward>), tensor(-40.7110, grad_fn=<UnbindBackward>)]


In [56]:
seq

[1, 12, 65, 41, 18, 10, 4, 47, 2]

In [58]:
[vocab.itos[i] for i in seq[1:-1]]

['كلب', 'بني', 'يركض', 'على', 'ال', '+', 'شاطئ']

In [61]:
# # libraries for arabert
# !pip install farasapy
# !pip install pyarabic
# !pip install fuzzysearch

In [62]:
!git clone https://github.com/aub-mind/arabert

Cloning into 'arabert'...
remote: Enumerating objects: 530, done.[K
remote: Counting objects: 100% (316/316), done.[K
remote: Compressing objects: 100% (228/228), done.[K
remote: Total 530 (delta 167), reused 226 (delta 82), pack-reused 214[K
Receiving objects: 100% (530/530), 4.86 MiB | 23.70 MiB/s, done.
Resolving deltas: 100% (290/290), done.


In [65]:
from arabert.preprocess import ArabertPreprocessor

model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)
arabert_prep.unpreprocess(" ".join([vocab.itos[i] for i in seq[1:-1]]))



 99%|█████████▉| 239M/241M [00:19<00:00, 12.1MiB/s]



'كلب بني يركض على الشاطئ'

### Test without replication

In [76]:
class CaptionDataset(Dataset):
    """ 
    Caption Dataset Class
    """

    def __init__(self, imgs_dir, captions_file, vocab, transforms=None, split='train'):
        """
        :param imgs_dir: folder where images are stored
        :param captions_file: the df file with all caption information
        :param vocab: vocabuary object
        :param transforms: image transforms pipeline
        :param split: data split
        """

        # split has to be one of {'train', 'val', 'test'}
        assert split in {'train', 'val', 'test'}

        self.imgs_dir = imgs_dir
        self.df = pd.read_json(captions_file)
        self.df = self.df[self.df['split'] == split]
        self.vocab = vocab
        self.transforms = transforms
        self.split = split

        self.dataset_size = self.df.shape[0]
        # printing some info
        print(f"Dataset split: {split}")
        print(f"Unique images: {self.df.file_name.nunique()}")
        print(f"Total size: {self.dataset_size}")

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, index):

        # loading the image
        img_id = self.df['file_name'].values[index]
        img = Image.open(self.imgs_dir+img_id).convert("RGB")

        if self.transforms is not None:
            img = self.transforms(img)
        else:
            img = transfroms.ToTensor()(img)

        # loading current caption
        cap_len = self.df['tok_len'].values[index] + 2 # <sos> and <eos>
        tokens = self.df['tokens'].values[index]
        caption = torch.LongTensor(self.vocab.numericalize(tokens, cap_len))

        if self.split is 'train':
            return img, caption, cap_len
        else:
            # for val and test return all captions for calculate the bleu scores
            captions_tokens = self.df[self.df['file_name'] == img_id].tokens.values
            captions_lens = self.df[self.df['file_name'] == img_id].tok_len.values
            all_tokens = []
            for token, cap_len in zip(captions_tokens, captions_lens):
                all_tokens.append(self.vocab.numericalize(token, cap_len)[1:]) # remove <sos>

            return img, caption, cap_len, torch.tensor(all_tokens), img_id

In [77]:
bs = 1

loader = DataLoader(
            dataset=CaptionDataset(IMGS_PATH, DATA_JSON_PATH,
                                    transforms=transform, vocab=vocab, split='test'),
            batch_size=bs,
            num_workers=7,
            shuffle=True,
            pin_memory=True
        )

Dataset split: test
Unique images: 1000
Total size: 3000


  cpuset_checked))


In [78]:
# Test without replication 
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from utils import load_checkpoint
from dataset import build_vocab, get_loaders
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
from utils import print_scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True


transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


def evaluate(beam_size):
    vocab_size = len(vocab)
    references = list()
    hypotheses = list()
    img_ids = list()
    
    # For each image
    for i, (image, caps, caplens, allcaps, img_id) in enumerate(
        tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size), position=0, leave=True)):
        
        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to(device)  # (k, 1)
        
        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0)  # (s)
          
            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words // vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)
            
#             print(top_k_scores, top_k_words)
            # Add new words to sequences
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != vocab.stoi['<eos>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1
        
        if len(complete_seqs_scores) == 0:
            continue
        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(lambda c: [w for w in c if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}],
                img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([w for w in seq if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}])
        
        img_ids.append(img_id[0])
        assert len(references) == len(hypotheses) == len(img_ids)
    # Calculate BLEU-4 scores
#     bleu4 = corpus_bleu(references, hypotheses)
    return references, hypotheses, img_ids
    # print_scores(references, hypotheses, nltk=True)

In [79]:
references, hypothesis, img_ids = evaluate(1)

  cpuset_checked))
EVALUATING AT BEAM SIZE 1: 100%|██████████| 3000/3000 [02:12<00:00, 22.70it/s]


In [82]:
print_scores(references, hypothesis)

----- Bleu-n Scores -----
1: 59.27134312126155
2: 45.52397958654338
3: 33.58850576504064
4: 24.918812277227662
-------------------------


(59.27134312126155, 45.52397958654338, 33.58850576504064, 24.918812277227662)

In [81]:
df = pd.DataFrame.from_dict({"file_name":img_ids, "references":references, "hypothesis": hypothesis})
df.head()

Unnamed: 0,file_name,references,hypothesis
0,354642192_3b7666a2dd.jpg,"[[78, 1315, 16, 10, 4, 66, 21, 4, 10, 4, 44, 4...","[10, 4, 6, 4, 5, 16, 10, 4, 66]"
1,1237985362_dbafc59280.jpg,"[[283, 4, 5, 399, 20, 4, 5, 7, 4, 5, 400, 401,...","[20, 4, 5, 7, 4, 5, 16, 10, 4, 71, 10, 4, 166,..."
2,2641770481_c98465ff35.jpg,"[[28, 33, 98, 4, 5, 46, 106, 21, 4, 10, 4, 44,...","[28, 33, 122, 4, 61, 102, 15, 4, 14, 4, 5, 10,..."
3,3485425825_c2f3446e73.jpg,"[[28, 15, 4, 1903, 353, 190, 4, 5, 21, 4, 29, ...","[112, 4, 5, 16, 10, 4, 190, 4, 5, 10, 4, 278, ..."
4,1554713437_61b64527dd.jpg,"[[12, 12, 53, 356, 18, 992, 4, 5, 16, 10, 4, 3...","[12, 65, 41, 18, 10, 4, 42]"


In [83]:
df.sort_values("file_name").head()

Unnamed: 0,file_name,references,hypothesis
2392,1056338697_4f7d7ce270.jpg,"[[112, 4, 5, 151, 16, 122, 109, 664, 689, 4, 5...","[20, 4, 5, 7, 4, 5, 16, 10, 4, 71]"
1289,1056338697_4f7d7ce270.jpg,"[[112, 4, 5, 151, 16, 122, 109, 664, 689, 4, 5...","[20, 4, 5, 7, 4, 5, 16, 10, 4, 71]"
439,1056338697_4f7d7ce270.jpg,"[[112, 4, 5, 151, 16, 122, 109, 664, 689, 4, 5...","[20, 4, 5, 7, 4, 5, 16, 10, 4, 71]"
43,106490881_5a2dd9b7bd.jpg,"[[49, 16, 236, 10, 4, 237, 4, 5, 10, 4, 93, 18...","[49, 7, 33, 122, 4, 61, 109, 15, 4, 14, 4, 61]"
2160,106490881_5a2dd9b7bd.jpg,"[[49, 16, 236, 10, 4, 237, 4, 5, 10, 4, 93, 18...","[49, 7, 33, 122, 4, 61, 109, 15, 4, 14, 4, 61]"


In [84]:
df.to_json("arabic_bert_results.json")

In [85]:
refes = []
hypos = []
for fname in tqdm(df.file_name.unique()):
  refes.append(df[df.file_name==fname].references.to_list()[0])
  hypos.append(df[df.file_name==fname].hypothesis.to_list()[0])


  0%|          | 0/841 [00:00<?, ?it/s][A
  8%|▊         | 70/841 [00:00<00:01, 695.43it/s][A
 16%|█▋        | 137/841 [00:00<00:01, 686.52it/s][A
 25%|██▍       | 209/841 [00:00<00:00, 694.42it/s][A
 33%|███▎      | 278/841 [00:00<00:00, 692.15it/s][A
 41%|████▏     | 349/841 [00:00<00:00, 695.80it/s][A
 50%|█████     | 421/841 [00:00<00:00, 700.29it/s][A
 59%|█████▉    | 495/841 [00:00<00:00, 711.12it/s][A
 67%|██████▋   | 563/841 [00:00<00:00, 699.29it/s][A
 75%|███████▌  | 634/841 [00:00<00:00, 701.53it/s][A
 83%|████████▎ | 702/841 [00:01<00:00, 674.74it/s][A
 91%|█████████▏| 768/841 [00:01<00:00, 657.36it/s][A
100%|██████████| 841/841 [00:01<00:00, 678.43it/s]


In [87]:
len(refes), len(hypos)

(841, 841)

In [88]:
print_scores(refes, hypos)

----- Bleu-n Scores -----
1: 59.27134312126155
2: 45.52397958654338
3: 33.58850576504064
4: 24.918812277227662
-------------------------


(59.27134312126155, 45.52397958654338, 33.58850576504064, 24.918812277227662)