In [1]:
!nvidia-smi

Sat Jul  3 19:48:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [4]:
!pip install kaggle -q
!kaggle datasets download -d aladdinpersson/flickr8kimagescaptions
!unzip -q flickr8kimagescaptions.zip

Downloading flickr8kimagescaptions.zip to /content
 99% 1.03G/1.04G [00:04<00:00, 235MB/s]
100% 1.04G/1.04G [00:04<00:00, 241MB/s]


In [5]:
# get the code form github
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 630, done.[K
remote: Counting objects: 100% (630/630), done.[K
remote: Compressing objects: 100% (308/308), done.[K
remote: Total 630 (delta 380), reused 561 (delta 311), pack-reused 0[K
Receiving objects: 100% (630/630), 39.18 MiB | 21.14 MiB/s, done.
Resolving deltas: 100% (380/380), done.


In [6]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [7]:
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from utils import load_checkpoint
from dataset import build_vocab, get_loaders
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
from utils import print_scores
import pandas as pd

In [8]:
# DATA_NAME = 'flickr8k_ar'

# local
# DATA_JSON_PATH = 'ar_data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = 'Image-Captioning/data.json'
IMGS_PATH = 'flickr8k/images/'

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

In [10]:
CHECKPOINT_PATH = '/content/drive/MyDrive/ImageCaptioning/flickr8/BEST_checkpoint_flickr8k_finetune.pth.tar'

In [11]:
# Load model
checkpoint = load_checkpoint(CHECKPOINT_PATH)
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 12
Best Bleu-4: 15.97917426288958


In [12]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [13]:
vocab = build_vocab(DATA_JSON_PATH)
len(vocab)

100%|██████████| 40000/40000 [00:00<00:00, 310651.22it/s]


5089

In [14]:
bs = 1
beam_size=3
loader = get_loaders(bs, IMGS_PATH, DATA_JSON_PATH, transform, vocab, test=True, n_workers=8)

Dataset split: test
Unique images: 1000
Total size: 5000


  cpuset_checked))


In [15]:
import torch
import torch.nn.functional as F
import numpy as np
import json
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# import skimage.transform
# import argparse
from PIL import Image
def caption_image_beam_search(encoder, decoder, image_path, word_map, beam_size=3):

    k = beam_size
    vocab_size = len(word_map)

    # Read image and process
    img = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    image = transform(img).to(device)  # (3, 256, 256)

    # Encode
    image = image.unsqueeze(0)  # (1, 3, 256, 256)
    encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)

    # We'll treat the problem as having a batch size of k
    encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

    # Tensor to store top k previous words at each step; now they're just <start>
    k_prev_words = torch.LongTensor([[word_map.stoi['<sos>']]] * k).to(device)  # (k, 1)

    # Tensor to store top k sequences; now they're just <start>
    seqs = k_prev_words  # (k, 1)

    # Tensor to store top k sequences' scores; now they're just 0
    top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

    # Tensor to store top k sequences' alphas; now they're just 1s
    seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(device)  # (k, 1, enc_image_size, enc_image_size)

    # Lists to store completed sequences, their alphas and scores
    complete_seqs = list()
    complete_seqs_alpha = list()
    complete_seqs_scores = list()

    # Start decoding
    step = 1
    h, c = decoder.init_hidden_state(encoder_out)

    # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
    while True:

        embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

        awe, alpha = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

        alpha = alpha.view(-1, enc_image_size, enc_image_size)  # (s, enc_image_size, enc_image_size)

        gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe

        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

        scores = decoder.fc(h)  # (s, vocab_size)
        scores = F.log_softmax(scores, dim=1)

        # Add
        scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

        # For the first step, all k points will have the same scores (since same k previous words, h, c)
        if step == 1:
            top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
        else:
            # Unroll and find top scores, and their unrolled indices
            top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

        # Convert unrolled indices to actual indices of scores
        prev_word_inds = top_k_words // vocab_size  # (s)
        next_word_inds = top_k_words % vocab_size  # (s)
        
        # Add new words to sequences, alphas
        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)
        seqs_alpha = torch.cat([seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
                               dim=1)  # (s, step+1, enc_image_size, enc_image_size)
#         print(seqs[prev_word_inds], prev_word_inds)
#         if step == 5:
#             return seqs
        # Which sequences are incomplete (didn't reach <end>)?
        incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != word_map.stoi['<eos>']]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

        # Set aside complete sequences
        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])
        k -= len(complete_inds)  # reduce beam length accordingly

        # Proceed with incomplete sequences
        if k == 0:
            break
        seqs = seqs[incomplete_inds]
        seqs_alpha = seqs_alpha[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        # Break if things have been going on too long
        if step > 50:
            break
        step += 1

    i = complete_seqs_scores.index(max(complete_seqs_scores))
    seq = complete_seqs[i]
    alphas = complete_seqs_alpha[i]

    return seq, alphas, complete_seqs

In [None]:
seq, _, comp_seqs = caption_image_beam_search(encoder, decoder, 'flickr/Images/3514019869_7de4ece2a5.jpg', vocab, beam_size=2)

FileNotFoundError: ignored

In [None]:
[sent for sent in comp_seqs][0]

NameError: ignored

In [None]:
[" ".join([vocab.itos[i] for i in sent]) for sent in comp_seqs]

['<sos> a dog runs through an obstacle <eos>',
 '<sos> a dog jumps over a hurdle <eos>']

In [None]:
[vocab.itos[i] for i in seq]

['<sos>', 'a', 'dog', 'runs', 'through', 'an', 'obstacle', '<eos>']

In [75]:
def evaluate(beam_size):

    references = list()
    hypotheses = list()

    # For each image
    for i, (image, caps, caplens, allcaps) in enumerate(
        tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size), position=0, leave=True)):
        
        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to(device)  # (k, 1)
        
        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0)  # (s)
          
            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words // vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)
            
#             print(top_k_scores, top_k_words)
            # Add new words to sequences
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != vocab.stoi['<eos>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1
        
        if len(complete_seqs_scores) == 0:
            continue
        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(lambda c: [w for w in c if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}],
                img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([w for w in seq if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}])

        assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores
#     bleu4 = corpus_bleu(references, hypotheses)
    return references, hypotheses
    print_scores(references, hypotheses, nltk=True)


In [76]:
vocab_size = len(vocab)

In [77]:
vocab_size

5089

In [None]:
references, hypotheses = evaluate(3)

  cpuset_checked))
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [03:15<00:00, 25.61it/s]


In [None]:
len(references), len(hypotheses)

(4995, 4995)

In [None]:
print_scores(references, hypotheses)

----- Bleu-n Scores -----
1: 64.08114558472555
2: 46.50725094600124
3: 32.63582641164054
4: 22.48417748427286
-------------------------


(64.08114558472555, 46.50725094600124, 32.63582641164054, 22.48417748427286)

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict({"references": references, "hypotheses": hypotheses})
df.head()

Unnamed: 0,references,hypotheses
0,"[[4, 63, 11, 2453, 841, 17, 90, 106, 8, 491, 1...","[4, 64, 62, 11, 4, 49, 47, 17, 90, 34, 4, 373]"
1,"[[4, 51, 11, 93, 79, 21, 130, 529], [4, 51, 11...","[4, 51, 11, 4, 93, 47, 17, 206, 11, 4, 76]"
2,"[[4, 51, 11, 4, 1971, 83, 930, 2791, 1595], [4...","[4, 51, 11, 4, 83, 78, 17, 59, 34, 4, 373]"
3,"[[4, 6, 22, 36, 5, 17, 206, 11, 8, 7, 71, 4, 3...","[4, 6, 22, 36, 5, 17, 12, 13, 8, 7]"
4,"[[4, 51, 398, 4, 826, 859, 259, 4, 201], [8, 5...","[4, 75, 314, 50, 4, 194, 255]"


In [None]:
df.to_json("evaluation_df.json")

In [None]:
df = pd.read_json("evaluation_df.json")
df.head()

Unnamed: 0,references,hypotheses
0,"[[4, 63, 11, 2453, 841, 17, 90, 106, 8, 491, 1...","[4, 64, 62, 11, 4, 49, 47, 17, 90, 34, 4, 373]"
1,"[[4, 51, 11, 93, 79, 21, 130, 529], [4, 51, 11...","[4, 51, 11, 4, 93, 47, 17, 206, 11, 4, 76]"
2,"[[4, 51, 11, 4, 1971, 83, 930, 2791, 1595], [4...","[4, 51, 11, 4, 83, 78, 17, 59, 34, 4, 373]"
3,"[[4, 6, 22, 36, 5, 17, 206, 11, 8, 7, 71, 4, 3...","[4, 6, 22, 36, 5, 17, 12, 13, 8, 7]"
4,"[[4, 51, 398, 4, 826, 859, 259, 4, 201], [8, 5...","[4, 75, 314, 50, 4, 194, 255]"


In [None]:
hs = [" ".join(word for word in sent) for sent in vocab.indextostring(hypotheses)]
rs = []
for r in references:
    rs.append([" ".join(word for word in sent) for sent in vocab.indextostring(r)])

In [None]:
hs[1]

'a black and white dog is running through the grass'

In [None]:
rs[2]

['a boy pushes a wagon full of pumpkins',
 'a boy pushes a wagon with two pumpkins',
 'a boy smiling leaning over a wagon filled with two large pumpkins',
 'a child squats behind a wagon with two pumpkins in it',
 'boy pushing wagon with two pumpkins in it']

In [None]:
from statistics import mean

total_meteor = 0

for r, h in tqdm(zip(rs, hs), total=len(rs)):
    total_meteor += meteor_score(r, h)

100%|██████████| 5000/5000 [00:10<00:00, 460.53it/s]


In [None]:
total_meteor/len(rs)

0.428133409401112

In [None]:
### turn outputs into strings -> bleu_score

In [None]:
print_scores(rs, hs)

----- Bleu-n Scores -----
1: 64.62017684887459
2: 47.13660963689657
3: 33.759143347900405
4: 23.833249590210322
-------------------------
----- METEOR Score -----


AttributeError: 'NoneType' object has no attribute 'itos'

In [None]:
from nltk.translate.meteor_score import meteor_score

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/kelwa/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
hs = [" ".join([vocab.itos[i] for i in sent[0]]) for sent in hypotheses]
rs = []
for r in references:
    rs.append([" ".join([vocab.itos[i] for i in sent]) for sent in r])

In [None]:
vocab_size = len(vocab)

In [None]:
evaluate(2)

EVALUATING AT BEAM SIZE 2: 100%|██████████| 5000/5000 [03:27<00:00, 24.14it/s]


----- Bleu-n Scores -----
1: 63.54625550660793
2: 45.03356444855022
3: 31.69343570783961
4: 22.30082901251822
-------------------------


In [None]:
evaluate(3)

EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [03:57<00:00, 21.07it/s]


----- Bleu-n Scores -----
1: 64.31220201306728
2: 45.5251368754951
3: 32.38723129370634
4: 23.07429032664538
-------------------------


In [None]:
evaluate(1)

EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [02:59<00:00, 27.82it/s]


----- Bleu-n Scores -----
1: 61.123197163806296
2: 42.936593246185
3: 29.775037258795304
4: 20.646167109205283
-------------------------


In [None]:
for i in range(1, 6):
    print('*'*15, f"Beam size of {i}", '*'*15)
    evaluate(i)

EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

*************** Beam size of 1 ***************


EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [02:59<00:00, 27.92it/s]


----- Bleu-n Scores -----
1: 61.123197163806296
2: 42.936593246185
3: 29.775037258795304


EVALUATING AT BEAM SIZE 2:   0%|          | 0/5000 [00:00<?, ?it/s]

4: 20.646167109205283
-------------------------
*************** Beam size of 2 ***************


EVALUATING AT BEAM SIZE 2: 100%|██████████| 5000/5000 [03:58<00:00, 20.99it/s]


----- Bleu-n Scores -----
1: 63.54625550660793
2: 45.03356444855022
3: 31.69343570783961


EVALUATING AT BEAM SIZE 3:   0%|          | 0/5000 [00:00<?, ?it/s]

4: 22.30082901251822
-------------------------
*************** Beam size of 3 ***************


EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [04:43<00:00, 17.62it/s]


----- Bleu-n Scores -----
1: 64.31220201306728
2: 45.5251368754951
3: 32.38723129370634


EVALUATING AT BEAM SIZE 4:   0%|          | 0/5000 [00:00<?, ?it/s]

4: 23.07429032664538
-------------------------
*************** Beam size of 4 ***************


EVALUATING AT BEAM SIZE 4: 100%|██████████| 5000/5000 [04:58<00:00, 16.76it/s]


----- Bleu-n Scores -----
1: 64.40847503864691
2: 45.8716912526639
3: 32.61201588518963


EVALUATING AT BEAM SIZE 5:   0%|          | 0/5000 [00:00<?, ?it/s]

4: 23.235058922423306
-------------------------
*************** Beam size of 5 ***************


EVALUATING AT BEAM SIZE 5: 100%|██████████| 5000/5000 [05:38<00:00, 14.77it/s]


----- Bleu-n Scores -----
1: 64.8227213662521
2: 46.212067416932584
3: 32.929193446645684
4: 23.41989863202648
-------------------------


### End-to-End Arabic VS Translated English

In [35]:
from torch.utils.data import Dataset, DataLoader

In [38]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
import torchvision.transforms as transfroms

from PIL import Image
import pandas as pd
from tqdm import tqdm
import pickle

class CaptionDataset(Dataset):
    """ 
    Caption Dataset Class
    """

    def __init__(self, imgs_dir, captions_file, vocab, transforms=None, split='train'):
        """
        :param imgs_dir: folder where images are stored
        :param captions_file: the df file with all caption information
        :param vocab: vocabuary object
        :param transforms: image transforms pipeline
        :param split: data split
        """

        # split has to be one of {'train', 'val', 'test'}
        assert split in {'train', 'val', 'test'}

        self.imgs_dir = imgs_dir
        self.df = pd.read_json(captions_file)
        self.df = self.df[self.df['split'] == split]
        self.vocab = vocab
        self.transforms = transforms
        self.split = split

        self.dataset_size = self.df.shape[0]
        # printing some info
        print(f"Dataset split: {split}")
        print(f"Unique images: {self.df.file_name.nunique()}")
        print(f"Total size: {self.dataset_size}")

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, index):

        # loading the image
        img_id = self.df['file_name'].values[index]
        img = Image.open(self.imgs_dir+img_id).convert("RGB")

        if self.transforms is not None:
            img = self.transforms(img)
        else:
            img = transfroms.ToTensor()(img)

        # loading current caption
        cap_len = self.df['tok_len'].values[index] + 2 # <sos> and <eos>
        tokens = self.df['tokens'].values[index]
        caption = torch.LongTensor(self.vocab.numericalize(tokens, cap_len))

        if self.split is 'train':
            return img, caption, cap_len
        else:
            # for val and test return all captions for calculate the bleu scores
            captions_tokens = self.df[self.df['file_name'] == img_id].tokens.values
            captions_lens = self.df[self.df['file_name'] == img_id].tok_len.values
            all_tokens = []
            for token, cap_len in zip(captions_tokens, captions_lens):
                all_tokens.append(self.vocab.numericalize(token, cap_len)[1:]) # remove <sos>

            return img, caption, cap_len, torch.tensor(all_tokens), img_id

In [42]:
# getting the English captions 
def evaluate(beam_size):

    references = list()
    hypotheses = list()
    img_ids = list()

    # For each image
    for i, (image, caps, caplens, allcaps, img_id) in enumerate(
        tqdm(test_loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size), position=0, leave=True)):
        
        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to(device)  # (k, 1)
        
        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0)  # (s)
          
            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words // vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)
            
#             print(top_k_scores, top_k_words)
            # Add new words to sequences
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != vocab.stoi['<eos>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1
        
        if len(complete_seqs_scores) == 0:
            continue
        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(lambda c: [w for w in c if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}],
                img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([w for w in seq if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}])

        img_ids.append(img_id[0])

        # print(img_ids)
        # break
        assert len(references) == len(hypotheses) == len(img_ids)

    # Calculate BLEU-4 scores
#     bleu4 = corpus_bleu(references, hypotheses)
    return references, hypotheses, img_ids
    print_scores(references, hypotheses, nltk=True)

In [43]:
# getting the test data loader with shuffle=False
bs = 1
vocab_size = len(vocab)


test_loader = DataLoader(
    dataset=CaptionDataset(IMGS_PATH, DATA_JSON_PATH,
                            transforms=transform, vocab=vocab, split='test'),
    batch_size=bs,
    num_workers=2,
    shuffle=True,
    pin_memory=True
)

Dataset split: test
Unique images: 1000
Total size: 5000


In [44]:
refs, hypos, img_ids = evaluate(3)

EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [03:12<00:00, 26.00it/s]


In [46]:
len(refs), len(hypos), len(img_ids)

(4995, 4995, 4995)

In [47]:
df = pd.DataFrame.from_dict({"file_name": img_ids, "references": refs, "hypothesis": hypos})
df.head()

Unnamed: 0,file_name,references,hypothesis
0,2431470169_0eeba7d602.jpg,"[[4, 51, 104, 71, 99, 184, 821, 602, 1328, 468...","[4, 64, 62, 104, 34, 4, 264]"
1,2869491449_1041485a6b.jpg,"[[9, 4839, 774, 277, 20, 168, 34, 4, 384], [9,...","[9, 10, 167, 12, 13, 8, 243]"
2,801607443_f15956d1ce.jpg,"[[129, 317, 17, 104, 34, 8, 161, 215, 8, 144, ...","[4, 51, 17, 256, 4, 161]"
3,3716244806_97d5a1fb61.jpg,"[[4, 113, 104, 34, 320, 121, 55, 8, 69], [8, 2...","[4, 164, 19, 112, 104, 34, 4, 161]"
4,2358898017_24496b80e8.jpg,"[[4, 25, 22, 36, 5, 1385, 50, 11, 4, 290, 109]...","[4, 25, 5, 17, 12, 13, 8, 243]"


In [48]:
df.to_json("evaluation_df.json")

### Arabic 

In [63]:
# getting the test data loader with shuffle=False
bs = 1
vocab_size = len(vocab)
DATA_JSON_PATH = 'Image-Captioning/ar_data.json'


test_loader = DataLoader(
    dataset=CaptionDataset(IMGS_PATH, DATA_JSON_PATH,
                            transforms=transform, vocab=vocab, split='test'),
    batch_size=bs,
    num_workers=2,
    shuffle=True,
    pin_memory=True
)

Dataset split: test
Unique images: 1000
Total size: 3000


In [67]:
refs, hypos, img_ids = evaluate(3)

EVALUATING AT BEAM SIZE 3:   2%|▏         | 61/3000 [00:02<01:46, 27.66it/s]

KeyboardInterrupt: ignored

In [56]:
len(refs)

2997

In [57]:
len(hypos)

2997

In [45]:
df = pd.DataFrame.from_dict({"references": refs, "hypothesis": hypos})
df.to_json("evaluation_df.json")

In [50]:
refs[2]

[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3], [3, 3, 3, 3, 3]]

In [None]:
addit_tokens = [vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']]

In [None]:
hypothesis_text = [" ".join([vocab.itos[i] for i in sent if i not in addit_tokens]) for sent in hypos]

In [None]:
hypothesis_text[0]

'a young boy in a blue shirt is running on the street'

In [None]:
import pandas as pd

In [59]:
# add hypothesis and references to a df
cap_df = pd.DataFrame.from_dict({"file_name": img_ids,"refs": refs, "hypos": hypos})

In [60]:
cap_df

Unnamed: 0,file_name,refs,hypos
0,1056338697_4f7d7ce270.jpg,"[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3, 3, 3, ...","[4, 64, 62, 11, 4, 49, 47, 17, 12, 34, 8, 46]"
1,1056338697_4f7d7ce270.jpg,"[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3, 3, 3, ...","[4, 64, 62, 11, 4, 49, 47, 17, 12, 34, 8, 46]"
2,1056338697_4f7d7ce270.jpg,"[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3, 3, 3, ...","[4, 64, 62, 11, 4, 49, 47, 17, 12, 34, 8, 46]"
3,106490881_5a2dd9b7bd.jpg,"[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3], [3, 3...","[4, 64, 62, 40, 4, 49, 47, 17, 59, 34, 8, 32]"
4,106490881_5a2dd9b7bd.jpg,"[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3], [3, 3...","[4, 64, 62, 40, 4, 49, 47, 17, 59, 34, 8, 32]"
...,...,...,...
2992,979383193_0a542a059d.jpg,"[[3, 3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3], [3, 3...","[9, 136, 167, 104, 34, 4, 264]"
2993,979383193_0a542a059d.jpg,"[[3, 3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3], [3, 3...","[9, 136, 167, 104, 34, 4, 264]"
2994,997722733_0cb5439472.jpg,"[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3], [3, 3...","[4, 51, 17, 256, 4, 161]"
2995,997722733_0cb5439472.jpg,"[[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3, 3], [3, 3...","[4, 51, 17, 256, 4, 161]"


In [None]:
cap_df.file_name.nunique()

999

In [None]:
cap_df.to_csv("cap_eng_output.csv", index=False)

In [None]:
cap_df = pd.read_csv("cap_eng_output.csv")
cap_df.head()

Unnamed: 0,file_name,En hyps
0,1056338697_4f7d7ce270.jpg,a young boy in a blue shirt is running on the ...
1,1056338697_4f7d7ce270.jpg,a young boy in a blue shirt is running on the ...
2,1056338697_4f7d7ce270.jpg,a young boy in a blue shirt is running on the ...
3,106490881_5a2dd9b7bd.jpg,a young boy wearing a blue shirt is standing o...
4,106490881_5a2dd9b7bd.jpg,a young boy wearing a blue shirt is standing o...


In [None]:
cap_df = pd.read_excel("/content/cap_to_ar.xlsx")
cap_df.head()

Unnamed: 0,file_name,en_hyps,ar_trans
0,1056338697_4f7d7ce270.jpg,a young boy in a blue shirt is running on the ...,صبي صغير يرتدي قميصا أزرق يعمل على الشارع
1,1056338697_4f7d7ce270.jpg,a young boy in a blue shirt is running on the ...,صبي صغير يرتدي قميصا أزرق يعمل على الشارع
2,1056338697_4f7d7ce270.jpg,a young boy in a blue shirt is running on the ...,صبي صغير يرتدي قميصا أزرق يعمل على الشارع
3,106490881_5a2dd9b7bd.jpg,a young boy wearing a blue shirt is standing o...,صبي صغير يرتدي قميصا أزرق يقف على الشاطئ
4,106490881_5a2dd9b7bd.jpg,a young boy wearing a blue shirt is standing o...,صبي صغير يرتدي قميصا أزرق يقف على الشاطئ


In [None]:
df_org = pd.read_json("/content/Image-Captioning/ar_data.json")
df_test = df_org[df_org.split=='test']
df_test.head(5)

Unnamed: 0,file_name,caption,split,tok_len,tokens
126,1056338697_4f7d7ce270.jpg,امرأة شقراء في قميص أزرق تنتظر رحلة,test,7,"[امرأة, شقراء, في, قميص, أزرق, تنتظر, رحلة]"
127,1056338697_4f7d7ce270.jpg,امرأة شقراء في الشارع تشير الى سيارة أجرة,test,8,"[امرأة, شقراء, في, الشارع, تشير, الى, سيارة, أ..."
128,1056338697_4f7d7ce270.jpg,المرأة في الثوب الأزرق تمد بذراعها لحركة المرو...,test,9,"[المرأة, في, الثوب, الأزرق, تمد, بذراعها, لحرك..."
144,106490881_5a2dd9b7bd.jpg,صبي في ملابس السباحة الزرقاء على الشاطئ,test,7,"[صبي, في, ملابس, السباحة, الزرقاء, على, الشاطئ]"
145,106490881_5a2dd9b7bd.jpg,صبي يبتسم للكاميرا على الشاطئ,test,5,"[صبي, يبتسم, للكاميرا, على, الشاطئ]"


In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
translated_caps = []
original_caps = []
for idx, i in enumerate(cap_df.file_name.to_list()):
  translated_caps.append(cap_df.iloc[idx].ar_trans.split())
  caps = []
  for k in df_test[df_test.file_name==i].caption.to_list():
    caps.append(k.split())
  original_caps.append(caps)

In [None]:
len(translated_caps), len(original_caps)

(2997, 2997)

In [None]:
translated_caps[1]

['صبي', 'صغير', 'يرتدي', 'قميصا', 'أزرق', 'يعمل', 'على', 'الشارع']

In [None]:
original_caps[1]

[['امرأة', 'شقراء', 'في', 'قميص', 'أزرق', 'تنتظر', 'رحلة'],
 ['امرأة', 'شقراء', 'في', 'الشارع', 'تشير', 'الى', 'سيارة', 'أجرة'],
 ['المرأة',
  'في',
  'الثوب',
  'الأزرق',
  'تمد',
  'بذراعها',
  'لحركة',
  'المرور',
  'القادمة']]

In [None]:
translated_caps[100]

['مجموعة', 'من', 'الناس', 'يقفون', 'أمام', 'مبنى']

In [None]:
original_caps[100]

[['رجل', 'وفتاة', 'يجلسان', 'على', 'الأرض', 'ويأكلان'],
 ['رجل', 'وفتاة', 'صغيرة', 'يجلسان', 'على', 'رصيف', 'قرب', 'حقيبة', 'زرقاء'],
 ['رجل', 'وفتاة', 'يأكلان', 'وجبة', 'في', 'أحد', 'شوارع', 'المدينة']]

In [None]:
print_scores(original_caps, translated_caps)

----- Bleu-n Scores -----
1: 34.233436623874866
2: 20.012890946720674
3: 11.004236540712997
4: 5.5992246799395
-------------------------


(34.233436623874866, 20.012890946720674, 11.004236540712997, 5.5992246799395)

In [None]:
original_caps[0]

['امرأة شقراء في قميص أزرق تنتظر رحلة',
 'امرأة شقراء في الشارع تشير الى سيارة أجرة',
 'المرأة في الثوب الأزرق تمد بذراعها لحركة المرور القادمة']

In [None]:
cap_df.ar_trans.nunique()

797

In [None]:
!wget https://kkb-production.jupyter-proxy.kaggle.net/k/67249843/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2IiwidHlwIjoiSldUIn0..QxozAcCRQtqJg9saA6D1_A.Dyw9Y41owBppV8NPupo93f_ChdCabBRRSRa4u1mX2l0eqah7jd0e63V1AO2Nj0viz2ftYrW-kBJDRfKnxjg201a08nxF-P1seOVy_bJoSZm9ZKZC37-H9OU_I63F5UZx1zbvLdDcIrjF0BCOBPFAaYtF39TofvrZuYHprJbHQ4HPbEGRL73PZQhvIaveoi1Vs_Vfwmy_bcg__5jUopuUCw.azKye_nBu246g19zT-1Tsw/proxy/files/exps.zip

--2021-07-02 09:50:28--  https://kkb-production.jupyter-proxy.kaggle.net/k/67249843/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2IiwidHlwIjoiSldUIn0..QxozAcCRQtqJg9saA6D1_A.Dyw9Y41owBppV8NPupo93f_ChdCabBRRSRa4u1mX2l0eqah7jd0e63V1AO2Nj0viz2ftYrW-kBJDRfKnxjg201a08nxF-P1seOVy_bJoSZm9ZKZC37-H9OU_I63F5UZx1zbvLdDcIrjF0BCOBPFAaYtF39TofvrZuYHprJbHQ4HPbEGRL73PZQhvIaveoi1Vs_Vfwmy_bcg__5jUopuUCw.azKye_nBu246g19zT-1Tsw/proxy/files/exps.zip
Resolving kkb-production.jupyter-proxy.kaggle.net (kkb-production.jupyter-proxy.kaggle.net)... 35.244.180.134
Connecting to kkb-production.jupyter-proxy.kaggle.net (kkb-production.jupyter-proxy.kaggle.net)|35.244.180.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 107999 (105K) [application/zip]
Saving to: ‘exps.zip’


2021-07-02 09:50:28 (258 KB/s) - ‘exps.zip’ saved [107999/107999]

