In [1]:
!nvidia-smi

Tue Jul  6 23:49:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.31       Driver Version: 465.31       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   86C    P2    54W /  N/A |   3658MiB /  6078MiB |     86%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from utils import load_checkpoint
from dataset import build_vocab, get_loaders, get_10k_vocab, top10k_vocab
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
from utils import print_scores
import pandas as pd

In [4]:
# DATA_NAME = 'flickr8k_ar'

# local
DATA_JSON_PATH = 'data.json'
IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
# DATA_JSON_PATH = 'Image-Captioning/data.json'
# IMGS_PATH = 'flickr8k/images/'

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

In [8]:
CHECKPOINT_PATH = 'models//run/media/kelwa/DEV/GP/Image-Captioning/models/BEST_checkpoint_flickr8k_5_cap_per_img_2_min_word_freq_resnet101_fullvocab_fix_ds_rmsprop_finetune.pth.tar'

In [9]:
# Load model
checkpoint = load_checkpoint(CHECKPOINT_PATH)
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

FileNotFoundError: [Errno 2] No such file or directory: 'models//run/media/kelwa/DEV/GP/Image-Captioning/models/BEST_checkpoint_flickr8k_5_cap_per_img_2_min_word_freq_resnet101_fullvocab_fix_ds_rmsprop_finetune.pth.tar'

In [21]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [111]:
vocab = build_vocab(DATA_JSON_PATH)
# top10k_words = get_10k_vocab("10k_words.txt")
# vocab = top10k_vocab(top10k_words)
vocab_len = len(vocab)
vocab_len

  0%|          | 0/1000 [11:12<?, ?it/s]
100%|██████████| 155070/155070 [00:00<00:00, 319657.23it/s]


12096

In [112]:
from eval import test_score

In [28]:
for i in range(1, 6, 2):
    test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)

EVALUATING AT BEAM SIZE 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 5000/5000 [03:03<00:00, 27.19it/s]


----- Bleu-n Scores -----
1: 60.22017745645745
2: 42.03301963954654
3: 28.865543593536668
4: 19.822668164309842
-------------------------


EVALUATING AT BEAM SIZE 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 5000/5000 [03:55<00:00, 21.23it/s]


----- Bleu-n Scores -----
1: 64.22550530227498
2: 45.20326452084617
3: 31.54755348941619
4: 21.90451521810326
-------------------------


EVALUATING AT BEAM SIZE 5:   0%|          | 0/5000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 5000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 5000/5000 [05:07<00:00, 16.25it/s]


----- Bleu-n Scores -----
1: 65.29163468917882
2: 45.991178094289644
3: 32.14624016285145
4: 22.262874291057862
-------------------------


In [113]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
import torchvision.transforms as transfroms

from PIL import Image
import pandas as pd
from tqdm import tqdm
import pickle

class CaptionDataset(Dataset):
    """ 
    Caption Dataset Class
    """

    def __init__(self, imgs_dir, captions_file, vocab, transforms=None, split='train'):
        """
        :param imgs_dir: folder where images are stored
        :param captions_file: the df file with all caption information
        :param vocab: vocabuary object
        :param transforms: image transforms pipeline
        :param split: data split
        """

        # split has to be one of {'train', 'val', 'test'}
        assert split in {'train', 'val', 'test'}

        self.imgs_dir = imgs_dir
        self.df = pd.read_json(captions_file)
        self.df = self.df[self.df['split'] == split]
        self.vocab = vocab
        self.transforms = transforms
        self.split = split

        self.dataset_size = self.df.shape[0]
        # printing some info
        print(f"Dataset split: {split}")
        print(f"Unique images: {self.df.file_name.nunique()}")
        print(f"Total size: {self.dataset_size}")

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, index):

        # loading the image
        img_id = self.df['file_name'].values[index]
        img = Image.open(self.imgs_dir+img_id).convert("RGB")

        if self.transforms is not None:
            img = self.transforms(img)
        else:
            img = transfroms.ToTensor()(img)

        # loading current caption
        cap_len = self.df['tok_len'].values[index] + 2 # <sos> and <eos>
        tokens = self.df['tokens'].values[index]
        caption = torch.LongTensor(self.vocab.numericalize(tokens, cap_len))

        if self.split is 'train':
            return img, caption, cap_len
        else:
            # for val and test return all captions for calculate the bleu scores
            captions_tokens = self.df[self.df['file_name'] == img_id].tokens.values
            captions_lens = self.df[self.df['file_name'] == img_id].tok_len.values
            all_tokens = []
            for token, cap_len in zip(captions_tokens, captions_lens):
                all_tokens.append(self.vocab.numericalize(token, cap_len)[1:]) # remove <sos>

            return img, caption, cap_len, torch.tensor(all_tokens), img_id

In [114]:
bs = 1

loader = DataLoader(
            dataset=CaptionDataset(IMGS_PATH, DATA_JSON_PATH,
                                    transforms=transform, vocab=vocab, split='test'),
            batch_size=bs,
            num_workers=7,
            shuffle=True,
            pin_memory=True
        )

Dataset split: test
Unique images: 1000
Total size: 5000


In [115]:
def evaluate(beam_size):

    references = list()
    hypotheses = list()
    img_ids = list()
    
    # For each image
    for i, (image, caps, caplens, allcaps, img_id) in enumerate(
        tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size), position=0, leave=True)):
        
        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to(device)  # (k, 1)
        
        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0)  # (s)
          
            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words // vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)
            
#             print(top_k_scores, top_k_words)
            # Add new words to sequences
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != vocab.stoi['<eos>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1
        
        if len(complete_seqs_scores) == 0:
            continue
        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(lambda c: [w for w in c if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}],
                img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([w for w in seq if w not in {vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>']}])
        
        img_ids.append(img_id[0])
        assert len(references) == len(hypotheses) == len(img_ids)
    # Calculate BLEU-4 scores
#     bleu4 = corpus_bleu(references, hypotheses)
    return references, hypotheses, img_ids
    print_scores(references, hypotheses, nltk=True)


In [None]:
vocab_size = vocab_len
references, hypotheses, img_ids = evaluate(5)

EVALUATING AT BEAM SIZE 5:  40%|███▉      | 1998/5000 [02:15<03:10, 15.73it/s]

In [81]:
df = pd.DataFrame.from_dict({"file_name":img_ids, "references":references, "hypothesis": hypotheses})

In [82]:
df.head()

Unnamed: 0,file_name,references,hypothesis
0,3028145992.jpg,"[[4, 20, 5, 4, 22, 56, 10, 153, 1982, 13, 4, 3...","[4, 34, 8, 18, 52, 30, 6, 24]"
1,3643021980.jpg,"[[4, 31, 54, 5, 4, 180, 174, 650, 631, 71, 6, ...","[4, 31, 54, 11, 338, 7, 4, 394]"
2,226481576.jpg,"[[4, 377, 554, 1036, 40, 15, 1246, 153, 10, 46...","[4, 12, 5, 4, 43, 56, 10, 463, 11, 25, 4, 183]"
3,1153704539.jpg,"[[4, 31, 54, 40, 4, 43, 56, 1908, 5, 4, 637, 4...","[4, 12, 5, 4, 43, 56, 10, 22, 364, 11, 47, 4, ..."
4,4808256003.jpg,"[[14, 114, 65, 19, 15, 342, 2100, 32, 197, 366...","[4, 12, 5, 4, 36, 56, 11, 21, 5, 38, 8, 4, 352]"


In [83]:
df.to_json("predicted_captions_30.json")

In [90]:
references = []
hypothesis = []
for fname in tqdm(df.file_name.unique()):
  references.append(df[df.file_name==fname].references.to_list()[0])
  hypothesis.append(df[df.file_name==fname].hypothesis.to_list()[0])


  0%|          | 0/1000 [00:00<?, ?it/s][A
  5%|▍         | 47/1000 [00:00<00:02, 466.23it/s][A
 10%|█         | 103/1000 [00:00<00:01, 520.01it/s][A
 16%|█▋        | 163/1000 [00:00<00:01, 556.20it/s][A
 23%|██▎       | 230/1000 [00:00<00:01, 598.90it/s][A
 29%|██▉       | 290/1000 [00:00<00:01, 585.91it/s][A
 36%|███▌      | 357/1000 [00:00<00:01, 612.12it/s][A
 42%|████▏     | 419/1000 [00:00<00:00, 606.19it/s][A
 49%|████▉     | 488/1000 [00:00<00:00, 630.35it/s][A
 55%|█████▌    | 552/1000 [00:00<00:00, 619.04it/s][A
 61%|██████▏   | 614/1000 [00:01<00:00, 589.35it/s][A
 67%|██████▋   | 674/1000 [00:01<00:00, 539.84it/s][A
 73%|███████▎  | 729/1000 [00:01<00:00, 538.93it/s][A
 79%|███████▉  | 790/1000 [00:01<00:00, 557.38it/s][A
 85%|████████▍ | 849/1000 [00:01<00:00, 565.39it/s][A
 91%|█████████ | 906/1000 [00:01<00:00, 558.83it/s][A
100%|██████████| 1000/1000 [00:01<00:00, 572.05it/s][A


In [92]:
preds_tokens = [" ".join([vocab.itos[i] for i in seq]) for seq in tqdm(hypothesis)]

refes_tokens = []
for ref in tqdm(references):
  refes_tokens.append([" ".join([vocab.itos[i] for i in seq]) for seq in ref]) 



100%|██████████| 1000/1000 [00:00<00:00, 297658.36it/s]


  0%|          | 0/1000 [00:41<?, ?it/s][A[A
100%|██████████| 1000/1000 [00:00<00:00, 42332.93it/s]


In [95]:
hypo = {idx: [tokens] for idx, tokens in enumerate(preds_tokens)}
refs = {idx: tokens for idx, tokens in enumerate(refes_tokens)}

In [96]:
len(refs), len(hypo)

(1000, 1000)

### Metrics 

In [97]:
!pip install "git+https://github.com/salaniz/pycocoevalcap.git"

Collecting git+https://github.com/salaniz/pycocoevalcap.git
  Cloning https://github.com/salaniz/pycocoevalcap.git to /tmp/pip-req-build-4ckm44vg
  Running command git clone -q https://github.com/salaniz/pycocoevalcap.git /tmp/pip-req-build-4ckm44vg
Collecting pycocotools>=2.0.2
  Downloading pycocotools-2.0.2.tar.gz (23 kB)
Collecting cython>=0.27.3
  Using cached Cython-0.29.23-cp37-cp37m-manylinux1_x86_64.whl (2.0 MB)
Building wheels for collected packages: pycocoevalcap, pycocotools
  Building wheel for pycocoevalcap (setup.py) ... [?25ldone
[?25h  Created wheel for pycocoevalcap: filename=pycocoevalcap-1.2-py3-none-any.whl size=104312215 sha256=1d7914621181b950036c9496e47d66a66a01a9cdddb07c161864c07483ebc1f0
  Stored in directory: /tmp/pip-ephem-wheel-cache-417vz9_y/wheels/6f/c9/51/e266f0496048c16686e133d8e33644d692931a356bfb372aae
  Building wheel for pycocotools (setup.py) ... [?25ldone
[?25h  Created wheel for pycocotools: filename=pycocotools-2.0.2-cp37-cp37m-linux_x86_64.

In [98]:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.spice.spice import Spice

#### BLEU 

In [99]:
score, scores = Bleu(4).compute_score(refs, hypo)
score

{'testlen': 10424, 'reflen': 10402, 'guess': [10424, 9424, 8424, 7424], 'correct': [6806, 3053, 1323, 549]}
ratio: 1.0021149778887712


[0.6529163468917255,
 0.45991178094284996,
 0.32146240162848005,
 0.22262874291055326]

#### Meteor 

In [100]:
score, scores = Meteor().compute_score(refs, hypo)
score

0.18458572048818053

#### Rouge 

In [101]:
score, scores = Rouge().compute_score(refs, hypo)
score

0.44392078717000505

#### Cider 

In [102]:
score, scores = Cider().compute_score(refs, hypo)
score

0.42666098298752114

#### Spice 

In [106]:
# score, scores = Spice().compute_score(refs, hypo)
# score

CalledProcessError: Command '['java', '-jar', '-Xmx8G', 'spice-1.0.jar', '/home/kelwa/anaconda3/envs/kaggle_torch/lib/python3.7/site-packages/pycocoevalcap/spice/tmp/tmp2zxdf5ks', '-cache', '/home/kelwa/anaconda3/envs/kaggle_torch/lib/python3.7/site-packages/pycocoevalcap/spice/cache', '-out', '/home/kelwa/anaconda3/envs/kaggle_torch/lib/python3.7/site-packages/pycocoevalcap/spice/tmp/tmpf4c0j_qd', '-subset', '-silent']' returned non-zero exit status 1.