In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [3]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none

# Old Preprocessing to Arabert  

In [4]:
DATA_JSON_PATH = 'old_ar_data.json'
IMGS_PATH = 'flickr/Images/'
DATA_NAME = 'TESTING'

In [5]:
max_seq = 30
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 312155.55it/s]


5788

In [6]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>',
  '<sos>',
  '<eos>',
  '<unk>',
  'طفلة',
  'صغيرة',
  'تتسلق',
  'إلى',
  'كلب',
  'أسود'])

In [7]:
m = load_checkpoint("models/BEST_checkpoint_flickr8k_ar_finetune.pth.tar")

Loaded Checkpoint!!
Last Epoch: 19
Best Bleu-4: 6.862300456763069


In [8]:
encoder = m['encoder'].eval()
decoder = m['decoder'].eval()

In [9]:
from eval import test_score

for i in range(1, 6):
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)

Dataset split: test
Unique images: 1000
Total size: 3000


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
EVALUATING AT BEAM SIZE 1: 100%|██████████| 1000/1000 [00:26<00:00, 37.96it/s]


----- Bleu-n Scores -----
1: 39.955
2: 25.779
3: 15.108
4: 8.647
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 2: 100%|██████████| 1000/1000 [00:28<00:00, 35.00it/s]


----- Bleu-n Scores -----
1: 40.689
2: 26.983
3: 16.460
4: 9.632
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 1000/1000 [00:31<00:00, 31.47it/s]


----- Bleu-n Scores -----
1: 40.374
2: 26.616
3: 16.097
4: 9.385
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 4: 100%|██████████| 1000/1000 [00:34<00:00, 29.30it/s]


----- Bleu-n Scores -----
1: 40.489
2: 26.861
3: 16.300
4: 9.621
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 1000/1000 [00:37<00:00, 26.78it/s]


----- Bleu-n Scores -----
1: 39.848
2: 26.475
3: 15.930
4: 9.330
-------------------------


### Saving the results of the best beam size 

In [10]:
references, hypotheses = test_score(2, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab, return_results=True)

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 2: 100%|██████████| 1000/1000 [00:28<00:00, 35.15it/s]


----- Bleu-n Scores -----
1: 40.689
2: 26.983
3: 16.460
4: 9.632
-------------------------


In [11]:
references_old_tokens = [[[vocab.itos[i] for i in refe] for refe in refes] for refes in references]
hypotheses_old_tokens = [[vocab.itos[i] for i in hypo] for hypo in hypotheses]

In [11]:
from arabert.preprocess import ArabertPreprocessor
import pyarabic.araby as araby


model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)



In [13]:
hypotheses_ara_tokens = [araby.tokenize(arabert_prep.preprocess(" ".join(i))) for i in hypotheses_old_tokens]
references_ara_tokens = [[araby.tokenize(arabert_prep.preprocess(" ".join(i))) for i in ref] for ref in references_old_tokens]

In [14]:
print_scores(references_ara_tokens, hypotheses_ara_tokens)

----- Bleu-n Scores -----
1: 54.853
2: 43.938
3: 34.767
4: 27.802
-------------------------


(54.853, 43.938, 34.767, 27.802)

# Arabet to old Preprocessing 

In [15]:
DATA_JSON_PATH = 'data/ar_data.json'
IMGS_PATH = 'flickr/Images/'
DATA_NAME = 'TESTING'

In [16]:
max_seq = 65
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 255904.98it/s]


3309

In [17]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>', '<sos>', '<eos>', '<unk>', '+', 'ة', 'طفل', 'صغير', 'تتسلق', 'إلى'])

In [18]:
m = load_checkpoint("ar_models/BEST_checkpoint_flickr8k_ar_arabert_pretrained_finetune.pth.tar")
encoder = m['encoder'].eval()
decoder = m['decoder'].eval()

Loaded Checkpoint!!
Last Epoch: 9
Best Bleu-4: 24.949378413361714


In [19]:
from eval import test_score
for i in range(1, 6):
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 1: 100%|██████████| 1000/1000 [00:40<00:00, 24.86it/s]


----- Bleu-n Scores -----
1: 59.271
2: 45.524
3: 33.589
4: 24.919
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 2: 100%|██████████| 1000/1000 [00:44<00:00, 22.54it/s]


----- Bleu-n Scores -----
1: 59.686
2: 46.882
3: 35.446
4: 26.922
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 1000/1000 [00:52<00:00, 19.14it/s]


----- Bleu-n Scores -----
1: 60.326
2: 47.554
3: 36.147
4: 27.524
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 4: 100%|██████████| 1000/1000 [01:00<00:00, 16.49it/s]


----- Bleu-n Scores -----
1: 59.512
2: 47.318
3: 36.330
4: 27.893
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 1000/1000 [01:03<00:00, 15.68it/s]


----- Bleu-n Scores -----
1: 58.678
2: 46.857
3: 36.146
4: 27.864
-------------------------


In [20]:
references, hypotheses = test_score(4, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab, return_results=True)

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 4: 100%|██████████| 1000/1000 [01:00<00:00, 16.41it/s]


----- Bleu-n Scores -----
1: 59.512
2: 47.318
3: 36.330
4: 27.893
-------------------------


In [21]:
references_ara_tokens = [[[vocab.itos[i] for i in refe] for refe in refes] for refes in references]
hypotheses_ara_tokens = [[vocab.itos[i] for i in hypo] for hypo in hypotheses]

In [22]:
references_sent_tokens = [[arabert_prep.unpreprocess(' '.join(w for w in i)) for i in refe] for refe in references_ara_tokens] 
hypotheses_sent_tokens = [arabert_prep.unpreprocess(' '.join(w for w in i)) for i in hypotheses_ara_tokens]
assert len(references_sent_tokens) == len(hypotheses_sent_tokens)

In [23]:
references_old_tokens = [[araby.tokenize(i)  for i in refe] for refe in references_sent_tokens] 
hypotheses_old_tokens = [araby.tokenize(i)  for i in hypotheses_sent_tokens]
assert len(references_old_tokens) == len(hypotheses_old_tokens)

In [24]:
print_scores(references_old_tokens, hypotheses_old_tokens)

----- Bleu-n Scores -----
1: 32.268
2: 20.747
3: 11.739
4: 6.022
-------------------------


(32.268, 20.747, 11.739, 6.022)

# Saving Predictions info
- Ground Truth Captions [separete df]
- Araby Catpions and Arabert captions with their number of tokens (print the number of words later)

### Getting the ground truth captions  

In [9]:
import pandas as pd 
df = pd.read_json('old_ar_data.json')
ground_truth = df[df['split'] == 'test'].drop(['split', 'tokens'], axis=1)
ground_truth.head()

Unnamed: 0,file_name,caption,tok_len
126,1056338697_4f7d7ce270.jpg,امرأة شقراء في قميص أزرق تنتظر رحلة,7
127,1056338697_4f7d7ce270.jpg,امرأة شقراء في الشارع تشير الى سيارة أجرة,8
128,1056338697_4f7d7ce270.jpg,المرأة في الثوب الأزرق تمد بذراعها لحركة المرو...,9
144,106490881_5a2dd9b7bd.jpg,صبي في ملابس السباحة الزرقاء على الشاطئ,7
145,106490881_5a2dd9b7bd.jpg,صبي يبتسم للكاميرا على الشاطئ,5


## Old Predictions 

In [3]:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from caption import CaptionDataset, caption_image
from utils import load_checkpoint, print_scores
from dataset import build_vocab
import pandas as pd
from tqdm import tqdm

In [27]:
model_path = 'models/BEST_checkpoint_flickr8k_ar_finetune.pth.tar'
IMGS_PATH = 'flickr/Images/'
DATA_JSON_PATH = 'old_ar_data.json'

In [28]:
vocab = build_vocab(DATA_JSON_PATH)
print('len:',len(vocab))

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


bs = 1
loader = DataLoader(
            dataset=CaptionDataset(IMGS_PATH, DATA_JSON_PATH,
                                    transforms=transform, vocab=vocab, split='test'),
            batch_size=bs,
            num_workers=7,
            pin_memory=True
        )

print('-'*20)
checkpoint = load_checkpoint(model_path)
encoder = checkpoint['encoder'].eval()
decoder = checkpoint['decoder'].eval()

100%|██████████| 24000/24000 [00:00<00:00, 446297.51it/s]


len: 5788
Dataset split: test
Unique images: 1000
Total size: 3000
--------------------
Loaded Checkpoint!!
Last Epoch: 19
Best Bleu-4: 6.862300456763069


In [29]:
references, hypothesis, img_ids = caption_image(loader, vocab, encoder,
                                               decoder, 2)

EVALUATING AT BEAM SIZE 2: 100%|██████████| 1000/1000 [00:30<00:00, 32.81it/s]

Number of NO COMPLETED:  0





In [30]:
hypotheses_tokens = [[vocab.itos[i] for i in hypo] for hypo in hypothesis]

In [31]:
df = pd.DataFrame.from_dict({"file_name":img_ids, "old_hypotheses": hypotheses_tokens})
df.head()

Unnamed: 0,file_name,old_hypotheses
0,1056338697_4f7d7ce270.jpg,"[رجل, في, قميص, أزرق, يقفز, في, الهواء]"
1,106490881_5a2dd9b7bd.jpg,"[صبي, صغير, في, قميص, أزرق, يركض, على, الشاطئ]"
2,1082379191_ec1e53f996.jpg,"[رجل, يجلس, على, لوح, التزلج, في, الهواء]"
3,1084040636_97d9633581.jpg,"[كلب, أبيض, يركض, على, العشب]"
4,1096395242_fc69f0ae5a.jpg,"[صبي, صغير, يلعب, في, الشارع]"


In [32]:
df['old_tok_len'] = df['old_hypotheses'].apply(lambda x: len(x)).values
df.head()

Unnamed: 0,file_name,old_hypotheses,old_tok_len
0,1056338697_4f7d7ce270.jpg,"[رجل, في, قميص, أزرق, يقفز, في, الهواء]",7
1,106490881_5a2dd9b7bd.jpg,"[صبي, صغير, في, قميص, أزرق, يركض, على, الشاطئ]",8
2,1082379191_ec1e53f996.jpg,"[رجل, يجلس, على, لوح, التزلج, في, الهواء]",7
3,1084040636_97d9633581.jpg,"[كلب, أبيض, يركض, على, العشب]",5
4,1096395242_fc69f0ae5a.jpg,"[صبي, صغير, يلعب, في, الشارع]",5


In [33]:
df['file_name'].nunique()

1000

## Arabert Predictions  

In [34]:
model_path = 'ar_models/BEST_checkpoint_flickr8k_ar_arabert_pretrained_finetune.pth.tar'
IMGS_PATH = 'flickr/Images/'
DATA_JSON_PATH = 'data/ar_data.json'

In [35]:
vocab = build_vocab(DATA_JSON_PATH)
print('len:',len(vocab))

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


bs = 1
loader = DataLoader(
            dataset=CaptionDataset(IMGS_PATH, DATA_JSON_PATH,
                                    transforms=transform, vocab=vocab, split='test'),
            batch_size=bs,
            num_workers=7,
            pin_memory=True
        )

print('-'*20)
checkpoint = load_checkpoint(model_path)
encoder = checkpoint['encoder'].eval()
decoder = checkpoint['decoder'].eval()

100%|██████████| 24000/24000 [00:00<00:00, 232893.97it/s]


len: 3309
Dataset split: test
Unique images: 1000
Total size: 3000
--------------------
Loaded Checkpoint!!
Last Epoch: 9
Best Bleu-4: 24.949378413361714


In [36]:
references, hypothesis, img_ids = caption_image(loader, vocab, encoder,
                                               decoder, 4)
hypotheses_tokens = [[vocab.itos[i] for i in hypo] for hypo in hypothesis]

EVALUATING AT BEAM SIZE 4: 100%|██████████| 1000/1000 [01:00<00:00, 16.44it/s]

Number of NO COMPLETED:  25





In [37]:
rest_df = pd.DataFrame.from_dict({"file_name":img_ids, "hypotheses": hypotheses_tokens})
rest_df.head()

Unnamed: 0,file_name,hypotheses
0,1056338697_4f7d7ce270.jpg,"[فتا, +, ة, صغير, +, ة, في, ال, +, هواء]"
1,106490881_5a2dd9b7bd.jpg,"[صبي, صغير, يقفز, في, ال, +, ماء]"
2,1082379191_ec1e53f996.jpg,"[رجل, و, +, امرأ, +, ة, يقف, +, ان, في, ال, +,..."
3,1084040636_97d9633581.jpg,"[ال, +, كلب, ال, +, كلب, ال, +, بني, و, +, ال,..."
4,1096395242_fc69f0ae5a.jpg,"[فتا, +, ة, صغير, +, ة, صغير, +, ة, صغير, +, ة..."


In [38]:
rest_df['tok_len'] = rest_df['hypotheses'].apply(lambda x: len(x)).values
rest_df.head()

Unnamed: 0,file_name,hypotheses,tok_len
0,1056338697_4f7d7ce270.jpg,"[فتا, +, ة, صغير, +, ة, في, ال, +, هواء]",10
1,106490881_5a2dd9b7bd.jpg,"[صبي, صغير, يقفز, في, ال, +, ماء]",7
2,1082379191_ec1e53f996.jpg,"[رجل, و, +, امرأ, +, ة, يقف, +, ان, في, ال, +,...",13
3,1084040636_97d9633581.jpg,"[ال, +, كلب, ال, +, كلب, ال, +, بني, و, +, ال,...",19
4,1096395242_fc69f0ae5a.jpg,"[فتا, +, ة, صغير, +, ة, صغير, +, ة, صغير, +, ة...",19


In [39]:
rest_df['file_name'].nunique()

975

In [40]:
full_df = df.merge(rest_df, on='file_name')

In [41]:
full_df.sample(20)

Unnamed: 0,file_name,old_hypotheses,old_tok_len,hypotheses,tok_len
829,3692593096_fbaea67476.jpg,"[شخص, يقف, على, قمة, الجبل]",5,"[راكب, ال, +, دراج, +, ة, في, ال, +, هواء]",10
521,3135504530_0f4130d8f8.jpg,"[امرأة, في, قميص, أبيض]",4,"[امرأ, +, ة, صغير, +, ة, في, ال, +, شارع]",10
29,1311388430_4ab0cd1a1f.jpg,"[شخص, يقفز, في, الهواء]",4,"[فتا, +, ة, في, ال, +, هواء]",7
459,3006093003_c211737232.jpg,"[رجل, يرتدي, سترة, سوداء, وقبعة, <unk>]",6,"[رجل, و, +, امرأ, +, ة, يقف, +, ان, في, ال, +,...",13
35,1339596997_8ac29c1841.jpg,"[امرأة, ترتدي, ملابس, سوداء, في, الشارع]",6,"[مجموع, +, ة, من, ال, +, ناس, في, ال, +, شارع]",11
737,3506560025_8d0f4f9ac4.jpg,"[امرأة, في, قميص, أزرق]",4,"[فتا, +, ة, صغير, +, ة, ترتدي, قميص, +, ا, أزر...",16
671,3393926562_66cc01b001.jpg,"[رجل, يرتدي, سترة, سوداء, يقف, على, الرصيف]",7,"[امرأ, +, ة, صغير, +, ة, في, ال, +, شارع]",10
784,3585598356_8ce815bbb9.jpg,"[امرأة, في, ثوب, السباحة]",4,"[مجموع, +, ة, من, ال, +, ناس, في, ال, +, ماء]",11
613,326456451_effadbbe49.jpg,"[كلب, أسود, يقفز, في, الهواء]",5,"[كلب, بني, يقفز, في, ال, +, هواء]",7
631,3317073508_7e13565c1b.jpg,"[مجموعة, من, لاعبي, كرة, القدم]",5,"[مجموع, +, ة, من, لاعب, +, ي, كر, +, ة, ال, +,...",13


In [42]:
full_df['old_captions'] = full_df.old_hypotheses.apply(lambda x: " ".join(i for i in x)).values

In [43]:
full_df['captions'] = full_df.hypotheses.apply(lambda x: arabert_prep.unpreprocess(' '.join(w for w in x))).values

In [44]:
full_df.head()

Unnamed: 0,file_name,old_hypotheses,old_tok_len,hypotheses,tok_len,old_captions,captions
0,1056338697_4f7d7ce270.jpg,"[رجل, في, قميص, أزرق, يقفز, في, الهواء]",7,"[فتا, +, ة, صغير, +, ة, في, ال, +, هواء]",10,رجل في قميص أزرق يقفز في الهواء,فتاة صغيرة في الهواء
1,106490881_5a2dd9b7bd.jpg,"[صبي, صغير, في, قميص, أزرق, يركض, على, الشاطئ]",8,"[صبي, صغير, يقفز, في, ال, +, ماء]",7,صبي صغير في قميص أزرق يركض على الشاطئ,صبي صغير يقفز في الماء
2,1082379191_ec1e53f996.jpg,"[رجل, يجلس, على, لوح, التزلج, في, الهواء]",7,"[رجل, و, +, امرأ, +, ة, يقف, +, ان, في, ال, +,...",13,رجل يجلس على لوح التزلج في الهواء,رجل وامرأة يقفان في الماء
3,1084040636_97d9633581.jpg,"[كلب, أبيض, يركض, على, العشب]",5,"[ال, +, كلب, ال, +, كلب, ال, +, بني, و, +, ال,...",19,كلب أبيض يركض على العشب,الكلب الكلب البني والأبيض والأبيض
4,1096395242_fc69f0ae5a.jpg,"[صبي, صغير, يلعب, في, الشارع]",5,"[فتا, +, ة, صغير, +, ة, صغير, +, ة, صغير, +, ة...",19,صبي صغير يلعب في الشارع,فتاة صغيرة صغيرة صغيرة صغيرة في الشارع


# Saving individual BLEU scores 

In [14]:
def calculate_scores(row):
    idx = ["old processing, old results", "new processing, new results", "new processing, old results", "old processing, new results"]
    b1s = []
    b2s = []
    b3s = []
    b4s = []
    
    # get ground truth 
    gt = ground_truth[ground_truth.file_name == row['file_name']].caption.values
    
    # tokenize truth
    old_tokens_truth = [araby.tokenize(i) for i in gt]
    tokens_truth = [araby.tokenize(arabert_prep.preprocess(i)) for i in gt]
    

    b1, b2, b3, b4 = print_scores([old_tokens_truth], [row['old_hypotheses']], prnt=False)
    b1s.append(b1)
    b2s.append(b2)
    b3s.append(b3)
    b4s.append(b4)

    b1, b2, b3, b4 = print_scores([tokens_truth], [row['hypotheses']], prnt=False)
    b1s.append(b1)
    b2s.append(b2)
    b3s.append(b3)
    b4s.append(b4)
    
    # reverse scores
    old_hypo = araby.tokenize(arabert_prep.preprocess(row['old_captions']))
    hypo = araby.tokenize(row['captions'])
    
    b1, b2, b3, b4 = print_scores([tokens_truth], [old_hypo], prnt=False)
    b1s.append(b1)
    b2s.append(b2)
    b3s.append(b3)
    b4s.append(b4)
    
    b1, b2, b3, b4 = print_scores([old_tokens_truth], [hypo], prnt=False)
    b1s.append(b1)
    b2s.append(b2)
    b3s.append(b3)
    b4s.append(b4)
    
    out_df = pd.DataFrame({'Exp': idx, 'b1': b1s, 'b2': b2s, 'b3': b3s, 'b4': b4s})
    gt_df = ground_truth[ground_truth.file_name == row['file_name']]
    return pd.concat([d.reset_index(drop=True) for d in [out_df, gt_df]], axis=1).fillna('_')

In [46]:
calculate_scores(full_df.iloc[0])

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,Exp,b1,b2,b3,b4,caption,tok_len
0,"old processing, old results",42.857,37.796,30.571,0.0,امرأة شقراء في قميص أزرق تنتظر رحلة,7
1,"new processing, new results",63.339,56.427,47.627,40.881,امرأة شقراء في الشارع تشير الى سيارة أجرة,8
2,"new processing, old results",44.485,42.203,34.411,0.0,المرأة في الثوب الأزرق تمد بذراعها لحركة المرو...,9
3,"old processing, new results",11.809,0.0,0.0,0.0,_,_


In [47]:
full_df.to_json('results_comparison.json')

In [5]:
full_df = pd.read_json('results_comparison.json')

# Comparing Results 

In [6]:
# Modules
import ipywidgets as widgets
from ipywidgets import interact_manual
import matplotlib.pyplot as plt

import os, shutil

# needed to use matplotlib with jupyter widgets
%matplotlib notebook

from IPython.display import display, clear_output

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
# Create the widgets
left = widgets.Button(description='Left')
right = widgets.Button(description='Right')
out = widgets.Output()

# get the file names
DATA_PATH = 'flickr/Images/'
full_df = pd.read_json('results_comparison.json')

# Setting the first and last indexs
i=0 # starting index
i_max = full_df.shape[0] # ending index

# Plot the first picture
img = plt.imread(f'{DATA_PATH}/{full_df.iloc[i, 0]}')
fig, ax = plt.subplots(1, figsize=(7, 7))
ax = plt.imshow(img)
plt.suptitle("Current sample")
plt.show()
with out:
    clear_output()
    display(full_df.iloc[i:i+1].drop('file_name', axis=1))
    display(calculate_scores(full_df.iloc[i]))

# plots the next image
def plot_next(*args):
    global i
    i += 1
    # stop
    if i >= i_max: 
        print("Done")
        return

    
    # get next image index
    img = plt.imread(f'{DATA_PATH}/{full_df.iloc[i, 0]}')
    ax.set_data(img)
    plt.show()
    plt.title(f"image: {full_df.iloc[i, 0]}")
    with out:
        clear_output()
        display(full_df.iloc[i:i+1])
        display(calculate_scores(full_df.iloc[i]))

def plot_previous(*args):
    global i
    i -= 1
    # stop
    if i < 0: 
        print("Done")
        return

    
    
    # get next image index
    img = plt.imread(f'{DATA_PATH}/{full_df.iloc[i, 0]}')
    ax.set_data(img)
    plt.show()
    plt.title(f"image: {full_df.iloc[i, 0]}")

    with out:
        clear_output()
        display(full_df.iloc[i:i+1])
        display(calculate_scores(full_df.iloc[i]))
    
    
# attaching the buttons to their callback functions         
right.on_click(plot_next)
left.on_click(plot_previous)

# showing the widgets
vb1 = widgets.VBox([right, left])
vb2 = widgets.VBox([out])
widgets.HBox([vb1, vb2])

<IPython.core.display.Javascript object>

HBox(children=(VBox(children=(Button(description='Right', style=ButtonStyle()), Button(description='Left', sty…