In [6]:
import pandas as pd

In [7]:
df = pd.read_json('old_ar_data.json')
df.head()

Unnamed: 0,file_name,caption,split,tok_len,tokens
0,1000268201_693b08cb0e.jpg,طفلة صغيرة تتسلق إلى مسرح خشبي,train,6,"[طفلة, صغيرة, تتسلق, إلى, مسرح, خشبي]"
1,1000268201_693b08cb0e.jpg,طفلة صغيرة تتسلق الدرج إلى منزلها,train,6,"[طفلة, صغيرة, تتسلق, الدرج, إلى, منزلها]"
2,1000268201_693b08cb0e.jpg,فتاة صغيرة في ثوب وردي تذهب إلى المقصورة الخشبية,train,9,"[فتاة, صغيرة, في, ثوب, وردي, تذهب, إلى, المقصو..."
3,1001773457_577c3a7d70.jpg,كلب أسود وكلب ثلاثي الألوان يلعبان مع بعضهما ا...,train,11,"[كلب, أسود, وكلب, ثلاثي, الألوان, يلعبان, مع, ..."
4,1001773457_577c3a7d70.jpg,كلب أسود وكلب أبيض ببقع بنية يحدقان في بعضهما ...,train,12,"[كلب, أسود, وكلب, أبيض, ببقع, بنية, يحدقان, في..."


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [3]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none

# Old Preprocessing to Arabert  

In [4]:
DATA_JSON_PATH = 'old_ar_data.json'
IMGS_PATH = 'flickr/Images/'
DATA_NAME = 'TESTING'

In [9]:
max_seq = 30
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 498812.21it/s]


5788

In [10]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>',
  '<sos>',
  '<eos>',
  '<unk>',
  'طفلة',
  'صغيرة',
  'تتسلق',
  'إلى',
  'كلب',
  'أسود'])

In [11]:
m = load_checkpoint("models/BEST_checkpoint_flickr8k_ar_finetune.pth.tar")

Loaded Checkpoint!!
Last Epoch: 19
Best Bleu-4: 6.862300456763069


In [12]:
encoder = m['encoder'].eval()
decoder = m['decoder'].eval()

In [14]:
from eval import test_score

for i in range(1, 6):
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)

Dataset split: test
Unique images: 1000
Total size: 3000


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
EVALUATING AT BEAM SIZE 1: 100%|██████████| 3000/3000 [01:15<00:00, 39.80it/s]


----- Bleu-n Scores -----
1: 39.95516959427624
2: 25.77931292578018
3: 15.10781606368901
4: 8.64667118061901
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 2: 100%|██████████| 3000/3000 [01:22<00:00, 36.19it/s]


----- Bleu-n Scores -----
1: 40.68928313941436
2: 26.982549212073188
3: 16.459546825901818
4: 9.631675158047099
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 3000/3000 [01:31<00:00, 32.89it/s]


----- Bleu-n Scores -----
1: 40.373911854332825
2: 26.615634981477243
3: 16.09711443584611
4: 9.384744789396482
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 4: 100%|██████████| 3000/3000 [01:41<00:00, 29.62it/s]


----- Bleu-n Scores -----
1: 40.48892525456145
2: 26.861057340847083
3: 16.299550030840223
4: 9.621167436145724
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 3000/3000 [01:50<00:00, 27.20it/s]


----- Bleu-n Scores -----
1: 39.84779810028518
2: 26.47471808239697
3: 15.929967050767372
4: 9.330062157193685
-------------------------


### Saving the results of the best beam size 

In [16]:
references, hypotheses = test_score(2, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab, return_results=True)

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 2: 100%|██████████| 3000/3000 [01:31<00:00, 32.72it/s]


----- Bleu-n Scores -----
1: 40.68928313941436
2: 26.982549212073188
3: 16.459546825901818
4: 9.631675158047099
-------------------------


In [47]:
references_old_tokens = [[[vocab.itos[i] for i in refe] for refe in refes] for refes in references]
hypotheses_old_tokens = [[vocab.itos[i] for i in hypo] for hypo in hypotheses]

In [16]:
from arabert.preprocess import ArabertPreprocessor
import pyarabic.araby as araby


model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)



In [52]:
hypotheses_ara_tokens = [araby.tokenize(arabert_prep.preprocess(" ".join(i))) for i in hypotheses_old_tokens]
references_ara_tokens = [[araby.tokenize(arabert_prep.preprocess(" ".join(i))) for i in ref] for ref in references_old_tokens]

In [53]:
print_scores(references_ara_tokens, hypotheses_ara_tokens)

----- Bleu-n Scores -----
1: 54.85289890804387
2: 43.93811436252674
3: 34.76736607534273
4: 27.80241332075155
-------------------------


(54.85289890804387, 43.93811436252674, 34.76736607534273, 27.80241332075155)

# Arabet to old Preprocessing 

In [4]:
DATA_JSON_PATH = 'data/ar_data.json'
IMGS_PATH = 'flickr/Images/'
DATA_NAME = 'TESTING'

In [5]:
max_seq = 65
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 310986.73it/s]


3309

In [6]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>', '<sos>', '<eos>', '<unk>', '+', 'ة', 'طفل', 'صغير', 'تتسلق', 'إلى'])

In [7]:
m = load_checkpoint("ar_models/BEST_checkpoint_flickr8k_ar_arabert_pretrained_finetune.pth.tar")
encoder = m['encoder'].eval()
decoder = m['decoder'].eval()

Loaded Checkpoint!!
Last Epoch: 9
Best Bleu-4: 24.949378413361714


In [9]:
from eval import test_score
for i in range(1, 6):
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)

Dataset split: test
Unique images: 1000
Total size: 3000


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
EVALUATING AT BEAM SIZE 1: 100%|██████████| 3000/3000 [01:51<00:00, 26.82it/s]


----- Bleu-n Scores -----
1: 59.27134312126155
2: 45.52397958654338
3: 33.58850576504064
4: 24.918812277227662
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 2: 100%|██████████| 3000/3000 [02:12<00:00, 22.69it/s]


----- Bleu-n Scores -----
1: 59.68623977817169
2: 46.88248014003475
3: 35.44609658225986
4: 26.921966136090035
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 3000/3000 [02:42<00:00, 18.43it/s]


----- Bleu-n Scores -----
1: 60.32593136195551
2: 47.5536072957737
3: 36.147037636633875
4: 27.524282207029003
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 4: 100%|██████████| 3000/3000 [03:08<00:00, 15.94it/s]


----- Bleu-n Scores -----
1: 59.51223743176995
2: 47.318183310877345
3: 36.32951749139962
4: 27.893442570349023
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 3000/3000 [03:11<00:00, 15.64it/s]


----- Bleu-n Scores -----
1: 58.678484239386094
2: 46.85683508774053
3: 36.14555791431082
4: 27.864202291806382
-------------------------


In [10]:
references, hypotheses = test_score(4, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab, return_results=True)

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 4: 100%|██████████| 3000/3000 [03:12<00:00, 15.55it/s]


----- Bleu-n Scores -----
1: 59.51223743176995
2: 47.318183310877345
3: 36.32951749139962
4: 27.893442570349023
-------------------------


In [32]:
references_ara_tokens = [[[vocab.itos[i] for i in refe] for refe in refes] for refes in references]
hypotheses_ara_tokens = [[vocab.itos[i] for i in hypo] for hypo in hypotheses]

In [33]:
references_sent_tokens = [[arabert_prep.unpreprocess(' '.join(w for w in i)) for i in refe] for refe in references_ara_tokens] 
hypotheses_sent_tokens = [arabert_prep.unpreprocess(' '.join(w for w in i)) for i in hypotheses_ara_tokens]
assert len(references_sent_tokens) == len(hypotheses_sent_tokens)

In [34]:
references_old_tokens = [[araby.tokenize(i)  for i in refe] for refe in references_sent_tokens] 
hypotheses_old_tokens = [araby.tokenize(i)  for i in hypotheses_sent_tokens]
assert len(references_old_tokens) == len(hypotheses_old_tokens)

In [37]:
print_scores(references_old_tokens, hypotheses_old_tokens)

----- Bleu-n Scores -----
1: 32.26797327283087
2: 20.74686106020747
3: 11.73920224562103
4: 6.022065664696533
-------------------------


(32.26797327283087, 20.74686106020747, 11.73920224562103, 6.022065664696533)