<a href="https://colab.research.google.com/github/moaaztaha/Image-Captioning/blob/main/skipgram_Ar_pretrained_with_old_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sun May 29 14:54:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# setting up kaggle json
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

# downloading dataset from kaggle
!pip install kaggle -q
!kaggle datasets download -d aladdinpersson/flickr8kimagescaptions
!unzip -q flickr8kimagescaptions.zip

Downloading flickr8kimagescaptions.zip to /content
 98% 1.02G/1.04G [00:06<00:00, 139MB/s]
100% 1.04G/1.04G [00:06<00:00, 166MB/s]


In [4]:
# get the code form github
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 785, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 785 (delta 1), reused 5 (delta 0), pack-reused 778[K
Receiving objects: 100% (785/785), 62.42 MiB | 18.95 MiB/s, done.
Resolving deltas: 100% (472/472), done.
Checking out files: 100% (143/143), done.


In [5]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [7]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 300  # dimension of word embeddings
attention_dim = 300  # dimension of attention linear layers
decoder_dim = 300  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = True
fine_tune_embeddings = True
checkpoint = None  # path to checkpoint, None if none

In [8]:
DATA_NAME = 'flickr8k_skipgram_pretrained'

# local
# DATA_JSON_PATH = 'ar_data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = '/content/Image-Captioning/old_ar_data.json'
IMGS_PATH = 'flickr8k/images/'

In [9]:
max_seq = 65
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 158977.77it/s]


5788

In [10]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>',
  '<sos>',
  '<eos>',
  '<unk>',
  'طفلة',
  'صغيرة',
  'تتسلق',
  'إلى',
  'كلب',
  'أسود'])

### Pre-trained Arabic Embeddings

In [11]:
# downloading arabic skipgrams pretrained word embedings
! wget https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_sg_300_wiki.zip
! unzip -q full_grams_sg_300_wiki.zip

--2022-05-29 14:55:52--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_grams_sg_300_wiki.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1488871452 (1.4G) [application/zip]
Saving to: ‘full_grams_sg_300_wiki.zip’


2022-05-29 14:56:06 (98.6 MB/s) - ‘full_grams_sg_300_wiki.zip’ saved [1488871452/1488871452]



In [12]:
import gensim
model = gensim.models.Word2Vec.load("./full_grams_sg_300_wiki.mdl")
model.wv.save_word2vec_format("aravec.txt")

In [13]:
import numpy as np
def get_weights(embedding_path):
    embeddings_index = {}
    with open(embedding_path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    print("Found %s word vectors." % len(dict(embeddings_index)))
    
    num_tokens = len(vocab)
    embedding_dim = 300
    hits = 0
    misses = 0
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, index in tqdm(vocab.stoi.items()):
        if word in embeddings_index:
            embedding_matrix[index] = embeddings_index[word]
            hits+=1
        else:
            misses+=1
            embedding_matrix[index] = np.random.uniform(-.1, .1, size=embedding_dim)
    print("Hist:", hits, " | Misses:", misses)
    return embedding_matrix

In [14]:
embedding_matrix = get_weights("./aravec.txt")

Found 662110 word vectors.


100%|██████████| 5788/5788 [00:00<00:00, 189496.85it/s]

Hist: 3746  | Misses: 2042





In [15]:
embedding_matrix.shape

(5788, 300)

In [16]:
len(vocab.itos)

5788

In [17]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'fine_tune_embeddings': fine_tune_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout,
    'embeddings_matrix': embedding_matrix
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': max_seq,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}


t_params

{'batch_size': 256,
 'data_name': 'flickr8k_skipgram_pretrained',
 'decoder_lr': 0.0004,
 'df_path': '/content/Image-Captioning/old_ar_data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_embeddings': True,
 'fine_tune_encoder': False,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': True,
 'vocab': <dataset.Vocabulary at 0x7f545b275990>,
 'workers': 2}

In [18]:
# experiment name
name = DATA_NAME
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [21]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/71]	Batch Time 10.695 (10.695)	Data Load Time 4.792 (4.792)	Loss 9.5726 (9.5726)	Top-5 Accuracy 0.000 (0.000)
Epoch train time 168.402 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 6.011 (6.011)	Loss 6.5247 (6.5247)	Top-5 Accuracy 35.127 (35.127)	
----- Bleu-n Scores -----
1: 34.641
2: 14.431
3: 5.855
4: 2.909
-------------------------

 * LOSS - 6.603, TOP-5 ACCURACY - 33.855, BLEU-4 - 2.909

Epoch validation time 34.883 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training --------------------
Epoch: [1][0/71]	Batch Time 5.820 (5.820)	Data Lo

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f53a223c560>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f53a223c560>
    self._shutdown_workers()
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/pytho

Epoch train time 162.067 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 5.665 (5.665)	Loss 5.8351 (5.8351)	Top-5 Accuracy 45.133 (45.133)	
----- Bleu-n Scores -----
1: 42.146
2: 21.693
3: 10.283
4: 4.817
-------------------------

 * LOSS - 5.877, TOP-5 ACCURACY - 44.996, BLEU-4 - 4.817

Epoch validation time 31.444 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training --------------------
Epoch: [14][0/71]	Batch Time 5.709 (5.709)	Data Load Time 3.626 (3.626)	Loss 5.2041 (5.2041)	Top-5 Accuracy 49.738 (49.738)
Epoch train time 157.119 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 5.862 (5.862)	Loss 5.9848 (5.9848)	Top-5 Accuracy 43.844 (43.844)	
----- Bleu-n Scores -----
1: 42.135
2: 21.681
3: 10.223
4: 4.796
-------------------------

 * LOSS - 5.877, TOP-5 ACCURACY - 45.032, BLEU-4 - 4.796

Epoch validation time 31.206 (ep

In [None]:
!ls

In [22]:
m = load_checkpoint("BEST_checkpoint_flickr8k_skipgram_pretrained.pth.tar")

Loaded Checkpoint!!
Last Epoch: 13
Best Bleu-4: 4.817


In [23]:
batch_size = 64
fine_tune_encoder = True
checkpoint = 'BEST_checkpoint_flickr8k_skipgram_pretrained.pth.tar'
# epochs = 30

t_params['batch_size'] = batch_size
t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['decoder_lr'] = t_params['decoder_lr'] / 10
# t_params['epochs'] = epochs
t_params

{'batch_size': 64,
 'data_name': 'flickr8k_skipgram_pretrained_finetune',
 'decoder_lr': 4e-05,
 'df_path': '/content/Image-Captioning/old_ar_data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_embeddings': True,
 'fine_tune_encoder': True,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': True,
 'vocab': <dataset.Vocabulary at 0x7f545b275990>,
 'workers': 2}

In [24]:
fit(t_params, checkpoint=checkpoint, m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 14
Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [14][0/282]	Batch Time 4.885 (4.885)	Data Load Time 1.055 (1.055)	Loss 5.2484 (5.2484)	Top-5 Accuracy 46.892 (46.892)
Epoch: [14][100/282]	Batch Time 1.166 (1.200)	Data Load Time 0.001 (0.011)	Loss 5.4407 (5.2406)	Top-5 Accuracy 47.963 (48.885)
Epoch: [14][200/282]	Batch Time 1.144 (1.177)	Data Load Time 0.001 (0.006)	Loss 5.2289 (5.2074)	Top-5 Accuracy 49.815 (49.409)
Epoch train time 330.390 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/47]	Batch Time 1.560 (1.560)	Loss 5.7792 (5.7792)	Top-5 Accuracy 44.110 (44.110)	
----- Bleu-n Scores -----
1: 42.337
2: 22.268
3: 11.034

In [None]:
!ls

### Test Scores

In [25]:
!cp BEST_checkpoint_flickr8k_skipgram_pretrained_finetune.pth.tar /content/drive/MyDrive/ImageCaptioning/skipgram/

In [26]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_skipgram_pretrained_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 14
Best Bleu-4: 5.48


In [27]:
from eval import test_score

test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

  cpuset_checked))


Dataset split: test
Unique images: 1000
Total size: 3000


  prev_word_inds = top_k_words // vocab_size  # (s)
EVALUATING AT BEAM SIZE 1: 100%|██████████| 1000/1000 [00:43<00:00, 23.20it/s]


----- Bleu-n Scores -----
1: 39.012
2: 24.454
3: 13.011
4: 7.276
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 1000/1000 [00:43<00:00, 22.96it/s]


----- Bleu-n Scores -----
1: 40.103
2: 25.582
3: 14.280
4: 7.898
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 1000/1000 [00:45<00:00, 22.03it/s]


----- Bleu-n Scores -----
1: 39.108
2: 25.131
3: 13.962
4: 8.129
-------------------------
