In [1]:
!nvidia-smi

Thu Jul  1 15:29:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [4]:
!pip install kaggle -q
!kaggle datasets download -d aladdinpersson/flickr8kimagescaptions
!unzip -q flickr8kimagescaptions.zip

Downloading flickr8kimagescaptions.zip to /content
100% 1.04G/1.04G [00:26<00:00, 60.7MB/s]
100% 1.04G/1.04G [00:26<00:00, 41.4MB/s]


In [5]:
# get the code form github
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 611, done.[K
remote: Counting objects: 100% (611/611), done.[K
remote: Compressing objects: 100% (293/293), done.[K
remote: Total 611 (delta 367), reused 551 (delta 307), pack-reused 0[K
Receiving objects: 100% (611/611), 38.47 MiB | 18.04 MiB/s, done.
Resolving deltas: 100% (367/367), done.


In [6]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [7]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [8]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none


In [9]:
DATA_NAME = 'flickr8k_ar_aug'

# local
# DATA_JSON_PATH = 'ar_data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = 'Image-Captioning/ar_data.json'
IMGS_PATH = 'flickr8k/images/'

In [10]:
max_seq = 30
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 283147.02it/s]


5788

In [11]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>',
  '<sos>',
  '<eos>',
  '<unk>',
  'طفلة',
  'صغيرة',
  'تتسلق',
  'إلى',
  'كلب',
  'أسود'])

In [12]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': max_seq,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [13]:
t_params

{'batch_size': 256,
 'data_name': 'flickr8k_ar',
 'decoder_lr': 0.0004,
 'df_path': 'Image-Captioning/ar_data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_encoder': False,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': False,
 'vocab': <dataset.Vocabulary at 0x7f730290b510>,
 'workers': 2}

In [14]:
# experiment name
name = DATA_NAME + "_data_augmentation"
# path
log_dir = '/content/drive/MyDrive/ImageCaptioning/flickr8_ar/experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [17]:
def get_loaders(bs, images_path, df_path, transform, vocab, test=False, n_workers=0):
    #pad_idx = vocab.stoi['<pad>']

    if test:
        test_loader = DataLoader(
            dataset=CaptionDataset(images_path, df_path,
                                    transforms=transform, vocab=vocab, split='test'),
            batch_size=bs,
            num_workers=n_workers,
            shuffle=True,
            pin_memory=True
        )
        return test_loader


    train_transforms = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.RandomHorizontalFlip(),
      transforms.ColorJitter(.4,.4,.4),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
      ])


    train_loader = DataLoader(
        dataset=CaptionDataset(images_path, df_path,
                              transforms=train_transforms, vocab=vocab, split='train'),
        batch_size=bs,
        num_workers=n_workers,
        shuffle=True,
        pin_memory=True,
    )
    valid_loader = DataLoader(
        dataset=CaptionDataset(images_path, df_path,
                              transforms=transform, vocab=vocab, split='val'),
        batch_size=bs,
        num_workers=n_workers,
        shuffle=True,
        pin_memory=True,
    )

    return train_loader, valid_loader

In [18]:
def fit(t_params, checkpoint=None, m_params=None, logger=None):

    # info
    data_name = t_params['data_name']
    imgs_path = t_params['imgs_path']
    df_path = t_params['df_path']
    vocab = t_params['vocab']

    start_epoch = 0
    epochs_since_improvement = 0
    best_bleu4 = 0
    epochs = t_params['epochs']
    batch_size = t_params['batch_size']
    workers = t_params['workers']
    encoder_lr = t_params['encoder_lr']
    decoder_lr = t_params['decoder_lr']
    fine_tune_encoder = t_params['fine_tune_encoder']

    # pretrained word embeddings
    pretrained_embeddings = t_params['pretrained_embeddings']
    if pretrained_embeddings:
        fine_tune_embeddings = t_params['fine_tune_embeddings']
        embeddings_matrix = m_params['embeddings_matrix']



    # init / load checkpoint
    if checkpoint is None:

        # getting hyperparameters
        attention_dim = m_params['attention_dim']
        embed_dim = m_params['embed_dim']
        decoder_dim = m_params['decoder_dim']
        encoder_dim = m_params['encoder_dim']
        dropout = m_params['dropout']

        decoder = DecoderWithAttention(attention_dim=attention_dim,
                                      embed_dim=embed_dim,
                                      decoder_dim=decoder_dim,
                                      encoder_dim=encoder_dim,
                                      vocab_size=len(vocab),
                                      dropout=dropout)
        
        if pretrained_embeddings:
            decoder.load_pretrained_embeddings(torch.tensor(embeddings_matrix, dtype=torch.float32))
            decoder.fine_tune_embeddings(fine_tune=fine_tune_embeddings)
        
        decoder_optimizer = torch.optim.RMSprop(params=filter(lambda p:p.requires_grad, decoder.parameters()),
                                            lr=decoder_lr)
        


        encoder=Encoder()
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.RMSprop(params=filter(lambda p:p.requires_grad, encoder.parameters()),
                                            lr=encoder_lr) if fine_tune_encoder else None

        
    # load checkpoint
    else:
        checkpoint = torch.load(checkpoint)
        print('Loaded Checkpoint!!')
        start_epoch = checkpoint['epoch'] + 1
        print(f"Starting Epoch: {start_epoch}")
        epochs_since_improvement = checkpoint['epochs_since_imrovment']
        best_bleu4 = checkpoint['bleu-4']
        decoder = checkpoint['decoder']
        decoder_optimizer = checkpoint['deocder_optimizer']
        encoder = checkpoint['encoder']
        encoder_optimizer = checkpoint['encoder_optimizer']
        if fine_tune_encoder is True and encoder_optimizer is None:
            encoder.fine_tune(fine_tune_encoder)
            encoder_optimizer = torch.optim.RMSprop(params=filter(lambda p:p.requires_grad, encoder.parameters()),
                                                lr=encoder_lr)
    
    # Schedulers
    decoder_scheduler = ReduceLROnPlateau(decoder_optimizer, patience=2, verbose=True)
    if fine_tune_encoder:
        encoder_scheduler = ReduceLROnPlateau(encoder_optimizer, patience=2, verbose=True)


    # move to gpu, if available
    decoder = decoder.to(device)
    encoder = encoder.to(device)
    
    # loss function
    criterion = nn.CrossEntropyLoss().to(device)
    
    # dataloaders
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    print('Loading Data')
    train_loader, val_loader = get_loaders(batch_size, imgs_path, df_path, transform, vocab, False ,workers)
    print('_'*50)

    print('-'*20, 'Fitting', '-'*20)
    for epoch in range(start_epoch, epochs):

        # if epochs_since_improvement > 0 and epochs_since_improvement % 2 == 0:
        #     adjust_learning_rate(decoder_optimizer, 0.8)
        #     if fine_tune_encoder:
        #         adjust_learning_rate(encoder_optimizer, 0.8)
        
        print('_'*50)
        print('-'*20, 'Training', '-'*20)
        # one epoch of training
        epoch_time = AverageMeter()
        start_time = time.time()
        train(train_loader=train_loader,
            encoder=encoder,
            decoder=decoder,
            criterion=criterion,
            encoder_optimizer=encoder_optimizer,
            decoder_optimizer=decoder_optimizer,
            epoch=epoch,
            logger=logger)
        epoch_time.update(time.time() - start_time)
        print(f"Epoch train time {epoch_time.val:.3f} (epoch_time.avg:.3f)")

        # one epoch of validation
        epoch_time = AverageMeter()
        start_time = time.time()
        print('-'*20, 'Validation', '-'*20)
        b1, b2, b3, recent_bleu4 = validate(val_loader=val_loader,
            encoder=encoder,
            decoder=decoder,
            criterion=criterion,
            vocab=vocab,
            epoch=epoch,
            logger=logger)
        epoch_time.update(time.time() - start_time)
        # tensorboard 
        logger.add_scalar(f'b-1/valid', b1, epoch)
        logger.add_scalar(f'b-2/valid', b2, epoch)
        logger.add_scalar(f'b-3/valid', b3, epoch)
        logger.add_scalar(f'b-4/valid', recent_bleu4, epoch)
        # logger.add_scalar(f'Meteor/valid', m, epoch)
        print(f"Epoch validation time {epoch_time.val:.3f} (epoch_time.avg:.3f)")

        
        # check for improvement
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print(f'\nEpochs since last improvement: {epochs_since_improvement,}')
        else:
            # reset
            epochs_since_improvement = 0
        

        # stop training if no improvement for 5 epochs
        if epochs_since_improvement == 5:
            print('No improvement for 5 consecutive epochs, terminating...')
            break
        
        # learning rate schedular
        decoder_scheduler.step(recent_bleu4)
        if fine_tune_encoder:
            encoder_scheduler.step(recent_bleu4)

        save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
            decoder_optimizer, recent_bleu4, is_best)

In [19]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [0][0/71]	Batch Time 9.397 (9.397)	Data Load Time 4.751 (4.751)	Loss 9.5748 (9.5748)	Top-5 Accuracy 0.046 (0.046)
Epoch train time 318.961 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 8.024 (8.024)	Loss 6.6170 (6.6170)	Top-5 Accuracy 34.222 (34.222)	
----- Bleu-n Scores -----
1: 35.848565748702256
2: 15.35303305671232
3: 5.461182625395057
4: 2.1937256568031542
-------------------------

 * LOSS - 6.443, TOP-5 ACCURACY - 35.963, BLEU-4 - 2.1937256568031542

Epoch validation time 47.199 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training -----------

KeyboardInterrupt: ignored

In [None]:
m = load_checkpoint("/content/BEST_checkpoint_flickr8k_ar.pth.tar")

Loaded Checkpoint!!
Last Epoch: 8
Best Bleu-4: 5.715176451490768


In [None]:
batch_size = 64
fine_tune_encoder = True
checkpoint = '/content/BEST_checkpoint_flickr8k_ar.pth.tar'
# epochs = 30

t_params['batch_size'] = batch_size
t_params['data_name'] = t_params['data_name'] + "_finetune" 
t_params['fine_tune_encoder'] = True
t_params['decoder_lr'] = t_params['decoder_lr'] / 10
# t_params['epochs'] = epochs
t_params

{'batch_size': 64,
 'data_name': 'flickr8k_ar_finetune',
 'decoder_lr': 4e-05,
 'df_path': 'Image-Captioning/ar_data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_encoder': True,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': False,
 'vocab': <dataset.Vocabulary at 0x7f7e18aaefd0>,
 'workers': 2}

In [None]:
fit(t_params, checkpoint=checkpoint, m_params=m_params, logger=logger)

Loaded Checkpoint!!
Starting Epoch: 9
Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------
Epoch: [9][0/282]	Batch Time 4.199 (4.199)	Data Load Time 0.960 (0.960)	Loss 4.9865 (4.9865)	Top-5 Accuracy 52.319 (52.319)
Epoch: [9][100/282]	Batch Time 1.235 (1.233)	Data Load Time 0.001 (0.010)	Loss 5.1665 (4.9120)	Top-5 Accuracy 49.912 (52.423)
Epoch: [9][200/282]	Batch Time 1.229 (1.217)	Data Load Time 0.001 (0.006)	Loss 4.8351 (4.8706)	Top-5 Accuracy 53.012 (53.206)
Epoch train time 341.644 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/47]	Batch Time 1.474 (1.474)	Loss 5.5957 (5.5957)	Top-5 Accuracy 45.269 (45.269)	
----- Bleu-n Scores -----
1: 43.30506294300831
2: 22.758277

In [None]:
!cp BEST_checkpoint_flickr8k_ar_finetune.pth.tar /content/drive/MyDrive/ImageCaptioning/flickr8_ar

In [None]:
checkpoint = load_checkpoint("BEST_checkpoint_flickr8k_ar_finetune.pth.tar")
decoder = checkpoint['decoder']
decoder = decoder.to(device)
decoder.eval()
encoder = checkpoint['encoder']
encoder = encoder.to(device)
encoder.eval();

Loaded Checkpoint!!
Last Epoch: 19
Best Bleu-4: 6.862300456763069


In [None]:
from eval import test_score

test_dict = {}

for i in [1, 3, 5]:
    
    b1, b2, b3, b4 = test_score(i, encoder, decoder, IMGS_PATH, DATA_JSON_PATH, vocab)
    if i == 3:
        test_dict['b1'] = b1
        test_dict['b2'] = b2
        test_dict['b3'] = b3
    
    test_dict[f'b4-b{i}'] = b4

  cpuset_checked))
EVALUATING AT BEAM SIZE 1:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
EVALUATING AT BEAM SIZE 1: 100%|██████████| 3000/3000 [01:26<00:00, 34.86it/s]


----- Bleu-n Scores -----
1: 39.95516959427624
2: 25.77931292578018
3: 15.10781606368901
4: 8.64667118061901
-------------------------


  cpuset_checked))
EVALUATING AT BEAM SIZE 3:   0%|          | 0/3000 [00:00<?, ?it/s]

Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 3: 100%|██████████| 3000/3000 [01:34<00:00, 31.81it/s]


----- Bleu-n Scores -----


  cpuset_checked))
EVALUATING AT BEAM SIZE 5:   0%|          | 0/3000 [00:00<?, ?it/s]

1: 40.373911854332825
2: 26.615634981477243
3: 16.09711443584611
4: 9.384744789396482
-------------------------
Dataset split: test
Unique images: 1000
Total size: 3000


EVALUATING AT BEAM SIZE 5: 100%|██████████| 3000/3000 [01:43<00:00, 28.92it/s]


----- Bleu-n Scores -----
1: 39.84779810028518
2: 26.47471808239697
3: 15.929967050767372
4: 9.330062157193685
-------------------------


In [None]:
test_dict

{'b1': 40.373911854332825,
 'b2': 26.615634981477243,
 'b3': 16.09711443584611,
 'b4-b1': 8.64667118061901,
 'b4-b3': 9.384744789396482,
 'b4-b5': 9.330062157193685}

In [None]:
# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 5.653,
    'b-1/test': test_dict['b1'],
    'b-2/test': test_dict['b2'],
    'b-3/test': test_dict['b3'],
    'b-4/b3': test_dict['b4-b3'],
    'b-4/b1': test_dict['b4-b1'],
    'b-4/b5': test_dict['b4-b5']
}

In [None]:
logger.add_hparams(logger_dic, results_dic, run_name='Arabic_30maxlen')