# Importing Libraries and Setting Seed for Reproducibility

In [1]:
import numpy as np
import pandas as pd
import math
import random
import os
import pickle
import time
import json
import torch
import torch.nn as nn
from torch.nn import functional as F

from dictionary import Vocabulary,EOS_token,PAD_token,SOS_token,UNK_token
from utils import Utils
from config import Config,Path
from model import ShowAttendTell
from data import DataHandler
from evaluate import Evaluator

from torchvision.models import resnet101

utils = Utils()
#set seed for reproducibility
utils.set_seed(1)

# Configuration of Hyperparameters

In [2]:
cfg = Config()

dataset_path = '/media/nasibullah/Ubuntu/DataSets/Vision_Language_Tasks/COCO2014/'
path = Path(dataset_path)

print('Device in Use: ',cfg.device)
print('Device Properties: ',torch.cuda.get_device_properties(cfg.device))

#Change hyperparameters here  
#Config.encoder_arch = 'resnet'
#Config.feat_size = 1024
#Config.batch_size = 42


voc = Vocabulary('COCO_TRAIN')
voc.load()
voc.trim(min_count=5) # remove words having freq0.000001,0.0001,1.0uency less than min_count
print('Vocabulary size :',voc.num_words)

Device in Use:  cuda:1
Device Properties:  _CudaDeviceProperties(name='GeForce RTX 2080 Ti', major=7, minor=5, total_memory=11019MB, multi_processor_count=68)
keep_words 8730 / 22905 = 0.3811
Vocabulary size : 8733


# Creation of Datasets and Dataloaders

In [3]:
datahandler = DataHandler(cfg,path,voc)
train_dset,val_dset,test_dset = datahandler.getDataSets(utils)
train_loader,val_loader,test_loader = datahandler.getDataLoaders(train_dset,val_dset,test_dset)


dataiter = iter(train_loader)
features, targets, mask, max_length,ides= dataiter.next()
features.size(),targets.size(),mask.size(),ides.size()

loading annotations into memory...
Done (t=0.75s)
creating index...
index created!
loading annotations into memory...
Done (t=0.35s)
creating index...
index created!


(torch.Size([64, 3, 224, 224]),
 torch.Size([22, 64]),
 torch.Size([22, 64]),
 torch.Size([64]))

# Model Creation and Defining Evaluator

In [4]:
model = ShowAttendTell(voc,cfg)

val_evaluator = Evaluator(val_loader,path,cfg,str(path.val_annotation_file))

  "num_layers={}".format(dropout, num_layers))


# Training Loop

In [5]:
cfg.encoder_lr = 1e-5
cfg.decoder_lr = 1e-3
cfg.teacher_forcing_ratio = 1.0
model.update_hyperparam(cfg)

for epoch in range(1,5):
    model.train()
    loss = model.train_epoch(train_loader,utils)
    print(' Epoch :',epoch,' Loss :',loss)
    scores = val_evaluator.evaluate(model,epoch)
    print(scores)

  loss = crossEntropy.masked_select(mask).mean()


Iteration: 400; Percent complete: 30.9%; Average loss: 4.0590
Iteration: 800; Percent complete: 61.9%; Average loss: 3.2842
Iteration: 1200; Percent complete: 92.8%; Average loss: 3.0743
 Epoch : 1  Loss : 3.4374030550019854
{'testlen': 400590, 'reflen': 394372, 'guess': [400590, 360086, 319582, 279078], 'correct': [210096, 78897, 26575, 9204]}
ratio: 1.0157668394307886
[(['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'], [0.5244664120422364, 0.3389892817456947, 0.2122043580659995, 0.1332381138592167]), ('METEOR', 0.15638880303737998), ('ROUGE_L', 0.3990901394213532), ('CIDEr', 0.33469285780026675)]


  loss = crossEntropy.masked_select(mask).mean()


Iteration: 400; Percent complete: 30.9%; Average loss: 2.8984
Iteration: 800; Percent complete: 61.9%; Average loss: 2.8263
Iteration: 1200; Percent complete: 92.8%; Average loss: 2.7773
 Epoch : 2  Loss : 2.827663024164434
{'testlen': 446976, 'reflen': 428369, 'guess': [446976, 406472, 365968, 325464], 'correct': [247518, 99555, 34996, 12721]}
ratio: 1.0434368500054836
[(['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'], [0.5537612757731947, 0.36827947247689796, 0.23495071626482705, 0.15005039870369985]), ('METEOR', 0.1796180297518337), ('ROUGE_L', 0.4239189021698589), ('CIDEr', 0.4249693392659399)]


  loss = crossEntropy.masked_select(mask).mean()


Iteration: 400; Percent complete: 30.9%; Average loss: 2.6974
Iteration: 800; Percent complete: 61.9%; Average loss: 2.6790
Iteration: 1200; Percent complete: 92.8%; Average loss: 2.6570
 Epoch : 3  Loss : 2.6745554067042576
{'testlen': 438544, 'reflen': 423240, 'guess': [438544, 398040, 357536, 317032], 'correct': [246742, 103870, 39031, 15412]}
ratio: 1.036159153199128
[(['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'], [0.5626390966470854, 0.38317455401182127, 0.2521318676329153, 0.16707435015108096]), ('METEOR', 0.18641106016881506), ('ROUGE_L', 0.43461101003142216), ('CIDEr', 0.4553852913244559)]


  loss = crossEntropy.masked_select(mask).mean()


Iteration: 400; Percent complete: 30.9%; Average loss: 2.6045
Iteration: 800; Percent complete: 61.9%; Average loss: 2.5986
Iteration: 1200; Percent complete: 92.8%; Average loss: 2.5793
 Epoch : 4  Loss : 2.5939382484092586
{'testlen': 450252, 'reflen': 431173, 'guess': [450252, 409748, 369244, 328740], 'correct': [251906, 106925, 40086, 15735]}
ratio: 1.0442490601220369
[(['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'], [0.5594778035411269, 0.38209612984568425, 0.25119337684459114, 0.1659623926379734]), ('METEOR', 0.1894629277517949), ('ROUGE_L', 0.4342418223126701), ('CIDEr', 0.4583116212599615)]


# Inference and Visualization

In [None]:
#visualization
dataiter = iter(val_loader)
features,_,_,_,ide = dataiter.next()

print(features.size())
ct,ctx,aw = model.Greedy_Decoding(features.to(cfg.device))
ctx

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [14, 14]
import skimage.transform

num = 10
words = ctx[num].split(' ')
img = features[num].permute(1,2,0).numpy()
print(ctx[num])
plt.subplot(4, 5, 1)
plt.imshow(img)
plt.axis('off')

for t in range(len(words)):
        if t > 15:
            break
        plt.subplot(4, 4, t+2)
        plt.text(0, 1, '%s'%(words[t]) , color='black', backgroundcolor='white', fontsize=10)
        plt.imshow(img)
        alp_curr = aw[t,num,:].reshape(14,14)
        alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20)
        plt.imshow(alp_img, alpha=0.70)
        plt.axis('off')
plt.show()

In [None]:
# #Result generation for test files
# result = []
# ide_list = []
# caption_list =[]
# model.eval()
# with torch.no_grad():
#     for data in tqdm(test_loader):
#         ides, features = data
#         cap,cap_txt = model.Greedy_Decoding(features.to(device))
#         ide_list += list(ides.cpu().numpy())
#         caption_list += cap_txt
# for a in zip(ide_list,caption_list):
#     result.append({'image_id':a[0].item(),'caption':a[1].strip()}) 
    
# predicted_test_file = os.path.join(prediction_file_path,'captions_test2014_SAT_results.json') 
# with open(predicted_test_file, 'w') as fp:
#     json.dump(result,fp)

In [None]:
# Epoch : lr, tf ratio  Bleu4(epoch no)
# 1-10 : 1e-3, 0.5       14
# 10-15 : 1e-3, 0.5      15
# 15-25 : 1e-4, 0.4      15.5
# 25-30 : 1e-3, 0.7      18
# 30-33 : 1e-4, 0.8      20.6 (32)
# 33 - 35 : 1e-4, 0.9    22.9 (35)  
# 36 - 38 : 1e-4, 1.0    24.1 (38)
# 39 - 41 : 1e-4, 1.0    24.16 (39)


# 1,6    1e-3     0.7     19
# 6,16   1e-4     1.0     25.5
# 16,21  1e-4     1.0     25.6
# 21,26  1e-4     1.0     25.8
# 26,31  1e-4     1.0     25.79
# 31,34  1e-3     1.0 