In [1]:
import numpy as np
import pandas as pd
import math
import random
import os
import pickle
import time
import json
import torch
import torch.nn as nn
from torch.nn import functional as F

from dictionary import Vocabulary,EOS_token,PAD_token,SOS_token,UNK_token
from utils import maskNLLLoss,normalizeString
from config import Config,Path
from model import ShowAttendTell
from data import DataHandler
from evaluate import Evaluator

from torchvision.models import resnet101

dataset_path = '/media/nasibullah/Ubuntu/DataSets/Vision_Language_Tasks/COCO2014/'
path = Path(dataset_path)

print('Device in Use: ',Config.device)
print('Device Properties: ',torch.cuda.get_device_properties(Config.device))

# changes for resnet
Config.model_name = 'RESNET101_LSTM_'
Config.encoder_arch = 'resnet'
Config.feat_size = 1024
Config.batch_size = 42

Device in Use:  cuda
Device Properties:  _CudaDeviceProperties(name='GeForce RTX 2080 Ti', major=7, minor=5, total_memory=11018MB, multi_processor_count=68)


In [2]:
#json.load(open('results/VGG_LSTM_39.json'))

In [3]:
#Original Model Implementation Details
  # Encoder - VGG19 14×14×512 feature map of the fourth convolutional layer before max pooling. 196 × 512 
  # mini-batch - 64
  # stopping criterion - early stopping on BLEU score
  # model selection - BLEU on our validation set
  # vocabulary size - 10,000
    



#We observed a breakdown in correlation between the validation set log-likelihood and BLEU in the later stages of 
#training during our experiments



In [4]:
voc = Vocabulary('COCO_TRAIN')
voc.load()
#voc.trim(min_count=3) # remove words having freq0.000001,0.0001,1.0uency less than min_count
print('Vocabulary size :',voc.num_words)

Vocabulary size : 22905


In [5]:
datahandler = DataHandler(dataset_path,voc)
train_dset,val_dset,test_dset = datahandler.getDataSets()
train_loader,val_loader,test_loader = datahandler.getDataLoaders(train_dset,val_dset,test_dset)

In [6]:
dataiter = iter(train_loader)
features, targets, mask, max_length,ides= dataiter.next()

features.size(),targets.size(),mask.size(),ides.size()

(torch.Size([42, 3, 224, 224]),
 torch.Size([18, 42]),
 torch.Size([18, 42]),
 torch.Size([42]))

In [7]:
model = ShowAttendTell(voc,Config)
#model.load('Save/VGG_LSTM_encoder_39.pt','Save/VGG_LSTM_decoder_39.pt')

val_evaluator = Evaluator(Config.model_name,path.prediction_file_path,path.val_annotation_file,val_loader)



In [None]:
# Epoch : lr, tf ratio  Bleu4(epoch no)
# 1-10 : 1e-3, 0.5       14
# 10-15 : 1e-3, 0.5      15
# 15-25 : 1e-4, 0.4      15.5
# 25-30 : 1e-3, 0.7      18
# 30-33 : 1e-4, 0.8      20.6 (32)
# 33 - 35 : 1e-4, 0.9    22.9 (35)  
# 36 - 38 : 1e-4, 1.0    24.1 (38)
# 39 - 41 : 1e-4, 1.0    24.16 (39)

In [None]:
1,6    1e-3     0.7     19
6,16   1e-4     1.0     25.5
16,21  1e-4     1.0     25.6
21,26  1e-4     1.0     25.8
26,31  1e-4     1.0     25.79
31,34  1e-3     1.0 

In [None]:
Config.encoder_lr = 1e-5
Config.decoder_lr = 1e-3
Config.teacher_forcing_ratio = 1.0
model.update_hyperparam(Config)

for epoch in range(31,34):
    model.train()
    loss = model.train_epoch(train_loader)
    print(' Epoch :',epoch,' Loss :',loss)
    scores = val_evaluator.evaluate(model,epoch)
    print(scores)

  loss = crossEntropy.masked_select(mask).mean()


Iteration: 400; Percent complete: 20.3%; Average loss: 2.2635
Iteration: 800; Percent complete: 40.6%; Average loss: 2.3259
Iteration: 1200; Percent complete: 60.9%; Average loss: 2.3279
Iteration: 1600; Percent complete: 81.2%; Average loss: 2.3349
 Epoch : 31  Loss : 2.3165232126150834
{'testlen': 396327, 'reflen': 391713, 'guess': [396327, 355823, 315319, 274815], 'correct': [264825, 127529, 53770, 22566]}
ratio: 1.0117790320974769
[(['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'], [0.6681982302492622, 0.48937314047448244, 0.34436847291117556, 0.24064180477800723]), ('METEOR', 0.22158995574757132), ('ROUGE_L', 0.4921808333150701), ('CIDEr', 0.7383353658108943)]


  loss = crossEntropy.masked_select(mask).mean()


Iteration: 400; Percent complete: 20.3%; Average loss: 2.2941
Iteration: 800; Percent complete: 40.6%; Average loss: 2.3023
Iteration: 1200; Percent complete: 60.9%; Average loss: 2.3040
Iteration: 1600; Percent complete: 81.2%; Average loss: 2.3096
 Epoch : 32  Loss : 2.3055361043976084
{'testlen': 389391, 'reflen': 387313, 'guess': [389391, 348887, 308383, 267879], 'correct': [261924, 126138, 53577, 22226]}
ratio: 1.0053651697722488
[(['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'], [0.6726503694230204, 0.49314572107715615, 0.34829413963614425, 0.24332699097255486]), ('METEOR', 0.22277604975396034), ('ROUGE_L', 0.49419720301865916), ('CIDEr', 0.7502807899326497)]


  loss = crossEntropy.masked_select(mask).mean()


Iteration: 400; Percent complete: 20.3%; Average loss: 2.2501
Iteration: 800; Percent complete: 40.6%; Average loss: 2.2641
Iteration: 1200; Percent complete: 60.9%; Average loss: 2.2688
Iteration: 1600; Percent complete: 81.2%; Average loss: 2.2729
 Epoch : 33  Loss : 2.265338014922364
{'testlen': 394132, 'reflen': 390497, 'guess': [394132, 353628, 313124, 272620], 'correct': [264064, 128072, 54300, 23089]}
ratio: 1.0093086502585142


In [None]:
# #val_evaluator = Evaluator(Config.model_name,path.prediction_file_path,path.val_annotation_file,val_loader)
# scores = val_evaluator.evaluate(model,5)
# print(scores)

In [None]:
model.encoder(features.to(Config.device)).size()

In [None]:
#visualization
dataiter = iter(val_loader)
features,_,_,_,_ = dataiter.next()

print(features.size())
ct,ctx,aw = model.Greedy_Decoding(features.to(Config.device))
ctx

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [14, 14]
import skimage.transform

num = 10
words = ctx[num].split(' ')
img = features[num].permute(1,2,0).numpy()
print(ctx[num])
plt.subplot(4, 5, 1)
plt.imshow(img)
plt.axis('off')

for t in range(len(words)):
        if t > 15:
            break
        plt.subplot(4, 4, t+2)
        plt.text(0, 1, '%s'%(words[t]) , color='black', backgroundcolor='white', fontsize=10)
        plt.imshow(img)
        alp_curr = aw[t,num,:].reshape(14,14)
        alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20)
        plt.imshow(alp_img, alpha=0.70)
        plt.axis('off')
plt.show()

In [None]:
#Result generation for test files
result = []
ide_list = []
caption_list =[]
model.eval()
with torch.no_grad():
    for data in tqdm(test_loader):
        ides, features = data
        cap,cap_txt = model.Greedy_Decoding(features.to(device))
        ide_list += list(ides.cpu().numpy())
        caption_list += cap_txt
for a in zip(ide_list,caption_list):
    result.append({'image_id':a[0].item(),'caption':a[1].strip()}) 
    
predicted_test_file = os.path.join(prediction_file_path,'captions_test2014_SAT_results.json') 
with open(predicted_test_file, 'w') as fp:
    json.dump(result,fp)