In [1]:
import torch
import torch.nn.functional as F
import numpy as np
import json
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import skimage.transform
import argparse
from scipy.misc import imread, imresize
from PIL import Image
import os
import random


import torchvision.datasets as dset
import torchvision.transforms as transforms
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px


import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torchvision.models as models
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

from skimage.transform import resize

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from efficientnet_pytorch import EfficientNet

import nltk
from collections import Counter
from collections import OrderedDict

import json
from pycocotools.coco import COCO
import pickle

import re
from PIL import Image 

import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
def build_vocab(annon_file,min_threshold,load=True):
    if (load):
        with open('./word2idx', 'rb') as f:
            word2idx=pickle.load(f)
        with open('./idx2word', 'rb') as f:
            idx2word=pickle.load(f)
                
    else:
        word2idx={}
        idx2word={}
        idx=0

        # Adding Start,End and Unkown Token
        word='<start>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1

        word='<end>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1

        word='<unknown>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1
        
        word='<pad>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1



        coco=COCO(annon_file)
        keys=coco.anns.keys()
        counter=Counter()
        for i,key in enumerate(keys):
            caption=coco.anns[key]['caption']
            tokens=nltk.tokenize.word_tokenize(caption.lower())
            counter.update(tokens)

        words = [word for word, count in counter.items() if count >= min_threshold]
        for word in words:
            if word not in word2idx:
                word2idx[word]=idx
                idx2word[idx]=word
                idx+=1
        
        with open('./word2idx', 'wb') as f:
            pickle.dump(word2idx, f)

        with open('./idx2word', 'wb') as f:
            pickle.dump(idx2word, f)

    return word2idx,idx2word


anon_file_train='/home/valkyrie/data/2014/annotations/captions_train2014.json'
anon_file_val='/home/valkyrie/data/2014/annotations/captions_val2014.json'
word2idx,idx2word=build_vocab(anon_file_train,5,load=True)

In [3]:
class Encoder(nn.Module):
    
    def __init__(self,base_model,fine_tune=False):
        super(Encoder, self).__init__()
        
        self.base_model = base_model

        if (fine_tune==False):
            for param in self.base_model.parameters():
                param.requires_grad = False
        
        self.adaptive_pool = nn.AdaptiveAvgPool2d((14,14))
        
    def forward(self,images):
        feat = self.base_model.extract_features(images)  
        out = self.adaptive_pool(feat)  
        out = out.permute(0, 2, 3, 1)
        return out
    

    
    
class Attention(nn.Module):
    """
    Attention Network.
    """

    def __init__(self, encoder_dim, decoder_dim, attention_dim):

        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)  
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)  
        self.full_att = nn.Linear(attention_dim, 1)  
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1) 

    def forward(self, encoder_out, decoder_hidden):
        
        att1 = self.encoder_att(encoder_out)  # (batch_size, num_pixels, attention_dim)
        att2 = self.decoder_att(decoder_hidden)  # (batch_size, attention_dim)
        att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2)
        alpha = self.softmax(att)  
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1) 

        return attention_weighted_encoding, alpha


class DecoderWithAttention(nn.Module):
    """
    Decoder which uses Attention Network
    """
    def __init__(self, attention_dim, embed_dim, decoder_dim, vocab_size, encoder_dim=1536, dropout=0.5):
        
        super(DecoderWithAttention, self).__init__()
        self.encoder_dim = encoder_dim
        self.attention_dim = attention_dim
        self.embed_dim = embed_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.dropout = dropout
        self.attention = Attention(encoder_dim, decoder_dim, attention_dim)  # attention network

        
        self.embedding = nn.Embedding(vocab_size, embed_dim)  # embedding layer
        self.dropout = nn.Dropout(p=self.dropout)
        self.decode_step = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True)  # decoding LSTMCell
        self.init_h = nn.Linear(encoder_dim, decoder_dim)  
        self.init_c = nn.Linear(encoder_dim, decoder_dim)  
        self.f_beta = nn.Linear(decoder_dim, encoder_dim)  
        self.sigmoid = nn.Sigmoid()
        
        # Output Layer
        self.fc = nn.Linear(decoder_dim, vocab_size)  
        self.init_weights()  

    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)    

    def init_hidden_state(self, encoder_out):
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out) 
        c = self.init_c(mean_encoder_out)
        return h, c

    def forward(self, encoder_out, encoded_captions, caption_lengths):
        batch_size = encoder_out.size(0)
        encoder_dim = encoder_out.size(-1)
        vocab_size = self.vocab_size

        # Flatten image
        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  
        num_pixels = encoder_out.size(1)

        caption_lengths, sort_ind = caption_lengths.sort(dim=0, descending=True)
        encoder_out = encoder_out[sort_ind]
        encoded_captions = encoded_captions[sort_ind]

        # Embeddings
        embeddings = self.embedding(encoded_captions) 

        # Initial Cell and Hidden state
        h, c = self.init_hidden_state(encoder_out)  

        decode_lengths = (caption_lengths - 1).tolist()
        
        
        # Output Tensors
        predictions = torch.zeros(batch_size, max(decode_lengths), vocab_size).to(device)
        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)

        """
        Main Decode Step. Output used from the previous timestep and the attention weighted encoding is used.
        """
        for size in range(max(decode_lengths)):
            batch_size_t = sum([l > size for l in decode_lengths])
            attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t],
                                                                h[:batch_size_t])
            # LSTM gate
            gate = self.sigmoid(self.f_beta(h[:batch_size_t]))  
            attention_weighted_encoding = gate * attention_weighted_encoding
            h, c = self.decode_step(
                torch.cat([embeddings[:batch_size_t, size, :], attention_weighted_encoding], dim=1),
                (h[:batch_size_t], c[:batch_size_t]))
            preds = self.fc(self.dropout(h)) 
            predictions[:batch_size_t, size, :] = preds
            alphas[:batch_size_t, size, :] = alpha

        return predictions, encoded_captions, decode_lengths, alphas, sort_ind

In [4]:
# # Model parameters
# emb_dim = 512  # dimension of word embeddings
# attention_dim = 512  # dimension of attention linear layers
# decoder_dim = 512  # dimension of decoder RNN
# dropout = 0.3


# Model parameters
emb_dim = 512  # dimension of word embeddings
attention_dim = 1024  # dimension of attention linear layers
decoder_dim = 1024  # dimension of decoder RNN
dropout = 0.3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors


In [5]:
decoder = DecoderWithAttention(attention_dim=attention_dim,
                                       embed_dim=emb_dim,
                                       decoder_dim=decoder_dim,
                                       vocab_size=len(word2idx),
                                       dropout=dropout)
model = EfficientNet.from_pretrained('efficientnet-b3')
encoder=Encoder(model)


Loaded pretrained weights for efficientnet-b3


In [6]:
def encode_cap_val(captions,word2idx,max_len):
    
    encoded_captions=[]
    for cap in captions:
        encoded=[]
        tokens=nltk.tokenize.word_tokenize(cap.lower())
        encoded.append(word2idx['<start>'])
        for token in tokens:
            if token in word2idx.keys():
                encoded.append(word2idx[token])
            else:
                encoded.append(word2idx['<unknown>'])
        encoded.append(word2idx['<end>'])
        encoded.extend([word2idx['<pad>']]*(max_len-len(tokens)))
        encoded_captions.append(encoded)
    return encoded_captions

def encode_captions(annon_file,word2idx,max_len):
    coco=COCO(annon_file)
    ordered = OrderedDict(sorted(coco.anns.items(), key=lambda i: i[1]['image_id']))
    keys=list(ordered.keys())
    encoded_captions=[]
    lengths=[]
    image_ids=[]
    
    index=0
    for key in keys:
        encoded=[]
        caption=ordered[key]['caption']
        image_id=ordered[key]['image_id']
        tokens=nltk.tokenize.word_tokenize(caption.lower())
        encoded.append(word2idx['<start>'])
        for token in tokens:
            if token in word2idx.keys():
                encoded.append(word2idx[token])
            else:
                encoded.append(word2idx['<unknown>'])
        lengths.append(len(tokens)+2)
        encoded.append(word2idx['<end>'])
        encoded.extend([word2idx['<pad>']]*(max_len-len(tokens)))
        encoded_captions.append(encoded)        
        image_ids.append(image_id)

    return encoded_captions,lengths,image_ids

def decode_caption(idx2word,cap_list):
    
    decoded_caption=[]
    for caption in cap_list:
        decoded=[]
        for token in caption:
            if token==3:
                break
            decoded.append(idx2word[token])
            
        decoded_caption.append(decoded)
    return decoded_caption

In [7]:
class CaptionDataset(torch.utils.data.Dataset):
    
    
    def __init__(self,data_list,encoded_captions,lengths,image_ids,val=False):
        
        self.data=data_list
        self.encoded_captions=encoded_captions
        self.cap_lens=lengths
        self.image_ids=image_ids
        self.val=val
        
    def __getitem__(self, index):
        caption=self.encoded_captions[index]
        image_id=self.image_ids[index]
        length=self.cap_lens[index]
        
        idx=self.data.ids.index(image_id)
        image,captions=self.data[idx]
        
        caption=torch.tensor(caption)
        length=torch.tensor(length)
        
        if(self.val):
#             print (captions)
            captions=captions[:4]
            all_caps=encode_cap_val(captions,word2idx,100)
            all_caps=torch.LongTensor(all_caps)
            return image,caption,length,all_caps 
        else:
            return image,caption,length
        
    
    def __len__(self):
        return len(self.encoded_captions)

In [8]:
train_transform=transforms.Compose([
                                transforms.Resize((255,255)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                                     std=[0.229, 0.224, 0.225])                               
])



val_list=dset.CocoCaptions(root = '/home/valkyrie/data/2014/val2014/',
                        annFile = '/home/valkyrie/data/2014/annotations/captions_val2014.json',
                        transform=train_transform)

# Val Captions
encoded_captions_val,lengths_val,image_ids_val=encode_captions(anon_file_val,word2idx,max_len=100)
val_set=CaptionDataset(val_list,encoded_captions_val,lengths_val,image_ids_val,val=True)
val_loader=torch.utils.data.DataLoader(val_set,batch_size=1)

loading annotations into memory...
Done (t=0.30s)
creating index...
index created!
loading annotations into memory...
Done (t=0.31s)
creating index...
index created!


In [24]:
# Res1
# decoder_weights='./checkpoints/decoder_epoch_1_best.pth'
# encoder_weights='./checkpoints/encoder_epoch_1_best.pth'

decoder_weights='./checkpoints/decoder_epoch_3_best_b3.pth'
encoder_weights='./checkpoints/encoder_epoch_3_best_b3.pth'



decoder.load_state_dict(torch.load(decoder_weights))
encoder.load_state_dict(torch.load(encoder_weights))


encoder = encoder.to(device)
decoder=decoder.to(device)

decoder.eval()
encoder.eval()

Encoder(
  (base_model): EfficientNet(
    (_conv_stem): Conv2dStaticSamePadding(
      3, 40, kernel_size=(3, 3), stride=(2, 2), bias=False
      (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
    )
    (_bn0): BatchNorm2d(40, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
    (_blocks): ModuleList(
      (0): MBConvBlock(
        (_depthwise_conv): Conv2dStaticSamePadding(
          40, 40, kernel_size=(3, 3), stride=[1, 1], groups=40, bias=False
          (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
        )
        (_bn1): BatchNorm2d(40, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
        (_se_reduce): Conv2dStaticSamePadding(
          40, 10, kernel_size=(1, 1), stride=(1, 1)
          (static_padding): Identity()
        )
        (_se_expand): Conv2dStaticSamePadding(
          10, 40, kernel_size=(1, 1), stride=(1, 1)
          (static_padding): Identity()
        )
        (

In [25]:
beam_size=5
references = list()
hypotheses = list()

word_map=word2idx
rev_word_map=idx2word
vocab_size = len(word_map)

# For each image
for i, (image, caps, caplens, allcaps) in enumerate(
        tqdm(val_loader, desc="Beam Size: " + str(beam_size))):

    k = beam_size

    # Move to GPU device, if available
    image = image.to(device)  

    # Encode
    encoder_out = encoder(image)  
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)

#     print (num_pixels)
    encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

    k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)  # (k, 1)

    seqs = k_prev_words  # (k, 1)

    top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

    complete_seqs = list()
    complete_seqs_scores = list()

    # Start decoding
    step = 1
    h, c = decoder.init_hidden_state(encoder_out)

    # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
    while True:

        embeddings = decoder.embedding(k_prev_words).squeeze(1)
        awe, _ = decoder.attention(encoder_out, h) 
        gate = decoder.sigmoid(decoder.f_beta(h))  
        awe = gate * awe

        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))

        scores = decoder.fc(h)
        scores = F.log_softmax(scores, dim=1)

        # Add
        scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

        if step == 1:
            top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
        else:
            # Unroll and find top scores, and their unrolled indices
            top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

        prev_word_inds = top_k_words / vocab_size  # (s)
        next_word_inds = top_k_words % vocab_size  # (s)

        # Add new words to sequences
        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

        incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != word_map['<end>']]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])
        k -= len(complete_inds)  # reduce beam length accordingly

        # Proceed with incomplete sequences
        if k == 0:
            break
        seqs = seqs[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        if step > 50:
            break
        step += 1

    i = complete_seqs_scores.index(max(complete_seqs_scores))
    seq = complete_seqs[i]

    # References
    img_caps = allcaps[0].tolist()
    img_captions = list(
        map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],
            img_caps))  # remove <start> and pads
    references.append(img_captions)

    # Hypotheses
    hypotheses.append([w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])

    assert len(references) == len(hypotheses)

# Calculate BLEU-4 scores
bleu4 = corpus_bleu(references, hypotheses)

EVALUATING AT BEAM SIZE 5:  26%|██▌       | 52612/202654 [1:10:15<3:20:22, 12.48it/s]


KeyboardInterrupt: 

In [17]:
result = {'bleu4': bleu4}
f = open( './bleu4.txt', 'w' )
f.write( 'bleu4 = ' + repr(result) + '\n' )
f.close()
print (bleu4)


0.2462556677667717


In [None]:
weights = (1.0/1.0, )
bleu1=corpus_bleu(references, hypotheses, weights)

In [None]:
weights=(1.0/2.0, 1.0/2.0,)
bleu2=corpus_bleu(references, hypotheses, weights)

In [None]:
bleu2

In [None]:
weights=(1.0/3.0, 1.0/3.0, 1.0/3.0,)
bleu3=corpus_bleu(references, hypotheses, weights)

In [None]:
bleu3

In [None]:
result = {'bleu4': bleu4,'bleu3': bleu3,'bleu2': bleu2,'bleu1': bleu1}
f = open( './results_withoutfine.txt', 'w' )
f.write( 'results' + repr(result) + '\n' )
f.close()
# print (bleu4)
