In [1]:
from comet_ml import Experiment

import torchvision.datasets as dset
import torchvision.transforms as transforms
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px


import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torchvision.models as models
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

from skimage.transform import resize

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from efficientnet_pytorch import EfficientNet


import nltk
from collections import Counter
from collections import OrderedDict

import json
from pycocotools.coco import COCO
import pickle

import re
from PIL import Image 

import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from nltk.translate.bleu_score import corpus_bleu


### Loading Dataset

In [2]:
train_transform=transforms.Compose([
                                transforms.Resize((255,255)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                                     std=[0.229, 0.224, 0.225])                               
])

val_transform=transforms.Compose([
                                transforms.Resize((255,255)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                                     std=[0.229, 0.224, 0.225])                               
])

In [3]:
%%time
train_list = dset.CocoCaptions(root = '/home/valkyrie/data/2014/train2014/',
                        annFile = '/home/valkyrie/data/2014/annotations/captions_train2014.json',
                        transform=train_transform)

val_list=dset.CocoCaptions(root = '/home/valkyrie/data/2014/val2014/',
                        annFile = '/home/valkyrie/data/2014/annotations/captions_val2014.json',
                        transform=val_transform)


loading annotations into memory...
Done (t=0.62s)
creating index...
index created!
loading annotations into memory...
Done (t=0.45s)
creating index...
index created!
CPU times: user 1.3 s, sys: 227 ms, total: 1.53 s
Wall time: 1.52 s


### Building Vocab

In [4]:
def build_vocab(annon_file,min_threshold,load=True):
    if (load):
        with open('./word2idx', 'rb') as f:
            word2idx=pickle.load(f)
        with open('./idx2word', 'rb') as f:
            idx2word=pickle.load(f)
                
    else:
        word2idx={}
        idx2word={}
        idx=0

        # Adding Start,End and Unkown Token
        word='<start>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1

        word='<end>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1

        word='<unknown>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1
        
        word='<pad>'
        word2idx[word]=idx
        idx2word[idx]=word
        idx+=1



        coco=COCO(annon_file)
        keys=coco.anns.keys()
        counter=Counter()
        for i,key in enumerate(keys):
            caption=coco.anns[key]['caption']
            tokens=nltk.tokenize.word_tokenize(caption.lower())
            counter.update(tokens)

        words = [word for word, count in counter.items() if count >= min_threshold]
        for word in words:
            if word not in word2idx:
                word2idx[word]=idx
                idx2word[idx]=word
                idx+=1
        
        with open('./word2idx', 'wb') as f:
            pickle.dump(word2idx, f)

        with open('./idx2word', 'wb') as f:
            pickle.dump(idx2word, f)

    return word2idx,idx2word




In [5]:
# coco=COCO(anon_file_train)
# ordered = OrderedDict(sorted(coco.anns.items(), key=lambda i: i[1]['image_id']))
# keys=list(ordered.keys())

In [6]:
# keys[0:10]

In [7]:
# ordered[keys[0]]

In [8]:
# ordered[keys[2]]['caption']

In [9]:
# image,captions=train_list[0]
# caps=encode_cap_val(captions,word2idx,100)

In [10]:
# decode_caption(idx2word,caps)

In [11]:
def encode_cap_val(captions,word2idx,max_len):
    
    encoded_captions=[]
    for cap in captions:
        encoded=[]
        tokens=nltk.tokenize.word_tokenize(cap.lower())
        encoded.append(word2idx['<start>'])
        for token in tokens:
            if token in word2idx.keys():
                encoded.append(word2idx[token])
            else:
                encoded.append(word2idx['<unknown>'])
        encoded.append(word2idx['<end>'])
        encoded.extend([word2idx['<pad>']]*(max_len-len(tokens)))
        encoded_captions.append(encoded)
    return encoded_captions

In [12]:
def encode_captions(annon_file,word2idx,max_len):
    coco=COCO(annon_file)
    ordered = OrderedDict(sorted(coco.anns.items(), key=lambda i: i[1]['image_id']))
    keys=list(ordered.keys())
    encoded_captions=[]
    lengths=[]
    image_ids=[]
    
    index=0
    for key in keys:
        encoded=[]
        caption=ordered[key]['caption']
        image_id=ordered[key]['image_id']
        tokens=nltk.tokenize.word_tokenize(caption.lower())
        encoded.append(word2idx['<start>'])
        for token in tokens:
            if token in word2idx.keys():
                encoded.append(word2idx[token])
            else:
                encoded.append(word2idx['<unknown>'])
        lengths.append(len(tokens)+2)
        encoded.append(word2idx['<end>'])
        encoded.extend([word2idx['<pad>']]*(max_len-len(tokens)))
        encoded_captions.append(encoded)        
        image_ids.append(image_id)

    return encoded_captions,lengths,image_ids

def decode_caption(idx2word,cap_list):
    
    decoded_caption=[]
    for caption in cap_list:
        decoded=[]
        for token in caption:
            if token==3:
                break
            decoded.append(idx2word[token])
            
        decoded_caption.append(decoded)
    return decoded_caption

In [13]:
%%time
anon_file_train='/home/valkyrie/data/2014/annotations/captions_train2014.json'
anon_file_val='/home/valkyrie/data/2014/annotations/captions_val2014.json'
word2idx,idx2word=build_vocab(anon_file_train,5,load=True)

# Train Captions
encoded_captions_train,lengths_train,image_ids_train=encode_captions(anon_file_train,word2idx,max_len=100)

# Val Captions
encoded_captions_val,lengths_val,image_ids_val=encode_captions(anon_file_val,word2idx,max_len=100)

loading annotations into memory...
Done (t=0.62s)
creating index...
index created!
loading annotations into memory...
Done (t=0.27s)
creating index...
index created!
CPU times: user 1min 20s, sys: 675 ms, total: 1min 21s
Wall time: 1min 21s


In [14]:
# import nltk
# nltk.download('punkt')


In [15]:
decode_caption(idx2word,[encoded_captions_train[2]])

[['<start>',
  'there',
  'are',
  'containers',
  'filled',
  'with',
  'different',
  'kinds',
  'of',
  'foods',
  '<end>']]

In [16]:
image_ids_train[0]

9

In [17]:
len(encoded_captions_train[1])

102

In [90]:
lengths_train[1]

12

In [20]:
image_ids_train[0]

9

In [14]:
len(encoded_captions_train),len(train_list),len(image_ids_train)

(414113, 82783, 414113)

In [18]:
class CaptionDataset(torch.utils.data.Dataset):
    
    
    def __init__(self,data_list,encoded_captions,lengths,image_ids,val=False):
        
        self.data=data_list
        self.encoded_captions=encoded_captions
        self.cap_lens=lengths
        self.image_ids=image_ids
        self.val=val
        
    def __getitem__(self, index):
        caption=self.encoded_captions[index]
        image_id=self.image_ids[index]
        length=self.cap_lens[index]
        
        idx=self.data.ids.index(image_id)
        image,captions=self.data[idx]
        
        caption=torch.tensor(caption)
        length=torch.tensor(length)
        
        if(self.val):
#             print (captions)
            captions=captions[:4]
            all_caps=encode_cap_val(captions,word2idx,100)
            all_caps=torch.LongTensor(all_caps)
            return image,caption,length,all_caps 
        else:
            return image,caption,length
        
    
    def __len__(self):
        return len(self.encoded_captions)

In [19]:
train_set=CaptionDataset(train_list,encoded_captions_train,lengths_train,image_ids_train,val=False)
train_loader=torch.utils.data.DataLoader(train_set,batch_size=100)


val_set=CaptionDataset(val_list,encoded_captions_val,lengths_val,image_ids_val,val=True)
val_loader=torch.utils.data.DataLoader(val_set,batch_size=100)

In [27]:
%%time
images,captions,lengths=next(iter(train_loader))

CPU times: user 5.67 s, sys: 362 ms, total: 6.03 s
Wall time: 1.03 s


In [22]:
%%time
images,captions,lengths,all_caps=next(iter(val_loader))

CPU times: user 11 s, sys: 113 ms, total: 11.1 s
Wall time: 2.86 s


In [82]:
images.shape

torch.Size([64, 3, 255, 255])

In [84]:
captions.shape

torch.Size([64, 102])

In [157]:
lengths.unsqueeze(1).shape

torch.Size([64, 1])

In [35]:
all_caps[1].shape

torch.Size([4, 102])

In [32]:
class Encoder(nn.Module):
    
    def __init__(self,base_model,fine_tune=False):
        super(Encoder, self).__init__()
        
        self.base_model = base_model

        if (fine_tune==False):
            for param in self.base_model.parameters():
                param.requires_grad = False
        
        self.adaptive_pool = nn.AdaptiveAvgPool2d((14,14))
        
    def forward(self,images):
        feat = self.base_model.extract_features(images)  
        out = self.adaptive_pool(feat)  
        out = out.permute(0, 2, 3, 1)
        return out
    

    
    
class Attention(nn.Module):
    """
    Attention Network.
    """

    def __init__(self, encoder_dim, decoder_dim, attention_dim):

        super(Attention, self).__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)  
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)  
        self.full_att = nn.Linear(attention_dim, 1)  
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1) 

    def forward(self, encoder_out, decoder_hidden):
        
        att1 = self.encoder_att(encoder_out)  # (batch_size, num_pixels, attention_dim)
        att2 = self.decoder_att(decoder_hidden)  # (batch_size, attention_dim)
        att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2)
        alpha = self.softmax(att)  
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1) 

        return attention_weighted_encoding, alpha


class DecoderWithAttention(nn.Module):
    """
    Decoder which uses Attention Network
    """
    def __init__(self, attention_dim, embed_dim, decoder_dim, vocab_size, encoder_dim=1536, dropout=0.5):
        
        super(DecoderWithAttention, self).__init__()
        self.encoder_dim = encoder_dim
        self.attention_dim = attention_dim
        self.embed_dim = embed_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.dropout = dropout
        self.attention = Attention(encoder_dim, decoder_dim, attention_dim)  # attention network

        
        self.embedding = nn.Embedding(vocab_size, embed_dim)  # embedding layer
        self.dropout = nn.Dropout(p=self.dropout)
        self.decode_step = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True)  # decoding LSTMCell
        self.init_h = nn.Linear(encoder_dim, decoder_dim)  
        self.init_c = nn.Linear(encoder_dim, decoder_dim)  
        self.f_beta = nn.Linear(decoder_dim, encoder_dim)  
        self.sigmoid = nn.Sigmoid()
        
        # Output Layer
        self.fc = nn.Linear(decoder_dim, vocab_size)  
        self.init_weights()  

    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)    

    def init_hidden_state(self, encoder_out):
        mean_encoder_out = encoder_out.mean(dim=1)
        h = self.init_h(mean_encoder_out) 
        c = self.init_c(mean_encoder_out)
        return h, c

    def forward(self, encoder_out, encoded_captions, caption_lengths):
        batch_size = encoder_out.size(0)
        encoder_dim = encoder_out.size(-1)
        vocab_size = self.vocab_size

        # Flatten image
        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  
        num_pixels = encoder_out.size(1)

        caption_lengths, sort_ind = caption_lengths.sort(dim=0, descending=True)
        encoder_out = encoder_out[sort_ind]
        encoded_captions = encoded_captions[sort_ind]

        # Embeddings
        embeddings = self.embedding(encoded_captions) 

        # Initial Cell and Hidden state
        h, c = self.init_hidden_state(encoder_out)  

        decode_lengths = (caption_lengths - 1).tolist()
        
        
        # Output Tensors
        predictions = torch.zeros(batch_size, max(decode_lengths), vocab_size).to(device)
        alphas = torch.zeros(batch_size, max(decode_lengths), num_pixels).to(device)

        """
        Main Decode Step. Output used from the previous timestep and the attention weighted encoding is used.
        """
        for size in range(max(decode_lengths)):
            batch_size_t = sum([l > size for l in decode_lengths])
            attention_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t],
                                                                h[:batch_size_t])
            # LSTM gate
            gate = self.sigmoid(self.f_beta(h[:batch_size_t]))  
            attention_weighted_encoding = gate * attention_weighted_encoding
            h, c = self.decode_step(
                torch.cat([embeddings[:batch_size_t, size, :], attention_weighted_encoding], dim=1),
                (h[:batch_size_t], c[:batch_size_t]))
            preds = self.fc(self.dropout(h)) 
            predictions[:batch_size_t, size, :] = preds
            alphas[:batch_size_t, size, :] = alpha

        return predictions, encoded_captions, decode_lengths, alphas, sort_ind

In [33]:
class Meter(object):
    '''
    Helper Class to store different Metrics
    '''
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
        
def accuracy(scores, targets, k):
    batch_size = targets.size(0)
    _, ind = scores.topk(k, 1, True, True)
    correct = ind.eq(targets.view(-1, 1).expand_as(ind))
    correct_total = correct.view(-1).float().sum()
    return correct_total.item() * (100.0 / batch_size)


In [34]:
def train_one(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch,experiment):

    decoder.train() 
    encoder.train()

    # Network pass time
    batch_time = Meter() 
    # data loading time
    data_time = Meter()  
    # loss 
    losses = Meter()  
    #Top5
    top5accs = Meter()

    start = time.time()

    # Batches
    for i, (imgs, caps, caplens) in enumerate(train_loader):
        data_time.update(time.time() - start)

        
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)

        # Forward prop.
        imgs = encoder(imgs)
        scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)
        targets = caps_sorted[:, 1:]
#         print (targerts.shape)
        
        # Removing PADS
#         print ("LEN: ",pack_padded_sequence(scores, decode_lengths, batch_first=True).data.shape)
            
        scores= pack_padded_sequence(scores, decode_lengths, batch_first=True).data
        targets= pack_padded_sequence(targets, decode_lengths, batch_first=True).data

        
        # Backprop
        loss = criterion(scores, targets)
        # alpha_c(regularization)
        loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()
        decoder_optimizer.zero_grad()
        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad()
        loss.backward()
        # Update weights
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()

            
        # Updating Metrics
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(top5, sum(decode_lengths))
        batch_time.update(time.time() - start)

        start = time.time()
        
        # Logging Metrics on Comet
        experiment.log_metric("train_epoch", epoch)
        experiment.log_metric("train_loss", losses.avg)
        experiment.log_metric("train_top5acc", top5accs.avg)
        experiment.log_metric("batch_time", batch_time.avg)
        
        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
                                                                          loss=losses,
                                                                          top5=top5accs))


def validate_one(val_loader, encoder, decoder, criterion,experiment):
    
    
    start = time.time()

    
    decoder.eval()  
    if encoder is not None:
        encoder.eval()

    batch_time = Meter()
    losses = Meter()
    top5accs = Meter()
    
    #True Cap
    references = list()  
    #Hypoth
    hypotheses = list()  


    with torch.no_grad():
        # Batches
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):

            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            if encoder is not None:
                imgs = encoder(imgs)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

            targets = caps_sorted[:, 1:]

            scores_copy = scores.clone()
            
#             print ("LEN: ",pack_padded_sequence(scores, decode_lengths, batch_first=True).shape)
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
            targets= pack_padded_sequence(targets, decode_lengths, batch_first=True).data

            loss = criterion(scores, targets)

            loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader),
                                                                                loss=losses, top5=top5accs))

            allcaps = allcaps[sort_ind]  
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                img_captions = list(
                    map(lambda c: [w for w in c if w not in {word2idx['<start>'], word2idx['<pad>']}],
                        img_caps)) 
                references.append(img_captions)

            # Predictions
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
            preds = temp_preds
            hypotheses.extend(preds)

            # Checking if references are equal to hypotheses
            assert len(references) == len(hypotheses)

        # BLEU-4 scores
        bleu4 = corpus_bleu(references, hypotheses)
        
        # Bleu1
        weights = (1.0/1.0, )
        bleu1=corpus_bleu(references, hypotheses, weights)
        
        
        # Logging on Comet
        experiment.log_metric("val_loss",losses.avg)
        experiment.log_metric("val_top5acc",top5accs.avg)
        experiment.log_metric("val_blue4",bleu4)
        experiment.log_metric("val_blue1",bleu1)
        
        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}, BLEU-1 - {bleu1}\n'.format(
                loss=losses,
                top5=top5accs,
                bleu=bleu4,bleu1=bleu1))

    return bleu4


In [35]:
def train_all(load=False):
    """
    Training and validation.
    """
    global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder,train_decoder
    

    # CometML 
    experiment = Experiment(api_key="laQIvq6M7DVoWfyJRzWQo3ypT",
                            project_name="captioning", workspace="spideysloth")

    tag='Dummy'
    experiment.add_tag(tag)
    experiment.log_parameter("Embedding Dimension",embeding_dimension)
    experiment.log_parameter("Attention Dimension",attention_dimension)
    experiment.log_parameter("Decoder Dimension",decoder_dimension)
    experiment.log_parameter("Dropout",dropout)
    experiment.log_parameter("BatchSize",batch_size)
    experiment.log_parameter("Encoder Lr",encoder_lr)
    experiment.log_parameter("Decoder Lr",decoder_lr)
    experiment.log_parameter("Alpha_c",alpha_c)
    experiment.log_parameter("EffNet","EffNet-B3")
    experiment.log_parameter("FineTune Encoder",fine_tune_encoder)
#     experiment.log_parameter("FineTune Encoder",fine_tune_encoder)
    
    
    
    decoder = DecoderWithAttention(attention_dim=attention_dimension,
                                       embed_dim=embeding_dimension,
                                       decoder_dim=decoder_dimension,
                                       vocab_size=len(word2idx),
                                       dropout=dropout)
    
#     if(train_decoder==False):
#         for param in decoder.parameters():
#             param.requires_grad = False
        
#          decoder_optimizer = None

    decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                             lr=decoder_lr)
    
    model = EfficientNet.from_pretrained('efficientnet-b3')
    encoder=Encoder(model,fine_tune=fine_tune_encoder)
    
    # Load Pre-trained
    if(load):
        decoder_weights='./checkpoints/decoder_epoch_3_best_b3.pth'
        encoder_weights='./checkpoints/encoder_epoch_3_best_b3.pth'


        decoder.load_state_dict(torch.load(decoder_weights))
        encoder.load_state_dict(torch.load(encoder_weights))

        
        encoder_optimizer=torch.optim.Adam(list(encoder.parameters()),lr=encoder_lr)
    else:
        encoder_optimizer =None

    decoder = decoder.to(device)
    encoder = encoder.to(device)

    criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders
    train_set=CaptionDataset(train_list,encoded_captions_train,lengths_train,image_ids_train,val=False)
    train_loader=torch.utils.data.DataLoader(train_set,batch_size=batch_size)


    val_set=CaptionDataset(val_list,encoded_captions_val,lengths_val,image_ids_val,val=True)
    val_loader=torch.utils.data.DataLoader(val_set,batch_size=batch_size)
    
    # Epochs
    for epoch in range(start_epoch, epochs):

        # One epoch's training
        train_one(train_loader=train_loader,
              encoder=encoder,
              decoder=decoder,
              criterion=criterion,
              encoder_optimizer=encoder_optimizer,
              decoder_optimizer=decoder_optimizer,
              epoch=epoch,experiment=experiment)

        # One epoch's validation
        recent_bleu4 = validate_one(val_loader=val_loader,
                                encoder=encoder,
                                decoder=decoder,
                                criterion=criterion,experiment=experiment)

        # Check if there was an improvement
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
        else:
            print ("***********SAVING****************")
            path='./checkpoints/decoder_epoch_{}_best_b3_fineTune.pth'.format(epoch)
            torch.save(decoder.state_dict(), path)
            
            path='./checkpoints/encoder_epoch_{}_best_b3_fineTune.pth'.format(epoch)
            torch.save(encoder.state_dict(), path)
            
            epochs_since_improvement = 0
    return encoder,decoder


In [36]:
# Model Params
embeding_dimension = 512  # Word Embeddings
attention_dimension = 1024  # Attention Layers
decoder_dimension = 1024  # Decoder
dropout = 0.3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 



# Train Params
start_epoch = 0
epochs = 10  
epochs_since_improvement = 0 
batch_size = 30
encoder_lr = 0.00005  
decoder_lr = 4e-4  
alpha_c = 1.0  
best_bleu4 = 0.0
print_freq = 50  
fine_tune_encoder = False  # fine-tune encoder
train_decoder=False

In [37]:
encoder,decoder=train_all(load=False)

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/spideysloth/captioning/448636fcbaee4d1490845e2f6677bc2b
COMET INFO:   Parameters:
COMET INFO:     Alpha_c             : 1.0
COMET INFO:     Attention Dimension : 1024
COMET INFO:     BatchSize           : 30
COMET INFO:     Decoder Dimension   : 1024
COMET INFO:     Decoder Lr          : 0.0004
COMET INFO:     Dropout             : 0.3
COMET INFO:     EffNet              : EffNet-B3
COMET INFO:     Embedding Dimension : 512
COMET INFO:     Encoder Lr          : 5e-05
COMET INFO:     FineTune Encoder    : 1
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     os packages         : 1
COMET INFO: ---------------------------
COMET INFO: Experiment

Loaded pretrained weights for efficientnet-b3
Epoch: [0][0/13804]	Loss 9.9843 (9.9843)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][50/13804]	Loss 5.7012 (6.8729)	Top-5 Accuracy 41.287 (33.885)


KeyboardInterrupt: 

In [25]:
model = EfficientNet.from_pretrained('efficientnet-b1')
encoder=Encoder(model)

Loaded pretrained weights for efficientnet-b1


In [68]:
len(stuff[2])

64

In [22]:
path='./checkpoints/decoder_epoch_{}_bleu_{}.pth'.format(10,0.4)
path            

'./checkpoints/decoder_epoch_10_bleu_0.4.pth'

In [23]:
!pwd

/home/valkyrie/notebooks/caption


In [61]:
# imgs=img.unsqueeze(dim=0)
output=encoder.forward(stuff[0])

In [29]:
output.shape

torch.Size([64, 14, 14, 1280])

### Helpers

In [62]:
class UnNormalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
        Returns:
            Tensor: Normalized image.
        """
        for t, m, s in zip(tensor, self.mean, self.std):
            t.mul_(s).add_(m)
            # The normalize code -> t.sub_(m).div_(s)
        return tensor
    
# unorm = UnNormalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
# plt.imshow(unorm(img).permute(1, 2, 0))