In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [3]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from nltk.translate.bleu_score import corpus_bleu

In [4]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 1000  # dimension of word embeddings
attention_dim = 1000  # dimension of attention linear layers
decoder_dim = 1000  # dimension of decoder RNN
dropout = 0.3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 1  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 4
encoder_lr = 3e-4  # learning rate for encoder if fine-tuning
decoder_lr = 2e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none

In [5]:
DATA_NAME = 'testing_experiemnts_process'

# local
# DATA_JSON_PATH = 'data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
DATA_JSON_PATH = 'data.json'
IMGS_PATH = 'flickr/Images/'

In [6]:
vocab = build_vocab(DATA_JSON_PATH)

100%|██████████| 40000/40000 [00:00<00:00, 387424.31it/s]


In [7]:
vocab_len = len(vocab); vocab_len

5089

In [8]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder
}

m_params = {
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'max_seq_length': 100,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [9]:
# experiment name
name = DATA_NAME
# path
log_dir = 'experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [10]:
t_params

{'data_name': 'testing_experiemnts_process',
 'imgs_path': 'flickr/Images/',
 'df_path': 'data.json',
 'vocab': <dataset.Vocabulary at 0x7f0960957210>,
 'epochs': 1,
 'batch_size': 256,
 'workers': 4,
 'decoder_lr': 0.0002,
 'encoder_lr': 0.0003,
 'fine_tune_encoder': False}

In [11]:
def train(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch):
    # performs one epoch's training

    
    encoder.train()
    decoder.train()

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss (per word decoded)
    top5accs = AverageMeter()  # top5 accuracy

    start = time.time()

    # batches
    for i, (imgs, caps, caplens) in enumerate(train_loader):
        data_time.update(time.time() - start)

        # move to gpu, if available
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.unsqueeze(1).to(device)

        # forward prop
        imgs = encoder(imgs)
        scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

        # get the words after <sos>
        targets = caps_sorted[:, 1:]

        # remove timesteps that we didn't decode at or are pads
        scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
        targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data

        # calculate the loss
        loss = criterion(scores, targets)

        # doubly stochastic attention regularization
        loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

        # back prop
        decoder_optimizer.zero_grad()
        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad()
        loss.backward()

        # clip gradients
        if grad_clip is not None:
            clip_gradient(decoder_optimizer, grad_clip)
            if encoder_optimizer is not None:
                clip_gradient(encoder_optimizer, grad_clip)

        # update weights
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()

        
        # keep track of metrics
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(top5, sum(decode_lengths))
        batch_time.update(time.time() - start)

        start = time.time()


        # print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
                                                                          batch_time=batch_time,
                                                                          data_time=data_time, loss=losses,
                                                                          top5=top5accs))
            
        # tensorboard 
        logger.add_scalar('Loss/train', losses.val, i)
        logger.add_scalar('top5acc/train', top5accs.val, i)
        
        

In [None]:
def validate(val_loader, encoder, decoder, criterion, vocab):

    decoder.eval()
    if encoder is not None:
        encoder.eval()

    
    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list() # true captions for calculating the bleu scores
    hypotheses = list() # hypotheses (predictions)

    with torch.no_grad():
        # batches
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):

            # move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.unsqueeze(1).to(device)


            # forward prop
            if encoder is not None:
                imgs = encoder(imgs)

            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

            targets = caps_sorted[:, 1:]

            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data

            # calculate loss
            loss = criterion(scores, targets)

            # doubly stochastic attention regularization
            loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

            # keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time,
                                                                                loss=losses, top5=top5accs))
            





            allcaps = allcaps[sort_ind]
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                references.append(vocab.indextostring(img_caps))
            

            # hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads
            
            preds = vocab.indextostring(temp_preds)
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)
            
            # debug
            # if i % 50 == 0 and i != 0:
            # break

        # debug
        # return references, hypotheses
        # Calculate BLEU-4 scores
        #bleu4 = corpus_bleu(references, hypotheses)

        # print scores
        b1, b2, b3, b4 = print_scores(references, hypotheses)

        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'.format(
                loss=losses,
                top5=top5accs,
                bleu=bleu4))
        
        # tensorboard 
        logger.add_scalar('Loss/valid', losses.val, i)
        logger.add_scalar('top5acc/valid', top5accs.val, i)
        logger.add_scalar('b-1/valid', b1, i)
        logger.add_scalar('b-2/valid', b2, i)
        logger.add_scalar('b-3/valid', b3, i)
        logger.add_scalar('b-4/valid', b4, i)
        
    return b4

In [46]:
# final results -> different from training and validation scalars
results_dic =  {
    # train & valid
    'total_epochs': 12,
    'Top_5_acc': 75.361,
    'V-bleu-1': 73.986,
    'V-bleu-2': 51.126,
    'V-bleu-3': 55.67,
    'V-bleu-4': 56.61,
    'V-Meteor': 18
    # Testing
#     'T-bleu-1',
#     'T-bleu-2',
#     'T-bleu-3',
#     'T-bleu-4',
}

logger.add_hparams(logger_dic, results_dic)