In [47]:
import pickle

# set the subset number to use
subset = 1

# remove index 880 from all since it was a problem data point that the visdial dataloader ignores for some reason: Check tools/functionalties.ipynb for more details

'''MASTER NODE IMPLEMENTATION'''

'''Load the GAP output of the History Graphs Batch'''
with open('../../embeddings/history/' + str(subset) + '/history_batch_GAP.pkl', 'rb') as f:
    history_batch_GAP = pickle.load(f)
    history_batch_GAP.pop(880)

'''Load the GAP output of the Question Graphs Batch'''
with open('../../embeddings/fusion/question_GAP_batch_master_node' + str(subset) + '.pkl', 'rb') as f:
    question_GAP_batch = pickle.load(f)
    question_GAP_batch.pop(880)

'''Load the GAP output of the Image Graphs Batch'''
with open('../../embeddings/fusion/image_GAP_batch_master_node' + str(subset) + '.pkl', 'rb') as f:
    image_GAP_batch = pickle.load(f)
    image_GAP_batch.pop(880)

In [48]:
# adjust the inputs to match the attn_encoder
import torch

'''Takes in a list of lists of tensors and returns a tensor of tensors: Refactoring the GAP outputs to match the attn_encoder input format'''


def adjust_inputs(inputs):
    output = []
    # flatten the list of lists
    inputs = [item for sublist in inputs for item in sublist]
    for i in inputs:
        i = i.unsqueeze(0)
        output.append(i)
    output = torch.stack(output)
    # convert torch tensor to FloatTensor
    output = output.type(torch.FloatTensor)
    return output

In [49]:
from torch.utils.data import Dataset, DataLoader

'''hi: torch.FloatTensor
The representation of history utility
Shape [batch_size x NH, T, hidden_size]'''
hi = DataLoader(adjust_inputs(history_batch_GAP), batch_size=8*10,
                shuffle=False)  # note that the batch size is 8 but shaping it to 80 since Dataloader includes the NH dimension

'''qe: torch.FloatTensor
The representation of question utility
Shape [batch_size x NH, N, hidden_size]'''
qe = DataLoader(adjust_inputs(question_GAP_batch), batch_size=8*10,
                shuffle=False)  # note that the batch size is 8 but shaping it to 80 since Dataloader includes the NH dimension

'''im: torch.FloatTensor
The representation of image utility
Shape [batch_size x NH, K, hidden_size]'''
im = DataLoader(adjust_inputs(image_GAP_batch), batch_size=8*10,
                shuffle=False)  # note that the batch size is 8 but shaping it to 80 since Dataloader includes the NH dimension

for h, q, i in zip(hi, qe, im):
    print('History Input for Attn_Encoder: ', h.shape)
    print('Question Input for Attn_Encoder: ', q.shape)
    print('Image Input for Attn_Encoder: ', i.shape)
    break

History Input for Attn_Encoder:  torch.Size([80, 1, 512])
Question Input for Attn_Encoder:  torch.Size([80, 1, 512])
Image Input for Attn_Encoder:  torch.Size([80, 1, 512])


In [50]:
import torch.nn as nn

'''mask_hi: torch.LongTensor
Shape [batch_size x NH, T]'''
history_masks = torch.ones(
    len(history_batch_GAP)*10, 1)  # 1232 different dialogs x 10 number of history rounds
mask_hi = DataLoader(history_masks, batch_size=8*10, shuffle=False)

'''mask_qe: torch.LongTensor
Shape [batch_size x NH, N]'''
question_masks = torch.ones(
    len(question_GAP_batch)*10, 1)  # 1232 different dialogs x 10 number of history rounds
mask_qe = DataLoader(question_masks, batch_size=8*10, shuffle=False)

'''mask_im: torch.LongTensor
Shape [batch_size x NH, K]'''
image_masks = torch.ones(
    len(image_GAP_batch)*10, 1)  # 1232 different dialogs x 10 number of history rounds
mask_im = DataLoader(image_masks, batch_size=8*10, shuffle=False)

for m_h, m_q, m_i in zip(mask_hi, mask_qe, mask_im):
    print('History Mask for Attn_Encoder: ', m_h.shape)
    print('Question Mask for Attn_Encoder: ', m_q.shape)
    print('Image Mask for Attn_Encoder: ', m_i.shape)
    break

History Mask for Attn_Encoder:  torch.Size([80, 1])
Question Mask for Attn_Encoder:  torch.Size([80, 1])
Image Mask for Attn_Encoder:  torch.Size([80, 1])


In [51]:
# setting the config from new_options.py
config = {'seed': 42,
        'dataset': {
                'v0.9': False,
                'overfit': False,
                'concat_hist': False,
                'max_seq_len': 20,
                'vocab_min_count': 5,
                'finetune': False,
                'is_add_boundaries': True,
                'is_return_options': True,
                'num_boxes': 'fixed',
                'glove_path': 'datasets/glove/embedding_Glove_840_300d.pkl',
                'train_feat_img_path': 'datasets/bottom-up-attention/trainval_resnet101_faster_rcnn_genome__num_boxes_100_100.h5',
                'val_feat_img_path': 'datasets/bottom-up-attention/val2018_resnet101_faster_rcnn_genome__num_boxes_100_100.h5',
                'test_feat_img_path': 'datasets/bottom-up-attention/test2018_resnet101_faster_rcnn_genome__num_boxes_100_100.h5',
                'train_json_dialog_path': 'datasets/annotations/visdial_1.0_train.json',
                'val_json_dialog_path': 'datasets/annotations/visdial_1.0_val.json',
                'test_json_dialog_path': 'datasets/annotations/visdial_1.0_test.json',
                'val_json_dense_dialog_path': 'datasets/annotations/visdial_1.0_val_dense_annotations.json',
                'train_json_word_count_path': 'datasets/annotations/visdial_1.0_word_counts_train.json'
                },
        'model': {
                'decoder_type': 'misc',
                'encoder_out': ['img', 'ques'],
                'hidden_size': 512,
                'dropout': 0.1,
                'test_mode': False,

                # image features
                'img_feat_size': 2048,
                'img_num_attns': None,
                'img_has_bboxes': False,
                'img_has_attributes': False,
                'img_has_classes': False,

                # text features
                'txt_vocab_size': 11322,
                'txt_tokenizer': 'nlp',
                'txt_bidirectional': True,
                'txt_embedding_size': 300,
                'txt_has_pos_embedding': False,
                'txt_has_layer_norm': False,
                'txt_has_decoder_layer_norm': False,

                # cross attention
                'ca_has_shared_attns': False,
                'ca_has_proj_linear': False,
                'ca_has_layer_norm': False,
                'ca_has_residual': False,
                'ca_num_attn_stacks': 1,
                'ca_num_attn_heads': 4,
                'ca_pad_size': 2,
                'ca_has_avg_attns': False,
                'ca_has_self_attns': False,
                },
        'solver': {
                # Adam optimizer
                'optimizer': 'adam',
                'adam_betas': [0.9, 0.997],
                'adam_eps': 1e-9,
                'weight_decay': 1e-5,
                'clip_norm': None,
                # dataloader
                'num_epochs': 100,
                'batch_size': 8,
                'cpu_workers': 8,
                'batch_size_multiplier': 1,
                # learning rate scheduler
                'scheduler_type': 'LinearLR',
                'init_lr': 5e-3,
                'min_lr': 1e-5,
                'num_samples': 1233,
                # warmup scheduler
                'warmup_factor': 0.2,
                'warmup_epochs': 1,
                # linear scheduler
                'linear_gama': 0.5,
                'milestone_steps': [3, 6, 8, 10, 11],
                'fp16': False,
                },
        'callbacks': {
                'resume': False,
                'validate': True,
                'path_pretrained_ckpt': None,
                'save_dir': 'checkpoints/',
                'log_dir': 'checkpoints/tensorboard/',
                }
        }

In [52]:
from visdial.encoders.attn_encoder import AttentionStack
'''The Attention Stack include of 3 blocks (i.e. 9 MHAttentions) to compute the attention from all sources to one target (including itself)
Attention from X -> Y and Y -> X can be wrapped into a single MultiHeadAttention
And self-attention X -> X: can be wrapped into MultiHeadAttention(X, X)'''

# initialize the AttentionStack
attention_stack = AttentionStack(config)
batch_output = []

for i, q, h, m_i, m_q, m_h in zip(im, qe, hi, mask_im, mask_qe, mask_hi):
    # convert to tuple
    batch_input = (i, q, h, m_i, m_q, m_h)
    # pass the inputs to the AttentionStack
    batch_output.append(attention_stack(batch_input))
    # output : A tuples of the updated representations of inputs as the triples.

In [53]:
from visdial.data.dataset import VisDialDataset

train_dataset = VisDialDataset(config, 'train')

[train subset] Tokenizing questions...
[train subset] Tokenizing answers...
[train subset] Tokenizing captions...


In [54]:
train_dataset[0].keys()

dict_keys(['img_ids', 'num_rounds', 'opts', 'opts_in', 'opts_out', 'opts_len', 'opts_in_len', 'opts_out_len', 'ans', 'ans_in', 'ans_out', 'ans_len', 'ans_in_len', 'ans_out_len', 'ans_ind', 'img_feat', 'ques_tokens', 'hist_tokens', 'ques_len', 'hist_len'])

In [55]:
train_dataset[0]['img_ids']

tensor(218703)

In [62]:
from ast import literal_eval
a = [1, 2, 3, 4, 5]
b = str(a)
# convert string to list
c = literal_eval(b)
type(c)

list

In [56]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset,
                              batch_size=config['solver']['batch_size'],
                              num_workers=config['solver']['cpu_workers'],
                              shuffle=False)

In [63]:
import pickle
from ast import literal_eval

batch_input = {}
for batch, i, q, h, m_i, m_q, m_h in zip(train_dataloader, im, qe, hi, mask_im, mask_qe, mask_hi):
    # convert to tuple with batch['img_ids'] as the primary key
    batch_input[str(batch['img_ids'].tolist())] = (i, q, h, m_i, m_q, m_h)

# save them to disk
with open('batch_input.pkl', 'wb') as f:
    pickle.dump(batch_input, f)


In [42]:
import pickle

batch_input = pickle.load(open('batch_input.pkl', 'rb'))
len(batch_input)

154

In [77]:
a = [1, 2, 3, 4, 5]
b = [3, 1, 2, 4, 5]
# sort the list
c = sorted(a, key=lambda x: b.index(x))
c

[3, 1, 2, 4, 5]

In [81]:
sorted(b)

[1, 2, 3, 4, 5]

In [82]:
for i in batch_input.keys():
    print(i)


[218703, 27510, 275130, 166165, 214995, 462664, 114473, 175227]
[134001, 547115, 281733, 451345, 122678, 499733, 439352, 26365]
[208945, 188771, 240441, 516535, 246467, 338880, 100665, 271283]
[22135, 411700, 117192, 301376, 143800, 268718, 554152, 441605]
[120118, 282466, 163617, 482493, 58800, 160233, 70133, 500559]
[366569, 563371, 425758, 498702, 439926, 435267, 20279, 115158]
[175653, 451863, 304238, 147772, 315476, 513766, 206083, 356043]
[355902, 366329, 218155, 72860, 459200, 158602, 113787, 550736]
[522195, 468337, 504968, 343012, 117325, 316353, 524702, 131101]
[143323, 300737, 558915, 496732, 432008, 2843, 221562, 420287]
[190547, 228749, 213527, 440093, 82484, 86234, 397379, 433168]
[265574, 308520, 217707, 107331, 490810, 116358, 513513, 406508]
[121000, 70508, 446832, 224149, 490638, 96535, 317969, 491689]
[506220, 301317, 317911, 478982, 123946, 236740, 295563, 120388]
[51372, 144608, 452783, 483871, 497455, 515147, 386650, 110138]
[445893, 318562, 116466, 347131, 68765,

In [18]:
from visdial.decoders.decoder import Decoder

decoder = Decoder(config)

In [10]:
from visdial.decoders.disc_decoder import DiscriminativeDecoder

disc_decoder = DiscriminativeDecoder(config)

In [25]:
# initialize the DiscriminativeDecoder output
output = []

# pass the batch and the batch_output from the Attention Stack to the DiscriminativeDecoder
for batch, encoder_outputs in zip(train_dataloader, batch_output):
    output.append(decoder(batch, encoder_outputs, True))
    print(decoder(batch, encoder_outputs))
    break

{}


In [2]:
import os
import torch
import random
import numpy as np
from torch import nn
from tqdm import tqdm
from visdial.model import get_model
from torch.utils.data import DataLoader
from visdial.data.dataset import VisDialDataset
from visdial.metrics import SparseGTMetrics, NDCG
from visdial.utils.checkpointing import CheckpointManager, load_checkpoint_from_config
from visdial.utils import move_to_cuda
from visdial.common.utils import check_flag
# from new_options import get_training_config_and_args
from torch.utils.tensorboard import SummaryWriter
from visdial.optim import Adam, LRScheduler, get_weight_decay_params
from visdial.loss import DiscLoss

# config, args = get_training_config_and_args()

seed = config['seed']
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
os.environ['PYTHONHASHSEED'] = str(seed)

print(f"CUDA number: {torch.cuda.device_count()}")

CUDA number: 1


In [5]:
"""DATASET INIT"""
print("Loading val dataset...")
val_dataset = VisDialDataset(config, split='val')

if check_flag(config['dataset'], 'v0.9'):
    val_dataset.dense_ann_feat_reader = None

val_dataloader = DataLoader(val_dataset,
                            batch_size=int(config['solver']['batch_size'] /
                            2 * torch.cuda.device_count()),
                            num_workers=config['solver']['cpu_workers'])

print("Loading train dataset...")
if config['dataset']['overfit']:
    train_dataset = val_dataset
    train_dataloader = val_dataloader
else:
    train_dataset = VisDialDataset(config, split='train')
    if check_flag(config['dataset'], 'v0.9'):
        train_dataset.dense_ann_feat_reader = None

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config['solver']['batch_size'] *
                                  torch.cuda.device_count(),
                                  num_workers=config['solver']['cpu_workers'],
                                  shuffle=True)

Loading val dataset...
[val2018] Tokenizing questions...
[val2018] Tokenizing answers...
[val2018] Tokenizing captions...
Loading train dataset...
[train subset] Tokenizing questions...
[train subset] Tokenizing answers...
[train subset] Tokenizing captions...


In [15]:
train_dataset[0].keys()

dict_keys(['img_ids', 'num_rounds', 'opts', 'opts_in', 'opts_out', 'opts_len', 'opts_in_len', 'opts_out_len', 'ans', 'ans_in', 'ans_out', 'ans_len', 'ans_in_len', 'ans_out_len', 'ans_ind', 'img_feat', 'ques_tokens', 'hist_tokens', 'ques_len', 'hist_len'])

In [40]:
"""MODEL INIT"""
print("Init model...")
model = get_model(config)
# set device to cpu if cuda is not available
if not torch.cuda.is_available():
    device = torch.device('cpu')
    print("Cuda not available, using cpu")
else:
    device = torch.device('cuda')
    print("Cuda available, using gpu")
model = get_model(config)
model = model.to(device)

Init model...
Cuda available, using gpu


In [8]:
"""LOSS FUNCTION"""
from visdial.loss import DiscLoss

disc_criterion = DiscLoss(return_mean=True)
gen_criterion = nn.CrossEntropyLoss(ignore_index=0)

"""OPTIMIZER"""
parameters = get_weight_decay_params(
    model, weight_decay=config['solver']['weight_decay'])

optimizer = Adam(parameters,
                 betas=config['solver']['adam_betas'],
                 eps=config['solver']['adam_eps'],
                 weight_decay=config['solver']['weight_decay'])

lr_scheduler = LRScheduler(optimizer,
                           batch_size=config['solver']['batch_size'] *
                           torch.cuda.device_count(),
                           num_samples=config['solver']['num_samples'],
                           num_epochs=config['solver']['num_epochs'],
                           min_lr=config['solver']['min_lr'],
                           init_lr=config['solver']['init_lr'],
                           warmup_factor=config['solver']['warmup_factor'],
                           warmup_epochs=config['solver']['warmup_epochs'],
                           scheduler_type=config['solver']['scheduler_type'],
                           milestone_steps=config['solver']['milestone_steps'],
                           linear_gama=config['solver']['linear_gama']
                           )

In [9]:
# =============================================================================
#   SETUP BEFORE TRAINING LOOP
# =============================================================================
summary_writer = SummaryWriter(log_dir=config['callbacks']['log_dir'])

checkpoint_manager = CheckpointManager(
    model, optimizer, config['callbacks']['save_dir'], config=config)
sparse_metrics = SparseGTMetrics()
disc_metrics = SparseGTMetrics()
gen_metrics = SparseGTMetrics()
ndcg = NDCG()
disc_ndcg = NDCG()
gen_ndcg = NDCG()

print("Loading checkpoints...")
start_epoch, model, optimizer = load_checkpoint_from_config(
    model, optimizer, config)

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
    print("Using", torch.cuda.device_count(), "GPUs and DataParallel")

Loading checkpoints...
Can't load weight from None


In [10]:
# =============================================================================
#   TRAINING LOOP
# =============================================================================
iterations = len(
    train_dataset) // (config['solver']['batch_size'] * torch.cuda.device_count()) + 1
num_examples = torch.tensor(len(train_dataset), dtype=torch.float)
global_iteration_step = start_epoch * iterations

In [13]:
for epoch in range(start_epoch, config['solver']['num_epochs']):
    print(f"Training for epoch {epoch}:")
    print(f"Training for epoch {epoch}:")
    if check_flag(config['dataset'], 'v0.9') and epoch > 6:
        break

    epoch_loss = torch.tensor(0.0)
    for batch in tqdm(train_dataloader, total=iterations, unit="batch"):
        batch = move_to_cuda(batch, device)

        # zero out gradients
        optimizer.zero_grad()

        # do forward
        out = model(batch)

        # compute loss
        gen_loss = torch.tensor(0.0, requires_grad=True, device='cuda')
        disc_loss = torch.tensor(0.0, requires_grad=True, device='cuda')
        batch_loss = torch.tensor(0.0, requires_grad=True, device='cuda')
        if out.get('opt_scores') is not None:
            scores = out['opt_scores'].view(-1, 100)
            target = batch['ans_ind'].view(-1)

            sparse_metrics.observe(out['opt_scores'], batch['ans_ind'])
            disc_loss = disc_criterion(scores, target)
            batch_loss = batch_loss + disc_loss

        if out.get('ans_out_scores') is not None:
            scores = out['ans_out_scores'].view(-1,
                                                config['model']['txt_vocab_size'])
            target = batch['ans_out'].view(-1)
            gen_loss = gen_criterion(scores, target)
            batch_loss = batch_loss + gen_loss

        # compute gradients
        batch_loss.backward()

        # update params
        lr = lr_scheduler.step(global_iteration_step)
        optimizer.step()

        # logging
        if config['dataset']['overfit']:
            print("epoch={:02d}, steps={:03d}K: batch_loss:{:.03f} "
                  "disc_loss:{:.03f} gen_loss:{:.03f} lr={:.05f}".format(
                      epoch, int(global_iteration_step /
                                 1000), batch_loss.item(),
                      disc_loss.item(), gen_loss.item(), lr))

        if global_iteration_step % 1000 == 0:
            print("epoch={:02d}, steps={:03d}K: batch_loss:{:.03f} "
                  "disc_loss:{:.03f} gen_loss:{:.03f} lr={:.05f}".format(
                      epoch, int(global_iteration_step /
                                 1000), batch_loss.item(),
                      disc_loss.item(), gen_loss.item(), lr))

        summary_writer.add_scalar(config['config_name'] + "-train/batch_loss",
                                  batch_loss.item(), global_iteration_step)
        summary_writer.add_scalar("train/batch_lr", lr, global_iteration_step)

        global_iteration_step += 1
        torch.cuda.empty_cache()

        epoch_loss += batch["ans"].size(0) * batch_loss.detach()

    if out.get('opt_scores') is not None:
        avg_metric_dict = {}
        avg_metric_dict.update(sparse_metrics.retrieve(reset=True))

        summary_writer.add_scalars(config['config_name'] + "-train/metrics",
                                   avg_metric_dict, global_iteration_step)

        for metric_name, metric_value in avg_metric_dict.items():
            print(f"{metric_name}: {metric_value}")

    epoch_loss /= num_examples
    summary_writer.add_scalar(config['config_name'] + "-train/epoch_loss",
                              epoch_loss.item(), global_iteration_step)

    # -------------------------------------------------------------------------
    #   ON EPOCH END  (checkpointing and validation)
    # -------------------------------------------------------------------------
    # Validate and report automatic metrics.

    if config['callbacks']['validate']:
        # Switch dropout, batchnorm etc to the correct mode.
        model.eval()

        print(f"\nValidation after epoch {epoch}:")

        for batch in val_dataloader:
            move_to_cuda(batch, device)

            with torch.no_grad():
                out = model(batch)

                if out.get('opt_scores') is not None:
                    scores = out['opt_scores']
                    disc_metrics.observe(scores, batch["ans_ind"])

                    if "gt_relevance" in batch:
                        scores = scores[
                            torch.arange(scores.size(0)),
                            batch["round_id"] - 1, :]

                        disc_ndcg.observe(scores, batch["gt_relevance"])

                if out.get('opts_out_scores') is not None:
                    scores = out['opts_out_scores']
                    gen_metrics.observe(scores, batch["ans_ind"])

                    if "gt_relevance" in batch:
                        scores = scores[
                            torch.arange(scores.size(0)),
                            batch["round_id"] - 1, :]

                        gen_ndcg.observe(scores, batch["gt_relevance"])

                if out.get('opt_scores') is not None and out.get('opts_out_scores') is not None:
                    scores = (out['opts_out_scores'] + out['opt_scores']) / 2

                    sparse_metrics.observe(scores, batch["ans_ind"])
                    if "gt_relevance" in batch:
                        scores = scores[
                            torch.arange(scores.size(0)),
                            batch["round_id"] - 1, :]

                        ndcg.observe(scores, batch["gt_relevance"])

        avg_metric_dict = {}
        avg_metric_dict.update(sparse_metrics.retrieve(reset=True, key='avg_'))
        avg_metric_dict.update(ndcg.retrieve(reset=True, key='avg_'))

        disc_metric_dict = {}
        disc_metric_dict.update(disc_metrics.retrieve(reset=True, key='disc_'))
        disc_metric_dict.update(disc_ndcg.retrieve(reset=True, key='disc_'))

        gen_metric_dict = {}
        gen_metric_dict.update(gen_metrics.retrieve(reset=True, key='gen_'))
        gen_metric_dict.update(gen_ndcg.retrieve(reset=True, key='gen_'))

        for metric_dict in [avg_metric_dict, disc_metric_dict, gen_metric_dict]:
            for metric_name, metric_value in metric_dict.items():
                print(f"{metric_name}: {metric_value}")
            summary_writer.add_scalars(config['config_name'] + "-val/metrics",
                                       metric_dict, global_iteration_step)

        model.train()
        torch.cuda.empty_cache()

        # Checkpoint
        if not config['dataset']['overfit']:
            if 'disc' in config['model']['decoder_type']:
                checkpoint_manager.step(
                    epoch=epoch, only_best=False, metrics=disc_metric_dict, key='disc_')

            elif 'gen' in config['model']['decoder_type']:
                checkpoint_manager.step(
                    epoch=epoch, only_best=False, metrics=gen_metric_dict, key='gen_')

            elif 'misc' in config['model']['decoder_type']:
                checkpoint_manager.step(
                    epoch=epoch, only_best=False, metrics=disc_metric_dict, key='disc_')


Training for epoch 0:
Training for epoch 0:


  0%|          | 0/155 [00:03<?, ?batch/s]


RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor