In [1]:
import json

# set the subset of the data to use
subset = 1
# load the data
data = json.load(open('../data/subsets/visdial_1.0_train_' +
                 str(subset) + 'percent_subset.json'))['data']
# load the dialogs
dialogs = data['dialogs']


In [2]:
import pickle

'''Load the GAP output of the History Graphs Batch'''
with open('../embeddings/history/' + str(subset) + '/history_batch_GAP.pkl', 'rb') as f:
    history_batch_GAP = pickle.load(f)
    
'''Load the question graphs'''
with open('../embeddings/questions/' + str(subset) + '/question_graphs.pkl', 'rb') as f:
    question_graphs = pickle.load(f)

'''Load the image graphs'''
with open('../embeddings/images/instance/' + str(subset) + '/image_graphs.pkl', 'rb') as f:
    image_graphs = pickle.load(f)

'''We now follow a bottom-up approach to build the context aware GoG Graphs. Approach:

    Data: <dialogs> contain n entries (depending on the subset) of <10 rounds of dialog> for each image. Each of these <dialog rounds> contain a <question>, an <answer>, <a list of 100 candidate questions>, and a <ground truth index from the candidate questions>
    
    Task Reminder: The task is to answer the questions in the <dialog round>.

    Feature Aggregation:
        1 .For each <dialog round>, we obtain 10 history graphs where the first history graph is simply a single node entry of the caption, the second graph being the first history graph + the first question and answer pair, and so on.

        For each dialog round, we first aggregate each of the history graphs for this round using GAT and GAP to obtain the embedding representations of the history graphs.
        
        2. For each <question> in the <dialog round>, we obtain 1 question graph. We use the corresponding History Graph Embedding and concatenate it to every node in the question graph.
        Note: The c_in changes to a torch.Size([300+512]) tensor for the GAT.
        
        Then we perform both GAT and GAp on the question graph to obtain the embedding representation of the question graph.

        3. To obtain a context-aware embedding representation of the image, for each question, we obtain the question graph embedding and add it as a node to the image graph. 
        Note: The c_in changes to a torch.Size([2048+512]) tensor for the GAT.

        Then we perform both GAT and GAP on the image graph to obtain the embedding representation of the image graph.

        4. Then we use the "Efficient Attention Mechanism for Visual Dialog that can Handle All the Interactions between Multiple Inputs" method to train a fusion model using an encoder-decoder (discriminative decoder) to predict the ground truth index from the candidate answers for each question.
        '''


In [3]:
'''Graph Attention Network (GAT) code from https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial7/GNN_overview.html'''

# Progress bar
from tqdm.notebook import tqdm

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data

# Torchvision
from torchvision import transforms
# PyTorch Lightning
import pytorch_lightning as pl


class GATLayer(nn.Module):

    def __init__(self, c_in, c_out, num_heads=1, concat_heads=True, alpha=0.2):
        """
        Inputs:
            c_in - Dimensionality of input features
            c_out - Dimensionality of output features
            num_heads - Number of heads, i.e. attention mechanisms to apply in parallel. The
                        output features are equally split up over the heads if concat_heads=True.
            concat_heads - If True, the output of the different heads is concatenated instead of averaged.
            alpha - Negative slope of the LeakyReLU activation.
        """
        super().__init__()
        self.num_heads = num_heads
        self.concat_heads = concat_heads
        if self.concat_heads:
            assert c_out % num_heads == 0, "Number of output features must be a multiple of the count of heads."
            c_out = c_out // num_heads

        # Sub-modules and parameters needed in the layer
        self.projection = nn.Linear(c_in, c_out * num_heads)
        self.a = nn.Parameter(torch.Tensor(
            num_heads, 2 * c_out))  # One per head
        self.leakyrelu = nn.LeakyReLU(alpha)

        # Initialization from the original implementation
        nn.init.xavier_uniform_(self.projection.weight.data, gain=1.414)
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

    def forward(self, node_feats, adj_matrix, print_attn_probs=False):
        """
        Inputs:
            node_feats - Input features of the node. Shape: [batch_size, c_in]
            adj_matrix - Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes]
            print_attn_probs - If True, the attention weights are printed during the forward pass (for debugging purposes)
        """
        batch_size, num_nodes = node_feats.size(0), node_feats.size(1)

        # Apply linear layer and sort nodes by head
        node_feats = self.projection(node_feats)
        node_feats = node_feats.view(batch_size, num_nodes, self.num_heads, -1)

        # We need to calculate the attention logits for every edge in the adjacency matrix
        # Doing this on all possible combinations of nodes is very expensive
        # => Create a tensor of [W*h_i||W*h_j] with i and j being the indices of all edges
        # Returns indices where the adjacency matrix is not 0 => edges
        edges = adj_matrix.nonzero(as_tuple=False)
        node_feats_flat = node_feats.view(
            batch_size * num_nodes, self.num_heads, -1)
        edge_indices_row = edges[:, 0] * num_nodes + edges[:, 1]
        edge_indices_col = edges[:, 0] * num_nodes + edges[:, 2]
        a_input = torch.cat([
            torch.index_select(input=node_feats_flat,
                               index=edge_indices_row, dim=0),
            torch.index_select(input=node_feats_flat,
                               index=edge_indices_col, dim=0)
        ], dim=-1)  # Index select returns a tensor with node_feats_flat being indexed at the desired positions along dim=0

        # Calculate attention MLP output (independent for each head)
        attn_logits = torch.einsum('bhc,hc->bh', a_input, self.a)
        attn_logits = self.leakyrelu(attn_logits)

        # Map list of attention values back into a matrix
        attn_matrix = attn_logits.new_zeros(
            adj_matrix.shape+(self.num_heads,)).fill_(-9e15)
        attn_matrix[adj_matrix[..., None].repeat(
            1, 1, 1, self.num_heads) == 1] = attn_logits.reshape(-1)

        # Weighted average of attention
        attn_probs = F.softmax(attn_matrix, dim=2)
        if print_attn_probs:
            print("Attention probs\n", attn_probs.permute(0, 3, 1, 2))
        node_feats = torch.einsum('bijh,bjhc->bihc', attn_probs, node_feats)

        # If heads should be concatenated, we can do this by reshaping. Otherwise, take mean
        if self.concat_heads:
            node_feats = node_feats.reshape(batch_size, num_nodes, -1)
        else:
            node_feats = node_feats.mean(dim=2)

        return node_feats

In [4]:
'''Initialize the GAT layer with:
- input dimension of 300+512 = 812 (the dimensionality of the node features)
- output dimension of 512 (the dimensionality of the output features)
- 4 attention heads
- attention heads are not concatenated
- alpha is set to 0.2
Note: The features here are set from the GoG Paper'''

gat_layer_questions = GATLayer(c_in=812, c_out=512, num_heads=4,
                     concat_heads=False, alpha=0.2)

In [5]:
import dgl
from dgl.nn import GlobalAttentionPooling

# initialize the gate layer
gate_nn = torch.nn.Linear(512, 1)
# initialize the GlobalAttentionPooling layer
gap = GlobalAttentionPooling(gate_nn)


In [6]:
'''Function to create an adjacency matrix from a graph where the input is a list of edges'''

def create_adj_matrix(edges, number_of_nodes):
    adj_matrix = torch.zeros(number_of_nodes, number_of_nodes)
    for edge in edges:
        adj_matrix[edge[0], edge[1]] = 1
        adj_matrix[edge[1], edge[0]] = 1
    # convert to list of lists
    adj_matrix = adj_matrix.tolist()
    # values should be integers
    adj_matrix = [[int(j) for j in i] for i in adj_matrix]
    return adj_matrix

In [7]:
'''Function that takes in a graph and a node embedding as input and concatenates the node embedding to every node in the graph and returns the updated graph'''
import torch
import numpy as np
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from tqdm import tqdm

def add_node_to_graph(graph, node_embedding):
    # concatenate the node embedding to every node in the graph
    graph.x = torch.cat((graph.x, node_embedding.repeat(graph.x.shape[0], 1)), 1)
    # create the adjacency matrix
    adj_matrix = create_adj_matrix(graph.edge_index.T.tolist(), graph.x.shape[0])
    # return the updated graph and the adjacency matrix
    return graph, adj_matrix

In [8]:
# Testing the add_node_to_graph function
temp = question_graphs[0].clone()
print('The original graph is: ', temp)
temp_update, temp_update_adj = add_node_to_graph(temp, torch.rand(512))
print('The updated graph is: ', temp_update)
print('The updated adjacency matrix shape is: ',
      np.array(temp_update_adj).shape)

The original graph is:  Data(x=[20, 300], edge_index=[2, 10], edge_attr=[6])
The updated graph is:  Data(x=[20, 812], edge_index=[2, 10], edge_attr=[6])
The updated adjacency matrix shape is:  (20, 20)


In [9]:
'''For each <dialog> entry, we obtain 10 questions <dialog rounds>'''
# store the list of history-aware-question graphs GAP embeddings
question_batch_GAP = []

for dialog in tqdm(dialogs):

    '''Load the history embeddings (GAP) which contains 10 torch tensors of shape 512'''
    history_embeddings_dialog = history_batch_GAP[dialogs.index(dialog)]
    
    '''Load the question indices which correspond to the question graphs'''
    question_ids_dialog = [round['question'] for round in dialog['dialog']]
    
    '''Load the question graphs for this dialog'''
    question_graphs_dialog = [question_graphs[i] for i in question_ids_dialog]
    
    # question GAP embeddings for this dialog
    question_GAP_dialog = []

    '''For each question graph, we concatenate this to each graph node'''
    for h_node, q_graph_dialog in zip(history_embeddings_dialog, question_graphs_dialog):
        # clone the question graph
        q_graph = q_graph_dialog.clone()
        # add the node to the graph
        q_graph, adj_matrix = add_node_to_graph(q_graph, h_node)
        
        '''We pass these graphs and the adjacency matrix through the GAT layer'''
        q_graph.x = gat_layer_questions(q_graph.x.unsqueeze(0), torch.tensor([adj_matrix]), print_attn_probs=False).squeeze(0)

        '''Now for each question graph, we need to obtain the GAP embedding'''
        # initialize an empty DGL graph with the same number of nodes as the question graph
        q_graph_GAP = dgl.graph(([], []), num_nodes=q_graph.num_nodes)
        # add the node features to the graph where the node feature size is 512
        q_graph_GAP_node_feats = q_graph.x
        # add the edges to the graph
        if q_graph.edge_index.shape[0] == 0:
            pass
        else:
            q_graph_GAP.add_edges(q_graph.edge_index[0], q_graph.edge_index[1])
        # apply the GlobalAttentionPooling layer to the graph
        q_graph_GAP_embedding = gap(q_graph_GAP, q_graph_GAP_node_feats).squeeze(0)
        # append the GAP embedding to the list of GAP embeddings
        question_GAP_dialog.append(q_graph_GAP_embedding)

    # append the list of GAP embeddings to the list of GAP embeddings for all dialogs
    question_batch_GAP.append(question_GAP_dialog)

'''Save the question GAP embeddings'''

with open('../embeddings/fusion/question_GAP_batch_gog' + str(subset) + '.pkl', 'wb') as f:
    pickle.dump(question_batch_GAP, f)

100%|██████████| 1233/1233 [00:30<00:00, 40.48it/s]


In [10]:
'''Initialize the GAT layer with:
- input dimension of 2048+512 (the dimensionality of the node features)
- output dimension of 512 (the dimensionality of the output features)
- 4 attention heads
- attention heads are not concatenated
- alpha is set to 0.2
Note: The features here are set from the GoG Paper'''

gat_layer_images = GATLayer(c_in=2560, c_out=512, num_heads=4,
                     concat_heads=False, alpha=0.2)

In [11]:
'''For each <dialog> entry, we obtain 10 questions <dialog rounds> for which we need to update the image graph with the history-aware-question GAP embeddings'''
# store the list of history-aware-question-aware image graphs GAP embeddings
image_batch_GAP = []

for dialog in tqdm(dialogs):

    '''Load the question embeddings (GAP) which contains 10 torch tensors of shape 512'''
    question_embeddings_dialog = question_batch_GAP[dialogs.index(dialog)]
    
    '''Load the image index which correspond to the image graph'''
    image_id_dialog = dialogs.index(dialog)
    # Note: In the subset, the image_id_dialog is the same as the dialog_id_dialog that was used to create the image graphs

    '''Load the image graphs for this dialog'''
    image_graphs_dialog = image_graphs[image_id_dialog]
    # since each image will have 10 different contexts, we need to clone the image graph 10 times
    image_graphs_dialog = [image_graphs_dialog.clone() for i in range(10)]

    # question GAP embeddings for this dialog
    image_GAP_dialog = []

    '''For each image graph, we concatenate this to each graph node'''
    for q_node, i_graph_dialog in zip(question_embeddings_dialog, image_graphs_dialog):
        # clone the image graph
        i_graph = i_graph_dialog.clone()
        # add the node to the graph
        i_graph, adj_matrix = add_node_to_graph(i_graph, q_node)

        '''We pass these graphs and the adjacency matrix through the GAT layer'''
        i_graph.x = gat_layer_images(i_graph.x.unsqueeze(0), torch.tensor(
            [adj_matrix]), print_attn_probs=False).squeeze(0)

        '''Now for each image graph, we need to obtain the GAP embedding'''
        # initialize an empty DGL graph with the same number of nodes as the question graph
        i_graph_GAP = dgl.graph(([], []), num_nodes=i_graph.num_nodes)
        # add the node features to the graph where the node feature size is 512
        i_graph_GAP_node_feats = i_graph.x
        # add the edges to the graph
        if i_graph.edge_index.shape[0] == 0:
            pass
        else:
            i_graph_GAP.add_edges(i_graph.edge_index[0], i_graph.edge_index[1])
        # apply the GlobalAttentionPooling layer to the graph
        i_graph_GAP_embedding = gap(
            i_graph_GAP, i_graph_GAP_node_feats).squeeze(0)
        # append the GAP embedding to the list of GAP embeddings
        image_GAP_dialog.append(i_graph_GAP_embedding)

    # append the list of GAP embeddings to the list of GAP embeddings for all dialogs
    image_batch_GAP.append(image_GAP_dialog)

'''Save the image GAP embeddings'''

with open('../embeddings/fusion/image_GAP_batch_gog' + str(subset) + '.pkl', 'wb') as f:
    pickle.dump(image_batch_GAP, f)

  adj_matrix = create_adj_matrix(graph.edge_index.T.tolist(), graph.x.shape[0])
100%|██████████| 1233/1233 [00:45<00:00, 27.18it/s]
