<a href="https://colab.research.google.com/github/mosesds/CS7650-project/blob/master/CS7650_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

print(f'GPU available: {torch.cuda.is_available()}')

Fri Nov 11 22:05:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    41W / 350W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Preprocessing

In [3]:
# TODO: DATA PREPROCESSING

# Model

In [2]:
import torch.nn.functional as F
import random
import string
import re
import collections
random.seed(0)


def prepare_sequence(lst, idx_mapping):
    """ 
    Map individual items from `sent` using `idx_mapping`
    Return value is the same length as `sent`
    Usage: 
        >> prepare_sequence(['a', 'b', 'c'], {'a':0, 'b':1, 'c':2})
        [0, 1, 2]
    """
    idxs = []
    for item in lst:
        if item not in idx_mapping:
            assert '<unk>' in idx_mapping or 0 in idx_mapping, "cannot map unknown token:" + item
            if '<unk>' in idx_mapping:
                idxs.append(idx_mapping['<unk>'])
            else:
                idxs.append(idx_mapping[0])
        else:
            idxs.append(idx_mapping[item])
    try: 
        return torch.tensor(idxs, dtype=torch.long)
    except: 
        return idxs


class LSTMEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, char_embedding_dim,
                 char_hidden_dim, char_size, vocab_size, lstm_layers=1, 
                 bidirectional=False, dropout=0):
        super(LSTMEncoder, self).__init__()
        #############################################################################
        # embedding layer: that maps words to the embedding space
        # char embedding layer: maps chars to embedding space 
        # an char level LSTM: that finds the character level embedding for a word
        # an LSTM layer: that takes the combined embeddings as input and outputs hidden states
        # Remember, this needs to be done for both context and query (our input)
        # (DO NOT apply bidirectionality to character LSTMs)
        #############################################################################

        self.context_embeddingLayer = nn.Embedding(vocab_size, embedding_dim)
        self.context_charEmbeddingLayer = nn.Embedding(char_size, char_embedding_dim)
        self.context_char_LSTM = nn.LSTM(char_embedding_dim, char_hidden_dim,lstm_layers)
        self.context_LSTMLayer = nn.LSTM(char_hidden_dim + embedding_dim, hidden_dim, lstm_layers, bidirectional=bidirectional)
        
        self.query_embeddingLayer = nn.Embedding(vocab_size, embedding_dim)
        self.query_charEmbeddingLayer = nn.Embedding(char_size, char_embedding_dim)
        self.query_char_LSTM = nn.LSTM(char_embedding_dim, char_hidden_dim,lstm_layers)
        self.query_LSTMLayer = nn.LSTM(char_hidden_dim + embedding_dim, hidden_dim, lstm_layers, bidirectional=bidirectional)

        # Remember: bidirectional makes the output hidden_dim * 2

    def forward(self, context, context_chars, query, query_chars):
        lstm_context_vectors = None # for each word
        lstm_query_vectors = None # for each word 
        #############################################################################
        # Given a tokenized index-mapped sentence and a character sequence as the arguments,
        # `context` and `query` are word sequences at are index mapped
        # `context_chars` and `query_chars` are char sequence of each word that are 
        #   index mapped
        # Return values:
        #   `lstm_query_vectors` : Txd or Nx(d*2) if bidirectional LSTM used
        #       T is # of words in context, and d is size of hidden states
        #   `lstm_query_vectors` : Jxd or Jx(d*2) if bidirectional LSTM used
        #       J is # of words in query, and d is size of hidden states
        # 
        #############################################################################

        # context embedding
        context_embeddings = self.context_embeddingLayer(context)
        context_char_embeddings = self.context_charEmbeddingLayer(context_chars)

        z = int(len(context_chars)/len(context))
        context_char_embeddings = context_char_embeddings.view(len(context),z,-1)
        context_last_char_embeddings = None
        for i in range(0,len(context)):
            hidden_state = None
            char_hiddenLayer = None
            for j in range(0,z):
                char_hiddenLayer, hidden_state = self.context_char_LSTM(context_char_embeddings[i][j].view(1,1,-1),hidden_state)
            context_last_char_embeddings = char_hiddenLayer if (context_last_char_embeddings == None) else torch.cat((context_last_char_embeddings,char_hiddenLayer),0)

        context_cat_layer = torch.cat((context_embeddings.view(len(context),-1), context_last_char_embeddings.view(len(context),-1)),1)
        hidden_layer,_ = self.context_LSTMLayer(context_cat_layer.view(len(context),1,-1))
        lstm_context_vectors = hidden_layer.view(len(context),-1)

        #query embedding
        query_embeddings = self.query_embeddingLayer(query)
        query_char_embeddings = self.query_charEmbeddingLayer(query_chars)

        z = int(len(query_chars)/len(query))
        query_char_embeddings = query_char_embeddings.view(len(query),z,-1)
        query_last_char_embeddings = None
        for i in range(0,len(query)):
            hidden_state = None
            char_hiddenLayer = None
            for j in range(0,z):
                char_hiddenLayer, hidden_state = self.context_char_LSTM(query_char_embeddings[i][j].view(1,1,-1),hidden_state)
            query_last_char_embeddings = char_hiddenLayer if (query_last_char_embeddings == None) else torch.cat((query_last_char_embeddings,char_hiddenLayer),0)

        query_cat_layer = torch.cat((query_embeddings.view(len(query),-1), query_last_char_embeddings.view(len(query),-1)),1)
        hidden_layer,_ = self.context_LSTMLayer(query_cat_layer.view(len(query),1,-1))
        lstm_query_vectors = hidden_layer.view(len(query),-1)

        return lstm_context_vectors, lstm_query_vectors
        

class AttentionFlow(nn.Module):
    def __init__(self, w_dim):
        """
        w_dim : is the same as 6d in the paper. Should be 6*hidden_dim if bidirectionality is True
        """
        super(AttentionFlow, self).__init__()
        #############################################################################
        # need a linear layer to compute the similarity matrix (no bias according to the paper)
        #############################################################################
        self.wt = nn.Linear(w_dim,1,bias=False)
        self.softmax_function = nn.Softmax(dim=0)

    def forward(self, context, query):
        G = None
        #############################################################################
        # T : number of tokens in context
        # J : number of tokens in query
        # d : hidden_dimensions (context and query will have d*2 hidden dimensions if using bidirectional LSTM)
        # Parameters (from the encoder):
        #     context: of size Txd or (d*2) if using bidirectional LSTM
        #     query: of size Jxd or (d*2) if using bidirectional LSTM
        # For this part, you need to compute a similarity matrix, S
        #     S : TxJ
        # then use S to build context2query and query2context attention
        #     context2query : T x (d*2)
        #     query2context : T x (d*2) 
        #         hint: query2context will be (1, d*2) but it will need to be repeated 
        #               T times so the dimension is T x (d*2)
        # Return :: G which is the query aware context vectors of size (T x d*8)
        #     G is obtained by combining `context`, `context2query` and `query2context` 
        #       as defined in the paper.
        #############################################################################
        S = None
        for i in range(0,len(context)):
            context_sim_vec = None
            h = context[i]
            for j in range(0,len(query)):
                u = query[j]
                vector = torch.cat((h,u,torch.mul(h,u)))
                sim = self.wt(vector)
                context_sim_vec = torch.cat((context_sim_vec, sim),0) if context_sim_vec != None else sim
            context_sim_vec = context_sim_vec.view(1,len(query))
            S = context_sim_vec if S == None else torch.cat((S,context_sim_vec),0)

        context2query = None
        query2context = None
        for t in range(0,len(context)):
            a_t = self.softmax_function(S[t])
            ut = None
            for j in range(0,len(query)):
                ut = a_t[j] * query[j] if ut == None else ut + (a_t[j] * query[j])
            ut = ut.view(1,-1)
            context2query = ut if context2query == None else torch.cat((context2query, ut), 0)

        #context2query = torch.transpose(context2query, 0, 1)
        #S_inverse = torch.transpose(S, 0, 1)
        b = torch.argmax(S,dim=1)
        b = self.softmax_function(b.float())
        h_tilda = None

        for i in range(0,len(context)):
            b_t = b[i]
            h_t = context[t]
            h_tilda = b_t * h_t if h_tilda == None else (b_t * h_t) + h_tilda
        big_h_tilda = h_tilda.repeat(len(context)).view(len(context),-1)
        query2context = torch.transpose(big_h_tilda, 0, 1)
        for t in range(0,len(context)):
            h = context[t]
            h_tilda = h_tilda
            u_tilda = context2query[t]
            vector = torch.cat((h, u_tilda, torch.mul(h,u_tilda), torch.mul(h,h_tilda)),0)
            vector = vector.view(1,-1)
            G = vector if G == None else torch.cat((G,vector),0)

        return G


class ModelingLayer(nn.Module):
    def __init__(self, input_dim, output_dim, num_layers=2, dropout=0.2, 
        bidirectional=True):
        super(ModelingLayer, self).__init__()
        #############################################################################
        # just need to pass our query aware context vectors 
        # initialize an LSTM layer here
        #############################################################################
        self.lstmLayer = nn.LSTM(input_dim, output_dim, num_layers, dropout=dropout,bidirectional=bidirectional)
    
    def forward(self, G):
        M = None
        #############################################################################
        # G : query aware context word embeddings 
        # returns :: of size Tx(output_dim*2) (T is # words in context)
        #############################################################################
        M,_ = self.lstmLayer(G.view(len(G),1,-1))
        M = M.view(len(G),-1)

        return M


class OutputLayer(nn.Module):
    def __init__(self, fc_dim, LSTM_input_size, LSTM_output_size, num_layers=1, bidirectional=True):
        super(OutputLayer, self).__init__()
        #############################################################################
        # For the OutputLayer, we need:
        #   Linear layer (to predict start idx; no bias for these linear layers according to paper) 
        #   LSTM + Linear (to predict end idx)
        #############################################################################
        directions = (2 if bidirectional else 1)
        fc_dim = fc_dim if bidirectional else int(fc_dim/2)
        self.start_linear = nn.Linear(fc_dim,1, bias=False)
        self.end_lstmLayer = nn.LSTM(LSTM_input_size, LSTM_output_size, num_layers, bidirectional=bidirectional)
        self.end_linear = nn.Linear(fc_dim, 1, bias=False)
        self.start_softmax = nn.Softmax(dim=0)
        self.end_softmax = nn.Softmax(dim=0)

    def forward(self, G, M):
        start, end = None, None
        #############################################################################
        # G : query aware context word embeddings
        # M : output of modeling layer
        # returns :: `start` and `end` of size (T,) 
        #############################################################################
        concat = torch.cat((G,M),1)
        p_start = self.start_linear(concat)

        m_2,_ = self.end_lstmLayer(M.view(len(M),1,-1))
        m_2 = m_2.view(len(M),-1)
        concat_2 = torch.cat((G,m_2),1)
        p_end = self.end_linear(concat_2)

        start = self.start_softmax(p_start).view(-1)
        end = self.end_softmax(p_end).view(-1)

        return start, end

 
class BiDAF(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, char_embedding_dim,
                 char_hidden_dim, char_size, vocab_size, bidirectional=True, phrase_LSTM_layers=1, modeling_LSTM_layers=2, dropout=0):
        super(BiDAF, self).__init__()
        #############################################################################
        # Initialize all the modules created so far and link their inputs and 
        #       outputs properly:
        #   LSTMEncoder
        #   AttentionFlow
        #   ModelingLayer
        #   Output
        #############################################################################
        directions = (2 if bidirectional else 1)
        self.LSTMEncoder = LSTMEncoder(embedding_dim, hidden_dim, char_embedding_dim,
                 char_hidden_dim, char_size, vocab_size, phrase_LSTM_layers, 
                 bidirectional=bidirectional, dropout=dropout)
        self.AttentionFlow = AttentionFlow(hidden_dim * 3 * directions)
        self.ModelingLayer = ModelingLayer(hidden_dim * directions * 4, hidden_dim, modeling_LSTM_layers, dropout, 
        bidirectional)
        self.Output = OutputLayer(hidden_dim*10, hidden_dim * directions, hidden_dim, bidirectional=bidirectional)

    def forward(self, context, context_chars, query, query_chars):
        start, end = None, None
        #############################################################################
        # Given a tokenized index-mapped sentence and a character sequence as the arguments,
        # find the corresponding scores for tags
        # returns:: `start` and `end` of size (T,) 
        #   where, T is the number of words/tokens in context
        #############################################################################
        context_enc, query_enc = self.LSTMEncoder(context, context_chars, query, query_chars)
        G = self.AttentionFlow(context_enc, query_enc)
        M = self.ModelingLayer(G)
        start, end = self.Output(G, M)

        return start, end


##### From Official SQUAD Evauation evaluation script version 2.0 #####

def normalize_answer(s):
    """Lower text and remove punctuation, artcles and extra whitespace"""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)]b', re.UNICODE)
        try:
            return re.sub(regex, ' ', text)
        except:
            return text

    def white_space_fix(text):
        try:
            return ' '.join(text.split())
        except:
            return text

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        try:
            return text.lower()
        except:
            return text
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


# Train and Eval

In [4]:
# TODO: Use above preprocessing, BiDAF model, and official squad eval method to train and eval model