In [1]:
# Library to read json
import json

# Numeric and data manipulation tools
import pandas as pd
import numpy as np
import random

# Deep learning framework
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Natural language tools
import nltk
from nltk.tokenize import TreebankWordTokenizer
import gensim
import gensim.downloader as gloader

# Other tools
from tqdm.notebook import tqdm
from collections import OrderedDict, Counter
from time import time
from itertools import zip_longest

# automatic mixed precision training:
from torch.cuda.amp import autocast 
from torch.cuda.amp import GradScaler

# Type hint
from typing import Optional, Callable, Tuple, Dict, List, Union

nltk.download('punkt')
nltk.download('stopwords')

# from sklearn.model_selection import train_test_split

# Use GPU acceleration if possible
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Using this device:', DEVICE)
   
# to avoid memory problems:
torch.backends.cudnn.enabled = False

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Using this device: cuda:0


In [2]:
# Define special tokens
PAD = '<PAD>'
UNK = '<UNK>'

In [3]:
# Lambda for computing the mean of a list
mean: Callable[[List[float]], float] = lambda l: sum(l) / len(l)

# Lambda for transforming a list of tuples into a tuple of lists
to_tuple_of_lists: Callable[[List[Tuple]], Tuple[List]] = lambda list_of_tuples: tuple(map(list, zip(*list_of_tuples)))

# Lambda for transforming a tuple of lists into a list of tuples
to_list_of_tuples: Callable[[Tuple[List]], List[Tuple]] = lambda tuple_of_lists: list(zip(*tuple_of_lists))

# Lambda for iterating with batches (if the length of the sequences does not match with the batch size, tuples of empty lists are appended)
batch_iteration: Callable[[List[Tuple]], zip] = lambda data, batch_size: zip_longest(*[iter(data)] * batch_size, fillvalue=([], [], []))

In [4]:
"""
json structure:

data []
|---title
|---paragraphs []
|   |---context
|   |---qas []
|   |   |---question
|   |   |---id
version

"""

filename = 'training_set.json'

with open(filename, 'r') as f:
    raw_data = f.readlines()[0]

parsed_data = json.loads(raw_data)['data']

context_list = []
context_index = -1
paragraph_index = -1

dataset = {'paragraph_index': [], 'context_index': [], 'question': [], 'id': []}

for i in range(len(parsed_data)):
    paragraph_index += 1
    for j in range(len(parsed_data[i]['paragraphs'])):
        context_list.append(parsed_data[i]['paragraphs'][j]['context'])
        context_index += 1

        for k in range(len(parsed_data[i]['paragraphs'][j]['qas'])):
            question = parsed_data[i]['paragraphs'][j]['qas'][k]['question']
            id = parsed_data[i]['paragraphs'][j]['qas'][k]['id']

            dataset['paragraph_index'].append(paragraph_index)
            dataset['context_index'].append(context_index)
            dataset['question'].append(question)
            dataset['id'].append(id)

df = pd.DataFrame.from_dict(dataset)
id_list = df['id'].tolist()

In [8]:
# Embeddings
emb_dim = 50
glove_model = gloader.load('glove-wiki-gigaword-' + str(emb_dim))
print('\nDownload completed.')


Download completed.


In [5]:
# Solo per Giacomo e Lorenzo
# Fixing "ValueError: substring not found"
# https://github.com/nltk/nltk/issues/1750
to_replace = {'"': ' ', '\'': ' ', '`': ' '}

def replace_all(text):
    for i, j in to_replace.items():
        text = text.replace(i, j)

    return text

context_list = [replace_all(context) for context in context_list]
df['question'] = df['question'].apply(lambda x: replace_all(x))

In [6]:
# DEBUG ONLY
if True:
    df = df[:100]
    id_list = df['id'].tolist()

In [23]:
def tokenize_corpus(df: pd.DataFrame, context_list: List[str]):
    twt = TreebankWordTokenizer()
    
    # Retrieve contexts
    contexts = df['context_index'].apply(lambda x: context_list[x])
    # Tokenize both contexts and queries
    ctx = contexts.apply(lambda x: twt.tokenize(x)).tolist()
    qry = df['question'].apply(lambda x: twt.tokenize(x)).tolist()
    
    # Get spans of tokens, to revert the tokenization
    spans_list = contexts.apply(lambda x: twt.span_tokenize(x)).tolist()
    
    return ctx, qry, spans_list

def build_vocabulary(corpus: List[List[str]],
                     old_word_listing: Optional[List[str]] = None) -> (Dict[int, str], Dict[int, str], List[str]):
    flat_tokens = [x for sub in corpus for x in sub]
    
    if old_word_listing is None:  # standard case
        word_listing = [PAD] + list(OrderedDict.fromkeys(flat_tokens))
    else:  # case in which we extend an already existing vocabulary
        word_listing = list(OrderedDict.fromkeys(old_word_listing + flat_tokens))
        
    idx_to_word = {i: w for i, w in enumerate(word_listing)}
    word_to_idx = {w: i for i, w in enumerate(word_listing)}

    return idx_to_word, word_to_idx, word_listing

# Tokenize corpus
context_tokenized, query_tokenized, spans_list = tokenize_corpus(df, context_list)
corpus = context_tokenized + query_tokenized

# Get word and char mappings
i2w, w2i, wl = build_vocabulary(corpus)

# OOV words
oov_words = [word for word in wl if word not in glove_model.vocab and word != PAD]

In [10]:
def build_char_embedding_matrix(corpus: List[str],
                                enc_dim: Optional[int] = 100):
    # Flatten to obtain single characters
    flat_chars = [c for sent in corpus for word in sent for c in word]
    
    # Sort characters by occurrences
    unique_chars = Counter(flat_chars)
    char_listing = sorted(unique_chars, key=unique_chars.get, reverse=True)
    # Select only the enc_dim most frequent ones
    if len(char_listing) > enc_dim - 1:
        char_listing = char_listing[:enc_dim - 1]
    char_listing = [PAD] + char_listing + [UNK]  # add PAD and UNK tokens
    
    idx_to_char = {i: c for i, c in enumerate(char_listing)}
    char_to_idx = {c: i for i, c in enumerate(char_listing)}
    
    # Create one-hot vectors, reserving the last one for UNK (0...0, 1)
    one_hot_chars = np.zeros((len(char_listing) - 1, enc_dim))
    np.fill_diagonal(one_hot_chars, 1)
    one_hot_chars = np.vstack([np.zeros((1, enc_dim)), one_hot_chars])  # stack zero vector on top for PAD
    
    return idx_to_char, char_to_idx, char_listing, one_hot_chars

# Build char embedding matrix based only on the training set, and use it for validation and test too:
# in fact, we can assume that characters appear uniformly in the three splits;
# for those rare case in which this does not happen, we assign the UNK vector
i2c, c2i, cl, char_emb_mtx = build_char_embedding_matrix(corpus)

In [None]:
# Load the model
model = torch.load("bidaf_test.pt")
model.eval()

In [12]:
from model.tensor_maker import TensorMaker

tensor_maker = TensorMaker(w2i, c2i, DEVICE)

In [32]:
# Retrieve original contexts
contexts = df['context_index'].apply(lambda x: context_list[x])

evaluation_data = []
for i in range(len(context_tokenized)):
    evaluation_data.append((contexts[i], context_tokenized[i], query_tokenized[i]))

In [45]:
def generate_evaluation_json(model,
                             evaluation_data: List[Tuple[str, List[str], List[str]]],
                             spans_list: List[List[Tuple[int, int]]],
                             id_list: List[str],
                             filename: str):
    predictions = {}

    with torch.no_grad():
        batch_size = 4

        # Create batch iterator
        batch_iter = batch_iteration(evaluation_data, batch_size)

        for i, batch in enumerate(batch_iter):
            # Extract samples
            batch_context, batch_context_tokenized, batch_query_tokenized = to_tuple_of_lists(batch)

            # Filter valid samples in batches (in case of incomplete ones)
            batch_context_tokenized: Tuple[List[str]] = tuple([c for c in batch_context_tokenized if len(c) > 0])
            batch_query_tokenized: Tuple[List[str]] = tuple([q for q in batch_query_tokenized if len(q) > 0])

            context_word_tensor, context_char_tensor, context_lengths = tensor_maker.get_tensor(batch_context_tokenized)
            query_word_tensor, query_char_tensor, query_lengths = tensor_maker.get_tensor(batch_query_tokenized)

            # Make prediction
            p_soft_start, p_soft_end = model(context_word_tensor, context_char_tensor,
                                             query_word_tensor, query_char_tensor)

            # Argmax
            p_start = torch.argmax(p_soft_start, dim=1)
            p_end = torch.argmax(p_soft_end, dim=1)

            for j in range(batch_size):
                start_word_idx = p_start[j].item()
                end_word_idx = p_end[j].item()

                span = spans_list[i * batch_size + j]
                start_char_idx = span[start_word_idx][0]
                end_char_idx = span[end_word_idx][1]

                answer = batch_context[j][start_char_idx:end_char_idx+1]

                # DEBUG: original context answer vs tokenized context answer
                # They should match (in terms of meaning/words)!
                print(answer)
                print(batch_context_tokenized[j][start_word_idx:end_word_idx])
                print()

                id = id_list[i * batch_size + j]
                predictions[id] = answer

    with open(filename, "w") as f:
        f.write(json.dumps(predictions))

In [46]:
generate_evaluation_json(model, evaluation_data, spans_list, id_list, "predictions.txt")

Main Building s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend  Venite Ad Me Omnes 
['Main', 'Building', 's', 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'Virgin', 'Mary.', 'Immediately', 'in', 'front', 'of', 'the', 'Main', 'Building', 'and', 'facing', 'it', ',', 'is', 'a', 'copper', 'statue', 'of', 'Christ', 'with', 'arms', 'upraised', 'with', 'the', 'legend', 'Venite', 'Ad', 'Me']

Main Building s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend  Venite Ad Me Omnes 
['Main', 'Building', 's', 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'Virgin', 'Mary.', 'Immediately', 'in', 'front', 'of', 'the', 'Main', 'Building', 'and', 'facing', 'it', ',', 'is', 'a', 'copper', 'statue', 'of', 'Christ', 'with', 'arms', 'upraise