In [1]:
# Library to read json
import json

# Numeric and data manipulation tools
import pandas as pd
import numpy as np
import random

# Deep learning framework
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Natural language tools
import nltk
from nltk.tokenize import TreebankWordTokenizer
import gensim
import gensim.downloader as gloader

# Other tools
from tqdm.notebook import tqdm
from collections import OrderedDict, Counter
from time import time
from itertools import zip_longest

# automatic mixed precision training:
from torch.cuda.amp import autocast 
from torch.cuda.amp import GradScaler

# Type hint
from typing import Optional, Callable, Tuple, Dict, List, Union

nltk.download('punkt')
nltk.download('stopwords')

# Use GPU acceleration if possible
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Using this device:', DEVICE)
   
# to avoid memory problems:
torch.backends.cudnn.enabled = False

Using this device: cuda:0


[nltk_data] Downloading package punkt to /home/nihil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nihil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Define special tokens
PAD = '<PAD>'
UNK = '<UNK>'

In [3]:
# Lambda for computing the mean of a list
mean: Callable[[List[float]], float] = lambda l: sum(l) / len(l)

# Lambda for transforming a list of tuples into a tuple of lists
to_tuple_of_lists: Callable[[List[Tuple]], Tuple[List]] = lambda list_of_tuples: tuple(map(list, zip(*list_of_tuples)))

# Lambda for transforming a tuple of lists into a list of tuples
to_list_of_tuples: Callable[[Tuple[List]], List[Tuple]] = lambda tuple_of_lists: list(zip(*tuple_of_lists))

# Lambda for iterating with batches (if the length of the sequences does not match with the batch size, tuples of empty lists are appended)
batch_iteration: Callable[[List[Tuple]], zip] = lambda data, batch_size: zip_longest(*[iter(data)] * batch_size, fillvalue=([], [], []))

In [4]:
"""
json structure:

data []
|---title
|---paragraphs []
|   |---context
|   |---qas []
|   |   |---question
|   |   |---id
version

"""

filename = 'training_set.json'

with open(filename, 'r') as f:
    raw_data = f.readlines()[0]

parsed_data = json.loads(raw_data)['data']

context_list = []
context_index = -1
paragraph_index = -1

dataset = {'paragraph_index': [], 'context_index': [], 'question': [], 'id': []}

for i in range(len(parsed_data)):
    paragraph_index += 1
    for j in range(len(parsed_data[i]['paragraphs'])):
        context_list.append(parsed_data[i]['paragraphs'][j]['context'])
        context_index += 1

        for k in range(len(parsed_data[i]['paragraphs'][j]['qas'])):
            question = parsed_data[i]['paragraphs'][j]['qas'][k]['question']
            id = parsed_data[i]['paragraphs'][j]['qas'][k]['id']

            dataset['paragraph_index'].append(paragraph_index)
            dataset['context_index'].append(context_index)
            dataset['question'].append(question)
            dataset['id'].append(id)

df = pd.DataFrame.from_dict(dataset)
id_list = df['id'].tolist()

In [5]:
# Embeddings
emb_dim = 50
glove_model = gloader.load('glove-wiki-gigaword-' + str(emb_dim))
print('\nDownload completed.')


Download completed.


In [19]:
def tokenize_corpus(df: pd.DataFrame, context_list: List[str]):
    twt = TreebankWordTokenizer()
    
    # Retrieve contexts
    contexts = df['context_index'].apply(lambda x: context_list[x])
    # Tokenize both contexts and queries
    ctx = contexts.apply(lambda x: twt.tokenize(x)).tolist()
    qry = df['question'].apply(lambda x: twt.tokenize(x)).tolist()
    
    # Get spans of tokens, to revert the tokenization
    spans_list = contexts.apply(lambda x: list(twt.span_tokenize(x))).tolist()
    
    return ctx, qry, spans_list

# Tokenize corpus
context_tokenized, query_tokenized, spans_list = tokenize_corpus(df, context_list)
corpus = context_tokenized + query_tokenized

In [7]:
from model.bidaf import BiDAF
from model.char_embedder import CharEmbedder
from model.word_embedder import WordEmbedder
from model.tensor_maker import TensorMaker

import pickle

# Import from pickle files
char_embedder = None
with open(os.path.join('best_model', 'char_emb.pickle'), 'rb') as f:
    char_embedder = pickle.load(f)
train_word_embedder = None
with open(os.path.join('best_model', 'train_word_embedder.pickle'), 'rb') as f:
    train_word_embedder = pickle.load(f)
val_word_embedder = None
with open(os.path.join('best_model', 'val_word_embedder.pickle'), 'rb') as f:
    val_word_embedder = pickle.load(f)

# Create model
model_bidaf = BiDAF(char_embedder, train_word_embedder, val_word_embedder, use_constraint=True, use_dropout=False).to(DEVICE)
# Load the model state
model_bidaf.load_state_dict(torch.load(os.path.join('best_model', 'bidaf_test.pt')))
model_bidaf.eval()

BiDAF(
  (char_embedder): CharEmbedder(
    (embedding): Embedding(101, 100)
    (conv_layer): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1), padding=(1, 0), bias=False)
    (fc1): Linear(in_features=100, out_features=64, bias=False)
    (fc2): Linear(in_features=64, out_features=50, bias=False)
  )
  (train_word_embedder): WordEmbedder(
    (embedding): Embedding(114839, 50)
  )
  (eval_word_embedder): WordEmbedder(
    (embedding): Embedding(131821, 50)
  )
  (highway_net): ConvolutionalHighwayNetwork(
    (conv1): Conv2d(1, 1, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (gate1): Conv2d(1, 1, kernel_size=(5, 100), stride=(1, 1), padding=(2, 0))
    (conv2): Conv2d(1, 1, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (gate2): Conv2d(1, 1, kernel_size=(5, 100), stride=(1, 1), padding=(2, 0))
  )
  (ctx_rnn): GRU(100, 100, batch_first=True, bidirectional=True)
  (w_s): Linear(in_features=600, out_features=1, bias=False)
  (mod_rnn): GRU(800, 100, num_layers=2, ba

In [8]:
from model.tensor_maker import TensorMaker

tensor_maker = None
with open(os.path.join('best_model', 'tensor_maker.pickle'), 'rb') as f:
    tensor_maker = pickle.load(f)

In [20]:
# Retrieve original contexts
contexts = df['context_index'].apply(lambda x: context_list[x])

evaluation_data = []
for i in range(len(context_tokenized)):
    evaluation_data.append((contexts[i], context_tokenized[i], query_tokenized[i]))

In [23]:
def generate_evaluation_json(model,
                             evaluation_data: List[Tuple[str, List[str], List[str]]],
                             spans_list: List[List[Tuple[int, int]]],
                             id_list: List[str],
                             filename: str):
    predictions = {}
 
    with torch.no_grad():
        batch_size = 4
 
        # Create batch iterator
        batch_iter = batch_iteration(evaluation_data, batch_size)
 
        for i, batch in enumerate(batch_iter):
            # Extract samples
            batch_context, batch_context_tokenized, batch_query_tokenized = to_tuple_of_lists(batch)
 
            # Filter valid samples in batches (in case of incomplete ones)
            batch_context_tokenized: Tuple[List[str]] = tuple([c for c in batch_context_tokenized if len(c) > 0])
            batch_query_tokenized: Tuple[List[str]] = tuple([q for q in batch_query_tokenized if len(q) > 0])
 
            context_word_tensor, context_char_tensor, context_lengths = tensor_maker.get_tensor(batch_context_tokenized)
            query_word_tensor, query_char_tensor, query_lengths = tensor_maker.get_tensor(batch_query_tokenized)
 
            # Make prediction
            p_soft_start, p_soft_end = model(context_word_tensor, context_char_tensor,
                                             query_word_tensor, query_char_tensor)
 
            # Argmax
            p_start = torch.argmax(p_soft_start, dim=1)
            p_end = torch.argmax(p_soft_end, dim=1)
 
            for j in range(batch_size):
                start_word_idx = p_start[j].item()
                end_word_idx = p_end[j].item()
 
                span = spans_list[i * batch_size + j]
                start_char_idx = span[start_word_idx][0]
                end_char_idx = span[end_word_idx][1]
 
                answer = batch_context[j][start_char_idx:end_char_idx+1]
 
                id = id_list[i * batch_size + j]
                predictions[id] = answer
 
    with open(filename, "w") as f:
        f.write(json.dumps(predictions))

In [24]:
generate_evaluation_json(model_bidaf, evaluation_data, spans_list, id_list, "predictions.txt")

KeyboardInterrupt: 