# qa-nlp
Question answering neural model based on the SQuAD dataset.

Authors:
- Lorenzo Mario Amorosa
- Andrea Espis
- Mattia Orlandi
- Giacomo Pinardi

## 0. Environment setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Library to read json
import json

# Numeric and data manipulation tools
import pandas as pd
import numpy as np
import random

# Deep learning framework
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Natural language tools
import nltk
from nltk.tokenize import TreebankWordTokenizer
import gensim
import gensim.downloader as gloader

# Other tools
from tqdm.notebook import tqdm
from collections import OrderedDict, Counter
from time import time
from itertools import zip_longest

# automatic mixed precision training:
from torch.cuda.amp import autocast 
from torch.cuda.amp import GradScaler

# Type hint
from typing import Optional, Callable, Tuple, Dict, List, Union

nltk.download('punkt')
nltk.download('stopwords')

# from sklearn.model_selection import train_test_split

# Use GPU acceleration if possible
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Using this device:', DEVICE)

if not(torch.cuda.is_available()):
    raise Exception('Switch to runtime GPU, otherwise the code won\'t work properly')
   
# to avoid memory problems:
torch.backends.cudnn.enabled = False

[nltk_data] Downloading package punkt to /home/nihil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nihil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using this device: cuda:0


In [3]:
# Set seed for reproducibility
def fix_random(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

fix_random(42)

In [4]:
# Use GPU acceleration if possible
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("using this device:", DEVICE)

# Define special tokens
PAD = '<PAD>'
UNK = '<UNK>'

using this device: cuda:0


In [5]:
# Lambda for computing the mean of a list
mean: Callable[[List[float]], float] = lambda l: sum(l) / len(l)

# Lambda for transforming a list of tuples into a tuple of lists
to_tuple_of_lists: Callable[[List[Tuple]], Tuple[List]] = lambda list_of_tuples: tuple(map(list, zip(*list_of_tuples)))

# Lambda for transforming a tuple of lists into a list of tuples
to_list_of_tuples: Callable[[Tuple[List]], List[Tuple]] = lambda tuple_of_lists: list(zip(*tuple_of_lists))

# Lambda for iterating with batches (if the length of the sequences does not match with the batch size, tuples of empty lists are appended)
batch_iteration: Callable[[List[Tuple]], zip] = lambda data, batch_size: zip_longest(*[iter(data)] * batch_size, fillvalue=([], [], []))

## 1. Dataset preparation

In [6]:
"""
json structure:

data []
|---title
|---paragraphs []
|   |---context
|   |---qas []
|   |   |---answers []
|   |   |   |---answer_start
|   |   |   |---text
|   |   |---question
|   |   |---id
version

"""

filename = 'training_set.json'

with open(filename, 'r') as f:
    raw_data = f.readlines()[0]

parsed_data = json.loads(raw_data)['data']

context_list = []
context_index = -1
paragraph_index = -1

dataset = {'paragraph_index': [], 'context_index': [], 'question': [], 'id': [], 'answer_start': [], 'answer_end': [], 'answer_text': []}

for i in range(len(parsed_data)):
    paragraph_index += 1
    for j in range(len(parsed_data[i]['paragraphs'])):
        context_list.append(parsed_data[i]['paragraphs'][j]['context'])
        context_index += 1

        for k in range(len(parsed_data[i]['paragraphs'][j]['qas'])):
            question = parsed_data[i]['paragraphs'][j]['qas'][k]['question']
            id = parsed_data[i]['paragraphs'][j]['qas'][k]['id']

            for l in range(len(parsed_data[i]['paragraphs'][j]['qas'][k]['answers'])): 
                answer_start = parsed_data[i]['paragraphs'][j]['qas'][k]['answers'][l]['answer_start']
                answer_text = parsed_data[i]['paragraphs'][j]['qas'][k]['answers'][l]['text']

                answer_end = answer_start + len(answer_text)

                dataset['paragraph_index'].append(paragraph_index)
                dataset['context_index'].append(context_index)
                dataset['question'].append(question)
                dataset['id'].append(id)
                dataset['answer_start'].append(answer_start)
                dataset['answer_end'].append(answer_end)
                dataset['answer_text'].append(answer_text)

df = pd.DataFrame.from_dict(dataset)

df.head()

Unnamed: 0,paragraph_index,context_index,question,id,answer_start,answer_end,answer_text
0,0,0,To whom did the Virgin Mary allegedly appear i...,5733be284776f41900661182,515,541,Saint Bernadette Soubirous
1,0,0,What is in front of the Notre Dame Main Building?,5733be284776f4190066117f,188,213,a copper statue of Christ
2,0,0,The Basilica of the Sacred heart at Notre Dame...,5733be284776f41900661180,279,296,the Main Building
3,0,0,What is the Grotto at Notre Dame?,5733be284776f41900661181,381,420,a Marian place of prayer and reflection
4,0,0,What sits on top of the Main Building at Notre...,5733be284776f4190066117e,92,126,a golden statue of the Virgin Mary


In [7]:
# Some examples of contexts and questions:
for i in range(0, 4000, 100):
    # print('Title:   ', title_list[df['title_index'][i]])
    print('Context: ', context_list[df['context_index'][i]])
    print('Question:', df['question'][i], "\n")

fur may be no less serious and heinous than genocide."
Question: What has been widely debated as a possible act of genocide in Sudan? 

Context:  The majority of studies indicate antibiotics do interfere with contraceptive pills, such as clinical studies that suggest the failure rate of contraceptive pills caused by antibiotics is very low (about 1%). In cases where antibacterials have been suggested to affect the efficiency of birth control pills, such as for the broad-spectrum antibacterial rifampicin, these cases may be due to an increase in the activities of hepatic liver enzymes' causing increased breakdown of the pill's active ingredients. Effects on the intestinal flora, which might result in reduced absorption of estrogens in the colon, have also been suggested, but such suggestions have been inconclusive and controversial. Clinicians have recommended that extra contraceptive measures be applied during therapies using antibacterials that are suspected to interact with oral cont

In [8]:
# Define split ratios
test_ratio = 0.2
val_ratio = 0.2

# Build array of paragraphs indexes and shuffle them
paragraph_indexes = df['paragraph_index'].unique()
np.random.shuffle(paragraph_indexes)
n_samples = len(paragraph_indexes)

# Reserve indexes for test set
test_size = int(test_ratio * n_samples)
train_val_size = n_samples - test_size
test_indexes = paragraph_indexes[-test_size:]
# Reserve indexes for validation set
val_size = int(val_ratio * train_val_size)
train_size = train_val_size - val_size
val_indexes = paragraph_indexes[-(test_size + val_size):-test_size]
# Reserve indexes for training set
train_indexes = paragraph_indexes[:train_size]

assert train_size == len(train_indexes), 'Something went wrong with train set slicing'
assert val_size == len(val_indexes), 'Something went wrong with val set slicing'
assert test_size == len(test_indexes), 'Something went wrong with test set slicing'

print('Number of train paragraphs:', train_size)
print('Number of validation paragraphs:', val_size)
print('Number of test paragraphs:', test_size)

# Split dataframe
df_train = df[np.in1d(df['paragraph_index'], train_indexes)]
df_val = df[np.in1d(df['paragraph_index'], val_indexes)]
df_test = df[np.in1d(df['paragraph_index'], test_indexes)]

print('\nNumber of train samples:', len(df_train))
print('Number of validation samples:', len(df_val))
print('Number of test samples:', len(df_test))

Number of train paragraphs: 284
Number of validation paragraphs: 70
Number of test paragraphs: 88

Number of train samples: 57451
Number of validation samples: 12921
Number of test samples: 17227



## 2. Embeddings

In [9]:
print('Downloading GloVe model...')
emb_dim = 50
glove_model = gloader.load('glove-wiki-gigaword-' + str(emb_dim))
print('\nDownload completed.')

Downloading GloVe model...

Download completed.


In [10]:
if True:
    df_train = pd.concat([df_train , df_val], axis=0) 
    df_val = df_test

In [11]:
def tokenize_corpus(df: pd.DataFrame, context_list: List[str]):
    twt = TreebankWordTokenizer()
    
    t_start = time()
    # Retrieve contexts
    contexts = df['context_index'].apply(lambda x: context_list[x])
    # Tokenize both contexts and queries
    x_ctx = contexts.apply(lambda x: twt.tokenize(x)).tolist()
    x_qry = df['question'].apply(lambda x: twt.tokenize(x)).tolist()
    # Get indexes to start_end characters
    y_char = [(start, end) for start, end in zip(df['answer_start'].tolist(), df['answer_end'].tolist())]
    # Get spans of tokens
    spans_list = contexts.apply(lambda x: twt.span_tokenize(x)).tolist()
    # Convert indexes s.t. the point to start/end tokens
    y = []
    for spans, (char_start, char_end) in zip(spans_list, y_char):
        token_start, token_end = None, None
        for i, span in enumerate(spans):
            if span[0] <= char_start <= span[1]:
                token_start = i
            if span[0] <= char_end <= span[1]:
                token_end = i
        y.append((token_start, token_end))
    print(f'[{time() - t_start:.3f} s]')
    
    return x_ctx, x_qry, y

def build_vocabulary(corpus: List[List[str]],
                     old_word_listing: Optional[List[str]] = None) -> (Dict[int, str], Dict[int, str], List[str]):
    flat_tokens = [x for sub in corpus for x in sub]
    
    if old_word_listing is None:  # standard case
        word_listing = [PAD] + list(OrderedDict.fromkeys(flat_tokens))
    else:  # case in which we extend an already existing vocabulary
        word_listing = list(OrderedDict.fromkeys(old_word_listing + flat_tokens))
        
    idx_to_word = {i: w for i, w in enumerate(word_listing)}
    word_to_idx = {w: i for i, w in enumerate(word_listing)}

    return idx_to_word, word_to_idx, word_listing

# Tokenize corpus
print('Tokenizing training corpus...', end=' ')
X_trainC, X_trainQ, Y_train = tokenize_corpus(df_train, context_list)
train_corpus = X_trainC + X_trainQ

print('Tokenizing validation corpus...', end=' ')
X_valC, X_valQ, Y_val = tokenize_corpus(df_val, context_list)
val_corpus = X_valC + X_valQ

print('Tokenizing test corpus...', end=' ')
X_testC, X_testQ, Y_test = tokenize_corpus(df_test, context_list)
test_corpus = X_testC + X_testQ

# Get word and char mappings for each set
train_i2w, train_w2i, train_wl = build_vocabulary(train_corpus)
val_i2w, val_w2i, val_wl = build_vocabulary(val_corpus, train_wl)
test_i2w, test_w2i, test_wl = build_vocabulary(test_corpus, val_wl)

print('-' * 50)
print('Words in training set:', len(train_wl))
print('Words in validation set:', len(val_wl))
print('Words in test set:', len(test_wl))

Tokenizing training corpus... [51.999 s]
Tokenizing validation corpus... [13.527 s]
Tokenizing test corpus... [12.769 s]
--------------------------------------------------
Words in training set: 114839
Words in validation set: 131821
Words in test set: 131821


In [12]:
x = 0

print(X_trainC[x][Y_train[x][0]:Y_train[x][1]])
print(X_trainQ[x])

['Saint', 'Bernadette']
['To', 'whom', 'did', 'the', 'Virgin', 'Mary', 'allegedly', 'appear', 'in', '1858', 'in', 'Lourdes', 'France', '?']


In [13]:
train_oov_words = [word for word in train_wl if word not in glove_model.vocab and word != PAD]
val_oov_words = [word for word in val_wl if word not in glove_model.vocab and word != PAD]
test_oov_words = [word for word in test_wl if word not in glove_model.vocab and word != PAD]

print(f'Total OOV terms in training set: {len(train_oov_words)} ({float(len(train_oov_words)) / len(train_wl) * 100:.2f}%)')
print(f'Total OOV terms in validation set: {len(val_oov_words)} ({float(len(val_oov_words)) / len(val_wl) * 100:.2f}%)')
print(f'Total OOV terms in test set: {len(test_oov_words)} ({float(len(test_oov_words)) / len(test_wl) * 100:.2f}%)')

Total OOV terms in training set: 72875 (63.46%)
Total OOV terms in validation set: 85981 (65.23%)
Total OOV terms in test set: 85981 (65.23%)


In [14]:
def build_word_embedding_matrix(embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors,
                                word_to_idx: Dict[str, int],
                                oov_words: List[str],
                                old_word_embedding_matrix: Optional[np.ndarray] = None):
    # Initialize embedding matrix with all zeros
    embedding_matrix = np.zeros((len(word_to_idx), embedding_model.vector_size))
    
    # Analyze embeddings to get mean and standard deviation
    mean_list, std_list = [], []
    for word in tqdm(word_to_idx.keys(), leave=False):
        if word not in oov_words and word != PAD:
            embed = embedding_model[word]
            # Compute mean and std
            mean_list.append(np.mean(embed))
            std_list.append(np.std(embed))

    embedding_mean = mean(mean_list)
    embedding_std = mean(std_list)

    for word, idx in tqdm(word_to_idx.items(), leave=False):
        # If word is PAD no action is performed (it will be assigned the zero vector)
        if word not in oov_words and word != PAD:
            embedding_matrix[idx] = embedding_model[word]
        elif word in oov_words:
            oov_idx = word_to_idx[word]
            if old_word_embedding_matrix is None or oov_idx >= len(old_word_embedding_matrix):
                embedding_matrix[idx] = np.random.normal(loc=embedding_mean, scale=embedding_std, size=embedding_model.vector_size)
            else:
                embedding_matrix[idx] = old_word_embedding_matrix[oov_idx]
            
    return embedding_matrix

# Build word embedding matrix based only on the training set (for training)
train_emb_mtx = build_word_embedding_matrix(glove_model, train_w2i, train_oov_words)
print('Shape of word embedding matrix (training set):', train_emb_mtx.shape)

# Build word embedding matrix based on training + validation set (for validation)
val_emb_mtx = build_word_embedding_matrix(glove_model, val_w2i, val_oov_words, train_emb_mtx)
print('Shape of word embedding matrix (validation set):', val_emb_mtx.shape)

# Build word embedding matrix based on training + validation + test set (for test)
#test_emb_mtx = build_word_embedding_matrix(glove_model, test_w2i, test_oov_words, val_emb_mtx)
#print('Shape of word embedding matrix (test set):', test_emb_mtx.shape)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=114839.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=114839.0), HTML(value='')))

Shape of word embedding matrix (training set): (114839, 50)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=131821.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=131821.0), HTML(value='')))

Shape of word embedding matrix (validation set): (131821, 50)


In [15]:
def build_char_embedding_matrix(corpus: List[str],
                                enc_dim: Optional[int] = 100):
    # Flatten to obtain single characters
    flat_chars = [c for sent in corpus for word in sent for c in word]
    
    # Sort characters by occurrences
    unique_chars = Counter(flat_chars)
    char_listing = sorted(unique_chars, key=unique_chars.get, reverse=True)
    # Select only the enc_dim most frequent ones
    if len(char_listing) > enc_dim - 1:
        char_listing = char_listing[:enc_dim - 1]
    char_listing = [PAD] + char_listing + [UNK]  # add PAD and UNK tokens
    
    idx_to_char = {i: c for i, c in enumerate(char_listing)}
    char_to_idx = {c: i for i, c in enumerate(char_listing)}
    
    # Create one-hot vectors, reserving the last one for UNK (0...0, 1)
    one_hot_chars = np.zeros((len(char_listing) - 1, enc_dim))
    np.fill_diagonal(one_hot_chars, 1)
    one_hot_chars = np.vstack([np.zeros((1, enc_dim)), one_hot_chars])  # stack zero vector on top for PAD
    
    return idx_to_char, char_to_idx, char_listing, one_hot_chars

# Build char embedding matrix based only on the training set, and use it for validation and test too:
# in fact, we can assume that characters appear uniformly in the three splits;
# for those rare case in which this does not happen, we assign the UNK vector
i2c, c2i, cl, char_emb_mtx = build_char_embedding_matrix(train_corpus)
print('Shape of char embedding matrix (training set):', char_emb_mtx.shape)

Shape of char embedding matrix (training set): (101, 100)


## 3. Training


In [16]:
from utils.bidaf_train_utils import training_loop

import matplotlib.pyplot as plt
%matplotlib inline


def plot_history(history):    
    # this function is simply used to plot and save the image (and the dictionary) about the train and val loss and accuracy during the training
    
    
    fig1, axes = plt.subplots(nrows=1, ncols=1, figsize=(7.5, 5))
    plt.suptitle('loss', size='xx-large')
    plt.tight_layout(rect=[0, 0.03, 1, 0.])

    axes.plot(history['loss'], label='train_loss')
    axes.plot(history['val_loss'], label='val_loss')
    axes.set_title('loss')
    axes.set(xlabel='# Epochs')
    axes.grid()
    axes.legend();

    fig2, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
    plt.suptitle('scores', size='xx-large')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    axes[0].plot(history['exact_score'], label='train_exact_score')
    axes[0].plot(history['val_exact_score'], label='val_exact_score')
    axes[0].set_title('exact_score')
    axes[0].set(xlabel='# Epochs')
    axes[0].grid()
    axes[0].legend();

    axes[1].plot(history['f1_score'], label='train_f1_score')
    axes[1].plot(history['val_f1_score'], label='val_f1_score')
    axes[1].set_title('f1_score')
    axes[1].set(xlabel='# Epochs')
    axes[1].grid()
    axes[1].legend();
    
    fig3, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
    plt.suptitle('distances', size='xx-large')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    axes[0].plot(history['distance_end'], label='train_distance_end')
    axes[0].plot(history['val_distance_end'], label='val_distance_end')
    axes[0].set_title('distance_end')
    axes[0].set(xlabel='# Epochs')
    axes[0].grid()
    axes[0].legend();

    axes[1].plot(history['distance_start'], label='train_distance_start')
    axes[1].plot(history['val_distance_start'], label='val_distance_start')
    axes[1].set_title('distance_start')
    axes[1].set(xlabel='# Epochs')
    axes[1].grid()
    axes[1].legend();

# jojonki:

In [25]:
# clear gpu memory before another training:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [20]:
from model.bidaf import BiDAF
from model.char_embedder import CharEmbedder
from model.word_embedder import WordEmbedder
from model.tensor_maker import TensorMaker

char_embedder = CharEmbedder(init_emb = torch.FloatTensor(char_emb_mtx),
                             out_char_emb_dim = emb_dim,
                             hidden_dim = 64,
                             input_channels = 1,
                             output_channels = 100,
                             kernel_height = 5,
                             trainable = False)
# char_embedder = CharEmbedder(c_embd_size = 8, vocab_size_c = len(c2i), out_chs = 100, filters = [[1, 5]])

train_word_embedder = WordEmbedder(init_emb = torch.FloatTensor(train_emb_mtx))
val_word_embedder = WordEmbedder(init_emb = torch.FloatTensor(val_emb_mtx))

# model_bidaf = BiDAF(char_embedder, train_word_embedder, val_word_embedder, use_constraint = True).to(DEVICE)

#train_tensor_maker = TensorMaker(train_w2i, c2i, device=DEVICE)
val_tensor_maker = TensorMaker(val_w2i, c2i, device=DEVICE)

In [21]:
import pickle

# Import from pickle files
#char_embedder: CharEmbedder = None
#with open(os.path.join('best_model', 'char_emb2.pickle'), 'rb') as f:
#    char_embedder = pickle.load(f)
#train_word_embedder: WordEmbedder = None
#with open(os.path.join('best_model', 'train_word_embedder.pickle'), 'rb') as f:
#    train_word_embedder = pickle.load(f)
#val_word_embedder: WordEmbedder = None
#with open(os.path.join('best_model', 'val_word_embedder.pickle'), 'rb') as f:
#    val_word_embedder = pickle.load(f)

# Create model
model_bidaf = BiDAF(char_embedder, train_word_embedder, val_word_embedder, use_constraint=True, use_dropout=False).to(DEVICE)
# Load the model state
model_bidaf.load_state_dict(torch.load(os.path.join('best_model', 'bidaf_test.pt')))
model_bidaf.eval()
# Load tensor maker
#tensor_maker = None
#with open(os.path.join('best_model', 'tensor_maker.pickle'), 'rb') as f:
#    tensor_maker = pickle.load(f)

BiDAF(
  (char_embedder): CharEmbedder(
    (embedding): Embedding(101, 100)
    (conv_layer): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1), padding=(1, 0), bias=False)
    (fc1): Linear(in_features=100, out_features=64, bias=False)
    (fc2): Linear(in_features=64, out_features=50, bias=False)
  )
  (train_word_embedder): WordEmbedder(
    (embedding): Embedding(114839, 50)
  )
  (eval_word_embedder): WordEmbedder(
    (embedding): Embedding(131821, 50)
  )
  (highway_net): ConvolutionalHighwayNetwork(
    (conv1): Conv2d(1, 1, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (gate1): Conv2d(1, 1, kernel_size=(5, 100), stride=(1, 1), padding=(2, 0))
    (conv2): Conv2d(1, 1, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (gate2): Conv2d(1, 1, kernel_size=(5, 100), stride=(1, 1), padding=(2, 0))
  )
  (ctx_rnn): GRU(100, 100, batch_first=True, bidirectional=True)
  (w_s): Linear(in_features=600, out_features=1, bias=False)
  (mod_rnn): GRU(800, 100, num_layers=2, ba

In [22]:
from utils.bidaf_train_utils import evaluate
from utils.squad_utils import squad_loss

val_data = to_list_of_tuples((X_valC, X_valQ, Y_val))
evaluate(model_bidaf, val_data, 4, squad_loss, val_tensor_maker, verbose=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4307.0), HTML(value='')))

Start (p): 101, End (p): 102, Start (T): 49, End (T): 52


(3.46501502778017,
 10.017240378475648,
 10.489638358390899,
 0.42061879607592734,
 0.5999495749533893)

In [19]:
import pickle

train_word_embedder = WordEmbedder(init_emb = torch.FloatTensor(train_emb_mtx))
val_word_embedder = WordEmbedder(init_emb = torch.FloatTensor(val_emb_mtx))

with open('char_emb2.pickle', 'wb') as f:
    pickle.dump(char_embedder, f)
#with open('train_word_embedder.pickle', 'wb') as f:
#    pickle.dump(train_word_embedder, f)
#with open('val_word_embedder.pickle', 'wb') as f:
#    pickle.dump(val_word_embedder, f)

In [None]:
from utils.squad_utils import squad_loss
from utils.bidaf_train_utils import training_loop

train_data = to_list_of_tuples((X_trainC, X_trainQ, Y_train))
val_data = to_list_of_tuples((X_valC, X_valQ, Y_val))

EP = 5
BS = 8

#optimizer = torch.optim.Adam(model_bidaf.parameters(), lr=5e-3)
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model_bidaf.parameters()))
#optimizer = torch.optim.Adadelta(model_bidaf.parameters(), lr=0.5, rho=0.999, eps=1e-06, weight_decay=0) # slower in time and in loss
criterion = squad_loss


history_noconst_jojonki_nohigh_1k = training_loop(model=model_bidaf,
                        train_data=train_data,
                        optimizer=optimizer,
                        epochs=EP,
                        batch_size=BS,
                        criterion=criterion,
                        train_tensor_maker=train_tensor_maker,
                        val_tensor_maker=val_tensor_maker,
                        val_data=val_data,
                        early_stopping=True,
                        patience = 15,
                        checkpoint_path='bidaf_gru_noconstraint.pt',
                        mix_scale = True)

# eps =1e-7, #jojonki, # use_constraint, # BS = 8, #1ktrain, 3kval, lr=5e-3, patience = 30, EP = 50, emb = 100
# adam optimizer # train+val

In [None]:
plot_history(history_noconst_jojonki_nohigh_1k)

# our model:

In [31]:
# clear gpu memory before another training:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [44]:
from model.bidaf import BiDAF
from model.char_embedder import CharEmbedder
from model.word_embedder import WordEmbedder
from model.tensor_maker import TensorMaker

char_embedder = CharEmbedder(init_emb = torch.FloatTensor(char_emb_mtx),
                             out_char_emb_dim = emb_dim,
                             hidden_dim = 64,
                             input_channels = 1,
                             output_channels = 100,
                             kernel_height = 5,
                             trainable = False)

train_word_embedder = WordEmbedder(init_emb = torch.FloatTensor(train_emb_mtx))
val_word_embedder = WordEmbedder(init_emb = torch.FloatTensor(val_emb_mtx))
model_bidaf = BiDAF(char_embedder, train_word_embedder, val_word_embedder, use_constraint = True, use_dropout = False).to(DEVICE)
train_tensor_maker = TensorMaker(train_w2i, c2i, device=DEVICE)
val_tensor_maker = TensorMaker(val_w2i, c2i, device=DEVICE)



In [45]:
from utils.squad_utils import squad_loss
from utils.bidaf_train_utils import training_loop

train_data = to_list_of_tuples((X_trainC, X_trainQ, Y_train))
val_data = to_list_of_tuples((X_valC, X_valQ, Y_val))

EP = 5
BS = 8

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model_bidaf.parameters()))
# optimizer = torch.optim.Adam(model_bidaf.parameters(), lr=5e-3)
# optimizer = torch.optim.Adadelta(model_bidaf.parameters(), lr=0.5, rho=0.999, eps=1e-06, weight_decay=0) # slower in time and in loss
criterion = squad_loss

history_drop = training_loop(model=model_bidaf,
                        train_data=train_data,
                        optimizer=optimizer,
                        epochs=EP,
                        batch_size=BS,
                        criterion=criterion,
                        train_tensor_maker=train_tensor_maker,
                        val_tensor_maker=val_tensor_maker,
                        val_data=val_data,
                        early_stopping=True,
                        patience = 15,
                        checkpoint_path='bidaf_gru_constraint.pt',
                        mix_scale = True)

# p_end > p_start

----------------------------------------------------------------------------------------------------
Epoch 1/5


HBox(children=(FloatProgress(value=0.0, max=125.0), HTML(value='')))

TypeError: ignored

In [46]:
torch.save(model_bidaf, "bidaf_test.pt")

In [None]:
plot_history(history_drop)

# to do :

- dropout
- explanation for bad results: BS too small
- our model: trainable embedder = True
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
- propose distance as new metric (normalized wrt the number of characters in the context)



our takes 2 hours per epoch

jojonki takes 

