## Imports

In [1]:
%%capture
# Suppress output

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import numpy as np
import pandas as pd
import torch
# Install needed dependencies on Colab
if colab:
    !pip install transformers
from transformers import DistilBertModel#, DistilBertTokenizerFast

# Enable GPU acceleration, whenever available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

In [2]:
if colab:
    !git clone 'https://github.com/michimichiamo/question-answering'

## Read data

In [3]:
# Execute this only to load the dataset in csv format if not already done
# from read_dataset import read_dataset

# dataset = read_dataset(path='training_set.json', validation_set_perc=20)
# train_df = pd.DataFrame(dataset[0], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# train_df.to_csv('train_df.csv')
# val_df = pd.DataFrame(dataset[1], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# val_df.to_csv('val_df.csv')

In [4]:
directory='./' if not colab else './question-answering/'

train_filename = directory+'data/tokenized/train.npz'
val_filename = directory+'data/tokenized/val.npz'

train_data = np.load(train_filename)
val_data = np.load(val_filename)

In [5]:
input_ids = train_data['input_ids']
attention_mask = train_data['attention_mask']
answer_start = train_data['answer_start']
answer_end = train_data['answer_end']

In [6]:
#train_df = pd.DataFrame()
#train_df['input_ids'] = [i for i in train_data['input_ids']]
#train_df['attention_mask'] = [i for i in train_data['attention_mask']]
#train_df['answer_start'] = train_data['answer_start']
#train_df['answer_end'] = train_data['answer_end']

In [7]:
#val_df = pd.DataFrame()
#val_df['input_ids'] = [i for i in val_data['input_ids']]
#val_df['attention_mask'] = [i for i in val_data['attention_mask']]
#val_df['answer_start'] = val_data['answer_start']
#val_df['answer_end'] = val_data['answer_end']

## Embedding

In [8]:
## Load tokenizer and transformers

#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
#model = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad')

In [9]:
## Tokenize questions and contexts
#max_length = model.config.max_position_embeddings
#doc_stride = 256
#
#train_tokenized = tokenizer(
#   train_questions,
#   train_contexts,
#   max_length=max_length,
#   truncation="only_second",
#   return_overflowing_tokens=True,
#   return_offsets_mapping=True,
#   stride=doc_stride,
#   return_attention_mask=True,
#   padding='max_length'
#)
#
#val_tokenized = tokenizer(
#   val_questions,
#   val_contexts,
#   max_length=max_length,
#   truncation="only_second",
#   return_overflowing_tokens=True,
#   return_offsets_mapping=True,
#   stride=doc_stride,
#   return_attention_mask=True,
#   padding='max_length'
#)

In [10]:
## Move to device
#bert_dict = {}
#
#bert_dict['input_ids'] = torch.IntTensor(tokenized['input_ids']).to(device)
#bert_dict['attention_mask'] = torch.IntTensor(tokenized['attention_mask']).to(device)
#
#model = model.to(device)

In [11]:
## Network structure

#transformed = model(**bert_dict)
#dropped = torch.nn.Dropout(0.3)(transformed[0])
#logits = torch.nn.Linear(768, 2, device=device)(dropped)
#start_logits, end_logits = logits.split(1, dim=-1)
#start_logits = start_logits.squeeze(-1)
#end_logits = end_logits.squeeze(-1)
#outputs = (start_logits, end_logits)

In [12]:
#tokenizer.decode(tokenized['input_ids'][0])

In [13]:
#train_df['question'].apply(lambda x: len(x.strip().split(' '))).max()

## TODO

- `tokenized['offset_mapping'][0]` returna le tuple (start,end) di ogni parola dell'input (query, context)

- Problema: splittare i contesti online (nel Dataloader) produce batch di lunghezza variabile
    - Prima soluzione: eliminare gli split che non contengono la domanda
    - Seconda soluzione: creare dataframe con contesti già splittati usando il tokenizer (lunghezza 512, overlapping 256) invece che farlo online nel Dataloader

- Problema: risposte sono presenti in un solo split, cosa fare con gli altri?
    - Una [soluzione](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb#scrollTo=iLekL6Un9D70&line=24&uniqifier=1): riscalare tuple di contesti tagliati (invece che 0, allinearli alla risposta)

    - Un'altra soluzione: 
        - riscalare answer_start e answer_end per ogni contesto
        - lo split che contiene la risposta mantiene answer_start e answer_end, gli altri split dello stesso contesto vanno trattati (assegniamo (0,0)? oppure scartiamo)
        - Possibile [soluzione](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb#scrollTo=v86c_RApFdNG) (in grassetto parte interessante): *Now let's put everything together in one function we will apply to our training set. In the case of impossible answers (the answer is in another feature given by an example with a long context), **we set the cls index for both the start and end position**. We could also simply discard those examples from the training set if the flag allow_impossible_answers is False. Since the preprocessing is already complex enough as it is, we've kept is simple for this part.*

- N.B. sul token `[CLS]` preso da [qui](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb#scrollTo=kv1iD9E6FdND):
> *The very first token ([CLS]) has (0, 0) because it doesn't correspond to any part of the question/answer, then the second token is the same as the characters 0 to 3 of the question*

## Network

In [14]:
class QA(torch.nn.Module):

    def __init__(self, hidden_size=768, num_labels=2, dropout_rate=0.5):
        super(QA, self).__init__()
        # Device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Parameters
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        # Layers
        #self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
        self.transformers = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad').to(self.device)
        self.dropout = torch.nn.Dropout(dropout_rate)
        #self.extra_linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        #self.extra_linear_tanh = torch.nn.Tanh()
        self.dense = torch.nn.Linear(self.hidden_size, self.num_labels, device=self.device)

    def forward(self, inputs):
        # Unpack inputs
        input_ids, attention_mask = inputs
        
        # Put to device
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        
        # Transformers 
        transformed = self.transformers(input_ids=input_ids, attention_mask=attention_mask)
        # Dropout
        dropped = self.dropout(transformed[0])
        # Obtain logits
        logits = self.dense(dropped) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits)
        
        return outputs

In [15]:
net = QA()

In [16]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, input_ids, attention_masks, answer_starts, answer_ends):
        'Initialization'
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.answer_starts = answer_starts
        self.answer_ends = answer_ends

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.input_ids)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        input_id = self.input_ids[index]
        attention_mask = self.attention_masks[index]
        answer_start = self.answer_starts[index]
        answer_end = self.answer_ends[index]

        # Pack input and output
        X = (input_id, attention_mask)
        y = (answer_start, answer_end)

        return X, y

In [17]:
data = Dataset(input_ids, attention_mask, answer_start, answer_end)

In [18]:
generator = torch.utils.data.DataLoader(data, batch_size=32)#, num_workers=2, pin_memory=True)

In [19]:
for inputs, labels in generator:
    print(inputs)
    print(labels)
    outputs = net.forward(inputs)
    print(outputs)
    break

[tensor([[  101,  1706,  2292,  ...,     0,     0,     0],
        [  101,  1327,  1110,  ...,     0,     0,     0],
        [  101,  1109, 19349,  ...,     0,     0,     0],
        ...,
        [  101,  1327,  8918,  ...,     0,     0,     0],
        [  101,  1327,  1110,  ...,     0,     0,     0],
        [  101,  1130,  1184,  ...,     0,     0,     0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])]
[tensor([136,  53,  83, 101,  34,  67, 103, 128,  39, 189,  36,  44,  59,  85,
        153, 110,  22,  43,  77,  47, 112,  28,  47, 136,  90, 133, 232,  38,
        167,  15, 118,  78]), tensor([142,  57,  85, 107,  40,  68, 103, 129,  39, 189,  36,  46,  60,  88,
        161, 110,  22,  46,  77,  48, 114,  28,  52, 143,  90, 135, 235,  38,
        172,  25, 125,  78])]
(tensor([[ 0.0841,  1.3260,  0.9043,  ...,  