## Imports

In [1]:
%%capture
# Suppress output

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import numpy as np
import pandas as pd
import torch
# Install needed dependencies on Colab
if colab:
    !pip install transformers
from transformers import DistilBertModel#, DistilBertTokenizerFast

# Enable GPU acceleration, whenever available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

In [2]:
if colab:
    !git clone 'https://github.com/michimichiamo/question-answering'

fatal: destination path 'question-answering' already exists and is not an empty directory.


## Read data

In [3]:
# Execute this only to load the dataset in csv format if not already done
# from read_dataset import read_dataset

# dataset = read_dataset(path='training_set.json', validation_set_perc=20)
# train_df = pd.DataFrame(dataset[0], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# train_df.to_csv('train_df.csv')
# val_df = pd.DataFrame(dataset[1], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# val_df.to_csv('val_df.csv')

In [4]:
directory='./' if not colab else './question-answering/'

train_filename = directory+'data/tokenized/train.npz'
#val_filename = directory+'data/tokenized/val.npz'

train_data = np.load(train_filename)
#val_data = np.load(val_filename)

In [5]:
input_ids = train_data['input_ids']
attention_mask = train_data['attention_mask']
answer_start = train_data['answer_start']
answer_end = train_data['answer_end']

## Network

In [6]:
class QA(torch.nn.Module):

    def __init__(self, hidden_size=768, num_labels=2, dropout_rate=0.5):
        super(QA, self).__init__()
        # Device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Parameters
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        # Layers
        #self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
        self.transformers = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad').to(self.device)
        self.transformers.requires_grad_(False)
        self.dropout = torch.nn.Dropout(dropout_rate)
        #self.extra_linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        #self.extra_linear_tanh = torch.nn.Tanh()
        self.dense = torch.nn.Linear(self.hidden_size, self.num_labels, device=self.device)

    def forward(self, inputs):
        # Unpack inputs
        input_ids, attention_mask = inputs
        
        # Put to device
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        
        # Transformers 
        transformed = self.transformers(input_ids=input_ids, attention_mask=attention_mask)
        # Dropout
        dropped = self.dropout(transformed[0])
        # Obtain logits
        logits = self.dense(dropped) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits)
        
        return outputs

In [7]:
net = QA()
net.to(net.device)

QA(
  (transformers): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features

In [8]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, input_ids, attention_masks, answer_starts, answer_ends):
        'Initialization'
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.answer_starts = answer_starts
        self.answer_ends = answer_ends

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.input_ids)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        input_id = self.input_ids[index]
        attention_mask = self.attention_masks[index]
        answer_start = self.answer_starts[index]
        answer_end = self.answer_ends[index]

        # Pack input and output
        X = (input_id, attention_mask)
        y = (answer_start, answer_end)

        return X, y

In [9]:
data = Dataset(input_ids, attention_mask, answer_start, answer_end)

In [19]:
dataloader = torch.utils.data.DataLoader(data, batch_size=batch_size)#, num_workers=2, pin_memory=True)

In [11]:
import gc

del input_ids, attention_mask, answer_start, answer_end

gc.collect()

488

In [22]:
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

#@title Hyperparameters
batch_size = 32 #@param ["32", "64", "128"] {type:"raw"}
learning_rate = 0.001 #@param ["0.00001", "0.0001", "0.001", "0.01", "0.1", "1"] {type:"raw"}
epochs = 5 #@param {type:"slider", min:5, max:200, step:5}

loss_fn = CrossEntropyLoss()
optimizer = Adam(net.parameters(), lr=learning_rate, weight_decay=0.01)
n_iter = round(len(dataloader)/(batch_size))

In [None]:
loss_history = []

for epoch in range(epochs):
    
    for iter, (inputs, targets) in enumerate(dataloader):
        # Unpack targets
        start_logits, end_logits = targets
        # Forward pass
        optimizer.zero_grad()
        start_out, end_out = net.forward(inputs)
        # Loss function
        loss = -loss_fn(start_logits, start_out) -loss_fn(end_logits, end_out)
        # Gradient update
        loss.backward()
        optimizer.step()

        # Track loss
        print(f'iter {iter}/{n_iter}: loss = {loss}')
        loss_history.append(loss)

iter 0/69.6875: loss = 49588.09863547748
iter 1/69.6875: loss = 51021.03443966964
iter 2/69.6875: loss = 51907.26640214165
iter 3/69.6875: loss = 51935.4375278352
iter 4/69.6875: loss = 51964.82127585965
iter 5/69.6875: loss = 51333.421006549295
iter 6/69.6875: loss = 50412.28318748968
iter 7/69.6875: loss = 48619.23014565387
iter 8/69.6875: loss = 47881.48497888457
