## Imports

In [90]:
%%capture
# Suppress output

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import numpy as np
import pandas as pd
import torch
# Install needed dependencies on Colab
if colab:
    !pip install transformers
    !pip install torchmetrics
from transformers import DistilBertModel#, DistilBertTokenizerFast

# Enable GPU acceleration, whenever available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

In [91]:
if colab:
    !git clone 'https://github.com/michimichiamo/question-answering'

fatal: destination path 'question-answering' already exists and is not an empty directory.


## Read data

In [92]:
# Execute this only to load the dataset in csv format if not already done
# from read_dataset import read_dataset

# dataset = read_dataset(path='training_set.json', validation_set_perc=20)
# train_df = pd.DataFrame(dataset[0], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# train_df.to_csv('train_df.csv')
# val_df = pd.DataFrame(dataset[1], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# val_df.to_csv('val_df.csv')

In [93]:
directory='./' if not colab else './question-answering/'

train_filename = directory+'data/tokenized/train.npz'
val_filename = directory+'data/tokenized/val.npz'

train_data = np.load(train_filename)
val_data = np.load(val_filename)

In [94]:
train_input_ids = train_data['input_ids'].astype('int32')
train_attention_mask = train_data['attention_mask'].astype('int32')
train_answer_start = train_data['answer_start'].astype('int32')
train_answer_end = train_data['answer_end'].astype('int32')

val_input_ids = val_data['input_ids'].astype('int32')
val_attention_mask = val_data['attention_mask'].astype('int32')
val_answer_start = val_data['answer_start'].astype('int32')
val_answer_end = val_data['answer_end'].astype('int32')

## Network

In [95]:
class QA(torch.nn.Module):

    def __init__(self, hidden_size=768, num_labels=2, dropout_rate=0.5):
        super(QA, self).__init__()
        # Device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Parameters
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        # Layers
        #self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
        self.transformers = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad').to(self.device)
        self.transformers.requires_grad_(False)
        self.dropout = torch.nn.Dropout(dropout_rate)
        #self.extra_linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        #self.extra_linear_tanh = torch.nn.Tanh()
        self.dense = torch.nn.Linear(self.hidden_size, self.num_labels, device=self.device, dtype=torch.float32)

    def forward(self, inputs):
        # Unpack inputs
        input_ids, attention_mask = inputs
        
        # Put to device
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        
        # Transformers 
        transformed = self.transformers(input_ids=input_ids, attention_mask=attention_mask)
        # Dropout
        dropped = self.dropout(transformed[0])
        # Obtain logits
        logits = self.dense(dropped) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits)
        
        return outputs

In [108]:
%%capture
net = QA()
net.device='cpu'
net.to(net.device)

In [97]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, input_ids, attention_masks, answer_starts, answer_ends):
        'Initialization'
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.answer_starts = answer_starts
        self.answer_ends = answer_ends

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.input_ids)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        input_id = self.input_ids[index]
        attention_mask = self.attention_masks[index]
        answer_start = self.answer_starts[index]
        answer_end = self.answer_ends[index]

        # Pack input and output
        X = (input_id, attention_mask)
        y = (answer_start, answer_end)

        return X, y

In [98]:
#@title Hyperparameters
batch_size = 32 #@param ["32", "64", "128"] {type:"raw"}
learning_rate = 0.001 #@param ["0.00001", "0.0001", "0.001", "0.01", "0.1", "1"] {type:"raw"}
epochs = 5 #@param {type:"slider", min:5, max:200, step:5}


In [99]:
train_dataset = Dataset(train_input_ids, train_attention_mask, train_answer_start, train_answer_end)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)#, num_workers=2, pin_memory=True)

In [112]:
val_dataset = Dataset(val_input_ids, val_attention_mask, val_answer_start, val_answer_end)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, )#, num_workers=2, pin_memory=True)
val_dataloader = iter(val_dataloader)

In [101]:
import gc

del train_input_ids, train_attention_mask, train_answer_start, train_answer_end
del val_input_ids, val_attention_mask, val_answer_start, val_answer_end

gc.collect()

1973

In [102]:
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

loss_fn = CrossEntropyLoss()
optimizer = Adam(net.parameters(), lr=learning_rate, weight_decay=0.01)
n_iter = round(len(train_dataloader)/(batch_size))

In [121]:
from torchmetrics import AveragePrecision, F1
#from torch.nn.functional import softmax

f1_score = F1(num_classes=net.transformers.config.max_position_embeddings, mdmc_average='global')
average_precision = AveragePrecision(pos_label=1, num_classes=net.transformers.config.max_position_embeddings)

#TOFIX: F1 sempre uguale
#TOCHECK: AVG_PREC
def evaluate(model, inputs, targets):
    # Set evaluation mode
    model.eval()
    # Obtain predictions
    start_preds, end_preds = model.forward(inputs)
    # Unpack targets
    start_logits, end_logits = targets

    # Get F1 scores
    
    # Extract IntTensors
    #start_logits, end_logits = torch.IntTensor(start_logits), torch.IntTensor(end_logits)

    start_out, end_out = torch.zeros_like(start_preds, dtype=torch.int16), torch.zeros_like(end_preds, dtype=torch.int16)
    start_out[torch.tensor(range(start_preds.size()[0])), torch.argmax(start_preds, axis=1)] = 1
    end_out[torch.tensor(range(end_preds.size()[0])), torch.argmax(end_preds, axis=1)] = 1

    f1_start = f1_score(start_out, start_logits)
    f1_end = f1_score(end_out, end_logits)
    f1 = f1_start + f1_end
    
    # Get Average Precision scores

    avg_start = average_precision(start_out, torch.argmax(start_logits, axis=1))
    avg_end = average_precision(end_out, torch.argmax(end_logits, axis=1))
    avg = avg_start + avg_end

    print(f'f1 score: {f1}')
    print(f'average precision: {avg}')
    return f1, avg




In [122]:
loss_history = []
f1_history = []
avg_prec_history = []

for epoch in range(epochs):
    #net.train()
    for iteration, (train_inputs, train_targets) in enumerate(train_dataloader):
        #net.train()
        ## Unpack targets and cast to float
        #start_logits, end_logits = train_targets
        #start_logits, end_logits = torch.tensor(start_logits, dtype=torch.float32), torch.tensor(end_logits, dtype=torch.float32)
        ## Forward pass
        #optimizer.zero_grad()
        #start_out, end_out = net.forward(train_inputs)
        ## Loss function
        #loss = -loss_fn(start_logits, start_out) -loss_fn(end_logits, end_out)
        ## Gradient update
        #loss.backward()
        #optimizer.step()
#
        ## Track loss
        #print(f'iteration {iteration+1}/{n_iter}')
        #print(f'loss = {loss}')
        #loss_history.append(loss)
    
    #if epoch % 5 == 0:
        net.eval()
        val_inputs, val_targets = next(val_dataloader)
        f1, avg_prec = evaluate(net, val_inputs, val_targets)
        f1_history.append(f1)
        avg_prec_history.append(avg_prec)






f1 score: 1.9921875
average precision: 0.07407407462596893
f1 score: 1.9921875
average precision: 0.0770370364189148


KeyboardInterrupt: ignored