## Imports

In [None]:
%%capture
# Suppress output

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import numpy as np
import pandas as pd
import torch
import sys
from numpy.core.numeric import full

from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

import os
from datetime import datetime, timezone


# Install needed dependencies on Colab
if colab:
    !pip install transformers
    !pip install torchmetrics==0.6
#from transformers import DistilBertModel#, DistilBertTokenizerFast
import gc
from transformers import DistilBertTokenizerFast

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

## Read data

In [2]:
if colab:
    !git clone 'https://github.com/michimichiamo/question-answering'
    %cd /content/question-answering

Cloning into 'question-answering'...
remote: Enumerating objects: 401, done.[K
remote: Counting objects: 100% (388/388), done.[K
remote: Compressing objects: 100% (324/324), done.[K
remote: Total 401 (delta 214), reused 173 (delta 63), pack-reused 13[K
Receiving objects: 100% (401/401), 134.42 MiB | 15.29 MiB/s, done.
Resolving deltas: 100% (217/217), done.
Checking out files: 100% (36/36), done.
/content/question-answering


In [3]:
# Execute this only to load the dataset in csv format if not already done
# from read_dataset import read_dataset

# dataset = read_dataset(path='training_set.json', validation_set_perc=20)
# train_df = pd.DataFrame(dataset[0], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# train_df.to_csv('train_df.csv')
# val_df = pd.DataFrame(dataset[1], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# val_df.to_csv('val_df.csv')

In [4]:
from util.model import read_npz
tr_id, tr_input, tr_attention_mask, tr_start, tr_end = read_npz(split='train')
val_id, val_input, val_attention_mask, val_start, val_end = read_npz(split='val')

ModuleNotFoundError: ignored

## Prepare for training

### Hyperparameters

In [None]:
#@title Hyperparameters
batch_size = 256 #@param ["32", "64", "128", "256", "512"] {type:"raw"}
learning_rate = 0.001 #@param ["0.00001", "0.0001", "0.001", "0.01", "0.1", "1"] {type:"raw"}
epochs = 100 #@param {type:"slider", min:5, max:200, step:5}

### Dataloaders

In [None]:
from util.model import Dataset
tr_dataset = Dataset(tr_id, tr_input, tr_attention_mask, tr_start, tr_end)
tr_dataloader = torch.utils.data.DataLoader(tr_dataset, batch_size=batch_size, pin_memory=True)

In [None]:
val_dataset = Dataset(val_id, val_input, val_attention_mask, val_start, val_end)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, pin_memory=True)
val_dataloader = iter(val_dataloader)

In [None]:
del tr_id, tr_input, tr_attention_mask, tr_start, tr_end
del val_id, val_input, val_attention_mask, val_start, val_end

gc.collect()

### Create model

In [None]:
from util.model import QA, evaluate, define_metrics

# Create model
net = QA()
net.to(net.device)
# Optimizer, loss function
optimizer = Adam(net.parameters(), lr=learning_rate, weight_decay=0.01)
#scheduler = StepLR(optimizer, step_size=20, gamma=0.1)
loss_fn = CrossEntropyLoss()
n_iter = len(tr_dataloader)
# Metrics
metrics = define_metrics(net)

### Setup to store log and results

In [None]:
homedir = None
if colab:
    # Setting Google Drive for Colab
    from google.colab import drive
    drive.mount('/content/gdrive')
    homedir = '/content/gdrive/My Drive/QA project/'
else:
    homedir = './data/logs/'

In [None]:
#Log paths

#Creation of the dirs to store weights/logs
utc_string = datetime.now(timezone.utc).strftime("%Y%m%d")
#Result dir
if not os.path.isdir(homedir+utc_string):
    os.mkdir(homedir+utc_string)
res_dir = homedir+utc_string
#Tensorboard dir (runs)
if not os.path.isdir(res_dir+'/runs/'):
    os.mkdir(res_dir+'/runs/')
runs_dir = res_dir+'/runs/'
#Logs dir
if not os.path.isdir(res_dir+'/logs/'):
    os.mkdir(res_dir+'/logs/')
logs_dir = res_dir+'/logs/'
#Checkpoints dir
if not os.path.isdir(res_dir+'/checkpoints/'):
    os.mkdir(res_dir+'/checkpoints/')
checkpoints_dir = res_dir+'/checkpoints/'

#Tensorboard init
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(runs_dir)

## Training

In [None]:
%reload_ext tensorboard
%tensorboard --logdir "$runs_dir"

In [None]:
#Training function
def train(model, save=True, load=False, filename=None):
    history = {
        'loss' : []
    }
    for metric in metrics.keys():
        history[metric] = []

    if load:
        path = checkpoints_dir + filename
        model.dense.load_state_dict(torch.load(path))

    for epoch in range(epochs):
        for iteration, (tr_X, tr_Y) in enumerate(tr_dataloader):
            # Set training mode
            model.train()
            # Unpack targets and cast to float
            tr_start, tr_end = tr_Y
            tr_start = tr_start.to(device=model.device, dtype=torch.long)
            tr_end = tr_end.to(device=model.device, dtype=torch.long)
            # Forward pass
            optimizer.zero_grad()
            _, tr_input, tr_attention_mask = tr_X
            pred_start, pred_end = model.forward((tr_input, tr_attention_mask))
            # Loss function
            ## TOCHECK
            loss = loss_fn(pred_start, tr_start) + loss_fn(pred_end, tr_end)
            # Gradient update
            loss.backward()
            optimizer.step()

            # Track loss
            history['loss'].append(loss)
            if iteration%100 == 0:
                print(f'Epoch: {epoch+1} ({iteration+1}/{n_iter}) | Loss: {loss}\n', end='')
            if save:
                writer.add_scalar('Loss/train-iterations', loss, epoch*len(tr_dataloader)+iteration)
            
        # Save logs
        if save:
            with open(logs_dir +'log.txt', 'a+') as f:
                f.write(f"Epoch: {epoch} [Loss: {loss}]\n")

        # Evaluation
        if (epoch+1) % 3 == 0:
            print('Checkpoint reached. Starting evaluation...')
            # Set evaluation mode
            model.eval()
            # Unpack inputs and targets
            val_X, val_Y = next(val_dataloader)
            _, val_input, val_attention_mask = val_X
            # Compute scores
            scores = evaluate(model, (val_input, val_attention_mask), val_Y, metrics)
            # Track scores
            for metric, score in zip(metrics.keys(), scores):
                history[metric].append(score)
                writer.add_scalar(f'{metric}/train-epochs', score, epoch)
            if save:
                # Save weights
                print('Saving weights')
                weights_save_path = checkpoints_dir + 'weights' + utc_string + '_' + str(epoch)
                torch.save(model.dense.state_dict(), weights_save_path)
            
        # Learning Rate scheduler
        #scheduler.step()

    return history


In [None]:
history = train(net)

# Evaluation

In [None]:
from util.evaluation import evaluate_model

evaluate_model(model=net, dataloader=val_dataloader, weights_path=resdir+'/weights/weights20220119_39')

## Monte Carlo uncertainty

In [None]:
def montecarlo_uncertainty(input, model, forward_passes, load=False):
  if load:
    path = homedir + 'best_model/best_model'
    model.dense.load_state_dict(torch.load(path))
  """ Function to get the monte-carlo samples and uncertainty estimates
    through multiple forward passes

    Parameters
    ----------
    data_loader : object
        data loader object from the data loader module
    forward_passes : int
        number of monte-carlo samples/forward passes
    model : object
        keras model
    n_classes : int
        number of classes in the dataset
    n_samples : int
        number of samples in the test set
    """
  dropout_predictions_start = np.empty((input[0].size()[0], 512))
  dropout_predictions_end = np.empty((input[0].size()[0], 512))
  softmax = torch.nn.Softmax(dim=1)
  start_inferences = []
  end_inferences = []
  for i in range(forward_passes):
    model.eval()
    model.dropout.train()
    with torch.no_grad():
      output_start, output_end = model(input)
      predicted_start = softmax(output_start)
      predicted_end = softmax(output_end) # shape (n_samples, n_classes)
      start_inferences.append(torch.argmax(predicted_start, axis=1).item())
      end_inferences.append(torch.argmax(predicted_end, axis=1).item())

    dropout_predictions_start = np.vstack((dropout_predictions_start, predicted_start.cpu().numpy()))
    dropout_predictions_end = np.vstack((dropout_predictions_end, predicted_end.cpu().numpy()))
  mc_logits_start = dropout_predictions_start[1:]
  mc_logits_end = dropout_predictions_end[1:]


  # Calculating mean across multiple MCD forward passes 
  mean_start = np.mean(mc_logits_start, axis=0) # shape (n_samples, n_classes)
  mean_end = np.mean(mc_logits_end, axis=0)
  # Calculating variance across multiple MCD forward passes 
  variance = np.var(mc_logits_start, axis=0) # shape (n_samples, n_classes)

  epsilon = sys.float_info.min
  # Calculating entropy across multiple MCD forward passes 
  entropy_start = -np.sum(mean_start*np.log(mean_start + epsilon), axis=-1) # shape (n_samples,)
  entropy_end = -np.sum(mean_end*np.log(mean_end + epsilon), axis=-1)
  return output_start, output_end, entropy_start, entropy_end


# Compute Monte Carlo uncertainty for a given dimension batch of input contexts
tokenizer=DistilBertTokenizerFast.from_pretrained("distilbert-base-cased-distilled-squad")

def compute_mc_uncertainty(mc_batch_dim, val_dataset, tokenizer, load=False):
  # Building new dataloader with batch_size=1 for inference
  val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, pin_memory=True)
  val_dataloader = iter(val_dataloader)
  
  # Testing monte carlo
  for i in range(mc_batch_dim):
    input, target = next(val_dataloader)
    _, inp, att = input
    mean_start, mean_end, entropy_start, entropy_end = montecarlo_uncertainty((inp, att), net, 5, load=load)
    start_pred = int(torch.argmax(mean_start.cpu(), axis=1).detach().numpy().astype('int32'))
    end_pred = int(torch.argmax(mean_end.cpu(), axis=1).detach().numpy().astype('int32'))
    
    # Extract answer from context
    context = inp.detach().numpy().reshape(-1,)
    pred_decoded = tokenizer.decode(context[start_pred:end_pred+1])
    target_decoded = tokenizer.decode(context[target[0]:target[1]+1])
    uncertainty_score = (entropy_start+entropy_end)/2
    print(f'Predicted answer: {pred_decoded}\nActual answer: {target_decoded}\nUncertainty score on the prediction: {round((uncertainty_score*100)/(512/np.exp(1)), 3)}%\n\n')

In [None]:
compute_mc_uncertainty(30, val_dataset, tokenizer, load=True)

## Jaccard Index
Custom evaluation metric that computes the percentage of the actual answer predicted by the model


In [None]:
def jaccard_accuracy_metric(actual_start, actual_end, predicted_start_array, predicted_end_array):
  pred_start = torch.argmax(predicted_start_array, axis=1).to(dtype=torch.int32)
  pred_end = torch.argmax(predicted_end_array, axis=1).to(dtype=torch.int32)
  tot_percentage = 0
  jaccard_metric = 0
  for i in range(actual_start.size()[0]):
    actual = list(range(actual_start[i], actual_end[i]+1))
    pred = list(range(pred_start[i], pred_end[i]+1))
    # tot_percentage += len(set(pred).intersection(actual))/len(actual)
    jaccard_metric += len(set(pred).intersection(actual))/len(set(pred).union(actual))
    tot_percentage += len(set(pred).intersection(actual))/len(actual)
  
  return (tot_percentage/actual_start.size()[0])*100, jaccard_metric/actual_start.size()[0]

## F1 and Exact Match

In [None]:
def post_train_eval(model, inp, att, target):
    # Obtain predictions
    start_preds, end_preds = model.forward((inp,att))
    # Unpack targets and send to device
    start_target, end_target = target
    start_target = start_target.to(model.device)
    end_target = end_target.to(model.device)
    
#    # Extract IntTensors for predictions
#    start_preds, end_preds = torch.zeros_like(start_model, dtype=torch.int16), torch.zeros_like(end_model, dtype=torch.int16)
#    start_preds[torch.tensor(range(start_model.size()[0])), torch.argmax(start_model, axis=1)] = 1
#    end_preds[torch.tensor(range(end_model.size()[0])), torch.argmax(end_model, axis=1)] = 1

    # Send predictions to device
    start_preds = start_preds.to(model.device)
    end_preds = end_preds.to(model.device)
    percentage_accuracy, jaccard = jaccard_accuracy_metric(start_target, end_target, start_preds, end_preds)
    return percentage_accuracy, jaccard, start_preds, end_preds, start_target, end_target

def final_evaluation(model, batch_size, full_val_dataset=False):
    # Building new dataloader with batch_size=1 for inference
    path = homedir + 'best_model/best_model'
    model.dense.load_state_dict(torch.load(path))
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, pin_memory=True)
    val_dataloader = iter(val_dataloader)
    # Set evaluation mode
    model.eval()
    correct_answers_counter = 0
    percentage_acc_list = []
    jaccard_list = []
    start_preds_list = np.empty((1, 512))
    end_preds_list = np.empty((1, 512))
    start_target_list = []#np.empty((1, 512))
    end_target_list = []#np.empty((1, 512))
    
    if full_val_dataset:
      batch_size = len(val_dataloader)
    
    for i in range(batch_size):#range(len(val_dataloader)):
      input, target = next(val_dataloader)
      _, inp, att = input
      perc, jaccard, st_pred, end_pred, st_tar, end_tar = post_train_eval(model, inp, att, target)
      
      #print(f'Percentage of answer correct: {perc}')
      if perc >= 70:
        correct_answers_counter += 1
      
      correct_answers_counter
      percentage_acc_list.append(perc)
      jaccard_list.append(jaccard)
      start_preds_list = np.vstack((start_preds_list, st_pred.cpu().detach().numpy()))
      end_preds_list = np.vstack((end_preds_list, end_pred.cpu().detach().numpy()))
      start_target_list.append(st_tar.item())
      end_target_list.append(end_tar.item())
    
    f1_score = metrics['F1']
    exact_match = metrics['ExactMatch']

    # Get F1 scores
    f1_start = f1_score(torch.tensor(start_preds_list[1:]), torch.tensor(np.array(start_target_list)))
    f1_end = f1_score(torch.tensor(end_preds_list[1:]), torch.tensor(np.array(end_target_list)))
    f1 = (f1_start + f1_end)/2
    f1 = f1.to('cpu')
  
    # Ge Exact Match scores
    em_start = exact_match(torch.tensor(start_preds_list[1:]), torch.tensor(np.array(start_target_list)))
    em_end = exact_match(torch.tensor(end_preds_list[1:]), torch.tensor(np.array(end_target_list)))
    em = (em_start + em_end)/2

    print('Evaluation completed.')
    print(f'Correct answers: {correct_answers_counter} out of {batch_size} total')
    print(f'Avreage Jaccard index on correct answers: {np.mean(np.array(jaccard_list))}')
    print(f'F1: {f1}, Exact Match: {em}')
                
    return f1, em

final_evaluation(net, 2000, full_val_dataset=True)

In [None]:
#show tensorboard of a specfied training
tens_dir = '/content/gdrive/My Drive/QA project/runs/'
%reload_ext tensorboard
%tensorboard --logdir "$tens_dir"