## Imports

In [1]:
#%%capture
# Suppress output

# Whether the notebook is run within Google Colab or not
colab = 'google.colab' in str(get_ipython())

# General imports
import numpy as np
import pandas as pd
import torch
# Install needed dependencies on Colab
if colab:
    !pip install transformers
    !pip install torchmetrics==0.6
from transformers import DistilBertModel#, DistilBertTokenizerFast

# Enable GPU acceleration, whenever available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Automatically reimport modules at each execution
%reload_ext autoreload
%autoreload 2

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 65.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 17.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 74.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [2]:
if colab:
    !git clone 'https://github.com/michimichiamo/question-answering'

Cloning into 'question-answering'...
remote: Enumerating objects: 155, done.[K
remote: Counting objects: 100% (142/142), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 155 (delta 73), reused 72 (delta 25), pack-reused 13[K
Receiving objects: 100% (155/155), 39.38 MiB | 5.74 MiB/s, done.
Resolving deltas: 100% (76/76), done.


## Read data

In [3]:
# Execute this only to load the dataset in csv format if not already done
# from read_dataset import read_dataset

# dataset = read_dataset(path='training_set.json', validation_set_perc=20)
# train_df = pd.DataFrame(dataset[0], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# train_df.to_csv('train_df.csv')
# val_df = pd.DataFrame(dataset[1], columns=['id', 'title', 'context_id', 'context', 'question', 'start', 'end'])
# val_df.to_csv('val_df.csv')

In [4]:
directory='./' if not colab else './question-answering/'

train_filename = directory+'data/tokenized/train.npz'
val_filename = directory+'data/tokenized/val.npz'

train_data = np.load(train_filename)
val_data = np.load(val_filename)

In [5]:
train_input_ids = train_data['input_ids'].astype('int32')
train_attention_mask = train_data['attention_mask'].astype('int32')
train_answer_start = train_data['answer_start'].astype('int32')
train_answer_end = train_data['answer_end'].astype('int32')

val_input_ids = val_data['input_ids'].astype('int32')
val_attention_mask = val_data['attention_mask'].astype('int32')
val_answer_start = val_data['answer_start'].astype('int32')
val_answer_end = val_data['answer_end'].astype('int32')

## Network

In [6]:
class QA(torch.nn.Module):

    def __init__(self, hidden_size=768, num_labels=2, dropout_rate=0.5):
        super(QA, self).__init__()
        # Device
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Parameters
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        # Layers
        #self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad')
        self.transformers = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad').to(self.device)
        self.transformers.requires_grad_(False)
        self.dropout = torch.nn.Dropout(dropout_rate)
        #self.extra_linear = torch.nn.Linear(self.hidden_size, self.hidden_size)
        #self.extra_linear_tanh = torch.nn.Tanh()
        self.dense = torch.nn.Linear(self.hidden_size, self.num_labels, device=self.device, dtype=torch.float32)

    def forward(self, inputs):
        # Unpack inputs
        input_ids, attention_mask = inputs
        
        # Put to device
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        
        # Transformers 
        transformed = self.transformers(input_ids=input_ids, attention_mask=attention_mask)
        # Dropout
        dropped = self.dropout(transformed[0])
        # Obtain logits
        logits = self.dense(dropped) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits)
        
        return outputs

In [7]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, input_ids, attention_masks, answer_starts, answer_ends):
        'Initialization'
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.answer_starts = answer_starts
        self.answer_ends = answer_ends

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.input_ids)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        input_id = self.input_ids[index]
        attention_mask = self.attention_masks[index]
        answer_start = self.answer_starts[index]
        answer_end = self.answer_ends[index]

        # Pack input and output
        X = (input_id, attention_mask)
        y = (answer_start, answer_end)

        return X, y

In [8]:
#@title Hyperparameters
batch_size = 128 #@param ["32", "64", "128", "256"] {type:"raw"}
learning_rate = 0.001 #@param ["0.00001", "0.0001", "0.001", "0.01", "0.1", "1"] {type:"raw"}
epochs = 5 #@param {type:"slider", min:5, max:200, step:5}


In [9]:
train_dataset = Dataset(train_input_ids, train_attention_mask, train_answer_start, train_answer_end)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)#, num_workers=2, pin_memory=True)

In [10]:
val_dataset = Dataset(val_input_ids, val_attention_mask, val_answer_start, val_answer_end)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, )#, num_workers=2, pin_memory=True)
val_dataloader = iter(val_dataloader)

In [11]:
import gc

del train_input_ids, train_attention_mask, train_answer_start, train_answer_end
del val_input_ids, val_attention_mask, val_answer_start, val_answer_end

gc.collect()

429

In [12]:
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

# Create model
net = QA()
net.to(net.device)
optimizer = Adam(net.parameters(), lr=learning_rate, weight_decay=0.01)
loss_fn = CrossEntropyLoss()
n_iter = len(train_dataloader)

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing DistilBertModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
import torchmetrics
from torchmetrics import AveragePrecision, F1

# Define scores and send to device
f1_score = F1(num_classes=net.transformers.config.max_position_embeddings, mdmc_average='global')
f1_score = f1_score.to(device)
average_precision = AveragePrecision(pos_label=1, num_classes=net.transformers.config.max_position_embeddings)
average_precision = average_precision.to(device)

def evaluate(model, inputs, targets):
    # Set evaluation mode
    model.eval()
    # Obtain predictions
    start_preds, end_preds = model.forward(inputs)
    # Unpack targets and send to device
    start_logits, end_logits = targets
    start_logits = start_logits.to(model.device)
    end_logits = end_logits.to(model.device)
    
    # Extract IntTensors for predictions
    start_out, end_out = torch.zeros_like(start_preds, dtype=torch.int16), torch.zeros_like(end_preds, dtype=torch.int16)
    start_out[torch.tensor(range(start_preds.size()[0])), torch.argmax(start_preds, axis=1)] = 1
    end_out[torch.tensor(range(end_preds.size()[0])), torch.argmax(end_preds, axis=1)] = 1

    # Send predictions to device
    start_out.to(model.device)
    end_out.to(model.device)

    # Get F1 scores
    f1_start = f1_score(start_out, start_logits)
    f1_end = f1_score(end_out, end_logits)
    f1 = f1_start + f1_end
    
    # Get Average Precision scores
    avg_start = average_precision(start_out, torch.argmax(start_logits, axis=1))
    avg_end = average_precision(end_out, torch.argmax(end_logits, axis=1))
    avg = avg_start + avg_end

    print(f'f1 score: {f1:.10f}')
    print(f'average precision: {avg:.5f}')
    return f1.to('cpu'), avg.to('cpu')




In [14]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("log/")

In [None]:
loss_history = []
f1_history = []
avg_prec_history = []

for epoch in range(epochs):
    net.train()
    for iteration, (train_inputs, train_targets) in enumerate(train_dataloader):
        net.train()
        # Unpack targets and cast to float
        start_logits, end_logits = train_targets
        start_logits, end_logits = torch.tensor(start_logits, dtype=torch.float32, device=device), torch.tensor(end_logits, dtype=torch.float32, device=device)
        # Forward pass
        optimizer.zero_grad()
        start_out, end_out = net.forward(train_inputs)
        # Loss function
        ## TOCHECK
        loss = loss_fn(start_out, start_logits) + loss_fn(end_out, end_logits)
        # Gradient update
        loss.backward()
        optimizer.step()
#
        # Track loss
        #print(f'iteration {iteration+1}/{n_iter}')
        #print(f'loss = {loss}')
        writer.add_scalar('Loss/train', loss, iteration)
        loss_history.append(loss)
    
    #if epoch % 5 == 0:
    net.eval()
    val_inputs, val_targets = next(val_dataloader)
    f1, avg_prec = evaluate(net, val_inputs, val_targets)
    f1_history.append(f1)
    avg_prec_history.append(avg_prec)
    writer.add_scalar('Accuracy/train', avg_prec, epoch)
    writer.add_scalar('F1/train', f1, epoch)




In [None]:
#pip install tensorboard
%reload_ext tensorboard
%tensorboard --logdir log

In [None]:
from tensorboard import notebook
notebook.list() # View open TensorBoard instances



# Control TensorBoard display. If no port is provided,
# the most recently launched TensorBoard is used
notebook.display(port=6006, height=1000)