This is a QA system fine-tuned by bert_base_uncased. Some code referenced from https://github.com/alexpod1000/SQuAD-QA.git.

In [None]:
# Run the following cells only if using Colab
if 'google.colab' in str(get_ipython()):
    # Clone repository
    !git clone https://github.com/alexpod1000/SQuAD-QA.git
    # Change current working directory to match project
    %cd SQuAD-QA/
    !pwd

    !pip install transformers

Cloning into 'SQuAD-QA'...
remote: Enumerating objects: 406, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 406 (delta 12), reused 17 (delta 6), pack-reused 372[K
Receiving objects: 100% (406/406), 9.16 MiB | 7.34 MiB/s, done.
Resolving deltas: 100% (237/237), done.
/content/SQuAD-QA/SQuAD-QA
/content/SQuAD-QA/SQuAD-QA


In [None]:
# External imports
import copy
import nltk
import numpy as np
import pandas as pd
import string
import torch
import json

from functools import partial
from nltk.tokenize import TreebankWordTokenizer, SpaceTokenizer
from transformers import AutoTokenizer
from typing import Tuple, List, Dict, Any, Union

# Project imports
from squad_data.parser import SquadFileParser
from squad_data.utils import build_mappers_and_dataframe_bert
from models import possible_models_dict
from evaluation.evaluate import evaluate_predictions
from evaluation.utils import build_evaluation_dict_bert
from utils import split_dataframe

RoBERTa: A Robustly Optimized BERT Pretraining Approach(https://arxiv.org/abs/1907.11692)
A research paper about improved Bert model.

In [None]:
current_selected_experiment = "distilroberta_extra_linear"
params_dict = possible_models_dict[current_selected_experiment]

### Parse the json and get the data (SQuAd V1.1)

In [None]:
train_file_json = "squad_data/data/training_set.json"
test_file_json = "squad_data/data/dev-v1.1.json"

train_parser = SquadFileParser(train_file_json)
test_parser = SquadFileParser(test_file_json)

train_data = train_parser.parse_documents()
test_data = test_parser.parse_documents()

### Prepare the mappers and datafram

In [None]:
def bert_tokenizer_fn(question, paragraph, tokenizer, max_length=384, doc_stride=128):
    pad_on_right = tokenizer.padding_side == "right"
    # Process the sample
    tokenized_input_pair = tokenizer(
        question,
        paragraph,
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_input_pair

In [None]:
tokenizer = AutoTokenizer.from_pretrained(params_dict["tokenizer_url"])
tokenizer_fn_preprocess = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"]-3)
tokenizer_fn_train = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"])

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
paragraphs_mapper, df = build_mappers_and_dataframe_bert(tokenizer, tokenizer_fn_preprocess, train_data, limit_answers=1)
print(paragraphs_mapper[next(iter(paragraphs_mapper))])
df.head()

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


Unnamed: 0,doc_id,paragraph_id,question_id,answer_id,answer_start,answer_text,question_text,tokenizer_answer_start,tokenizer_answer_end
0,0,0_0,5733be284776f41900661182,0,515,Saint Bernadette Soubirous,To whom did the Virgin Mary allegedly appear i...,135,143
1,0,0_0,5733be284776f4190066117f,0,188,a copper statue of Christ,What is in front of the Notre Dame Main Building?,54,59
2,0,0_0,5733be284776f41900661180,0,279,the Main Building,The Basilica of the Sacred heart at Notre Dame...,81,84
3,0,0_0,5733be284776f41900661181,0,381,a Marian place of prayer and reflection,What is the Grotto at Notre Dame?,96,103
4,0,0_0,5733be284776f4190066117e,0,92,a golden statue of the Virgin Mary,What sits on top of the Main Building at Notre...,35,42


In [None]:
df_train, df_val = split_dataframe(df, train_ratio=0.9)

In [None]:
print(f"Total samples: {len(df)}, Train samples: {len(df_train)}, Validation samples: {len(df_val)}")

Total samples: 88579, Train samples: 80992, Validation samples: 7587


### DataConverter and CustomQADataset

In [None]:
from data_loading.utils import bert_padder_collate_fn
from data_loading.qa_dataset import CustomQADatasetBERT

datasetQA = CustomQADatasetBERT(tokenizer_fn_train, df_train, paragraphs_mapper)
data_loader = torch.utils.data.DataLoader(datasetQA, collate_fn = bert_padder_collate_fn, batch_size=10, shuffle=True)

test_batch = next(iter(data_loader))
print(test_batch["input_ids"].shape)
print(test_batch["y_gt"].shape)

torch.Size([10, 384])
torch.Size([10, 2])


# Model train

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers

from timeit import default_timer as timer
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
from transformers.optimization import AdamW

from models.utils import SpanExtractor

In [None]:
use_amp = True
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"The device is {device}")
print(f"Automatic Mixed Precision Enabled: {use_amp}")

The device is cuda
Automatic Mixed Precision Enabled: True


Model:

(input_ids, attention_mask) -> (answer_start, answer_end) // for each token in input_ids

In [None]:
def train_step(model, scaler, optimizer, loss_function, dataloader, scheduler=None, device="cpu", show_progress=False):
    acc_loss = 0
    acc_start_accuracy = 0
    acc_end_accuracy = 0
    count = 0

    time_start = timer()

    model.train()
    wrapped_dataloader = tqdm(dataloader) if show_progress else dataloader
    for batch in wrapped_dataloader:
        # NOTE: we'll pass directly the batch dict to the model for inputs.
        answer_spans_start = batch["y_gt"][:, 0]
        answer_spans_end = batch["y_gt"][:, 1]
        # Clear gradients
        model.zero_grad()
        # Place to right device
        answer_spans_start = answer_spans_start.to(device)
        answer_spans_end = answer_spans_end.to(device)
        # Use Automatic Mixed Precision if enabled
        with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
            # Run forward pass
            pred_answer_start_scores, pred_answer_end_scores = model(batch)
            # Compute the CrossEntropyLoss
            loss = (loss_function(pred_answer_start_scores, answer_spans_start) + loss_function(pred_answer_end_scores, answer_spans_end))/2.0
        scaler.scale(loss).backward()
        # Optimizer step (via scaler)
        scaler.step(optimizer)
        scaler.update()
        # Update LR scheduler
        if scheduler is not None:
            scheduler.step()
        # --- Compute metrics ---
        # Get span indexes
        pred_span_start_idxs, pred_span_end_idxs = SpanExtractor.extract_most_probable(pred_answer_start_scores, pred_answer_end_scores)
        gt_start_idxs = answer_spans_start.cpu().detach()
        gt_end_idxs = answer_spans_end.cpu().detach()
        # two accs
        start_accuracy = torch.sum(gt_start_idxs == pred_span_start_idxs) / len(pred_span_start_idxs)
        end_accuracy = torch.sum(gt_end_idxs == pred_span_end_idxs) / len(pred_span_end_idxs)
        # Gather stats
        acc_loss += loss.item()
        acc_start_accuracy += start_accuracy.item()
        acc_end_accuracy += end_accuracy.item()
        count += 1
    time_end = timer()
    return {
        "loss": acc_loss / count,
        "accuracy_start": acc_start_accuracy / count,
        "accuracy_end": acc_end_accuracy / count,
        "time": time_end - time_start
    }

In [None]:
@torch.no_grad()
def validation_step(model, scaler, loss_function, dataloader, device="cpu", show_progress=False):
    acc_loss = 0
    acc_start_accuracy = 0
    acc_end_accuracy = 0
    count = 0

    time_start = timer()
    wrapped_dataloader = tqdm(dataloader) if show_progress else dataloader

    model.eval()
    for batch in wrapped_dataloader:
        answer_spans_start = batch["y_gt"][:, 0]
        answer_spans_end = batch["y_gt"][:, 1]
        # Place to right device
        answer_spans_start = answer_spans_start.to(device)
        answer_spans_end = answer_spans_end.to(device)
        # Use Automatic Mixed Precision if enabled
        with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
            # Run forward pass
            pred_answer_start_scores, pred_answer_end_scores = model(batch)
            # Compute the CrossEntropyLoss
            loss = (loss_function(pred_answer_start_scores, answer_spans_start) + loss_function(pred_answer_end_scores, answer_spans_end))/2.0
        # --- Compute metrics ---
        # Get span indexes
        pred_span_start_idxs, pred_span_end_idxs = SpanExtractor.extract_most_probable(pred_answer_start_scores, pred_answer_end_scores)
        gt_start_idxs = answer_spans_start.cpu().detach()
        gt_end_idxs = answer_spans_end.cpu().detach()
        # two accs
        start_accuracy = torch.sum(gt_start_idxs == pred_span_start_idxs) / len(pred_span_start_idxs)
        end_accuracy = torch.sum(gt_end_idxs == pred_span_end_idxs) / len(pred_span_end_idxs)
        # Gather stats
        acc_loss += loss.item()
        acc_start_accuracy += start_accuracy.item()
        acc_end_accuracy += end_accuracy.item()
        count += 1
    time_end = timer()
    return {
        "loss": acc_loss / count,
        "accuracy_start": acc_start_accuracy / count,
        "accuracy_end": acc_end_accuracy / count,
        "time": time_end - time_start
    }

In [None]:
def get_params_for_optimizer(model, no_decay, weight_decay=0.0001):
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer if not any(
                    nd in n for nd in no_decay
                )
            ],
            'weight_decay': weight_decay
        },
        {
            'params': [
                p for n, p in param_optimizer if any(
                    nd in n for nd in no_decay
                )
            ],
            'weight_decay': 0.0
        },
    ]
    return optimizer_parameters

In [None]:
# Define baseline model
model = params_dict["span_model"](768, 2, params_dict, dropout_rate=params_dict["train_params"]["dropout_rate"]).to(device)

# Define parameters on which to apply L2 decay
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
if params_dict["train_params"]["weight_decay"] > 0.0:
    model_params_optimizer = get_params_for_optimizer(model, no_decay, weight_decay=params_dict["train_params"]["weight_decay"])
else:
    model_params_optimizer = model.parameters()

# Define optimizer
optimizer = AdamW(
    model_params_optimizer,
    lr=params_dict["train_params"]["initial_lr"],
    correct_bias=False
)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Estimate the number of train steps for LR scheduler
num_train_steps = int(
    (len(df_train) / params_dict["train_params"]["batch_size_train"]) * params_dict["train_params"]["epochs"]
)

num_warmup_steps = int(num_train_steps * 0.1) # 10% of warmup steps

# LR scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_train_steps
)

In [None]:
loss_function = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

In [None]:
dataset_train_QA = CustomQADatasetBERT(tokenizer_fn_train, df_train, paragraphs_mapper)
dataset_val_QA = CustomQADatasetBERT(tokenizer_fn_train, df_val, paragraphs_mapper)

In [None]:
train_data_loader = torch.utils.data.DataLoader(
    dataset_train_QA, collate_fn = bert_padder_collate_fn,
    batch_size=params_dict["train_params"]["batch_size_train"], shuffle=True
)
val_data_loader = torch.utils.data.DataLoader(
    dataset_val_QA, collate_fn = bert_padder_collate_fn,
    batch_size=params_dict["train_params"]["batch_size_val"], shuffle=True
)

In [None]:
history = {
    "train_loss": [], "train_acc_start": [], "train_acc_end": [],
    "val_loss": [], "val_acc_start": [], "val_acc_end": []
}
loop_start = timer()
for epoch in range(params_dict["train_params"]["epochs"]):
    train_dict = train_step(model, scaler, optimizer, loss_function, train_data_loader,scheduler=scheduler, device=device, show_progress=True)
    val_dict = validation_step(model, scaler, loss_function, val_data_loader, device=device, show_progress=True)
    cur_lr = optimizer.param_groups[0]['lr']
    print(f'Epoch: {epoch}, '
          f'lr: {cur_lr}, '
          f'Train loss: {train_dict["loss"]:.4f}, '
          f'Train acc start: {train_dict["accuracy_start"]:.4f}, '
          f'Train acc end: {train_dict["accuracy_end"]:.4f}, '
          f'Val loss: {val_dict["loss"]:.4f}, '
          f'Val acc start: {val_dict["accuracy_start"]:.4f}, '
          f'Val acc end: {val_dict["accuracy_end"]:.4f}, '
          f'Time: {train_dict["time"]:.4f}')
    history["train_loss"].append(train_dict["loss"]);history["train_acc_start"].append(train_dict["accuracy_start"]);history["train_acc_end"].append(train_dict["accuracy_end"]);
    history["val_loss"].append(val_dict["loss"]);history["val_acc_start"].append(val_dict["accuracy_start"]);history["val_acc_end"].append(val_dict["accuracy_end"]);
loop_end = timer()
print(f"Elapsed time: {(loop_end - loop_start):.4f}")

100%|██████████| 10124/10124 [22:08<00:00,  7.62it/s]
100%|██████████| 949/949 [00:37<00:00, 25.06it/s]


Epoch: 0, lr: 1.666593503072871e-05, Train loss: 1.4321, Train acc start: 0.6053, Train acc end: 0.6267, Val loss: 1.0005, Val acc start: 0.6949, Val acc end: 0.7388, Time: 1329.0028


100%|██████████| 10124/10124 [22:00<00:00,  7.66it/s]
100%|██████████| 949/949 [00:37<00:00, 25.42it/s]

Epoch: 1, lr: 0.0, Train loss: 0.8309, Train acc start: 0.7326, Train acc end: 0.7641, Val loss: 0.9437, Val acc start: 0.7118, Val acc end: 0.7541, Time: 1320.9023
Elapsed time: 2725.1345





In [None]:
# Uncomment below line to save model to disk
torch.save(model.state_dict(), "distilroberta_extralinear_google_2_epochs.pt")

# Evaluation

In [None]:
# Uncomment below line to load model from disk
#model.load_state_dict(torch.load("distilbert_mdl.pt"))

## Quantitative evaluation

In [None]:
test_paragraphs_mapper, test_df = build_mappers_and_dataframe_bert(tokenizer, tokenizer_fn_preprocess, test_data, limit_answers=1)

In [None]:
dataset_test_QA = CustomQADatasetBERT(tokenizer_fn_train, test_df, test_paragraphs_mapper)
test_data_loader = torch.utils.data.DataLoader(
    dataset_test_QA, collate_fn = bert_padder_collate_fn,
    batch_size=params_dict["train_params"]["batch_size_test"], shuffle=True
)

In [None]:
with open(test_file_json, "r") as f:
    dataset_json = json.load(f)
pred_dict = build_evaluation_dict_bert(model, scaler, test_data_loader, test_paragraphs_mapper, tokenizer, device, show_progress=True)
eval_results = evaluate_predictions(dataset_json, pred_dict)
print(eval_results)

100%|██████████| 1349/1349 [00:54<00:00, 24.88it/s]


{
  "exact": 80.74739829706716,
  "f1": 87.88149012937704,
  "total": 10570,
  "HasAns_exact": 80.74739829706716,
  "HasAns_f1": 87.88149012937704,
  "HasAns_total": 10570
}


## Simple qualitative evaluation

In [None]:
def get_answer_span_helper(context, question, model, tokenizer_fn, tokenizer, device="cpu"):
    tokenized_input = tokenizer_fn(question, context)
    output_span = model({
        "input_ids": torch.tensor(tokenized_input["input_ids"]).to(device),
        "attention_mask": torch.tensor(tokenized_input["attention_mask"]).to(device)
    })
    start, end = SpanExtractor.extract_most_probable(output_span[0], output_span[1])
    start = start.item()
    end = end.item()
    return tokenizer.decode(tokenized_input["input_ids"][0][start:end], skip_special_tokens=True)

In [None]:
context = "This is a test message, written to see if our model can correctly predict its outputs."
question = "Who needs to predict its outputs?"
pred_answer = get_answer_span_helper(context, question, model, tokenizer_fn_train, tokenizer, device="cuda")
print(pred_answer)

 our model


In [None]:
context = "This is a test message, I had an apple for dinner."
question = "What did I eat?"
pred_answer = get_answer_span_helper(context, question, model, tokenizer_fn_train, tokenizer, device="cuda")
print(pred_answer)

 apple
