**Purpose:**

Fine tuned DistilBERT base model **only** on the Covid-QA dataset. 

This model was **not** finetuned on SQuAD for any question answering task prior to this training.

In [None]:
!pip install transformers
import time
import os
import contextlib
import torch
import nltk
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive')

In [2]:
#!wget http://participants-area.bioasq.org/MRQA2019/ -P '/content/drive/My Drive/colab_files/data/BioASQ/'

In [3]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('/content/drive/My Drive/colab_files/data/Covid-QA/Covid-QA-train.json')
val_contexts, val_questions, val_answers = read_squad('/content/drive/My Drive/colab_files/data/Covid-QA/Covid-QA-val.json')

In [4]:
#s = read_squad('/content/drive/My Drive/colab_files/data/Covid-QA/Covid-QA.json')

In [6]:
train_contexts[34]

'Controlled efficacy trial confirming toltrazuril resistance in a field isolate of ovine Eimeria spp.\n\nhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC6034276/\n\nSHA: ef000d8cdab3895e2321286f16cce2b8aea458d1\n\nAuthors: Odden, Ane; Enemark, Heidi L.; Ruiz, Antonio; Robertson, Lucy J.; Ersdal, Cecilie; Nes, Silje K.; Tømmerberg, Vibeke; Stuen, Snorre\nDate: 2018-07-05\nDOI: 10.1186/s13071-018-2976-4\nLicense: cc-by\n\nAbstract: BACKGROUND: Coccidiosis due to Eimeria spp. infections in lambs causes increased mortality and substantial production losses, and anticoccidials are important for control of the infection. Anticoccidial resistance has been reported in poultry and swine, and we recently described reduced toltrazuril efficacy in ovine Eimeria spp. in some Norwegian sheep farms using a newly developed faecal oocyst count reduction test (FOCRT). The aim of the present study was to use a controlled efficacy trial to assess the efficacy of toltrazuril against a field isolate suspected 

In [7]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [8]:
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', return_token_type_ids = True)
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [9]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [10]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [11]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [12]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [13]:
print(device)
print(next(model.parameters()).device)

cuda
cuda:0


In [14]:
model.save_pretrained('/content/drive/My Drive/colab_files/trained_models/Only_CovidQA_Model')
tokenizer.save_pretrained('/content/drive/My Drive/colab_files/trained_models/Only_CovidQA_Model')

('/content/drive/My Drive/colab_files/trained_models/Only_CovidQA_Model/vocab.txt',
 '/content/drive/My Drive/colab_files/trained_models/Only_CovidQA_Model/special_tokens_map.json',
 '/content/drive/My Drive/colab_files/trained_models/Only_CovidQA_Model/added_tokens.json')

Inference

In [15]:
# Inference:
import time
start_time = time.time()
context = 'The models suggested that without a vaccine, school closures would be unlikely to affect the pandemic, an estimated 35 000 to 60 000 ventilators would be needed, up to an estimated 7.3 billion surgical masks or respirators would be required, and perhaps most important, if vaccine development did not start before the virus was introduced, it was unlikely that a significant number of hospitalizations and deaths could be averted due to the time it takes to develop, test, manufacture, and distribute a vaccine.'
question = "How many surgical masks will be required as per the model?"

encoding = tokenizer.encode_plus(question, context)

input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

start_scores, end_scores = model(torch.tensor([input_ids]).to(device), attention_mask=torch.tensor([attention_mask]).to(device))

ans_tokens = input_ids[torch.argmax(start_scores) : torch.argmax(end_scores)+1]
answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens , skip_special_tokens=True)

print ("\nQuestion ",question)
print ("\nAnswer Tokens: ")
print (answer_tokens)

#Here, decode() works similar to doing self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))
#Should use self.convert_tokens_to_string() when not using a Fast Tokenizer
answer_tokens_to_string = tokenizer.decode(ans_tokens)

print ("\nAnswer : ",answer_tokens_to_string)

end_time = time.time()

print("\nExecution Time: {} seconds.".format(end_time - start_time))


Question  How many surgical masks will be required as per the model?

Answer Tokens: 
['35', '000', 'to', '60', '000']

Answer :  35 000 to 60 000

Execution Time: 0.02510857582092285 seconds.
