In [None]:
!pip install transformers

In [None]:
import time
import sys
import os
import contextlib

from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids = True)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#!wget http://participants-area.bioasq.org/MRQA2019/ -P '/content/drive/My Drive/colab_files/data/BioASQ/'

In [None]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('/content/drive/My Drive/colab_files/data/Covid-QA/Covid-QA-train.json')
val_contexts, val_questions, val_answers = read_squad('/content/drive/My Drive/colab_files/data/Covid-QA/Covid-QA-val.json')

In [None]:
#s = read_squad('/content/drive/My Drive/colab_files/data/Covid-QA/Covid-QA.json')

In [None]:
train_contexts[0]

'Preparation for Possible Sustained Transmission of 2019 Novel Coronavirus\nLessons From Previous Epidemics\nhttps://jamanetwork.com/journals/jama/fullarticle/2761285\nFebruary 11, 2020\nDavid L. Swerdlow, MD1; Lyn Finelli, DrPH, MS2\nAuthor Affiliations Article Information\nJAMA. 2020;323(12):1129-1130. doi:10.1001/jama.2020.1960\nCOVID-19 Resource Center\nrelated articles icon Related\nArticles\nauthor interview icon Interviews\nAudio Interview (25:53)\nCOVID-19 Update From China\nTransmissibility and severity are the 2 most critical factors that determine the effect of an epidemic. Neither the 2009 pandemic influenza A(H1N1) virus ([H1N1]pdm09) pandemic or the severe acute respiratory syndrome coronavirus (SARS-CoV) or the Middle East respiratory syndrome coronavirus (MERS-CoV) epidemics had the combination of both high transmissibility and severity. Control strategies are driven by this combination. R0, the basic reproduction number, is a commonly used measure of transmissibility a

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
#from transformers import DistilBertTokenizerFast
#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
#from transformers import DistilBertForQuestionAnswering
#model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(2):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [None]:
def extract_answer_phrase(question, context):
    '''
    Takes a `question` string and an `context` string (which contains the
    answer), and identifies the words within the `context` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, context)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example question through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    #print('Answer: "' + answer + '"')
    return answer

In [None]:
train_contexts[0]

'Preparation for Possible Sustained Transmission of 2019 Novel Coronavirus\nLessons From Previous Epidemics\nhttps://jamanetwork.com/journals/jama/fullarticle/2761285\nFebruary 11, 2020\nDavid L. Swerdlow, MD1; Lyn Finelli, DrPH, MS2\nAuthor Affiliations Article Information\nJAMA. 2020;323(12):1129-1130. doi:10.1001/jama.2020.1960\nCOVID-19 Resource Center\nrelated articles icon Related\nArticles\nauthor interview icon Interviews\nAudio Interview (25:53)\nCOVID-19 Update From China\nTransmissibility and severity are the 2 most critical factors that determine the effect of an epidemic. Neither the 2009 pandemic influenza A(H1N1) virus ([H1N1]pdm09) pandemic or the severe acute respiratory syndrome coronavirus (SARS-CoV) or the Middle East respiratory syndrome coronavirus (MERS-CoV) epidemics had the combination of both high transmissibility and severity. Control strategies are driven by this combination. R0, the basic reproduction number, is a commonly used measure of transmissibility a

In [None]:
train_questions[0]

'How many surgical masks or respirators have past studies projected will be required for a pandemic in the United States?'

In [None]:
train_answers[0]

{'answer_end': 6407, 'answer_start': 6383, 'text': 'an estimated 7.3 billion'}

In [None]:
context1 = 'After a new influenza virus (H7N9) was identified in China in 2013, a series of modeling articles described the effect of, and level of preparedness for, a severe, single-wave pandemic in the United States.7 In scenarios that used clinical attack rates (the proportion of individuals who become ill with or die from a disease in a population initially uninfected) of 20% to 30% (for comparison the clinical attack rate was 20% in the first year of the 2009 H1N1 pandemic), depending on severity there would be an estimated 669 000 to 4.3 million hospitalizations and an estimated 54 000 to 538 000 deaths without any interventions in the United States. The models suggested that without a vaccine, school closures would be unlikely to affect the pandemic, an estimated 35 000 to 60 000 ventilators would be needed, up to an estimated 7.3 billion surgical masks or respirators would be required, and perhaps most important, if vaccine development did not start before the virus was introduced, it was unlikely that a significant number of hospitalizations and deaths could be averted due to the time it takes to develop, test, manufacture, and distribute a vaccine.'

In [None]:
extract_answer_phrase(train_questions[0], context1)

Query has 282 tokens.



TypeError: ignored