In [1]:
!pip install --quiet transformers

In [2]:
# !python --version

In [3]:
# !python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

In [4]:
# !which python3

In [5]:
# !which jupyter

In [6]:
import numpy

In [7]:
import pickle # reading in our data

import torch # pytorch
from torch.utils.data import DataLoader # this helps us iterate over our data efficiently
from tqdm import tqdm

In [8]:
import transformers

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
cd /content/drive/MyDrive/datasets


/content/drive/MyDrive/datasets


## Load Data

### Load training data

In [11]:
with open('/content/drive/MyDrive/datasets/covidqa/covid_train_contexts.pkl', 'rb') as f1:
  train_contexts = pickle.load(f1)

In [12]:
with open('/content/drive/MyDrive/datasets/covidqa/covid_train_questions.pkl', 'rb') as f2:
  train_questions = pickle.load(f2)

In [13]:
with open('/content/drive/MyDrive/datasets/covidqa/covid_train_answers.pkl', 'rb') as f3:
  train_answers = pickle.load(f3)

### Load validation data

In [14]:
with open('/content/drive/MyDrive/datasets/covidqa/covid_val_contexts.pkl', 'rb') as f1:
  val_contexts = pickle.load(f1)

In [15]:
with open('/content/drive/MyDrive/datasets/covidqa/covid_val_questions.pkl', 'rb') as f2:
  val_questions = pickle.load(f2)

In [16]:
with open('/content/drive/MyDrive/datasets/covidqa/covid_val_answers.pkl', 'rb') as f3:
  val_answers = pickle.load(f3)

In [17]:
def add_end_index(answers, contexts):

  # loop over each context-answer pair
  for answer, context in zip(answers, contexts):

    # the actual answer
    expected_answer = answer['text']

    # start index of the answer
    start_index = answer['answer_start']

    # end index
    end_index = start_index + len(expected_answer)

    # take into account if answer is off by a couple characters
    if context[start_index:end_index] == expected_answer:
      answer['answer_end'] = end_index
    else:
      # if answer is off by 1 or 2 tokens
      for i in [1, 2]:
        if context[start_index-i:end_index-i] == expected_answer:
          answer['answer_start'] = start_index - i
          answer['answer_end'] = end_index - i

In [18]:
# apply our function above to dig up all the answer_start and end for each context-answer pair
add_end_index(train_answers, train_contexts)
add_end_index(val_answers, val_contexts)

In [19]:
train_answers[:3]

[{'answer_end': 466,
  'answer_start': 370,
  'text': 'Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide. '},
 {'answer_end': 2129,
  'answer_start': 2003,
  'text': 'DC-SIGNR plays a crucial role in MTCT of HIV-1 and that impaired placental DC-SIGNR expression increases risk of transmission.'},
 {'answer_end': 2408,
  'answer_start': 2291,
  'text': 'more than 400,000 children were infected worldwide, mostly through MTCT and 90% of them lived in sub-Saharan Africa. '}]

In [20]:
# from transformers import DistilBertTokenizerFast
from transformers import RobertaTokenizerFast
# defining the tokenizer we are going to use
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [21]:
# apply tokenizer to our training and validation datasets
train_tokenized = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_tokenized = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [43]:
print(train_tokenized[0])
print(val_tokenized[0])

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [22]:
tokenizer.decode(train_tokenized['input_ids'][0])

'<s>Functional Genetic Variants in DC-SIGNR Are Associated with Mother-to-Child Transmission of HIV-1\n\nhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC2752805/\n\nBoily-Larouche, Geneviève; Iscache, Anne-Laure; Zijenah, Lynn S.; Humphrey, Jean H.; Mouland, Andrew J.; Ward, Brian J.; Roger, Michel\n2009-10-07\nDOI:10.1371/journal.pone.0007211\nLicense:cc-by\n\nAbstract: BACKGROUND: Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide. Given that the C-type lectin receptor, dendritic cell-specific ICAM-grabbing non-integrin-related (DC-SIGNR, also known as CD209L or liver/lymph node–specific ICAM-grabbing non-integrin (L-SIGN)), can interact with pathogens including HIV-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence MTCT of HIV-1. METHODS AND FINDINGS: To investigate the potential role of DC-SIGNR in MTCT of HIV-1, we carried out a genetic association study of DC-SIGNR in a well-characterized cohort of 

In [23]:
def add_token_positions(tokenized_data, answers):
  # let's define a couple lists to keep track of our start/end tokens
  start_positions = [] # aggregate all start positions for each sample
  end_positions = [] # aggregate all end positions for each sample

  # loop over the answers list
  for i in range(len(answers)):
    # add the start and end encodings to our lists
    start_positions.append(tokenized_data.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(tokenized_data.char_to_token(i, answers[i]['answer_end']))

    # if no start position, the answer passage was truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    
    # if we can't find the end positions, then we want to shift characters left until we do find it
    shift = 1
    while end_positions[-1] is None:
      end_positions[-1] = tokenized_data.char_to_token(i, answers[i]['answer_end'] - shift)
      shift += 1
  
  tokenized_data.update({'start_positions': start_positions, 'end_positions': end_positions})

In [24]:
# apply function to train and validations data
add_token_positions(train_tokenized, train_answers)
add_token_positions(val_tokenized, val_answers)

In [25]:
train_tokenized.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [26]:
class CovidDataset(torch.utils.data.Dataset):
  # constructor
  def __init__(self, encodings):
    self.encodings = encodings
  
  # Query to get a specific item at an index in our dataset
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} # comprehension (shortcut)

  # returns the length of our dataset
  def __len__(self):
    return len(self.encodings.input_ids)

In [27]:
# Datasets for the training and validations sets
train_data = CovidDataset(train_tokenized)
val_data = CovidDataset(val_tokenized)

In [28]:
# Used to help us iterate over our dataset conveniently, especially when we have batches
train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=8, shuffle=True)

In [29]:
from transformers import RobertaForQuestionAnswering # This is the BERT model used for question answering

# BERT model already finetuned for question answering
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [30]:
from transformers import AdamW

optim = AdamW(model.parameters(), lr=5e-5)



In [31]:
if torch.cuda.is_available():
  device = torch.device("cuda") # this is the device our model will use for computing
  print(f'There are {torch.cuda.device_count()} GPUs available')
  print(f'Device name:', torch.cuda.get_device_name(0))
else:
  print('No GPU available, using CPU instead')
  device = torch.device('cpu')

There are 1 GPUs available
Device name: Tesla T4


In [32]:
model = model.to(device)

In [33]:
def train(model, train_dataloader):

  # run loop 3 times
  for epoch in range(1):
    # first, set the mode of the model to train
    model.train()

    # progress bar
    loop = tqdm(train_dataloader, leave=True)

    # loop over each batch 
    for batch in loop:
      # zero out the optimizer
      optim.zero_grad()

      # grab the batch attributes and attach them to the GPU device
      input_ids, attention_mask, start_positions, end_positions = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['start_positions'].to(device), batch['end_positions'].to(device)

      # Feedforward!!!
      outputs = model(input_ids, attention_mask, start_positions=start_positions, end_positions=end_positions)

      # Compute the Loss
      loss = outputs[0]

      # BACKPROPAGATION!!!!
      loss.backward()

      # update parameters for the next run
      optim.step()


      # display some stuff here
      loop.set_description(f'Epoch {epoch}')
      loop.set_postfix(loss=loss.item())


In [34]:
train(model, train_dataloader)

Epoch 0: 100%|██████████| 202/202 [02:39<00:00,  1.27it/s, loss=3.91]


In [35]:
def evaluate(model, val_dataloader):
    # set to evaluate mode
    model.eval()
    
    # keep track of accuracies
    accuracies = []
    
    # loop through batches
    for batch in val_dataloader:
        # grab the batch attributes and attach them to the GPU device
        input_ids, attention_mask, start_positions, end_positions = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['start_positions'].to(device), batch['end_positions'].to(device)
        
        # make prediction
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # get the prediction for the start and end token for answer
        start_probs = outputs['start_logits']
        end_probs = outputs['end_logits']
        
        start_pred = torch.argmax(start_probs, dim=1)
        end_pred = torch.argmax(end_probs, dim=1)
        
        # append the accuracy
        accuracies.append(((start_pred == start_positions).sum() / len(start_pred)).item())
        accuracies.append(((end_pred == end_positions).sum() / len(end_pred)).item())
    
    avg_acc = sum(accuracies) / len(accuracies)
    
    return avg_acc
    

In [36]:
avg_acc = evaluate(model, val_dataloader)

In [37]:
avg_acc

0.44362745098039214