In [None]:
#!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

### Load the data

In [10]:
raw_data = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')

### Clean the data

In [11]:
raw_data

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."
...,...,...
7194,1,"{'source': 'gutenberg', 'id': '34j10vatjfyw0ao..."
7195,1,"{'source': 'cnn', 'id': '3vj40nv2qinjocrcy7k4z..."
7196,1,"{'source': 'race', 'id': '3rjsc4xj10uw0to3vq0v..."
7197,1,"{'source': 'wikipedia', 'id': '3gs6s824sqxty8v..."


In [12]:
def clean_data(df):
  del df['version']
  row_list = []
  for _, row in df.iterrows():
      for i in range(len(row["data"]["questions"])):
          row_data = [row["data"]["story"], 
                       row["data"]["questions"][i]["input_text"], 
                       row["data"]["answers"][i]["input_text"]]
          row_list.append(row_data)
  new_df = pd.DataFrame(row_list, columns=["text","question","answer"]) 

  return new_df

In [13]:
data = clean_data(raw_data)
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [None]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


In [18]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [36]:
def question_answer(question, text):
    
    input_ids = tokenizer.encode(question, text)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    output = model(torch.tensor([input_ids]))
    
    start = torch.argmax(output.start_logits, dim=1)
    end = torch.argmax(output.end_logits, dim=1)

    # if the answer is valid
    if end >= start:
        pred_answer = tokens[start]
        # connect the words
        for i in range(start+1, end+1):
            pred_answer += " " + tokens[i] if tokens[i][0:2] != "##" else tokens[i][2:]
                
    if pred_answer.startswith('[CLS]'):
        answer = "I can't answer this question."
    
    print()
    print('Answer:\n{}'.format(pred_answer))

In [24]:
question = data["question"][0]
text = data["text"][0]

In [26]:
question

'When was the Vat formally opened?'

In [27]:
text

'The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula. \n\nThe Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail. \n\nIn March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online. \n\nThe Vatican Secret Archives were separated from the library at the beginning of t

In [None]:

text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")
while True:
    question_answer(question, text)
    is_done = True
    is_not_done = False
    while is_done:
        res = input("\nAnother question? (Y/N)? ")
        if res == "Y":
            question = input("\nPlease enter your question: \n")
            is_done = False
        elif res == "N":
            print("\nGood bye! See you :)")
            is_done = False
            is_not_done = True
            
    if is_not_done == True:
        break

Please enter your text: 
The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula. \n\nThe Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail. \n\nIn March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online. \n\nThe Vatican Secret Archives were separated from the libra

Play with model

In [None]:
from collections import Counter

In [None]:
def compute_f1_score(prediction, ground_truth):
    common = Counter(prediction.split()) & Counter(ground_truth.split())
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction.split())
    recall = 1.0 * num_same / len(ground_truth.split())
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def compute_em_score(prediction, ground_truth):
    return 1.0 if prediction == ground_truth else 0.0

In [None]:
import requests

for file in ['train-v2.0.json', 'dev-v2.0.json']:
    res = requests.get(f'https://rajpurkar.github.io/SQuAD-explorer/dataset/{file}')
    with open(file, 'wb') as f:
        for chunk in res.iter_content():
            f.write(chunk)

In [None]:
import json

def read(filename):
    with open(filename, 'rb') as f:
        json_dict = json.load(f)

    context_list, question_list, answer_list = [], [], []
    for key in json_dict['data']:
        for paragraph in key['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer_type = 'plausible_answers' if 'plausible_answers' in qa.keys() else 'answers'
                for answer in qa[answer_type]:
                    context_list.append(context)
                    question_list.append(question)
                    answer_list.append(answer)

    return context_list, question_list, answer_list

train_contexts, train_questions, train_answers = read('train-v2.0.json')
val_contexts, val_questions, val_answers = read('dev-v2.0.json')

In [None]:
train_questions[0]

'When did Beyonce start becoming popular?'

In [None]:
def add_answer_end(answers, contexts):
    for answer, context in zip(answers, contexts):
        answer['answer_end'] = answer['answer_start'] + len(answer['text'])
      
add_answer_end(train_answers, train_contexts)
add_answer_end(val_answers, val_contexts)

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 43.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 43.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.9 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created whe

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_tokens = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_tokens = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
def find_start_positions(encodings, answers):
    start_positions = []
    for i in range(len(answers)):
        if encodings.char_to_token(i, answers[i]['answer_start']) is None:
            start_postition = tokenizer.model_max_length
        else:
            start_position = encodings.char_to_token(i, answers[i]['answer_start'])
        start_positions.append(start_position)
    return start_positions

train_start_positions = find_start_positions(train_tokens, train_answers)
val_start_positions = find_start_positions(val_tokens, val_answers)

In [None]:
def find_end_positions(encodings, answers):
    end_positions = []
    for i in range(len(answers)):
        j = 0
        while encodings.char_to_token(i, answers[i]['answer_end'] - j) is None:
            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - j))
            j += 1
    return end_positions

train_end_positions = find_end_positions(train_tokens, train_answers)
val_end_positions = find_end_positions(val_tokens, val_answers)

In [None]:
import torch
from torch.utils.data import Dataset

class Encode(Dataset):
    def __init__(self, tokens, start_positions, end_positions):
        self.tokens = tokens
        self.start_positions = start_positions,
        self.end_positions = end_positions

    def __getitem__(self, index):
        return {
            'input_ids': torch.tensor(self.tokens['input_ids']),
            'attention_mask': torch.tensor(self.tokens['attention_mask']),
            'start_positions': torch.tensor(self.start_positions),
            'end_positions': self.end_positions
        }

    def __len__(self):
        return len(self.tokens['input_ids'])

train_set = Encode(train_tokens, train_start_positions, train_end_positions)
val_set = Encode(val_tokens, val_start_positions, val_end_positions)

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_set, batch_size=4, shuffle=True)



  0%|          | 0/32580 [00:08<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
def train(model, training_loader, optimizer):
    model.train()
    epoch_loss = 0
    for _, batch in training_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    return epoch_loss/len(training_loader)

In [None]:
def evaluate(model, training_loader, optimizer):
    model.eval()
    acc = []

    loop = tqdm(val_loader)
    # loop through batches
    for batch in loop:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            start_pred = torch.argmax(outputs['start_logits'], dim=1)
            end_pred = torch.argmax(outputs['end_logits'], dim=1)
            acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
            acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
    # calculate average accuracy in total
    acc = sum(acc)/len(acc)