In [None]:
import requests

for file in ['train-v2.0.json', 'dev-v2.0.json']:
    res = requests.get(f'https://rajpurkar.github.io/SQuAD-explorer/dataset/{file}')
    with open(file, 'wb') as f:
        for chunk in res.iter_content():
            f.write(chunk)

In [1]:
import json

def read(filename):
    with open(filename, 'rb') as f:
        json_dict = json.load(f)

    context_list, question_list, answer_list = [], [], []
    for key in json_dict['data']:
        for paragraph in key['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer_type = 'plausible_answers' if 'plausible_answers' in qa.keys() else 'answers'
                for answer in qa['answers']:
                    context_list.append(context)
                    question_list.append(question)
                    answer_list.append(answer)

    return context_list, question_list, answer_list

train_contexts, train_questions, train_answers = read('train-v2.0.json')
val_contexts, val_questions, val_answers = read('dev-v2.0.json')

In [2]:
train_questions[0]

'When did Beyonce start becoming popular?'

In [3]:
def add_answer_end(answers, contexts):
    for answer, context in zip(answers, contexts):
        answer['answer_end'] = answer['answer_start'] + len(answer['text'])
      
add_answer_end(train_answers, train_contexts)
add_answer_end(val_answers, val_contexts)

In [4]:
#!pip install transformers

In [5]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_tokens = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_tokens = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [6]:
def find_start_positions(encodings, answers):
    start_positions = []
    for i in range(len(answers)):
        if encodings.char_to_token(i, answers[i]['answer_start']) is None:
            start_postition = tokenizer.model_max_length
        else:
            start_position = encodings.char_to_token(i, answers[i]['answer_start'])
        start_positions.append(start_position)
    return start_positions

train_start_positions = find_start_positions(train_tokens, train_answers)
val_start_positions = find_start_positions(val_tokens, val_answers)

In [7]:
def find_end_positions(encodings, answers):
    end_positions = []
    for i in range(len(answers)):
        j = 0
        while encodings.char_to_token(i, answers[i]['answer_end'] - j) is None:
            j += 1
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - j))
    return end_positions

train_end_positions = find_end_positions(train_tokens, train_answers)
val_end_positions = find_end_positions(val_tokens, val_answers)

In [8]:
#!pip install torch

In [9]:
train_tokens['start_positions'] = train_start_positions 
train_tokens['end_positions'] = train_end_positions
val_tokens['start_positions'] = val_start_positions 
val_tokens['end_positions'] = val_end_positions

In [10]:
import torch
from torch.utils.data import Dataset

class Encode(Dataset):
    def __init__(self, tokens, start_positions, end_positions):
        self.tokens = tokens
        self.start_positions = start_positions,
        self.end_positions = end_positions

    def __getitem__(self, index):
        return {
            'input_ids': torch.tensor(self.tokens['input_ids'][index]),
            'attention_mask': torch.tensor(self.tokens['attention_mask'][index]),
            'start_positions': torch.tensor(self.tokens['start_positions'][index]),
            'end_positions': torch.tensor(self.tokens['end_positions'][index])
        }

    def __len__(self):
        return len(self.tokens.input_ids)

train_set = Encode(train_tokens, train_start_positions, train_end_positions)
val_set = Encode(val_tokens, val_start_positions, val_end_positions)

In [11]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [26]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [27]:
def train(model, train_loader, optimizer):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    return epoch_loss/len(train_loader)

In [28]:
def evaluate(model, val_loader):
    model.eval()
    acc = []

    for batch in tqdm(val_loader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)
            
            outputs = model(
                input_ids, 
                attention_mask=attention_mask
            )
            
            start_pred = torch.argmax(outputs['start_logits'], dim=1)
            end_pred = torch.argmax(outputs['end_logits'], dim=1)
            
            start_acc = ((start_pred == start_true).sum()
            start_acc /= len(start_pred)).item()
            acc.append(start_acc)
            end_acc = ((end_pred == end_true).sum()
            end_acc /= len(end_pred)).item()
            acc.append(end_acc)
            
    acc = sum(acc)/len(acc)
    
    return acc

SyntaxError: invalid syntax (Temp/ipykernel_10212/3899133424.py, line 21)

In [29]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_set, 
                          batch_size=16, 
                          shuffle=True)
val_loader = DataLoader(val_set, 
                        batch_size=16, 
                        shuffle=True)
N_EPOCHS = 3

cuda


In [30]:
import time

for epoch in range(N_EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch+1, N_EPOCHS))

    start_time = time.time()

    train_loss = train(model, train_loader, optimizer)
    valid_acc = evaluate(model, val_loader)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal. Acc: {valid_acc*100:.2f}%')




100%|██████████████████████████████████████████████████████████████████████████████| 5427/5427 [17:06<00:00,  5.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1269/1269 [01:21<00:00, 15.60it/s]


NameError: name 'valid_loss' is not defined

In [24]:
print(f'\tTrain Loss: {train_loss:.3f}')
print(f'\tVal. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.653
	Val. Acc: 65.92%
