# Transformers Question Answering (Huggingface method)

----
### Importing main libraries

In [54]:
import os
import pandas as pd
import numpy as np
import torch
import transformers
import matplotlib

### Downloading and managing Squad dataset

In [5]:
os.mkdir('squad') #Creating squad directory locally

In [6]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/' #squad URL

In [55]:
import requests

In [12]:
#Downloading train and dev from Squad dataset

for file in ['train-v2.0.json', 'dev-v2.0.json']:
    res = requests.get(f'{url}{file}') 
    with open(f'squad/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

### Data Pre-processing

In [25]:
import json

In [26]:
# Extracting contexts, questions and answers from squad dataset

def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    
    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    contexts.append(context)
                    questions .append(question)
                    answers.append(answer)
    return contexts, questions, answers

In [27]:
# Extracting contexts, questions and answers from train and validation datasets

train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [28]:
# Visualizing what data looks like

train_contexts[1]

SyntaxError: invalid syntax (3211899444.py, line 3)

### Word embedding

In [4]:
#Importing distilbert tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [30]:
#Tokenizing questions and context for both train and val datasets

train_encodings = tokenizer(train_questions, train_contexts, return_offsets_mapping=True, truncation="only_second", padding=True)
val_encodings = tokenizer(val_questions, val_contexts, return_offsets_mapping=True, truncation="only_second", padding=True)

In [31]:
#Visualizing keys from dictionary
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping'])

In [32]:
#Extracting offset mapping from both train and val datasets

train_offset_mapping = train_encodings.pop("offset_mapping")
val_offset_mapping = val_encodings.pop("offset_mapping")

In [33]:
# Adds start and end positions to the tokenized text to prepare it for finetune

def add_token_positions(encodings, answers, offset_mapping):
    answer = answers
    start_positions = []
    end_positions = []
    
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"]
        end_char = answer["answer_start"] + len(answer["text"])
        sequence_ids = encodings.sequence_ids(i)
        
        idx = 0
        while sequence_ids[idx] != 1:
            idx+=1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx+= 1
        context_end = idx - 1
        
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)
            
            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
            
    encodings.update({
        'start_positions' : start_positions,
        'end_positions' : end_positions   
    })

add_token_positions(train_encodings, train_answers, train_offset_mapping)
add_token_positions(val_encodings, val_answers, val_offset_mapping)

In [34]:
#Checking keys again to make sure start and end positions have been properly added
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [35]:
#Creates datasets from a dictionary

import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self,idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [36]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

### Fine-tune

In [25]:
#Importing distilbert model
from transformers import TFAutoModelForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [37]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [38]:
#checking cuda is available for training
import torch
torch.cuda.is_available()

True

In [43]:
#Declaring device, model and optimizer for training and setting model to train mode.

device = torch.device('cuda') 
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

In [40]:
#Training settings
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [30]:
#Training model

for epoch in range(3):  #Change epoch by changing range value
    loop = tqdm(train_loader, leave=True) #Loading bar
    for batch in loop:
        optim.zero_grad() 
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        #Calculating loss
        
        loss = outputs[0]
        loss.backward()
        optim.step()
        
        #Printing loss
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 8145/8145 [53:59<00:00,  2.51it/s, loss=1.13] 
Epoch 1: 100%|██████████| 8145/8145 [53:31<00:00,  2.54it/s, loss=0.61] 
Epoch 2: 100%|██████████| 8145/8145 [53:35<00:00,  2.53it/s, loss=0.568] 


In [31]:
#Saving model and tokenizer

model_path = 'model/distilbert-custom-huggingface'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('model/distilbert-custom-huggingface\\tokenizer_config.json',
 'model/distilbert-custom-huggingface\\special_tokens_map.json',
 'model/distilbert-custom-huggingface\\vocab.txt',
 'model/distilbert-custom-huggingface\\added_tokens.json',
 'model/distilbert-custom-huggingface\\tokenizer.json')

## Model Evaluation

In [41]:
#Importing custom model from local PC

from transformers import AutoModelForQuestionAnswering

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('model/distilbert-custom-huggingface')

model = AutoModelForQuestionAnswering.from_pretrained('model/distilbert-custom-huggingface')

#Setting model to evaluation mode
model.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [44]:
#Evaluating model

val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

#Loading Bar
loop = tqdm(val_loader, leave=True)
for batch in loop:
    
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        
        #Calculating predicted values
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        
        #Calculating accuracy of model
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

100%|██████████| 1640/1640 [03:52<00:00,  7.05it/s]


In [52]:
sum(acc)/len(acc) # Total Accuracy

0.6082888719512195

In [49]:
start_true #Expected values

tensor([173, 173, 173, 173,  11,  34,  62, 111], device='cuda:0')

In [50]:
start_pred #Predicted Values

tensor([ 82,  82, 193, 186,  11,  33,  61, 110], device='cuda:0')

## Running the Application

In [6]:
#Importing ucstom model from local PC

from transformers import AutoModelForQuestionAnswering

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('model/distilbert-custom-huggingface')#replace with model directory

model = AutoModelForQuestionAnswering.from_pretrained('model/distilbert-custom-huggingface')#replace with model directory

In [7]:
from transformers import pipeline

In [8]:
#Preparing model for custom question answering
question_answerer = pipeline("question-answering", 
                             model = AutoModelForQuestionAnswering.from_pretrained('model/distilbert-custom-huggingface'), #replace with model directory
                             tokenizer = AutoTokenizer.from_pretrained('model/distilbert-custom-huggingface')) #replace with model directory

In [9]:
import tkinter as tk
from tkinter import ttk

In [12]:
#Declaring canvas and title
root = tk.Tk()
root.geometry("800x500")
root.title("Bert Question Answering")

#Define styling

style = ttk.Style(root)
style.theme_use("clam")



#Model input/output 
def Take_input():
    context = textbox.get("1.0", "end-1c")
    question = textbox2.get("1.0", "end-1c")
    
    final_answer = question_answerer(question=question, context=context)
    
    textbox3.delete(1.0,tk.END)
    textbox3.insert(tk.END, final_answer['answer'])

    
#GUI objects

##Context label
label = tk.Label(root, text="Context", font=('Arial',11))
label.pack(padx=10, pady=10, anchor="w")

##Context textbox
textbox = tk.Text(root, height=4, font=('Arial',10))
textbox.pack(padx=10, pady=10, anchor="w", expand=True, fill=tk.BOTH)

##Question label
label2 = tk.Label(root, text="Question", font=('Arial',11))
label2.pack(padx=10, pady=10, anchor="w")


##Question textbox
textbox2 = tk.Text(root, height=2, font=('Arial',10))
textbox2.pack(padx=10, pady=10, anchor="w", expand=True, fill=tk.BOTH)

#Enter button
button = tk.Button(root, text="Enter", height=2, width=11, font=('Arial', 10) , command=lambda:Take_input())
button.pack(padx=10, pady=10,  anchor="e")

##Answer label
label3 = tk.Label(root, text="Answer", font=('Arial',11))
label3.pack(padx=10, pady=10, anchor="w")

#Answer textbox
textbox3 = tk.Text(root, height=3, font=('Arial',10))
textbox3.pack(padx=10, pady=10, anchor="w", expand=True, fill=tk.BOTH)


root.mainloop()