<div align="center"> <h1 align="center"> Chatbot Assignment: QA Chatbot with WikiQA </h1> </div>

<div align="center"> <h3 align="center"> December 12, 2021 </h3> </div>
<div align="center"> <h3 align="center"> Maria Aroca </h3> </div>
<br />

#### Code Adapted from
https://towardsdatascience.com/how-to-fine-tune-a-q-a-transformer-86f91ec92997
https://colab.research.google.com/github/fastforwardlabs/ff14_blog/blob/master/_notebooks/2020-05-19-Getting_Started_with_QA.ipynb?pli=1&authuser=1#scrollTo=bgYVkF2RmHPK

In [1]:
# Load WikiQA Dataset from huggingface library

from datasets import load_dataset

dataset = load_dataset("wiki_qa")

Using custom data configuration default
Reusing dataset wiki_qa (C:\Users\User\.cache\huggingface\datasets\wiki_qa\default\0.1.0\d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)


  0%|          | 0/3 [00:00<?, ?it/s]

## Preparing the Data

In [2]:
#Extract correct answers from training and validation WikiQA dataset (only answers with label ==1)

train_questions = []
train_answers = []
train_q_ids = []

val_questions = []
val_answers = []
val_q_ids = []

for i in range(0, len(dataset['validation'])):
    label = dataset['validation']['label'][i]
    question = dataset['validation']['question'][i]
    answer = dataset['validation']['answer'][i]
    q_id = dataset['validation']['question_id'][i]
    if label==1:
        val_questions.append(question)
        val_answers.append(answer)
        val_q_ids.append(q_id)
        
for i in range(0, len(dataset['train'])):
    label = dataset['train']['label'][i]
    question = dataset['train']['question'][i]
    answer = dataset['train']['answer'][i]
    q_id = dataset['train']['question_id'][i]
    if label==1:
        train_questions.append(question)
        train_answers.append(answer)
        train_q_ids.append(q_id)

In [3]:
# Create 'context' by concatenating strings in possible answers from WikiQA

train_contexts = []
val_contexts = []

for q_id in val_q_ids:
    list_question_id = [i for i,_ in enumerate(dataset['validation']) if _['question_id'] == q_id]
    val_contexts.append(' '.join(dataset['validation']['answer'][list_question_id[0]:list_question_id[-1]+1]))
    
for q_id in train_q_ids:
    list_question_id = [i for i,_ in enumerate(dataset['train']) if _['question_id'] == q_id]
    train_contexts.append(' '.join(dataset['train']['answer'][list_question_id[0]:list_question_id[-1]+1]))

In [6]:
#Remove one problematic line (encoding problem)
del val_answers[32]
del val_questions[32]
del val_contexts[32]
del train_answers[314]
del train_questions[314]
del train_contexts[314]

In [9]:
import re
import pandas as pd
# Get start and end positions from answers in context
train_answer_start=[]
train_answer_end=[]
for i in range(0, len(train_answers)):
    for match in re.finditer(train_answers[i], train_contexts[i]):
        answer_start= match.start()
        answer_end = match.end()
    train_answer_start.append(answer_start)
    train_answer_end.append(answer_end)
        
val_answer_start=[]
val_answer_end=[]
for i in range(0, len(val_answers)):
    for match in re.finditer(val_answers[i], val_contexts[i]):
        answer_start= match.start()
        answer_end = match.end()
    val_answer_start.append(answer_start)
    val_answer_end.append(answer_end)
    
train_answers2 = pd.DataFrame(list(zip(train_answers, train_answer_start, train_answer_end)),
               columns =['text', 'answer_start', 'answer_end']).to_dict(orient='records')
val_answers2 = pd.DataFrame(list(zip(val_answers, val_answer_start, val_answer_end)),
               columns =['text', 'answer_start', 'answer_end']).to_dict(orient='records')

## Encoding

In [25]:
# Fine-tuning on https://huggingface.co/deepset/roberta-base-squad2
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") 
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

In [26]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers2)
add_token_positions(val_encodings, val_answers2)

In [27]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [28]:
import torch

class WikiQADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = WikiQADataset(train_encodings)
val_dataset = WikiQADataset(val_encodings)

## Fine-Tuning

In [32]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device =  torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 65/65 [21:44<00:00, 20.08s/it, loss=3.01]
Epoch 1: 100%|██████████| 65/65 [21:36<00:00, 19.95s/it, loss=3.21]
Epoch 2: 100%|██████████| 65/65 [21:35<00:00, 19.93s/it, loss=1.96] 


In [33]:
# Save Model
model_path = 'models/roberta-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/roberta-custom\\tokenizer_config.json',
 'models/roberta-custom\\special_tokens_map.json',
 'models/roberta-custom\\vocab.json',
 'models/roberta-custom\\merges.txt',
 'models/roberta-custom\\added_tokens.json',
 'models/roberta-custom\\tokenizer.json')

## Model Evaluation

In [34]:
# Load Model
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [35]:
start_pred = torch.argmax(outputs['start_logits'], dim=1)
end_pred = torch.argmax(outputs['end_logits'], dim=1)

In [36]:
# switch model out of training mode
model.eval()
# initialize validation set data loader
val_loader = DataLoader(val_dataset, batch_size=16)
# initialize list to store accuracies
acc = []
# loop through batches
for batch in val_loader:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        # we will use true positions for accuracy calc
        start_true = batch['start_positions']
        end_true = batch['end_positions']
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull prediction tensors out and argmax to get predicted tokens
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)

In [37]:
acc

0.46496212151315475

In [43]:
print("T/F\tstart\tend\n")
for i in range (len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
         f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	1	32
pred	1	32

true	131	165
pred	151	165

true	1	24
pred	1	24

true	1	38
pred	45	155

true	36	57
pred	36	57

true	31	56
pred	1	11

true	1	58
pred	1	58

true	1	30
pred	1	115

true	70	121
pred	70	121

true	8	39
pred	1	39

true	5	36
pred	46	89



## Test

In [48]:
model = AutoModelForQuestionAnswering.from_pretrained(model_path, return_dict=False)
tokenizer = AutoTokenizer.from_pretrained(model_path)
outputs = model(**inputs)

In [64]:
question = "how are glacier caves formed?"

context = """A partly submerged glacier cave on Perito Moreno Glacier . 
The ice facade is approximately 60 m high Ice formations in the Titlis glacier cave. A glacier cave is a 
cave formed within the ice of a glacier . Glacier caves are often called ice caves , 
but this term is properly used to describe bedrock caves that contain year-round ice."""


# 1. TOKENIZE THE INPUT
# note: if you don't include return_tensors='pt' you'll get a list of lists which is easier for 
# exploration but you cannot feed that into a model. 
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 

# 2. OBTAIN MODEL SCORES
# the AutoModelForQuestionAnswering class includes a span predictor on top of the model. 
# the model returns answer start and end scores for each word in the text
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score

# 3. GET THE ANSWER SPAN
# once we have the most likely start and end tokens, we grab all the tokens between them
# and convert tokens back to words!
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [67]:
print(question)
print(answer)

how are glacier caves formed?
 A glacier cave is a 
cave formed within the ice of a glacier .


In [68]:
question = "how much is 1 tablespoon of water?"

context = """This tablespoon has a capacity of about 15 mL. 
Measuring Spoons In the US and parts of Canada, a tablespoon is the largest type of spoon used for eating from a bowl. 
In the UK, Europe and most Commonwealth countries, a tablespoon is a type of large spoon usually used for serving. 
In countries where a tablespoon is a serving spoon, the nearest equivalent to the US tablespoon is either the dessert spoon or the soup spoon . 
A tablespoonful, nominally the capacity of one tablespoon, is commonly used as a measure of volume in cooking . It is abbreviated as T, tb, tbs, tbsp, tblsp, or tblspn. The capacity of ordinary tablespoons is not regulated by law and is subject to considerable variation. In the USA one tablespoon (measurement unit) is approximately 15 mL; the capacity of an actual tablespoon (dining utensil) ranges from 7 mL to 14 mL. 
In Australia one tablespoon (measurement unit) is 20 mL."""


# 1. TOKENIZE THE INPUT
# note: if you don't include return_tensors='pt' you'll get a list of lists which is easier for 
# exploration but you cannot feed that into a model. 
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 

# 2. OBTAIN MODEL SCORES
# the AutoModelForQuestionAnswering class includes a span predictor on top of the model. 
# the model returns answer start and end scores for each word in the text
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score

# 3. GET THE ANSWER SPAN
# once we have the most likely start and end tokens, we grab all the tokens between them
# and convert tokens back to words!
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [71]:
print('question:',question)
print('answer:',answer)

question: how much is 1 tablespoon of water?
answer: This tablespoon has a capacity of about 15 mL.


In [60]:
train_questions[2]

'how much is 1 tablespoon of water'

In [61]:
train_contexts[2]

'This tablespoon has a capacity of about 15 mL. Measuring Spoons In the US and parts of Canada, a tablespoon is the largest type of spoon used for eating from a bowl. In the UK, Europe and most Commonwealth countries, a tablespoon is a type of large spoon usually used for serving. In countries where a tablespoon is a serving spoon, the nearest equivalent to the US tablespoon is either the dessert spoon or the soup spoon . A tablespoonful, nominally the capacity of one tablespoon, is commonly used as a measure of volume in cooking . It is abbreviated as T, tb, tbs, tbsp, tblsp, or tblspn. The capacity of ordinary tablespoons is not regulated by law and is subject to considerable variation. In the USA one tablespoon (measurement unit) is approximately 15 mL; the capacity of an actual tablespoon (dining utensil) ranges from 7 mL to 14 mL. In Australia one tablespoon (measurement unit) is 20 mL.'

In [77]:
question = "how are cholera and typhus transmitted and prevented?"

context = """Cholera is an infection in the small intestine caused by the bacterium Vibrio cholerae . 
The main symptoms are watery diarrhea and vomiting . Transmission occurs primarily by drinking water or eating food 
that has been contaminated by the feces (waste product) of an infected person, including one with no apparent symptoms. 
The severity of the diarrhea and vomiting can lead to rapid dehydration and electrolyte imbalance, and death in some cases. 
The primary treatment is oral rehydration therapy , typically with oral rehydration solution , to replace water and electrolytes. If this is not tolerated or does not provide improvement fast enough, intravenous fluids can also be used. Antibacterial drugs are beneficial in those with severe disease to shorten its duration and severity. Worldwide, it affects 3–5 million people and causes 100,000–130,000 deaths a year . 
Cholera was one of the earliest infections to be studied by epidemiological methods."""


# 1. TOKENIZE THE INPUT
# note: if you don't include return_tensors='pt' you'll get a list of lists which is easier for 
# exploration but you cannot feed that into a model. 
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 

# 2. OBTAIN MODEL SCORES
# the AutoModelForQuestionAnswering class includes a span predictor on top of the model. 
# the model returns answer start and end scores for each word in the text
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score

# 3. GET THE ANSWER SPAN
# once we have the most likely start and end tokens, we grab all the tokens between them
# and convert tokens back to words!
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [78]:
print('question:',question)
print('answer:',answer)

question: how are cholera and typhus transmitted and prevented?
answer:  Transmission occurs primarily by drinking water or eating food 


In [39]:
import wikipedia as wiki
import pprint as pp

In [99]:
question = "how old was monica lewinsky during the affair?"

context = """Monica Samille Lewinsky (born July 23, 1973) is an American woman with whom United States President Bill Clinton 
admitted to having had an "improper relationship" while she worked at the White House in 1995 and 1996. 
The affair and its repercussions (which included Clinton\'s impeachment ) became known as the Lewinsky scandal ."""


# 1. TOKENIZE THE INPUT
# note: if you don't include return_tensors='pt' you'll get a list of lists which is easier for 
# exploration but you cannot feed that into a model. 
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 

# 2. OBTAIN MODEL SCORES
# the AutoModelForQuestionAnswering class includes a span predictor on top of the model. 
# the model returns answer start and end scores for each word in the text
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score

# 3. GET THE ANSWER SPAN
# once we have the most likely start and end tokens, we grab all the tokens between them
# and convert tokens back to words!
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [100]:
print('question:',question)
print('answer:',answer)

question: how old was monica lewinsky during the affair?
answer: Monica Samille Lewinsky (born July 23, 1973) is an American woman with whom United States President Bill Clinton 
admitted to having had an "improper relationship" while she worked at the White House in 1995 and 1996.


In [127]:
import wikipedia as wiki
import pprint as pp

question = 'Why is the sky blue?'
results = wiki.search(question)
page = wiki.page(results[0])
text = page.content

In [128]:
context = text[:model.config.max_position_embeddings]


# 1. TOKENIZE THE INPUT
# note: if you don't include return_tensors='pt' you'll get a list of lists which is easier for 
# exploration but you cannot feed that into a model. 
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 

# 2. OBTAIN MODEL SCORES
# the AutoModelForQuestionAnswering class includes a span predictor on top of the model. 
# the model returns answer start and end scores for each word in the text
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score

# 3. GET THE ANSWER SPAN
# once we have the most likely start and end tokens, we grab all the tokens between them
# and convert tokens back to words!
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [129]:
print('question:',question)
print('answer:',answer)

question: Why is the sky blue?
answer: Diffuse sky radiation is solar radiation reaching the Earth's surface after having been scattered from the direct solar beam by molecules or particulates in the atmosphere.


In [143]:
question = 'How did Edgar Allan Poe die?'
results = wiki.search(question)
page = wiki.page(results[0])
text = page.content
context = text[:model.config.max_position_embeddings]
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [144]:
print('question:',question)
print('answer:',answer)

question: How did Edgar Allan Poe die?
answer:  He was taken to the Washington College Hospital, where he died at 5 a.m. on Sunday, October 7.


In [145]:
train_questions[300]

'what does the president of the usa do'

In [146]:
question = 'what does the president of the usa do?'
results = wiki.search(question)
page = wiki.page(results[0])
text = page.content
context = text[:model.config.max_position_embeddings]
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") 
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

In [147]:
print('question:',question)
print('answer:',answer)

question: what does the president of the usa do?
answer:  The president directs the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces.
