Question Answering using Transformers

In [1]:
# import the dataset

import pandas as pd

df = pd.read_json('train-v1.1.json')

In [93]:
# data format in json file: {data: {title, paragraphs: [{context, qas: [{ answers: [{answer_start, text}], questions, id}]}]}}

In [2]:
# normalizing/flattening the dataset (preprocessing part 1)

flat_rows = []

for item in df['data']:
    for para in item['paragraphs']:
        context = para['context']
        for qa in para['qas']:
            question = qa['question']
            qa_id = qa['id']
            answers = qa['answers']
            if len(answers) > 0:
                answer = answers[0]
                flat_rows.append({
                    'id': qa_id,
                    'context': context,
                    'question': question,
                    'answer': answer['text'],
                    'answer_start': answer['answer_start'],
                    'answer_end': answer['answer_start'] + len(answer['text'])
                })

df = pd.DataFrame(flat_rows)
print(df.head())

                         id  \
0  5733be284776f41900661182   
1  5733be284776f4190066117f   
2  5733be284776f41900661180   
3  5733be284776f41900661181   
4  5733be284776f4190066117e   

                                             context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                                            question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                                    answer  answer_start  answer_end  
0               Saint Bernadette Soubirous           515         541

In [3]:
# convert dataset to huggingface dataset

from datasets import Dataset

dataset = Dataset.from_pandas(df)

In [4]:
# tokenize and align the data (preprocessing part 2)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')

def tokenize_and_align(dataset):
    tokenized_inputs = tokenizer(
        dataset['question'],
        dataset['context'],
        truncation = 'only_second',
        max_length = 384,
        stride = 128,
        padding = 'max_length',
        return_overflowing_tokens = False,
        return_offsets_mapping = True,
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_inputs['offset_mapping']):
        answer_start = dataset['answer_start'][i]
        answer_end = dataset['answer_end'][i]
        sequence_ids = tokenized_inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        if not (answer_start >= offsets[context_start][0] and answer_end <= offsets[context_end][1]):
            start_positions.append(tokenizer.cls_token_id)
            end_positions.append(tokenizer.cls_token_id)
        else:
            for idx in range(context_start, context_end + 1):
                if offsets[idx][0] <= answer_start < offsets[idx][1]:
                    start_positions.append(idx)
                    break
            for idx in range(context_end, context_start - 1, -1):
                if offsets[idx][0] < answer_end <= offsets[idx][1]:
                    end_positions.append(idx)
                    break

    tokenized_inputs['start_positions'] = start_positions
    tokenized_inputs['end_positions'] = end_positions

    tokenized_inputs.pop('offset_mapping')
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [5]:
# load distilbert model

from transformers import AutoModelForQuestionAnswering

distilbert_model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [6]:
# setting up parameters/arguments for training

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./bert-qa-finetuned',
    overwrite_output_dir=True,
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=800,
    weight_decay=0.05,
    save_strategy='steps',
    save_steps=2000,
    logging_steps=100,
    fp16=True,
    gradient_accumulation_steps=2,
    report_to='none',
    eval_strategy='no',
)


In [7]:
# fine-tuning the model

from transformers import Trainer

trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    eval_dataset=tokenized_dataset,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
100,0.6978
200,0.6697
300,0.6678
400,0.6435
500,0.6907
600,0.6787
700,0.6997
800,0.7273
900,0.7006
1000,0.685


TrainOutput(global_step=1369, training_loss=0.6853472289593225, metrics={'train_runtime': 755.1734, 'train_samples_per_second': 115.999, 'train_steps_per_second': 1.813, 'total_flos': 8583810682277376.0, 'train_loss': 0.6853472289593225, 'epoch': 1.0})

In [8]:
# save the fine-tuned model

trainer.save_model("./bert-qa-finetuned")

In [41]:
# model preparation for evaluation

import torch

def load_model(model_path, tokenizer_path):
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model.eval()
    return model, tokenizer

model, tokenizer = load_model('./bert-qa-finetuned', 'distilbert-base-uncased-distilled-squad')

In [42]:
# feeding the model both context and question for finding the answer span

def predict_answer(question, context):
    inputs = tokenizer(question, context, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    tokens = inputs.input_ids[0][start_index:end_index+1]
    answer = tokenizer.decode(tokens, skip_special_tokens=True)
    return answer, start_index, end_index

question = 'Where is Pakistan located?'
context = 'Pakistan, officially the Islamic Republic of Pakistan, is a country in South Asia.'
answer, start_index, end_index = predict_answer(question, context)

print(f'Answer: {answer}\nAnswer Span: Start -> {start_index}, End -> {end_index}')

Answer: south asia
Answer Span: Start -> 20, End -> 21


In [43]:
# finding out the exact match and f1 score

from collections import Counter

def exact_match_score(prediction, actual):
    return prediction.strip().lower() == actual.strip().lower()

def f1_score(prediction, actual):
    pred_tokens = prediction.split()
    truth_tokens = actual.split()
    common = Counter(pred_tokens) & Counter(truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(truth_tokens)
    return 2 * precision * recall / (precision + recall)

def evaluate(questions, context, answers):
    total = len(questions)
    exact_match = 0
    f1 = 0

    for question, answer in zip(questions, answers):
        pred, _, _ = predict_answer(question, context)
        exact_match += exact_match_score(pred, answer)
        f1 += f1_score(pred.lower(), answer.lower())

        print(f'Q: {question}')
        print(f'Actual: {answer}')
        print(f'Predicted: {pred}\n')

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    print(f'Exact Match: {exact_match:.2f}%')
    print(f'F1 Score: {f1:.2f}%')

questions = [
    'What is the capital city of Pakistan?',
    'With which countries does Pakistan share borders?',
    'Where is Pakistan located?'
]

context = '''
      Pakistan, officially the Islamic Republic of Pakistan, is a country in South Asia.
      It shares borders with India, Afghanistan, Iran, and China.
      The capital city of Pakistan is Islamabad, which was built during the 1960s to replace Karachi as the capital.
      Islamabad is known for its high standard of living, safety, and abundant greenery.
      It hosts important government buildings, embassies, and cultural landmarks, making it the political and administrative center of the country.
    '''

answers = [
    'Islamabad',
    'India, Afghanistan, Iran, and China',
    'South Asia'
]

evaluate(questions, context, answers)

Q: What is the capital city of Pakistan?
Actual: Islamabad
Predicted: islamabad

Q: With which countries does Pakistan share borders?
Actual: India, Afghanistan, Iran, and China
Predicted: india, afghanistan, iran, and china

Q: Where is Pakistan located?
Actual: South Asia
Predicted: south asia

Exact Match: 100.00%
F1 Score: 100.00%


In [44]:
# bonus: comparing with tinybert

model, tokenizer = load_model('deepset/roberta-base-squad2', 'deepset/roberta-base-squad2')

evaluate(questions, context, answers)

Q: What is the capital city of Pakistan?
Actual: Islamabad
Predicted:  Islamabad

Q: With which countries does Pakistan share borders?
Actual: India, Afghanistan, Iran, and China
Predicted:  India, Afghanistan, Iran, and China

Q: Where is Pakistan located?
Actual: South Asia
Predicted:  South Asia

Exact Match: 100.00%
F1 Score: 100.00%


In [45]:
model, tokenizer = load_model('./bert-qa-finetuned', 'distilbert-base-uncased-distilled-squad')

while True:

    context = input('Enter new context: ')
    question = input('Enter new question: ')
    if context.lower() == 'exit' or question.lower() == 'exit':
        break

    answer, _, _ = predict_answer(question, context)
    print(f'Answer: {answer}\n')

Enter new context: Pakistan is a country, located in South Asia.
Enter new question: Where is Pakistan located?
Answer: south asia

Enter new context: Sundar Pichai is the CEO of Google.
Enter new question: Who is the CEO of Google?
Answer: sundar pichai

Enter new context: Clash Royale is a game by Supercell
Enter new question: What is Clash Royale?
Answer: a game by supercell

Enter new context: exit
Enter new question: exit
