# Question Answering with Transformers

End-to-end extractive Question Answering using SQuAD and BERT/DistilBERT.
- Load the dataset
- Preprocess and tokenize
- Train / evaluate (Exact Match & F1)
- Run inference: passage + question → answer span

## 1. Setup and imports

In [1]:
import os
import sys
import json

import pandas as pd
from tqdm import tqdm

ROOT = os.path.abspath(os.path.join(os.getcwd(), '..')) if os.path.basename(os.getcwd()) == 'src' else os.getcwd()
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
src_path = os.path.join(ROOT, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from data_loader import load_squad_data
from preprocess import prepare_train_features, squad_metrics, normalize_answer

  from .autonotebook import tqdm as notebook_tqdm


## 2. Load SQuAD data

In [3]:
data_dir = os.path.join(ROOT, 'data')
train_examples, dev_examples = load_squad_data(data_dir=data_dir)

print(f'Train examples: {len(train_examples)}')
print(f'Dev examples   : {len(dev_examples)}')
print('\nSample example:')
ex = train_examples[0]
print('Context (first 200 chars):', ex['context'][:200], '...')
print('Question:', ex['question'])
print('Answer  :', ex['answers'][0]['text'])

Train examples: 87599
Dev examples   : 10570

Sample example:
Context (first 200 chars): Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper sta ...
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer  : Saint Bernadette Soubirous


## 3. Tokenizer and preprocessing

In [4]:
from transformers import AutoTokenizer

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Inspect a small batch after preprocessing
batch = train_examples[:2]
out = prepare_train_features(batch, tokenizer, max_length=384, doc_stride=128)
print('Returned keys:', list(out.keys()))
print('Number of tokenized samples (with overflow):', len(out['input_ids']))
print('start_positions:', out['start_positions'])
print('end_positions  :', out['end_positions'])

Returned keys: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
Number of tokenized samples (with overflow): 2
start_positions: [130, 52]
end_positions  : [137, 56]


## 4. Load model and train on a small subset

In [5]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
from datasets import Dataset
import torch

from train import build_dataset, run_evaluation

# Small subset for quick experimentation
n_train, n_eval = 200, 50
train_small = train_examples[:n_train]
dev_small = dev_examples[:n_eval]

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
train_dataset = build_dataset(train_small, tokenizer, max_length=384, doc_stride=128, batch_size=8)

print(f'Train dataset size after tokenization: {len(train_dataset)}')

Loading weights: 100%|██████████| 100/100 [00:00<00:00, 319.32it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForQuestionAnswering LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
qa_outputs.bias         | MISSING    | 
qa_outputs.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m
Tokenizing: 100%|██████████| 25/25 [00:00<00:00, 134.40it/s]

Train dataset size after tokenization: 200





In [None]:
from transformers import DataCollatorWithPadding

# Train and save to the same folder that the Flask app expects: ROOT/outputs/final
output_root = os.path.join(ROOT, 'outputs')
os.makedirs(output_root, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_root,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy='epoch',
    fp16=torch.cuda.is_available(),
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()

final_dir = os.path.join(output_root, 'final')
trainer.save_model(final_dir)
tokenizer.save_pretrained(final_dir)


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


Step,Training Loss
10,5.762893


## 5. Evaluation: Exact Match and F1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
metrics = run_evaluation(model, tokenizer, dev_small, device, max_length=384, doc_stride=128, batch_size=8)
print('Exact Match:', metrics['exact_match'])
print('F1 score   :', metrics['f1'])

## 6. Inference: passage + question → answer

In [None]:
from inference import QAInference

model_path = os.path.join(output_root, 'final')
qa = QAInference(model_path=model_path) if os.path.isdir(model_path) else QAInference(model_name=model_name)

context = dev_examples[0]['context']
question = dev_examples[0]['question']
answer_text, score, start_char, end_char = qa.predict(question, context)

print('Question     :', question)
print('Predicted ans:', answer_text)
print('Score        :', score)
print('Ground truth :', dev_examples[0]['answers'][0]['text'])