# Question Answering with BERT (SQuAD 1.1)

This notebook performs: EDA, preprocessing, fine-tuning BERT, evaluation, and inference.
It uses Hugging Face datasets and transformers, and helper utilities in `src/`.

In [None]:
# If needed, install dependencies in the environment
# !pip install -r ../requirements.txt
import os, sys
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
print('Project root:', PROJECT_ROOT)

In [None]:
from src.data import load_squad, question_type_distribution, answer_length_distribution, sample_qas
from src.preprocess import prepare_train_features, prepare_validation_features
from src.train import train
from src.infer import QAInference
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

ds = load_squad('1.1')
ds

## EDA: Question types and answer length distribution

In [None]:
qd = question_type_distribution(ds['train'])
qd_sorted = sorted(qd.items(), key=lambda kv: kv[1], reverse=True)[:15]
import pandas as pd
pd.DataFrame(qd_sorted, columns=['Question_First_Token', 'Count'])

In [None]:
al = answer_length_distribution(ds['train'])
al_series = pd.Series(al).sort_index()
plt.figure(figsize=(8,4))
sns.lineplot(x=al_series.index, y=al_series.values)
plt.title('Answer length distribution (first answer words)')
plt.xlabel('Answer length (words)')
plt.ylabel('Frequency')
plt.show()

## Print answers for 5 context-question pairs (ground truth)

In [None]:
for i, (ctx, q, answers) in enumerate(sample_qas(ds['train'], n=5), start=1):
    print(f'Example {i}')
    print('Question:', q)
    print('Answer(s):', answers)
    print('Context snippet:', ctx[:200].replace('\n', ' ') + '...')
    print('-'*80)

## Fine-tune BERT on a small subset (for quick demo)

In [None]:
out = train(
    model_name='bert-base-uncased',
    output_dir=os.path.join(PROJECT_ROOT, 'question-answering-system-with-BERT', 'models', 'qa-bert'),
    epochs=1,
    batch_size=8,
    learning_rate=3e-5,
    squad_version='1.1',
    train_samples=500,  # reduce for quick run
    eval_samples=100,   # reduce for quick run
)
out

## Inference: Ask a question

In [None]:
qa = QAInference(model_dir=os.path.join(PROJECT_ROOT, 'question-answering-system-with-BERT', 'models', 'qa-bert'))
context = ('BERT stands for Bidirectional Encoder Representations from Transformers. ' +
           'It is a transformer-based machine learning technique for NLP developed by Google.')
question = 'What does BERT stand for?'
qa.predict(context, question)

## Web App
Run the Flask app from a terminal:

```bash
export QA_MODEL_DIR=question-answering-system-with-BERT/models/qa-bert
python question-answering-system-with-BERT/webapp/app.py
```

Open http://127.0.0.1:5000 and try your context and question.