# Import Libraries

In [1]:
pip install torch transformers datasets evaluate tqdm

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import evaluate
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

# Data Loading & Preprocessing

In [3]:
# Load the SQuAD v1.1 dataset
squad_dataset = load_dataset('squad', 'plain_text')

# Load a tokenizer (using DistilBERT as an example)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define a preprocessing function
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        start_char = answers[i]["answer_start"][0]
        end_char = start_char + len(answers[i]["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply the preprocessing function to the dataset
processed_squad_dataset = squad_dataset.map(preprocess_function, batched=True, remove_columns=squad_dataset["train"].column_names)

# Set the format of the processed dataset to 'torch'
processed_squad_dataset.set_format("torch")

# Print the processed dataset
print(processed_squad_dataset)

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10570
    })
})


# Answer Question Function

In [4]:
def answer_question(question, context, tokenizer, model, device):
    # Tokenize the question and context together
    inputs = tokenizer(
        question,
        context,
        add_special_tokens=True,
        return_tensors="pt",
        truncation="only_second",
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

    # Move inputs to the specified device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Extract offset mapping and remove it from inputs for model forward pass
    offset_mapping = inputs.pop("offset_mapping").squeeze().tolist()

    # Pass the tokenized inputs through the loaded model
    # torch.no_grad() is used because we are only doing inference, not training
    with torch.no_grad():
        outputs = model(**inputs)

    # Determine the predicted start and end token indices from the logits
    # Use argmax to get the token with the highest score for start and end positions
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores, dim=1).item()
    answer_end = torch.argmax(answer_end_scores, dim=1).item()

    # Use the predicted start and end indices to reconstruct the answer
    # Get the sequence IDs to distinguish between question and context tokens
    sequence_ids = inputs.sequence_ids(0)

    # Find the start and end token indices of the context in the input sequence
    context_start = sequence_ids.index(1)
    # The context ends right before the second [SEP] token (which has sequence_id None)
    # or at the end of the sequence if there's no second [SEP]
    try:
        context_end = sequence_ids[context_start:].index(None) + context_start - 1
    except ValueError:
         context_end = len(sequence_ids) - 1 # Handle cases without a second SEP token

    # Ensure the predicted answer span is within the context
    # If the predicted start or end is outside the context, it's likely an invalid answer
    if answer_start < context_start or answer_end > context_end or answer_start > answer_end:
        return "Could not find a satisfactory answer within the context."

    # Use offset mapping to get the character start and end positions of the predicted answer
    # Ensure the predicted indices are within the bounds of the offset_mapping list
    if answer_start < len(offset_mapping) and answer_end < len(offset_mapping):
        start_char = offset_mapping[answer_start][0]
        end_char = offset_mapping[answer_end][1]

        # Extract the answer text from the original context string using character offsets
        predicted_answer = context[start_char:end_char]

        # Handle cases where the extracted answer is just whitespace or empty
        if not predicted_answer.strip():
            return "Could not extract a valid answer."

        return predicted_answer
    else:
        # This case should ideally not happen if answer_start and answer_end are within valid range
        return "Error in extracting answer span."

# Evaluation Function

In [5]:
def evaluate_model(model, tokenizer, dataset, device):
    exact_match_metric = evaluate.load("squad")
    f1_metric = evaluate.load("squad")

    predictions = []
    references = []

    validation_set = dataset['validation']

    # Iterate through the validation split
    for example in tqdm(validation_set):
        question = example['question']
        context = example['context']
        ground_truth_answers = example['answers']
        example_id = example['id']

        # Get the predicted answer using the answer_question function
        predicted_answer = answer_question(question, context, tokenizer, model, device)

        # Prepare prediction in the required format
        predictions.append({'prediction_text': predicted_answer, 'id': example_id})

        # Prepare reference in the required format
        references.append({'answers': ground_truth_answers, 'id': example_id})

    # Compute the Exact Match and F1 scores
    exact_match_results = exact_match_metric.compute(predictions=predictions, references=references)
    f1_results = f1_metric.compute(predictions=predictions, references=references)

    return {
        "exact_match": exact_match_results["exact_match"],
        "f1": f1_results["f1"]
    }

# Compare Different Models

In [6]:
def answer_question(question, context, tokenizer, model, device):
    # Tokenize the question and context together
    inputs = tokenizer(
        question,
        context,
        add_special_tokens=True,
        return_tensors="pt",
        truncation="only_second",
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

    # Move inputs to the specified device while preserving BatchEncoding structure
    inputs = inputs.to(device)

    # Extract offset mapping and remove it from inputs for model forward pass
    offset_mapping = inputs.pop("offset_mapping").squeeze().tolist()

    # Pass the tokenized inputs through the loaded model
    with torch.no_grad():
        outputs = model(**inputs)

    # Determine the predicted start and end token indices from the logits
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores, dim=1).item()
    answer_end = torch.argmax(answer_end_scores, dim=1).item()

    # Use the predicted start and end indices to reconstruct the answer
    sequence_ids = inputs.sequence_ids(0)

    # Find the start and end token indices of the context in the input sequence
    context_start = sequence_ids.index(1)
    try:
        context_end = sequence_ids[context_start:].index(None) + context_start - 1
    except ValueError:
         context_end = len(sequence_ids) - 1

    # Ensure the predicted answer span is within the context
    if answer_start < context_start or answer_end > context_end or answer_start > answer_end:
        return "Could not find a satisfactory answer within the context."

    # Use offset mapping to get the character start and end positions of the predicted answer
    if answer_start < len(offset_mapping) and answer_end < len(offset_mapping):
        start_char = offset_mapping[answer_start][0]
        end_char = offset_mapping[answer_end][1]

        predicted_answer = context[start_char:end_char]

        if not predicted_answer.strip():
            return "Could not extract a valid answer."

        return predicted_answer
    else:
        return "Error in extracting answer span."

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# BERT model
bert_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name).to(device)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

# RoBERTa model
roberta_model_name = "csarron/roberta-base-squad-v1"
roberta_model = AutoModelForQuestionAnswering.from_pretrained(roberta_model_name).to(device)
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)

# ALBERT model
albert_model_name = "albert-base-v2"
albert_model = AutoModelForQuestionAnswering.from_pretrained(albert_model_name).to(device)
albert_tokenizer = AutoTokenizer.from_pretrained(albert_model_name)

# Use the evaluate_model function defined previously to evaluate each of the loaded models
bert_scores = evaluate_model(bert_model, bert_tokenizer, squad_dataset, device)
roberta_scores = evaluate_model(roberta_model, roberta_tokenizer, squad_dataset, device)
albert_scores = evaluate_model(albert_model, albert_tokenizer, squad_dataset, device)

print("BERT Scores:")
print(f"  Exact Match: {bert_scores['exact_match']:.2f}")
print(f"  F1 Score: {bert_scores['f1']:.2f}")
print("\nRoBERTa Scores:")
print(f"  Exact Match: {roberta_scores['exact_match']:.2f}")
print(f"  F1 Score: {roberta_scores['f1']:.2f}")
print("\nALBERT Scores:")
print(f"  Exact Match: {albert_scores['exact_match']:.2f}")
print(f"  F1 Score: {albert_scores['f1']:.2f}")

Using device: cuda


config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at csarron/roberta-base-squad-v1 were not used when initializing RobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

BERT Scores:
  Exact Match: 86.28
  F1 Score: 92.43

RoBERTa Scores:
  Exact Match: 79.22
  F1 Score: 86.30

ALBERT Scores:
  Exact Match: 0.01
  F1 Score: 3.81


In [7]:
def simple_qa_interface(tokenizer, model, device):
    print("Simple Question Answering Interface")
    print("Enter 'quit' in context or question to exit.")

    while True:
        context = input("\nEnter the context (passage): ")
        if context.lower() == 'quit':
            break

        question = input("Enter the question: ")
        if question.lower() == 'quit':
            break

        if not context or not question:
            print("Please provide both context and a question.")
            continue

        try:
            # Use the improved answer_question function which takes device as argument
            answer = answer_question(question, context, tokenizer, model, device)
            print(f"\nAnswer: {answer}")
        except Exception as e:
            print(f"An error occurred: {e}")

# Assuming bert_tokenizer and bert_model are loaded from previous steps
# and device is set
simple_qa_interface(bert_tokenizer, bert_model, device)

Simple Question Answering Interface
Enter 'quit' in context or question to exit.


KeyboardInterrupt: Interrupted by user

# Interactive Interface


In [8]:
!pip install gradio transformers pandas -q

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import gradio as gr
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BERT model
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)

# For storing history
history = []

def get_answer(context, question):
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation="only_second",
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True
    )
    inputs = inputs.to(device)
    offset_mapping = inputs.pop("offset_mapping").squeeze().tolist()

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits, end_logits = outputs.start_logits, outputs.end_logits
    start_probs = F.softmax(start_logits, dim=-1)
    end_probs = F.softmax(end_logits, dim=-1)

    start_idx = torch.argmax(start_probs, dim=1).item()
    end_idx = torch.argmax(end_probs, dim=1).item()

    confidence = (start_probs[0, start_idx] * end_probs[0, end_idx]).item() * 100

    sequence_ids = inputs.sequence_ids(0)
    context_start = sequence_ids.index(1)
    try:
        context_end = sequence_ids[context_start:].index(None) + context_start - 1
    except ValueError:
        context_end = len(sequence_ids) - 1

    if start_idx < context_start or end_idx > context_end or start_idx > end_idx:
        return "No valid answer", 0.0

    start_char, end_char = offset_mapping[start_idx][0], offset_mapping[end_idx][1]
    answer = context[start_char:end_char]

    if not answer.strip():
        return "No valid answer", 0.0

    return answer, round(confidence, 2)

def qa_interface(context, question):
    answer, confidence = get_answer(context, question)
    history.append({
        "Context": context,
        "Question": question,
        "Answer": answer,
        "Confidence": confidence
    })
    return answer, confidence

def reset_fields():
    return "", "", "", 0

def download_csv():
    if not history:
        return None
    df = pd.DataFrame(history)
    file_path = "/tmp/qa_history.csv"
    df.to_csv(file_path, index=False)
    return file_path

with gr.Blocks() as demo:
    gr.Markdown("## 🤖 Question Answering with Transformers")
    gr.Markdown("Paste a passage, ask a question, and BERT will answer with a confidence score.")

    with gr.Row():
        context = gr.Textbox(label="Context (Passage)", lines=6, placeholder="Paste your passage here...")
        question = gr.Textbox(label="Question", lines=3, placeholder="Type your question here...")

    answer = gr.Textbox(label="Answer", lines=4, interactive=False)
    confidence = gr.Number(label="Confidence (%)", interactive=False, precision=2)

    with gr.Row():
        btn = gr.Button("Get Answer")
        reset_btn = gr.Button("🔄 Reset")
        download_btn = gr.Button("⬇ Download CSV")

    file_output = gr.File(label="Download QA History")

    btn.click(fn=qa_interface, inputs=[context, question], outputs=[answer, confidence])
    reset_btn.click(fn=reset_fields, outputs=[context, question, answer, confidence])
    download_btn.click(fn=download_csv, outputs=file_output)

demo.launch(share=True)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a357af973012fd58c5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


