In [1]:
import json
import os
from pathlib import Path
from typing import Optional
from sklearn.metrics import f1_score
from collections import Counter
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [27]:
# PROJECT_ROOT = Path(__file__).resolve().parent
PROJECT_ROOT = Path('../')
VALIDATION_PATH = PROJECT_ROOT / "QA-dataset" / "autotrain_data" / "validation.json"
BERT_MODEL_PATH = PROJECT_ROOT / "notebooks" / "autotrain-bert-ex-qa1"
F_ROBERTA_PATH = PROJECT_ROOT / "notebooks" / "autotrain-roberta-ex-qa1"
ROBERTA_SQUAD2 = "deepset/roberta-base-squad2"

In [4]:
def load_validation_data(limit: Optional[int] = None) -> list:
    """Load validation examples from JSON."""
    with open(VALIDATION_PATH, "r") as f:
        data = json.load(f)
    if limit:
        data = data[:limit]
    return data


def normalize_answer(text: str) -> str:
    """Lower text and remove extra whitespace for fair comparison."""
    return " ".join(text.lower().strip().split())

In [5]:
def compute_f1(prediction: str, ground_truth: str) -> float:
    """Compute token-level F1 (SQuAD-style) using sklearn. Counter intersection for multiplicity."""
    pred_tokens = normalize_answer(prediction).split()
    truth_tokens = normalize_answer(ground_truth).split()
    if not pred_tokens or not truth_tokens:
        return 1.0 if pred_tokens == truth_tokens else 0.0
    common = Counter(pred_tokens) & Counter(truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    # Build (y_true, y_pred) for sklearn: num_same*(1,1) + (|pred|-num_same)*(0,1) + (|ref|-num_same)*(1,0)
    y_true = [1] * num_same + [0] * (len(pred_tokens) - num_same) + [1] * (len(truth_tokens) - num_same)
    y_pred = [1] * num_same + [1] * (len(pred_tokens) - num_same) + [0] * (len(truth_tokens) - num_same)
    return float(f1_score(y_true, y_pred, zero_division=0))

In [6]:
def compute_exact_match(prediction: str, ground_truths: list[str]) -> float:
    """1.0 if prediction matches any ground truth (normalized), else 0.0."""
    pred_norm = normalize_answer(prediction)
    for gt in ground_truths:
        if pred_norm == normalize_answer(gt):
            return 1.0
    return 0.0

In [7]:
def compute_metrics(predictions: list, references: list) -> dict:
    """Compute exact match and F1 over the dataset."""
    exact_matches = []
    f1_scores = []
    for pred, ref in zip(predictions, references):
        gt_texts = ref["answers"]["text"]
        em = max(compute_exact_match(pred, gt_texts), 0.0)
        f1 = max(compute_f1(pred, gt) for gt in gt_texts) if gt_texts else 0.0
        exact_matches.append(em)
        f1_scores.append(f1)
    return {
        "exact_match": 100.0 * sum(exact_matches) / len(exact_matches) if exact_matches else 0.0,
        "f1": 100.0 * sum(f1_scores) / len(f1_scores) if f1_scores else 0.0,
    }

In [8]:
def run_bert_hf(
    validation: list,
    model_path: str,
    device: Optional[str] = None,
) -> list:
    """Run a Hugging Face QA model (AutoModelForQuestionAnswering) on validation set."""
    from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)

    if device is None:
        try:
            import torch
            if torch.cuda.is_available():
                device = 0
            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
                device = "mps"
            else:
                device = -1
        except ImportError:
            device = -1

    qa_pipeline = pipeline(
        "question-answering",
        model=model,
        tokenizer=tokenizer,
        device=device,
    )

    predictions = []
    for item in validation:
        result = qa_pipeline(
            question=item["question"],
            context=item["context"],
        )
        predictions.append(result["answer"] if result else "")
    return predictions


In [28]:
def run_bert(validation: list, device: Optional[str] = None) -> list:
    """Run autotrain-bert-ex-qa1 on validation set."""
    if not BERT_MODEL_PATH.exists():
        raise FileNotFoundError(f"BERT model not found at {BERT_MODEL_PATH}")
    return run_bert_hf(validation, str(BERT_MODEL_PATH), device)

def run_f_robert(validation: list, device: Optional[str] = None) -> list:
    """Run autotrain-roberta-ex-qa1 on validation set."""
    if not F_ROBERTA_PATH.exists():
        raise FileNotFoundError(f"ROBERTA model not found at {F_ROBERTA_PATH}")
    return run_bert_hf(validation, str(F_ROBERTA_PATH), device)

def run_robertasquad2(validation: list, device: Optional[str] = None) -> list:
    """Run base MatSciBERT from Hugging Face on validation set."""
    return run_bert_hf(validation, ROBERTA_SQUAD2, device)

In [10]:
def run_openai(
    validation: list,
    model_id: str,
) -> list:
    """Run an OpenAI chat model on extractive QA."""
    try:
        from openai import OpenAI
    except ImportError:
        raise ImportError("Please install openai: pip install openai")

    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY environment variable is not set")

    client = OpenAI(api_key=api_key)
    predictions = []

    for item in validation:
        prompt = f"""You are an extractive question-answering system. Answer the question using ONLY the exact words from the context. Do not paraphrase or add information. If the answer is not in the context, respond with an empty string.

                        Context: {item["context"]}
                        
                        Question: {item["question"]}
                        
                        Answer (exact extract from context):"""

        try:
            response = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
            )
            answer = (response.choices[0].message.content or "").strip()
            # Remove quotes if model wrapped the answer
            if answer.startswith('"') and answer.endswith('"'):
                answer = answer[1:-1]
            predictions.append(answer)
        except Exception as e:
            print(f"  [WARN] OpenAI error for {model_id}: {e}")
            predictions.append("")

    return predictions

In [11]:
# --- Ollama (local, free) ---


def run_ollama(
    validation: list,
    model_id: str,
    host: str = "http://localhost:11434",
    ) -> list:
    """Run an Ollama chat model on extractive QA. Requires Ollama running locally."""
    try:
        from ollama import Client
    except ImportError:
        raise ImportError("Please install ollama: pip install ollama")

    client = Client(host=host)
    predictions = []

    for item in validation:
        prompt = f"""You are an extractive question-answering system. Answer the question using ONLY the exact words from the context. Do not paraphrase or add information. If the answer is not in the context, respond with an empty string.

                Context: {item["context"]}
                
                Question: {item["question"]}
                
                Answer (exact extract from context):"""

        try:
            response = client.chat(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": 0, "num_predict": 100},
            )
            answer = (response.message.content or "").strip()
            if answer.startswith('"') and answer.endswith('"'):
                answer = answer[1:-1]
            predictions.append(answer)
        except Exception as e:
            print(f"  [WARN] Ollama error for {model_id}: {e}")
            predictions.append("")

    return predictions

In [29]:
model_ids = {
        "bert": "autotrain-bert-ex-qa1",
        "gpt-5.2": "gpt-5.2",
        "gpt-4o": "gpt-4o",
        "qwen2.5": "qwen2.5:7b-instruct",
        "roberta": "deepset/roberta-base-squad2",
        "f-roberta": "autotrain-roberta-ex-qa1",
    }

In [30]:
print("Loading validation data...")
validation = load_validation_data()
print(f"  Loaded {len(validation)} examples\n")

Loading validation data...
  Loaded 627 examples



In [31]:
references = [{"answers": item["answers"]} for item in validation]

In [32]:
results = {}

for model_key in ["bert", "gpt-5.2", "gpt-4o", "qwen2.5", "roberta", "f-roberta"]:
    display_name = model_ids[model_key]
    print(f"Evaluating {display_name}...")
    try:
        if model_key == "bert":
            predictions = run_bert(validation, device='mps')
        elif model_key == "roberta":
            predictions = run_robertasquad2(validation, device='mps')
        elif model_key == "f-roberta":
            predictions = run_f_robert(validation, device='mps')
        elif model_key == "qwen2.5":
            predictions = run_ollama(validation, model_id=display_name)
        else:
            predictions = run_openai(validation, model_id=display_name)
        metrics = compute_metrics(predictions, references)
        results[display_name] = metrics
        print(f"  EM: {metrics['exact_match']:.2f}%  F1: {metrics['f1']:.2f}%")
    except Exception as e:
        print(f"  [ERROR] {e}")
        results[display_name] = {"error": str(e)}
    print()

Device set to use mps


Evaluating autotrain-roberta-ex-qa1...
  EM: 79.74%  F1: 81.27%

