In [1]:
import json
from typing import TypedDict


class LLMAnswer(TypedDict):
    answer: str
    is_correct: bool

class Question(TypedDict):
    id: str
    question: str
    expected_answer: str
    category: str
    difficulty: str
    llm_answers: dict[str, LLMAnswer]


questions:list[Question] = json.load(open("../frontend/data/sample_trivia.json"))['questions']

In [2]:

# Test with your example
test_case = {
    "question": "Who has built the Chateau de Versailles?",
    "expected_answer": "Louis XIV",
    "user_answer": "Louis XVI"
}


### Test Xenova/distilbert-base-uncased-finetuned-sst2

In [3]:
# Import required libraries
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification


classifier = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Function to format input and classify
def check_answer(question:str, expected_answer:str, user_answer:str):
    input_text = f"Question: {question} Expected answer: {expected_answer}. User answer: {user_answer}. Is the user answer correct? "
    result = classifier(input_text)
    return result[0]



result = check_answer(
    test_case["question"],
    test_case["expected_answer"],
    test_case["user_answer"]
)
print(result)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


{'label': 'NEGATIVE', 'score': 0.9953383207321167}


### Hybrid Approach: Containment Check + Semantic Similarity (all-mpnet-base-v2)

In [5]:
import re
from sentence_transformers import SentenceTransformer, util

# Load a better sentence embedding model (all-mpnet-base-v2 has better accuracy)
model = SentenceTransformer('all-mpnet-base-v2')

def normalize_text(text: str) -> str:
    """Normalize text for comparison: lowercase, remove extra whitespace."""
    return re.sub(r'\s+', ' ', text.lower().strip())

def check_containment(expected_answer: str, user_answer: str) -> bool:
    """
    Check if expected answer is contained in user answer.
    Handles word boundaries for better matching.
    """
    expected_norm = normalize_text(expected_answer)
    user_norm = normalize_text(user_answer)
    
    # Exact match
    if expected_norm == user_norm:
        return True
    
    # Check if expected answer is contained as a substring
    if expected_norm in user_norm:
        return True
    
    # For short answers (1-3 words), check if all words are present
    expected_words = set(expected_norm.split())
    user_words = set(user_norm.split())
    
    # If expected answer is short (<= 3 words), check if all words appear
    if len(expected_words) <= 3 and len(expected_words) > 0:
        # All expected words must be in user answer
        if expected_words.issubset(user_words):
            return True
    
    return False

def check_answer(question: str, expected_answer: str, user_answer: str, 
                 containment_threshold: int = 3, similarity_threshold: float = 0.55) -> bool:
    """
    Hybrid approach: First check containment, then fall back to semantic similarity.
    
    Strategy:
    1. If expected answer is contained in user answer (with word boundary handling), return True
    2. Otherwise, use semantic similarity with question context
    
    Args:
        question: The quiz question
        expected_answer: The correct answer
        user_answer: The user/LLM answer to validate
        containment_threshold: Max words in expected answer to use containment check (default 3)
        similarity_threshold: Semantic similarity threshold (default 0.55, lower for better recall)
    
    Returns:
        True if answer is correct, False otherwise
    """
    # Normalize for comparison
    expected_norm = normalize_text(expected_answer)
    user_norm = normalize_text(user_answer)
    
    # Strategy 1: Containment check (best for short answers like "1896", "Au", "George Washington")
    # Check if expected answer is contained in user answer
    if check_containment(expected_answer, user_answer):
        return True
    
    # Strategy 2: Semantic similarity with question context (for longer, paraphrased answers)
    # Include question context in encoding for better understanding
    expected_text = f"{question} {expected_answer}"
    user_text = f"{question} {user_answer}"
    
    # Encode them as vectors
    expected_vec = model.encode(expected_text, convert_to_tensor=True)
    user_vec = model.encode(user_text, convert_to_tensor=True)
    
    # Compute cosine similarity
    similarity = util.pytorch_cos_sim(expected_vec, user_vec).item()
    
    return similarity > similarity_threshold

# Test with the example
check_answer(test_case["question"], test_case["expected_answer"], "Louis XIV")

True

In [6]:
for question in questions:
    for model_name, llm_answer in question["llm_answers"].items():
        correct = check_answer(question["question"], question["expected_answer"], llm_answer["answer"])
        print(f"------{question['question']}------")
        print("Expected Answer:", question['expected_answer'], "\n", "LLM Answer:", llm_answer["answer"], 
        "\nExpected evaluation:",llm_answer["is_correct"],"\n", "Model evaluation:", correct)
        

------Who was the first President of the United States?------
Expected Answer: George Washington 
 LLM Answer: George Washington, who served as the first President from 1789 to 1797 
Expected evaluation: True 
 Model evaluation: True
------Who was the first President of the United States?------
Expected Answer: George Washington 
 LLM Answer: George Washington was the first President of the United States 
Expected evaluation: True 
 Model evaluation: True
------What is the chemical symbol for gold?------
Expected Answer: Au 
 LLM Answer: Au 
Expected evaluation: True 
 Model evaluation: True
------What is the chemical symbol for gold?------
Expected Answer: Au 
 LLM Answer: The chemical symbol for gold is Au, derived from the Latin word 'aurum' 
Expected evaluation: True 
 Model evaluation: True
------Which is the largest ocean on Earth?------
Expected Answer: Pacific Ocean 
 LLM Answer: The Pacific Ocean 
Expected evaluation: True 
 Model evaluation: True
------Which is the largest oc

In [18]:
question

{'id': 1,
 'category': 'History',
 'question': 'Who was the first President of the United States?',
 'expected_answer': 'George Washington',
 'llm_answers': {'gpt3.5': {'answer': 'George Washington, who served as the first President from 1789 to 1797',
   'is_correct': True},
  'claude': {'answer': 'George Washington was the first President of the United States',
   'is_correct': True}},
 'difficulty': 'easy'}