In [1]:
import json
from typing import TypedDict


class LLMAnswer(TypedDict):
    answer: str
    is_correct: bool

class Question(TypedDict):
    id: str
    question: str
    expected_answer: str
    category: str
    difficulty: str
    llm_answers: dict[str, LLMAnswer]


questions:list[Question] = json.load(open("../frontend/data/sample_trivia.json"))['questions']

In [4]:

# Test with your example
test_case = {
    "question": "Who has built the Chateau de Versailles?",
    "expected_answer": "Louis XIV",
    "user_answer": "Louis XVI"
}


### Test Xenova/distilbert-base-uncased-finetuned-sst2

In [None]:
# Import required libraries
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification


classifier = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Function to format input and classify
def check_answer(question:str, expected_answer:str, user_answer:str):
    input_text = f"Question: {question} Expected answer: {expected_answer}. User answer: {user_answer}. Is the user answer correct? "
    result = classifier(input_text)
    return result[0]



result = check_answer(
    test_case["question"],
    test_case["expected_answer"],
    test_case["user_answer"]
)
print(result)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


### Test all-MiniLM-L6-v2

In [11]:
from sentence_transformers import SentenceTransformer, util

# Load a sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define expected and user answers

def check_answer(question:str, expected_answer:str, user_answer:str)->bool:
    # Encode them as vectors
    expected_vec = model.encode(expected_answer, convert_to_tensor=True)
    user_vec = model.encode(user_answer, convert_to_tensor=True)

    # Compute cosine similarity
    similarity = util.pytorch_cos_sim(expected_vec, user_vec).item()

    # Decide threshold (tweak this as needed)
    threshold = 0.8
    classification = "correct" if similarity > threshold else "incorrect"

    return similarity > threshold

check_answer(test_case["question"], test_case["expected_answer"], "Louis XIV")

True

In [12]:
for question in questions:
    for model_name, llm_answer in question["llm_answers"].items():
        correct = check_answer(question["question"], question["expected_answer"], llm_answer["answer"])
        print(question['question'], "- Answer:", question['expected_answer'], "\n", llm_answer["answer"], "-",  llm_answer["is_correct"],"-", correct)

Who was the first President of the United States? - Answer: George Washington 
 George Washington, who served as the first President from 1789 to 1797 - True - False
Who was the first President of the United States? - Answer: George Washington 
 George Washington was the first President of the United States - True - False
What is the chemical symbol for gold? - Answer: Au 
 Au - True - True
What is the chemical symbol for gold? - Answer: Au 
 The chemical symbol for gold is Au, derived from the Latin word 'aurum' - True - False
Which is the largest ocean on Earth? - Answer: Pacific Ocean 
 The Pacific Ocean - True - True
Which is the largest ocean on Earth? - Answer: Pacific Ocean 
 The Atlantic Ocean is the largest ocean on Earth - False - False
Which actor played Tony Stark in the Marvel Cinematic Universe? - Answer: Robert Downey Jr. 
 Robert Downey Jr. - True - True
Which actor played Tony Stark in the Marvel Cinematic Universe? - Answer: Robert Downey Jr. 
 Robert Downey Jr. portr

In [18]:
question

{'id': 1,
 'category': 'History',
 'question': 'Who was the first President of the United States?',
 'expected_answer': 'George Washington',
 'llm_answers': {'gpt3.5': {'answer': 'George Washington, who served as the first President from 1789 to 1797',
   'is_correct': True},
  'claude': {'answer': 'George Washington was the first President of the United States',
   'is_correct': True}},
 'difficulty': 'easy'}