In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import json
import faiss
import os
import openai


openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
# Load questions
with open('data/index_to_question.json', 'r') as f:
    questions = json.load(f)

# Load FAISS index
index = faiss.read_index('data/questions_index.faiss')

# Load embedding model
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

### Method 1: Simple prompt (GPT3.5 Turbo)

In [3]:
def evaluate_generated_answer(query, generated_answer, ground_truth_answer):
    prompt = f"""
You are to grade the following LLM answer on a scale from 0 to 10.

**Question:**
{query}

**LLM Answer:**
{generated_answer}

**Correct Answer:**
{ground_truth_answer}

**Task:**
- Assign a score from 0 (completely incorrect) to 10 (completely correct).
- Provide a brief justification for the score.

**Response Format:**
Provide your evaluation in parsable JSON without using code blocks:

{{
  "Score": [0-10],
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()
    
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

In [4]:
def search_question(query, index, questions, embedding_model, k=5):
    # Encode the query
    query_embedding = embedding_model.encode([query])

    # Search the index
    distances, indices = index.search(query_embedding, k)

    # Retrieve the top k questions
    retrieved_questions = [questions[idx] for idx in indices[0]]
    return retrieved_questions, indices[0]

def search_function(query):
    return search_question(query, index, questions, embedding_model, k=5)

with open('data/ground_truth.json', 'r') as f:
    ground_truth = json.load(f)


def generate_answer(query, retrieved_docs):
    context = "\n".join([f"Q: {doc['question']}\nA: {doc['correct_options']}" for doc in retrieved_docs])
    prompt = f"""
    You are an AI assistant helping users prepare for the German driving theory exam.

    Context:
    {context}

    Question:
    {query}

    Answer the question using the context above, don't include the letter at the beginning of each correct option. If the answer is not in the context, say "I'm sorry, I don't have enough information to answer that question."
    """
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',  # Use 'gpt-4' if available
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()

evaluations = []
for item in ground_truth[:5]:
    query = item['query']
    ground_truth_answer = item['ground_truth_answer']
    retrieved_docs, _ = search_function(query)
    generated_answer = generate_answer(query, retrieved_docs)
    evaluation = evaluate_generated_answer(query, generated_answer, ground_truth_answer)
    evaluations.append(evaluation)

In [5]:
evaluations

['{\n  "Score": 10,\n  "Explanation": "The answer is completely correct and provides the accurate information that truck drivers in Germany are not permitted to have any alcohol in their system while operating a vehicle."\n}',
 '{\n  "Score": 2,\n  "Explanation": "The answer provided is incorrect as the legal limit for blood alcohol concentration for truck drivers in Germany is 0.3 grams per liter, not 0.02%. The answer shows a lack of accuracy and understanding of the topic."\n}',
 '{\n  "Score": 8,\n  "Explanation": "The answer correctly identifies the action to take when approaching a sharp curve while driving a truck, which is to reduce speed before entering the curve. However, it could have been more detailed by mentioning the importance of maintaining control of the vehicle and preventing rollovers or loss of control."\n}',
 '{\n  "Score": 8,\n  "Explanation": "The answer provided the correct legal limit for blood alcohol concentration (BAC) for truck drivers in Germany. However,

In [6]:
evaluations = []
for item in ground_truth:
    query = item['query']
    ground_truth_answer = item['ground_truth_answer']
    retrieved_docs, _ = search_function(query)
    generated_answer = generate_answer(query, retrieved_docs)
    evaluation = evaluate_generated_answer(query, generated_answer, ground_truth_answer)
    evaluations.append(evaluation)

In [7]:
print(f"Avg. score: {np.mean([json.loads(e)['Score'] for e in evaluations])}")

Avg. score: 8.033333333333333


### Method 2: Chain-of-Thought (GTP3.5 Turbo)

In [8]:
def generate_answer_cot(query, retrieved_docs):
    context = "\n".join([f"Q: {doc['question']}\nA: {doc['correct_options']}" for doc in retrieved_docs])
    prompt = f"""
    You are an AI assistant helping users prepare for the German driving theory exam.
    
    Context:
    {context}
    
    Question:
    {query}
    
    Think step-by-step and provide a detailed answer based on the context. If the answer is not in the context, say "I'm sorry, I don't have enough information to answer that question."
    """

    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()

In [9]:
evaluations_cot = []
for item in ground_truth:
    query = item['query']
    ground_truth_answer = item['ground_truth_answer']
    retrieved_docs, _ = search_function(query)
    generated_answer = generate_answer_cot(query, retrieved_docs)
    evaluation = evaluate_generated_answer(query, generated_answer, ground_truth_answer)
    evaluations_cot.append(evaluation)

In [10]:
print(f"Avg. score: {np.mean([json.loads(e)['Score'] for e in evaluations_cot])}")

Avg. score: 8.033333333333333


### Method 3: Simple prompt (GPT-4o mini)

In [11]:
def generate_answer_4o_mini(query, retrieved_docs):
    context = "\n".join([f"Q: {doc['question']}\nA: {doc['correct_options']}" for doc in retrieved_docs])
    prompt = f"""
    You are an AI assistant helping users prepare for the German driving theory exam.

    Context:
    {context}

    Question:
    {query}

    Answer the question using the context above, don't include the letter at the beginning of each correct option. If the answer is not in the context, say "I'm sorry, I don't have enough information to answer that question."
    """
    response = openai.ChatCompletion.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

In [12]:
evaluations_4o = []
for item in ground_truth:
    query = item['query']
    ground_truth_answer = item['ground_truth_answer']
    retrieved_docs, _ = search_function(query)
    generated_answer = generate_answer_4o_mini(query, retrieved_docs)
    evaluation = evaluate_generated_answer(query, generated_answer, ground_truth_answer)
    evaluations_4o.append(evaluation)

In [13]:
print(f"Avg. score: {np.mean([json.loads(e)['Score'] for e in evaluations_4o])}")

Avg. score: 8.3


### Method 4: Chain-of-Thought (GPT-4o mini)

In [14]:
def generate_answer_cot_4o_mini(query, retrieved_docs):
    context = "\n".join([f"Q: {doc['question']}\nA: {doc['correct_options']}" for doc in retrieved_docs])
    prompt = f"""
    You are an AI assistant helping users prepare for the German driving theory exam.
    
    Context:
    {context}
    
    Question:
    {query}
    
    Think step-by-step and provide a detailed answer based on the context. If the answer is not in the context, say "I'm sorry, I don't have enough information to answer that question."
    """

    response = openai.ChatCompletion.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

In [15]:
evaluations_cot_4o = []
for item in ground_truth:
    query = item['query']
    ground_truth_answer = item['ground_truth_answer']
    retrieved_docs, _ = search_function(query)
    generated_answer = generate_answer_cot_4o_mini(query, retrieved_docs)
    evaluation = evaluate_generated_answer(query, generated_answer, ground_truth_answer)
    evaluations_cot_4o.append(evaluation)

In [16]:
print(f"Avg. score: {np.mean([json.loads(e)['Score'] for e in evaluations_cot_4o])}")

Avg. score: 7.9


In [None]:
### Method 5: Few-Shot