### Prepare Embeddings

In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import json

# Load your generated questions
with open('data/generated_questions.json', 'r') as f:
    questions = json.load(f)

# Initialize the embedding model
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

# Encode the questions
embeddings = []
for q in questions:
    q_text = f"{q['theme']} {q['chapter']} {q['question']}"
    embedding = embedding_model.encode(q_text)
    embeddings.append(embedding)

embeddings = np.array(embeddings)

In [2]:
len(embeddings)

30

### Create and Save FAISS Index

In [3]:
import faiss

# Define the dimension
dimension = embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save the index
faiss.write_index(index, 'data/questions_index.faiss')

# Save the mapping of index to questions
with open('data/index_to_question.json', 'w') as f:
    json.dump(questions, f)

### Retrieval Function

In [4]:
def search_question(query, index, questions, embedding_model, k=5):
    # Encode the query
    query_embedding = embedding_model.encode([query])

    # Search the index
    distances, indices = index.search(query_embedding, k)

    # Retrieve the top k questions
    retrieved_questions = [questions[idx] for idx in indices[0]]
    return retrieved_questions, indices[0]

### RAG 

In [5]:
import os
import openai


openai.api_key = os.getenv('OPENAI_API_KEY')

def generate_answer(query, retrieved_docs):
    context = "\n".join([f"Q: {doc['question']}\nA: {doc['correct_options']}" for doc in retrieved_docs])
    prompt = f"""
    You are an AI assistant helping users prepare for the German driving theory exam.

    Context:
    {context}

    Question:
    {query}

    Answer the question using the context above, don't include the letter at the beginning of each correct option. If the answer is not in the context, say "I'm sorry, I don't have enough information to answer that question."
    """
    response = openai.ChatCompletion.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()

In [6]:
# Load questions
with open('data/index_to_question.json', 'r') as f:
    questions = json.load(f)

# Load FAISS index
index = faiss.read_index('data/questions_index.faiss')

# Load embedding model
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

# Set OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

In [7]:
user_query = "What is the maximum allowed blood alcohol concentration for truck drivers in Germany?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: The maximum allowed blood alcohol concentration for truck drivers in Germany is 0.0%.


In [8]:
user_query = "What does a red triangular sign indicate?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: I'm sorry, I don't have enough information to answer that question.


In [9]:
user_query = "What can happen if you ignore signs of tiredness?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: I'm sorry, I don't have enough information to answer that question.


In [10]:
user_query = "What could cause the vehicle to leave the road?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: I'm sorry, I don't have enough information to answer that question.


### Retrieval Evaluation

In [11]:
def hit_rate(relevance_total):
    cnt = sum(1 for relevance in relevance_total if any(relevance))
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for relevance in relevance_total:
        for rank, rel in enumerate(relevance):
            if rel:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

In [12]:
with open('data/ground_truth.json', 'r') as f:
    ground_truth = json.load(f)

In [13]:
def search_function(query):
    return search_question(query, index, questions, embedding_model, k=5)

from tqdm import tqdm

def evaluate(ground_truth, search_function):
    relevance_total = []

    for item in tqdm(ground_truth):
        query = item['query']
        relevant_doc_id = item['relevant_doc_id']
        results, indices = search_function(query)
        relevance = [idx == relevant_doc_id for idx in indices]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

results = evaluate(ground_truth, search_function)
print("Hit Rate:", results['hit_rate'])
print("MRR:", results['mrr'])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 38.75it/s]

Hit Rate: 1.0
MRR: 0.8666666666666666





## RAG Evaluation (LLM-as-a-jugde)

In [17]:
def evaluate_generated_answer(query, generated_answer, ground_truth_answer):
    prompt = f"""
You are to grade the following LLM answer on a scale from 0 to 10.

**Question:**
{query}

**LLM Answer:**
{generated_answer}

**Correct Answer:**
{ground_truth_answer}

**Task:**
- Assign a score from 0 (completely incorrect) to 10 (completely correct).
- Provide a brief justification for the score.

**Response Format:**
Provide your evaluation in parsable JSON without using code blocks:

{{
  "Score": [0-10],
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()
    
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

In [22]:
evaluations = []
for item in ground_truth:
    query = item['query']
    ground_truth_answer = item['ground_truth_answer']
    retrieved_docs, _ = search_function(query)
    generated_answer = generate_answer(query, retrieved_docs)
    evaluation = evaluate_generated_answer(query, generated_answer, ground_truth_answer)
    evaluations.append(evaluation)

In [23]:
evaluations[:5]

['{\n  "Score": 10,\n  "Explanation": "The answer is completely correct and provides the accurate information that truck drivers in Germany are not permitted to have any alcohol in their system while operating a vehicle."\n}',
 '{\n  "Score": 2,\n  "Explanation": "The answer provided is incorrect as the legal limit for blood alcohol concentration for truck drivers in Germany is 0.3 grams per liter, not 0.02%. The answer shows a lack of accuracy and understanding of the topic."\n}',
 '{\n  "Score": 8,\n  "Explanation": "The answer correctly identifies the action of reducing speed before entering a sharp curve while driving a truck. However, it could be improved by mentioning the specific reason for reducing speed, such as maintaining control of the vehicle and preventing rollovers or loss of control."\n}',
 '{\n  "Score": 8,\n  "Explanation": "The answer provided the correct legal limit for blood alcohol concentration (BAC) for truck drivers in Germany. However, it could have been impro

In [24]:
print(f"Avg. score: {np.mean([json.loads(e)['Score'] for e in evaluations])}")

Avg. score: 8.1
