### Prepare Embeddings

In [44]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import json

# Load your generated questions
with open('data/generated_questions.json', 'r') as f:
    questions = json.load(f)

# Initialize the embedding model
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

# Encode the questions
embeddings = []
for q in questions:
    q_text = f"{q['theme']} {q['chapter']} {q['question']}"
    embedding = embedding_model.encode(q_text)
    embeddings.append(embedding)

embeddings = np.array(embeddings)

In [45]:
len(embeddings)

30

### Create and Save FAISS Index

In [46]:
import faiss

# Define the dimension
dimension = embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save the index
faiss.write_index(index, 'data/questions_index.faiss')

# Save the mapping of index to questions
with open('data/index_to_question.json', 'w') as f:
    json.dump(questions, f)

### Retrieval Function

In [47]:
def search_question(query, index, questions, embedding_model, k=5):
    # Encode the query
    query_embedding = embedding_model.encode([query])

    # Search the index
    distances, indices = index.search(query_embedding, k)

    # Retrieve the top k questions
    retrieved_questions = [questions[idx] for idx in indices[0]]
    return retrieved_questions, indices[0]

### RAG 

In [48]:
import os
import openai


openai.api_key = os.getenv('OPENAI_API_KEY')

def generate_answer(query, retrieved_docs):
    context = "\n".join([f"Q: {doc['question']}\nA: {doc['correct_options']}" for doc in retrieved_docs])
    prompt = f"""
    You are an AI assistant helping users prepare for the German driving theory exam.

    Context:
    {context}

    Question:
    {query}

    Answer the question using the context above, don't include the letter at the beginning of each correct option. If the answer is not in the context, say "I'm sorry, I don't have enough information to answer that question."
    """
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',  # Use 'gpt-4' if available
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()

In [49]:
# Load questions
with open('data/index_to_question.json', 'r') as f:
    questions = json.load(f)

# Load FAISS index
index = faiss.read_index('data/questions_index.faiss')

# Load embedding model
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

# Set OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

In [50]:
user_query = "What is the maximum allowed blood alcohol concentration for truck drivers in Germany?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: The maximum allowable blood alcohol concentration for truck drivers in Germany is 0.02%.


In [51]:
user_query = "What does a red triangular sign indicate?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: A red triangular sign indicates "B. Yield" according to German traffic signs.


In [52]:
user_query = "What can happen if you ignore signs of tiredness?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: Ignoring signs of tiredness while driving can lead to decreased reaction times, impaired judgement, and an increased risk of accidents. It is important to prioritize rest and take breaks when feeling tired to ensure safe driving practices.


In [53]:
user_query = "What could cause the vehicle to leave the road?"

retrieved_docs, _ = search_question(user_query, index, questions, embedding_model, k=5)
answer = generate_answer(user_query, retrieved_docs)
print(f"Answer: {answer}")

Answer: Possible causes for a vehicle to leave the road could include distractions, drowsiness, oversteering, or understeering.


### Retrieval Evaluation

In [54]:
def hit_rate(relevance_total):
    cnt = sum(1 for relevance in relevance_total if any(relevance))
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for relevance in relevance_total:
        for rank, rel in enumerate(relevance):
            if rel:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

In [55]:
with open('ground_truth.json', 'r') as f:
    ground_truth = json.load(f)

In [56]:
def search_function(query):
    return search_question(query, index, questions, embedding_model, k=5)

from tqdm import tqdm

def evaluate(ground_truth, search_function):
    relevance_total = []

    for item in tqdm(ground_truth):
        query = item['query']
        relevant_doc_id = item['relevant_doc_id']
        results, indices = search_function(query)
        relevance = [idx == relevant_doc_id for idx in indices]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

results = evaluate(ground_truth, search_function)
print("Hit Rate:", results['hit_rate'])
print("MRR:", results['mrr'])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 97.87it/s]

Hit Rate: 1.0
MRR: 0.9333333333333333





## RAG Evaluation (LLM-as-a-jugde)

In [65]:
def evaluate_generated_answer(query, generated_answer, ground_truth_answer):
    prompt = """
    You are to grade the following LLM answer on a scale from 0 to 10.

    **Question:**
    {query}

    **LLM Answer:**
    {generated_answer}

    **Correct Answer:**
    {ground_truth_answer}

    **Task:**
    - Assign a score from 0 (completely incorrect) to 10 (completely correct).
    - Provide a brief justification for the score.

    **Response Format:**
    Provide your evaluation in parsable JSON without using code blocks:

    {{
      "Score": [0-10],
      "Explanation": "[Provide a brief explanation for your evaluation]"
    }}
""".strip()
    
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

In [66]:
evaluations = []
for item in ground_truth[:5]:
    query = item['query']
    ground_truth_answer = item['ground_truth_answer']
    retrieved_docs, _ = search_function(query)
    generated_answer = generate_answer(query, retrieved_docs)
    evaluation = evaluate_generated_answer(query, generated_answer, ground_truth_answer)
    evaluations.append(evaluation)



In [67]:
evaluations

['{\n  "Score": 7,\n  "Explanation": "The LLM answer provides relevant information and addresses the question, but there are some inaccuracies and missing details. It demonstrates a good understanding of the topic but could be improved with more specific and accurate information."\n}',
 '{\n  "Score": 5,\n  "Explanation": "The LLM answer partially addresses the question but lacks depth and accuracy. It provides some relevant information but fails to fully answer the query. There is room for improvement in providing a more comprehensive and accurate response."\n}',
 '{\n  "Score": 7,\n  "Explanation": "The LLM answer provides relevant information and addresses the question, but there are some inaccuracies and missing details compared to the correct answer. It demonstrates a good understanding of the topic but could be more precise."\n}',
 '{\n  "Score": 5,\n  "Explanation": "The LLM answer partially addresses the question but lacks depth and accuracy. It provides some relevant informati