### Retrieval method 1: minsearch

In [1]:
import minsearch

ms_index = minsearch.Index(
    text_fields=["theme", "chapter", "question"],
    keyword_fields=[]
)

In [2]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

# Load your generated questions
with open('data/generated_questions.json', 'r') as f:
    questions = json.load(f)

ms_index.fit(questions)

<minsearch.Index at 0x10479e950>

In [3]:
def search_minsearch(query):
    boost = {'question': 3.0, 'theme': 2.0, 'chapter': 1.0}

    results = ms_index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )

    return results

In [4]:
search_minsearch("What does a red triangular sign indicate?")

[{'theme': 'Traffic Signs',
  'chapter': 'Understanding German Traffic Signs',
  'question': 'What does a triangular traffic sign with a red border and a white background indicate?',
  'options': ['A. Priority road', 'B. Give way', 'C. No entry'],
  'correct_options': ['B. Give way'],
  'comments': 'A triangular traffic sign with a red border and a white background indicates that drivers must give way to vehicles on the main road.'},
 {'theme': 'Traffic Signs',
  'chapter': 'Understanding German Traffic Signs',
  'question': 'What does a blue traffic sign with a white arrow pointing upwards indicate?',
  'options': ['A. One-way street',
   'B. Mandatory direction of travel',
   'C. No entry'],
  'correct_options': ['B. Mandatory direction of travel'],
  'comments': 'This sign indicates that drivers must follow the direction of the arrow and proceed in that direction.'},
 {'theme': 'Traffic Signs',
  'chapter': 'Understanding German Traffic Signs',
  'question': 'What does a blue circul

### Evaluate

In [5]:
def hit_rate(relevance_total):
    cnt = sum(1 for relevance in relevance_total if any(relevance))
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for relevance in relevance_total:
        for rank, rel in enumerate(relevance):
            if rel:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

In [6]:
relevance_total = []

with open('data/ground_truth.json', 'r') as f:
    ground_truth = json.load(f)

for q in tqdm(ground_truth):
    doc_id = q['ground_truth_question']
    results = search_minsearch(query=q['query'])
    relevance = [d['question'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 818.44it/s]


In [7]:
hit_rate(relevance_total), mrr(relevance_total)

(0.9666666666666667, 0.9055555555555554)

### Retrieval method 2: FAISS

In [8]:
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

# Encode the questions
embeddings = []
for q in questions:
    q_text = f"{q['theme']} {q['chapter']} {q['question']}"
    embedding = embedding_model.encode(q_text)
    embeddings.append(embedding)

embeddings = np.array(embeddings)

In [9]:
import faiss

# Define the dimension
dimension = embeddings.shape[1]

# Create FAISS index
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

In [10]:
def search_faiss(query, index, questions, embedding_model, k=5):
    # Encode the query
    query_embedding = embedding_model.encode([query])

    # Search the index
    distances, indices = faiss_index.search(query_embedding, k)

    # Retrieve the top k questions
    retrieved_questions = [questions[idx] for idx in indices[0]]
    return retrieved_questions, indices[0]

In [11]:
user_query = "What does a red triangular sign indicate?"

retrieved_docs, _ = search_faiss(user_query, faiss_index, questions, embedding_model, k=5)
retrieved_docs

[{'theme': 'Traffic Signs',
  'chapter': 'Understanding German Traffic Signs',
  'question': 'What does a triangular traffic sign with a red border and a white background indicate?',
  'options': ['A. Priority road', 'B. Give way', 'C. No entry'],
  'correct_options': ['B. Give way'],
  'comments': 'A triangular traffic sign with a red border and a white background indicates that drivers must give way to vehicles on the main road.'},
 {'theme': 'Traffic Signs',
  'chapter': 'Understanding German Traffic Signs',
  'question': 'What does a blue circular sign with a white arrow pointing upwards indicate?',
  'options': ['A. Mandatory direction of travel',
   'B. No entry',
   'C. Priority road'],
  'correct_options': ['A. Mandatory direction of travel'],
  'comments': 'This sign indicates that drivers must follow the direction of the arrow and proceed straight ahead.'},
 {'theme': 'Traffic Signs',
  'chapter': 'Understanding German Traffic Signs',
  'question': 'What does a blue circular 

### Retrieval method 3: FAISS (Cosine)

In [12]:
# Cosine similarity index
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for cosine similarity
embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
index.add(embeddings_normalized)

### Evaluation

In [13]:
def search_question(query, index, questions, embedding_model, k=5):
    # Encode the query
    query_embedding = embedding_model.encode([query])

    # Search the index
    distances, indices = index.search(query_embedding, k)

    # Retrieve the top k questions
    retrieved_questions = [questions[idx] for idx in indices[0]]
    return retrieved_questions, indices[0]

def search_function(query):
    return search_question(query, index, questions, embedding_model, k=5)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for item in tqdm(ground_truth):
        query = item['query']
        relevant_doc_id = item['relevant_doc_id']
        results, indices = search_function(query)
        relevance = [idx == relevant_doc_id for idx in indices]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

def evaluate_retrieval_methods():
    methods = ['Euclidean', 'Cosine']
    results = {}
    for method_name in methods:
        # Build the index
        if method_name == 'Cosine':
            # Cosine similarity index
            index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity
            
            # Normalize embeddings for cosine similarity
            embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
            index.add(embeddings_norm)
            result = evaluate(ground_truth, lambda q: search_question(q, index, questions, embedding_model))
        else:
            index = faiss.IndexFlatL2(dimension)
            index.add(embeddings)
            result = evaluate(ground_truth, lambda q: search_question(q, index, questions, embedding_model))

        # Evaluate
        results[method_name] = result

    return results

In [14]:
evaluate_retrieval_methods()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 32.15it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 73.26it/s]


{'Euclidean': {'hit_rate': 1.0, 'mrr': 0.8666666666666666},
 'Cosine': {'hit_rate': 1.0, 'mrr': 0.8666666666666666}}