In [1]:
pip install -U minsearch qdrant_client rouge

Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Collecting qdrant_client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting pandas (from minsearch)
  Downloading pandas-2.3.1-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn (from minsearch)
  Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn->minsearch)
  Downloading scipy-1.16.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn->minsearch)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->minsearch)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading minsearch-0.0.4-py3-none-any.whl (11 kB)
Downloading qdrant_client-1.15.0-py3-none-any.whl (337 kB)
Downloading rouge-1.0.1-py3-none

In [2]:
import requests
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# Install required libraries
# !pip install -U minsearch qdrant_client rouge

# Load the data
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

# Evaluation functions
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from minsearch import Index
import inspect

# Check the signature of the search method
print(inspect.signature(Index.search))

# Create a basic index
index = Index(
    text_fields={
        'question': 1.5,
        'text': 1.0,
        'section': 0.1
    },
    keyword_fields={'course'}
)
index.fit(documents)

# Try a basic search without any parameters
results = index.search("How to train a model?")
print(f"Number of results: {len(results)}")
print(f"First result: {results[0] if results else None}")

# Define search function based on what we learn
def search_function_q1(q):
    # We'll adjust this based on what we learn about the API
    results = index.search(q['question'])
    # If there's no limit parameter, we'll slice the results
    return results[:5]

# Evaluate
results_q1 = evaluate(ground_truth, search_function_q1)
print(f"Hit rate: {results_q1['hit_rate']}")
print(f"MRR: {results_q1['mrr']}")

(self, query, filter_dict=None, boost_dict=None, num_results=10, output_ids=False)
Number of results: 10
First result: {'text': "What if there were hundreds of columns? How do you get the columns only with numeric or object data in a more concise way?\ndf.select_dtypes(include=np.number).columns.tolist()\ndf.select_dtypes(include='object').columns.tolist()\nAdded by Gregory Morris", 'section': '1. Introduction to Machine Learning', 'question': 'How to select column by dtype', 'course': 'machine-learning-zoomcamp', 'id': 'ff4da2b6'}


100%|██████████| 1830/1830 [00:08<00:00, 212.87it/s]

Hit rate: 0.726775956284153
MRR: 0.6128051001821502





In [12]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Create embeddings for questions
texts = []
for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Create and fit the vector index
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

# Define search function
def search_function_q2(q):
    q_vector = pipeline.transform([q['question']])
    # Check if VectorSearch.search accepts 'limit' parameter
    try:
        return vindex.search(q_vector[0], limit=5)
    except TypeError:
        # If 'limit' is not accepted, try without it and slice the results
        results = vindex.search(q_vector[0])
        return results[:5]

# Evaluate
results_q2 = evaluate(ground_truth, search_function_q2)
print(f"Hit rate: {results_q2['hit_rate']}")
print(f"MRR: {results_q2['mrr']}")

100%|██████████| 1830/1830 [00:04<00:00, 454.61it/s]

Hit rate: 0.3994535519125683
MRR: 0.29087431693989035





In [13]:
# Create embeddings for question + answer
texts = []
for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Create and fit the vector index
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

# Define search function
def search_function_q3(q):
    q_vector = pipeline.transform([q['question']])
    # Check if VectorSearch.search accepts 'limit' parameter
    try:
        return vindex.search(q_vector[0], limit=5)
    except TypeError:
        # If 'limit' is not accepted, try without it and slice the results
        results = vindex.search(q_vector[0])
        return results[:5]

# Evaluate
results_q3 = evaluate(ground_truth, search_function_q3)
print(f"Hit rate: {results_q3['hit_rate']}")
print(f"MRR: {results_q3['mrr']}")

100%|██████████| 1830/1830 [00:08<00:00, 205.59it/s]

Hit rate: 0.773224043715847
MRR: 0.6104826958105655





In [15]:
# Install the sentence-transformers package
!pip install sentence-transformers

from qdrant_client import QdrantClient
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer

# Initialize the model
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = SentenceTransformer(model_handle)

# Create texts combining question and answer
texts = []
for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

# Generate embeddings
embeddings = model.encode(texts)

# Initialize Qdrant client (using in-memory storage for this example)
client = QdrantClient(":memory:")

# Create collection
client.create_collection(
    collection_name="faq",
    vectors_config=models.VectorParams(
        size=model.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    )
)

# Upload vectors
client.upload_points(
    collection_name="faq",
    points=[
        models.PointStruct(
            id=idx,
            vector=embedding.tolist(),
            payload=doc
        )
        for idx, (doc, embedding) in enumerate(zip(documents, embeddings))
    ]
)

# Define search function
def search_function_q4(q):
    query_vector = model.encode(q['question']).tolist()
    results = client.search(
        collection_name="faq",
        query_vector=query_vector,
        limit=5
    )
    
    # Make sure we're returning the payload in the expected format
    try:
        return [point.payload for point in results]
    except AttributeError:
        # If results are returned in a different format
        if isinstance(results[0], dict) and 'payload' in results[0]:
            return [point['payload'] for point in results]
        else:
            # Try to adapt to the structure we have
            print("Warning: Unexpected result format from Qdrant. Check the structure:")
            print(results[0])
            # Return a best guess
            return results

# Evaluate
results_q4 = evaluate(ground_truth, search_function_q4)
print(f"Hit rate: {results_q4['hit_rate']}")
print(f"MRR: {results_q4['mrr']}")

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch>=1.11.0->sentence-transformers)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch>=1.11.0->sentence-transformers)
  Using cached MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.int

Hit rate: 0.1366120218579235
MRR: 0.08619307832422587





In [16]:
import numpy as np

# Load results
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

# Define cosine similarity function with error handling
def cosine(u, v):
    try:
        u_norm = np.sqrt(u.dot(u))
        v_norm = np.sqrt(v.dot(v))
        
        # Check for zero division
        if u_norm == 0 or v_norm == 0:
            return 0.0
            
        return u.dot(v) / (u_norm * v_norm)
    except Exception as e:
        print(f"Error calculating cosine similarity: {e}")
        return 0.0  # Return 0 for problematic vectors

# Create embeddings pipeline
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# Fit the pipeline on all text data
all_texts = df_results.answer_llm.fillna('') + ' ' + df_results.answer_orig.fillna('') + ' ' + df_results.question.fillna('')
pipeline.fit(all_texts)

# Calculate cosine similarity for each pair
cosine_similarities = []
for _, row in df_results.iterrows():
    # Handle potential NaN values
    llm_answer = row.answer_llm if pd.notna(row.answer_llm) else ''
    orig_answer = row.answer_orig if pd.notna(row.answer_orig) else ''
    
    # Skip empty answers
    if not llm_answer or not orig_answer:
        continue
        
    v_llm = pipeline.transform([llm_answer])[0]
    v_orig = pipeline.transform([orig_answer])[0]
    sim = cosine(v_llm, v_orig)
    cosine_similarities.append(sim)

# Calculate average cosine similarity
if cosine_similarities:
    avg_cosine = np.mean(cosine_similarities)
    print(f"Average cosine similarity: {avg_cosine}")
else:
    print("No valid cosine similarities calculated")

Average cosine similarity: 0.8415841233490402


In [17]:
from rouge import Rouge
import numpy as np

# Initialize Rouge scorer
rouge_scorer = Rouge()

# Check the Rouge score for the 10th document
try:
    r = df_results.iloc[10]
    if pd.notna(r.answer_llm) and pd.notna(r.answer_orig) and r.answer_llm and r.answer_orig:
        scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
        print(f"Rouge scores for 10th document: {scores}")
        print(f"Document ID: {r.get('doc_id', 'N/A')}")
    else:
        print("Cannot calculate Rouge score for 10th document: Empty or NaN answers")
except Exception as e:
    print(f"Error calculating Rouge score for 10th document: {e}")

# Calculate Rouge scores for all pairs
rouge1_f1_scores = []
skipped_count = 0

for idx, row in df_results.iterrows():
    try:
        # Handle potential NaN values
        llm_answer = row.answer_llm if pd.notna(row.answer_llm) else ''
        orig_answer = row.answer_orig if pd.notna(row.answer_orig) else ''
        
        # Skip empty answers
        if not llm_answer or not orig_answer:
            skipped_count += 1
            continue
            
        scores = rouge_scorer.get_scores(llm_answer, orig_answer)[0]
        rouge1_f1_scores.append(scores['rouge-1']['f'])
    except Exception as e:
        skipped_count += 1
        # Print error for debugging but only for the first few occurrences
        if skipped_count <= 5:
            print(f"Error calculating Rouge score for row {idx}: {e}")

# Calculate average Rouge-1 F1 score
if rouge1_f1_scores:
    avg_rouge1_f1 = np.mean(rouge1_f1_scores)
    print(f"Average Rouge-1 F1 score: {avg_rouge1_f1}")
    print(f"Calculated scores for {len(rouge1_f1_scores)} out of {len(df_results)} rows")
    print(f"Skipped {skipped_count} rows due to errors or empty answers")
else:
    print("No valid Rouge scores calculated")

Rouge scores for 10th document: {'rouge-1': {'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}, 'rouge-2': {'r': 0.21621621621621623, 'p': 0.21621621621621623, 'f': 0.21621621121621637}, 'rouge-l': {'r': 0.3939393939393939, 'p': 0.3939393939393939, 'f': 0.393939388939394}}
Document ID: N/A
Average Rouge-1 F1 score: 0.3516946452113943
Calculated scores for 1830 out of 1830 rows
Skipped 0 rows due to errors or empty answers
