In [1]:
!pip show minsearch

Name: minsearch
Version: 0.0.4
Summary: Minimalistic text search engine that uses sklearn and pandas
Home-page: https://github.com/alexeygrigorev/minsearch
Author: 
Author-email: Alexey Grigorev <alexey@datatalks.club>
License: WTFPL
Location: C:\tools\miniconda3\envs\llm-zoomcamp\Lib\site-packages
Requires: numpy, pandas, scikit-learn
Required-by: 


## Evaluation data

In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()                   #documents from the FAQ database

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url) 
ground_truth = df_ground_truth.to_dict(orient='records')    #generated question-answer pairs

In [4]:
# evaluation retrieval
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Q1. Minsearch text

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)



<minsearch.minsearch.Index at 0x2419e4db850>

In [6]:
boost = {'question': 1.5, 'section': 0.1}

def search(q):
    query = q['question']
    course = q['course']

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

# evaluate the search function
results = evaluate(ground_truth, search)
print(f"Hit Rate: {results['hit_rate']:.2f}")
print(f"(MRR: {results['mrr']:.2f})")

  0%|          | 0/4627 [00:00<?, ?it/s]

Hit Rate: 0.85
(MRR: 0.73)


In [7]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [8]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [9]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x241b6acf9d0>

## Q2 Vector search for 'question'

In [10]:
#evaluate the vector search function
def vector_search(q):
    query = q['question']
    course = q['course']

    query_vector = pipeline.transform([query])
    results = vindex.search(
        query_vector=query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results
results = evaluate(ground_truth, vector_search)

print(f"MRR: {results['mrr']:.2f}")
print(f"(Hit Rate: {results['hit_rate']:.2f})")

  0%|          | 0/4627 [00:00<?, ?it/s]

MRR: 0.36
(Hit Rate: 0.48)


## Q3. Vector search for 'question and answer'

In [11]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)    

<minsearch.vector.VectorSearch at 0x2419ce7c450>

In [12]:
results = evaluate(ground_truth, vector_search)

print(f"(MRR: {results['mrr']:.2f})")
print(f"Hit Rate: {results['hit_rate']:.2f}")

  0%|          | 0/4627 [00:00<?, ?it/s]

(MRR: 0.67)
Hit Rate: 0.82


## Q4. Qdrant

start DockerDesktop app, then run the server command:
docker run -p 6333:6333 qdrant/qdrant

In [None]:
%pip install sentence-transformers

from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer

# Connect to Qdrant (assuming local instance)
client = QdrantClient("localhost", port=6333)

# Use Jina embeddings model
# Something beats me here. With the model suggested in the homework 'jinaai/jina-embeddings-v2-small-en' MRR is 0.15; 
# if I change the model name's "small" to "base" that MRR jumps to 0.86 (same code everywhere else)!
model = SentenceTransformer("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)

# Prepare texts for embedding (improved: more robust and readable)
texts = [
    f"{doc.get('question', '')} {doc.get('text', '')}".strip()
    for doc in documents
]

embeddings = model.encode(texts, show_progress_bar=True)

# Create collection if not exists
collection_name = "faq_documents"
#if not client.collection_exists(collection_name):
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=embeddings.shape[1], distance="Cosine")
)



Note: you may need to restart the kernel to use updated packages.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

True

In [16]:
from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue
import uuid

#index documents with embeddings
points = []
for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
    points.append(
        PointStruct(
            id=i,
            vector=embedding.tolist(),
            payload=doc
        )
    )


# Upload documents with embeddings
client.upsert(
    collection_name=collection_name,
    points=points,
    wait=True  # Ensures operation completes before proceeding
)

def qdrant_search(q, limit=5):
    # Use both question and text if that's how you indexed
    query_text = q['question'] + ' ' + q.get('text', '')
    course = q['course']
    query_vector = model.encode([query_text])[0]
    
    # Search in Qdrant with course filter
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_vector.tolist(),
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="course",
                    match=MatchValue(value=course)
                )
            ]
        ),
        limit=limit
    )
    
    # Convert results to the expected format
    results = []
    for hit in search_result:
        results.append(hit.payload)
    
    return results

# Evaluate
results_q = evaluate(ground_truth, qdrant_search)

print(f"MRR: {results_q['mrr']:.2f}")
print(f"(Hit Rate: {results_q['hit_rate']:.2f})")

  0%|          | 0/4627 [00:00<?, ?it/s]

  search_result = client.search(


MRR: 0.86
(Hit Rate: 0.93)


## Q5. Cosine simiarity

In [None]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [19]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [20]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [21]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [23]:
import numpy as np
from itertools import combinations

# Transform texts to get embeddings
embeddings = pipeline.transform(texts)

# Calculate cosine similarity for each pair

cosines = []
for i, j in combinations(range(embeddings.shape[0]), 2):
    cos_sim = cosine(embeddings[i], embeddings[j])
    cosines.append(cos_sim)

# Compute the average cosine similarity
average_cosine = np.mean(cosines)
print(f"Average cosine similarity: {average_cosine:.4f}")

Average cosine similarity: 0.1960
