### Preparation

In [None]:
# %pip install -U minsearch qdrant_client

In [3]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [7]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


### Q1. Minsearch text

In [4]:
# evaluate our usual minsearch approach

import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x76b71851b0b0>

In [5]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [6]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:14<00:00, 328.20it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [None]:
# Q: What's the hitrate for this approach?

# A: 0.8487

In [7]:
# Embeddings
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [22]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

### Q2. Vector search for question

In [24]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x76b716572de0>

In [36]:
def minsearch_vector_search(query, course):
    vector = pipeline.transform([query])
    return vindex.search(
        vector,
        filter_dict={'course': course},
        num_results=5
    )

In [37]:
evaluate(ground_truth, lambda q: minsearch_vector_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:06<00:00, 673.86it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

In [None]:
# Q: Evaluate this seach method. What's MRR for it?

# A: 0.357

### Q3. Vector search for question and answer

In [38]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [42]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

def minsearch_vector_search(query, course):
    vector = pipeline.transform([query])
    return vindex.search(
        vector,
        filter_dict={'course': course},
        num_results=5
    )

evaluate(ground_truth, lambda q: minsearch_vector_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:07<00:00, 650.60it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

In [None]:
# Using the same pipeline (min_df=3 for TF-IDF vectorizer and n_components=128` for SVD), evaluate the performance of this approach

# Q: What's the hitrate?

# A: 0.821

### Q4. Qdrant

In [None]:
!docker run -d -p 6333:6333 -p 6334:6334 \
   -v "./qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant

In [15]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='zoomcamp-dense')])

In [28]:
client.delete_collection(collection_name="zoomcamp-dense")

True

In [29]:
import uuid

# Create the collection with both vector types
client.create_collection(
    collection_name="zoomcamp-dense",
    vectors_config=models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        )
)


# upload all the vectors into the newly created collection.
client.upsert(
    collection_name="zoomcamp-dense",
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector=models.Document(
                    text=doc['question'] + ' ' + doc['text'],
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "question": doc["question"],
                "id": doc["id"],
            }
        )
        for doc in documents
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [30]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5

def search(query, limit=5):

    results = client.query_points(
        collection_name="zoomcamp-dense",
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        # using='jina-small',
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results.points

In [31]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d.payload['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [32]:
evaluate(ground_truth, lambda q: search(q['question']))

100%|██████████| 4627/4627 [00:51<00:00, 90.22it/s] 


{'hit_rate': 0.9118219148476334, 'mrr': 0.8246271882429232}

In [None]:
# Q: What's the MRR?

# A: 0.82456

In [88]:
client.close()

### Q5. Cosine simiarity

In [89]:
def cosine(u, v):
    u = normalize(u)
    v = normalize(v)
    return u.dot(v)

def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm

In [4]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [93]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [None]:
# Now use the transform methon of the pipeline to create the embeddings and calculate the cosine similarity between each pair.

similarity = []

v_orig = pipeline.transform(df_results.answer_orig)
v_llm = pipeline.transform(df_results.answer_llm)

for i in range(len(v_orig)):
    score = cosine(v_llm[i], v_orig[i])
    similarity.append(score)

In [113]:
import numpy as np
np.mean(similarity)

np.float64(0.8415841233490402)

In [None]:
# Q: What's the average cosine?

# 0.8415

### Q6. Rouge

In [None]:
# %pip install rouge

In [5]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [13]:
# Rouge-1 F1
df_results['rouge'] = df_results.apply(lambda x: rouge_scorer.get_scores(x.answer_llm, x.answer_orig)[0]['rouge-1']['f'], axis=1)

In [14]:
df_results['rouge'].mean()

np.float64(0.3516946452113943)

In [None]:
# Q: What's the average Rouge-1 F1?

# A: 0.35169