In [1]:
import requests
import pandas as pd
from tqdm.auto import tqdm
import minsearch
import json
import requests 

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

In [2]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

# Calc metrics functions

In [3]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Search functions

In [4]:
import os
from mistralai import Mistral

In [5]:
api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-large-latest"

client = Mistral(api_key=api_key)

In [6]:
# chat_response = client.chat.complete(
#     model= model,
#     messages = [
#         {
#             "role": "user",
#             "content": "is it too late to join the course?",
#         },
#     ]
# )
# print(chat_response.choices[0].message.content)

## Minsearch

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section", "id"],
    keyword_fields=["course"]
)

In [8]:
index.fit(documents)

<minsearch.minsearch.Index at 0x75d9e3c78260>

In [9]:
def search(query):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query['question'],
        filter_dict={'course': query['course']},
        boost_dict=boost,
        num_results=5
    )

    return results

# Homework

## Q1

In [10]:
# What's the hitrate for this approach?

In [11]:
evaluate(ground_truth, search)

100%|█████████████████████████████████████████████████████████████████████| 4627/4627 [00:17<00:00, 270.14it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

## Q2

In [12]:
# Evaluate embedding vector search method. What's MRR for it?

In [13]:
from minsearch import VectorSearch

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [15]:
# Let's create embeddings for the "question" field
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [16]:
# let's index these embeddings with minsearch
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x75d9e28e9130>

In [17]:
# Embeddings for ground_truth question
X_q = pipeline.transform([q['question'] for q in ground_truth])
for question, vector in zip(ground_truth, X_q):
    question['vector'] = vector

In [18]:
def vector_search(query): 
    results = vindex.search(
        query_vector=query['vector'],
        filter_dict={'course': query['course']},
        num_results=5
    )

    return results

In [19]:
evaluate(ground_truth, vector_search)

100%|████████████████████████████████████████████████████████████████████| 4627/4627 [00:04<00:00, 1141.14it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}

## Q3

In [20]:
# We can use both question and answer. What's the hitrate?

In [21]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [22]:
# let's index these embeddings with minsearch
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x75d9e304f6e0>

In [23]:
# Embeddings for ground_truth question
X_q = pipeline.transform([q['question'] for q in ground_truth])
for question, vector in zip(ground_truth, X_q):
    question['vector'] = vector

In [24]:
evaluate(ground_truth, vector_search)

100%|████████████████████████████████████████████████████████████████████| 4627/4627 [00:04<00:00, 1121.95it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}

## Q4

In [25]:
# let's evaluate the following settings in Qdrant. What's the MRR?

In [26]:
from qdrant_client import QdrantClient, models

In [27]:
qd_client = QdrantClient("http://localhost:6333")

In [28]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [29]:
collection_name = "evaluating_ground_truth"

In [32]:
qd_client.delete_collection(collection_name=collection_name)

True

In [33]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [34]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [35]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [36]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files: 100%|██████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.83s/it]


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
def qdrant_vector_search(question):    
    course = question['course']
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question['question'],
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [38]:
evaluate(ground_truth, qdrant_vector_search)

100%|██████████████████████████████████████████████████████████████████████| 4627/4627 [00:54<00:00, 84.41it/s]


{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

## Q5

In [39]:
# Let's calculate cosine similarity. What's the average cosine?

In [40]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [41]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [42]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [43]:
df_results['v_llm'] = pipeline.transform(df_results['answer_llm']).tolist()
df_results['v_orig'] = pipeline.transform(df_results['answer_orig']).tolist()

In [44]:
df_results['cosine_similarity'] = df_results[['v_llm', 'v_orig']].apply(
    lambda x: cosine(np.array(x['v_llm']), np.array(x['v_orig'])), axis=1)

In [45]:
df_results.head(2)

Unnamed: 0,answer_llm,answer_orig,document,question,course,v_llm,v_orig,cosine_similarity
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,"[0.15549858795799829, 0.11219644369710809, -0....","[0.22746772878326757, 0.12079641681716827, -0....",0.463526
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,"[0.14894279479454858, 0.176792136462119, -0.16...","[0.22746772878326757, 0.12079641681716827, -0....",0.781565


In [46]:
df_results['cosine_similarity'].mean()

np.float64(0.8415841233490402)

## Q6

In [47]:
# Let's compute the ROUGE score between the answers of our dataframe

In [48]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [49]:
full_score = rouge_scorer.get_scores(df_results['answer_llm'], df_results['answer_orig'])

In [50]:
# What's the average Rouge-1 F1?
metrics = ['r', 'p', 'f']
for metric in metrics:
    print(f"{metric}_avg: {np.mean([record['rouge-1'][metric] for record in full_score])}")
# [record['rouge-1'][metric] for record in full_score for metric in record['rouge-1']]

r_avg: 0.34043594697723023
p_avg: 0.4299569796022711
f_avg: 0.3516946452113943
