In [1]:
import marimo as mo

In [2]:
import requests
import pandas as pd
import minsearch

from tqdm.auto import tqdm
from qdrant_client import QdrantClient, models
#from fastembed import TextEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load documents and ground truth data for search evaluation

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'

documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)

ground_truth = df_ground_truth.to_dict(orient='records')

**Here, documents contains the documents from the FAQ database with unique IDs, and ground_truth contains generated question-answer pairs.**

In [4]:
documents[0:3], ground_truth[0:3]  # show first 3 items of each list]

([{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
   'section': 'General course-related questions',
   'question': 'Course - When will the course start?',
   'course': 'data-engineering-zoomcamp',
   'id': 'c02e79ef'},
  {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
   'section': 'General course-related questions',
   'question': 'Course - What are the prerequisites for this course?',
   'course': 'data-engineering-zoomcamp',
   'id': '1f6520ca'},
  {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks

In [5]:
len(documents), len(ground_truth)  # show the number of items in each list

(948, 4627)

In [6]:
# define metrics and evaluation function

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Text search (minsearch) ###

In [7]:
# create an index over the documents with minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7f763e7f5f50>

In [8]:
# define a search function that uses the index
def minsearch_search(query, course):
    #boost = {'question': 3.0, 'section': 0.5}
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [9]:
 # evaluate the search function with minsearch index
course = 'data-engineering-course'
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:18<00:00, 254.20it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

### Vector search (minsearch) ###

In [10]:
from minsearch import VectorSearch

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [12]:
# create embeddings for the question field
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [13]:
X.shape  # shape of the embeddings matrix

(948, 128)

In [14]:
X[0]

array([ 0.20189188, -0.19028114, -0.10261914,  0.16435334, -0.14004852,
       -0.19928493,  0.0326298 ,  0.03152187,  0.11015991, -0.25056714,
       -0.2297715 , -0.08275686,  0.0107985 ,  0.01912367, -0.03904959,
        0.04858485, -0.03590806,  0.00200877, -0.20405168, -0.01187959,
        0.07217801,  0.21314061,  0.0352884 ,  0.09334844,  0.00800627,
        0.02730576, -0.05747045, -0.08794382,  0.04599191,  0.09568683,
        0.10378307, -0.12981451, -0.03935688,  0.03076194,  0.02946738,
       -0.02071025,  0.09501766,  0.05341492, -0.02582382,  0.08743149,
       -0.03647388, -0.168532  , -0.08957893,  0.03547496,  0.11095151,
        0.13033041, -0.07362053,  0.13634367,  0.09826041, -0.05042163,
        0.10989516,  0.00500543, -0.06848177,  0.0502295 ,  0.06014098,
        0.11383368,  0.05322441, -0.02371468, -0.0975518 ,  0.02646635,
        0.00609063, -0.00198324,  0.11985142,  0.08729537,  0.08522016,
        0.01295767,  0.03040857,  0.04797036, -0.00390723, -0.06

In [15]:
# create a vector search index with minsearch
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7f763c5816d0>

In [16]:
# Funzione per trasformare la query in vettore e fare la ricerca
def vector_search(q):
    # Trasforma la query in vettore usando lo stesso pipeline
    query_vector = pipeline.transform([q['question']])
    # Cerca usando il vettore della query
    results = vindex.search(
        query_vector=query_vector[0], 
        filter_dict={'course': q['course']},
        num_results=5
    )
    return results

# Valuta la ricerca vettoriale
evaluate(ground_truth, vector_search)

100%|██████████| 4627/4627 [00:06<00:00, 677.68it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}

In [17]:
# create combination of question and answer
texts2 = []

for doc2 in documents:
    t2 = doc2['question'] + ' ' + doc2['text']
    texts2.append(t2)

In [18]:
# create embeddings for the combined question and answer

# fit the pipeline on the combined texts
X2 = pipeline.fit_transform(texts2)

In [19]:
vindex2 = VectorSearch(keyword_fields={'course'})
vindex2.fit(X2, documents)

<minsearch.vector.VectorSearch at 0x7f763c545a50>

In [20]:
# Funzione per trasformare la query in vettore e fare la ricerca
def vector_search2(q):
    # Trasforma la query in vettore usando lo stesso pipeline
    query_vector = pipeline.transform([q['question']])
    # Cerca usando il vettore della query
    results = vindex2.search(
        query_vector=query_vector[0], 
        filter_dict={'course': q['course']},
        num_results=5
    )
    return results

# Valuta la ricerca vettoriale
evaluate(ground_truth, vector_search2)

100%|██████████| 4627/4627 [00:07<00:00, 612.86it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}

### Vector search (qdrant) ###

In [45]:
qd_client = QdrantClient("http://localhost:6333")

In [46]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [47]:
collection_name = "hw3-2025"

In [48]:
# se già esiste la cancella per ricrearla
qd_client.delete_collection(collection_name=collection_name)

True

In [49]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [50]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [51]:
points = []

for i, doc3 in enumerate(documents):
    text = doc3['question'] + ' ' + doc3['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc3
    )
    points.append(point)

In [52]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [53]:
def vector_search3(q):  # q è un dizionario dal ground_truth
    #print('vector_search is used')

    # Estrai la domanda e il corso dal dizionario
    question = q['question']  # ✅ Estrai la stringa della domanda
    course = q['course']      # ✅ Usa il corso dal ground_truth invece di hardcodarlo

    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,  # ✅ Ora question è una stringa
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)  # ✅ Usa il corso corretto
                )
            ]
        ),
        limit=5,
        with_payload=True
    )

    results = []

    for point in query_points.points:
        results.append(point.payload)

    return results

In [54]:
# Valuta la ricerca vettoriale
evaluate(ground_truth, vector_search3)

  0%|          | 0/4627 [00:00<?, ?it/s]

100%|██████████| 4627/4627 [01:09<00:00, 66.55it/s]


{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

### RAG offline evaluation: cosine similarity ###

In [31]:
import numpy as np
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [32]:
# load data from gpt-4o-mini evaluation (llm as a judge)

results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [33]:
df_results.head(3)  # show first 3 rows of the dataframe

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp


In [34]:
pipeline

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [35]:
X1 = pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [36]:
results_gpt4o_mini = df_results.to_dict(orient='records')

In [37]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    v_llm = X1.transform([answer_llm])
    v_orig = X1.transform([answer_orig])

    return cosine(v_llm[0], v_orig[0])

In [38]:
similarity_4o_mini = []

for record in tqdm(results_gpt4o_mini):
    sim = compute_similarity(record)
    similarity_4o_mini.append(sim)

100%|██████████| 1830/1830 [00:03<00:00, 560.41it/s]


In [39]:
similarity_4o_mini[:3]  # show first 3 similarity scores

[np.float64(0.46352620160029906),
 np.float64(0.7815651064829413),
 np.float64(0.8891577173455298)]

In [40]:
df_results['cosine_similarity'] = similarity_4o_mini
df_results.head(3)  # show first 3 rows of the dataframe with cosine similarity']

Unnamed: 0,answer_llm,answer_orig,document,question,course,cosine_similarity
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,0.463526
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,0.781565
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,0.889158


In [41]:
df_results['cosine_similarity'].mean()  # mean cosine similarity score

np.float64(0.8415841233490402)

### Alternative text similarity: Rouge ###

In [42]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [43]:
scores['rouge-1']['f']  # F1 score for Rouge-L

0.45454544954545456

In [44]:
tot_scores = []

for i1 in range(df_results.shape[0]):
    r1 = df_results.iloc[i1]
    scores1 = rouge_scorer.get_scores(r1.answer_llm, r1.answer_orig)[0]
    tot_scores.append(scores1['rouge-1']['f'])

np.mean(tot_scores)  # mean Rouge-1 F1 score


np.float64(0.3516946452113943)