In [33]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [34]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [35]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    },
}

index_name = "data-engineering-zoomcamp"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'data-engineering-zoomcamp'})

In [36]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:01<00:00, 639.58it/s]


In [37]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^1.5", "section^0.1", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [38]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [39]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:04<00:00, 937.91it/s] 


In [40]:
relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, F

In [42]:
hit_rate(relevance_total)
mrr(relevance_total)

0.8013291549600181

In [44]:
import minsearch
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)
index.fit(documents)

<minsearch.minsearch.Index at 0x15817c230>

In [46]:
def minsearch_search(query, course):
    boost = boost = {'question': 1.5, 'section': 0.1}
    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )
    return results

In [47]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:05<00:00, 882.75it/s]


In [None]:
hit_rate(relevance_total)

0.7288235717887772

# Q2. Vector search for question

In [None]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)


<minsearch.vector.VectorSearch at 0x1592d9940>

In [57]:
def vector_search(query, course):
    # Transform the query using the same pipeline
    query_vector = pipeline.transform([query])
    
    results = vindex.search(
        query_vector=query_vector[0],  # Get the first (and only) transformed query
        filter_dict={'course': course},
        num_results=5
    )
    return results
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = vector_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)
mrr(relevance_total)

100%|██████████| 4627/4627 [00:02<00:00, 1781.46it/s]


0.3573085512571141

# Q3. Vector search for question and answer

In [58]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)


<minsearch.vector.VectorSearch at 0x159352450>

In [62]:
def vector_search(query, course):
    # Transform the query using the same pipeline
    query_vector = pipeline.transform([query])
    
    results = vindex.search(
        query_vector=query_vector[0],  # Get the first (and only) transformed query
        filter_dict={'course': course},
        num_results=5
    )
    return results
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = vector_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)
hit_rate(relevance_total)

100%|██████████| 4627/4627 [00:04<00:00, 1085.67it/s]


0.8210503566025502

# Q4. Qdrant

In [72]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models
import numpy as np

# Initialize the embedding model
embedding_model = TextEmbedding('jinaai/jina-embeddings-v2-small-en')

# Initialize Qdrant client
client = QdrantClient("localhost", port=6333)

# Create collection (if not exists)
collection_name = "data-engineering-zoomcamp"
client.delete_collection(collection_name=collection_name)
# Create collection
try:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=512,
            distance=models.Distance.COSINE
        )
    )
except Exception as e:
    print(f"Collection might already exist: {e}")

# Prepare and upload points
points = []
for id, doc in enumerate(documents):  # Using your existing documents
    text = doc['question'] + ' ' + doc['text']
    # vector = list(embedding_model.embed([text]))[0]
    
    point = models.PointStruct(
        id=id,
        vector=models.Document(text=text, model='jinaai/jina-embeddings-v2-small-en'),
        payload={
            "id": doc.get('id', str(id)),
            "text": doc['text'],
            "question": doc['question'],
            "section": doc['section'],
            "course": doc['course']
        }
    )
    points.append(point)

# Upload points to Qdrant
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
def qdrant_search(query, course):
    # Create embedding for the query. In hw2 we're creating the query embedding in the function call
    query_embedding = list(embedding_model.embed([query]))[0]
    
    # Search in Qdrant
    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5
    )
    
    # Convert to the same format as your vector search results
    results = []
    for hit in search_results:
        results.append({
            'id': hit.payload.get('id'),  # Make sure you include 'id' in your payload when uploading
            'text': hit.payload['text'],
            'question': hit.payload['question'],
            'section': hit.payload['section'],
            'course': hit.payload['course'],
            'score': hit.score
        })
    
    return results
# Calculate MRR for Qdrant
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = qdrant_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

# Calculate MRR
mrr_score = mrr(relevance_total)
print(f"MRR: {mrr_score}")

  search_results = client.search(
100%|██████████| 4627/4627 [00:36<00:00, 127.68it/s]

MRR: 0.8517722066133576





# Q5. Cosine Similarity

In [74]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [75]:
v_llm = pipeline.transform(df_results.answer_llm)
v_orig = pipeline.transform(df_results.answer_orig)
cosines = []
for i in range(len(df_results)):
    cos_val = cosine(v_llm[i], v_orig[i])
    cosines.append(cos_val)
average_cosine = np.mean(cosines)
print("Average cosine similarity:", average_cosine)

Average cosine similarity: 0.8415841233490402


# Q6. Rouge

In [77]:
from rouge import Rouge
rouge_scorer = Rouge()
vals = []
for i in range(len(df_results)):
    r = df_results.iloc[i]
    scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
    vals.append(scores['rouge-1']['f'])
np.mean(vals)

np.float64(0.3516946452113943)