In [None]:
import requests
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import minsearch
from minsearch import VectorSearch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

from qdrant_client import QdrantClient, models
from rouge import Rouge



In [None]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Evaluating MinSearch

In [None]:
index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)
index.fit(documents)

<minsearch.minsearch.Index at 0x7d645d569150>

In [None]:
def minsearch_search(qustion , course , minsearch_index):
  boost = {'question': 1.5, 'section': 0.1}

  results = minsearch_index.search(
      query=qustion,
      filter_dict={'course': course},
      boost_dict=boost,
      num_results=5
  )

  return results


In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(q['question'], q['course'] , index)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [None]:
hit_rate(relevance_total) , mrr(relevance_total)

(0.848714069591528, 0.7288235717887772)

so the answer for Q1 is
Hit Rate : **0.84**

## Embedding

In [None]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Vector search for question

In [None]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7d645c87bfd0>

In [None]:
def minsearch_search(qustion , course , minsearch_index):
  boost = {'question': 1.5, 'section': 0.1}

  results = minsearch_index.search(
      query=qustion,
      filter_dict={'course': course},
      boost_dict=boost,
      num_results=5
  )

  return results

In [None]:
def vector_search(qustion , course , vector_index):
  results = vector_index.search(
      query_vector=qustion,
      filter_dict={'course': course},
      num_results=5
  )

  return results

In [None]:
relevance_total = []

for idx , q in enumerate(tqdm(ground_truth)):
    doc_id = q['document']
    embdded_question = pipeline.transform([q['question']])[0]
    results = vector_search(embdded_question, q['course'] , vindex)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [None]:
hit_rate(relevance_total) , mrr(relevance_total)

(0.48173762697212014, 0.3571284489590088)

So, the answer for this question is
MRR : **.35**

## Vector search for question and answer

In [None]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [None]:
doc

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp',
 'id': '886d1617'}

In [None]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7d6459e2f110>

In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    embdded_question = pipeline.transform([q['question']])[0]
    results = vector_search(embdded_question, q['course'] , vindex)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [None]:
hit_rate(relevance_total) , mrr(relevance_total)

(0.8210503566025502, 0.6717707657949719)

So the Answer for this question is Hit Rate: **0.82**

## Qdrant

In [None]:
# running qdrant in local mode suitable for experiments
client = QdrantClient(":memory:")

model_name = "jinaai/jina-embeddings-v2-small-en"

In [None]:
client.create_collection(
    "qdrant_search",
    vectors_config=models.VectorParams(
        size=client.get_embedding_size(model_name), distance=models.Distance.COSINE)
)

True

In [None]:
points = []
id = 0
for doc in documents:
  point = models.PointStruct(
      id=id,
      payload=doc,
      vector=models.Document(text=doc['question'] + ' ' + doc['text'] , model=model_name)
  )
  points.append(point)
  id = id + 1

In [None]:
client.upsert(collection_name="qdrant_search", points=points)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
def qdrant_search(query):
  results = client.query_points(
      collection_name='qdrant_search',
      query=models.Document(
          text = query,
          model=model_name
      ),
      limit=5,
      with_payload=True
  )
  return [point.payload for point in results.points]


In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    embdded_question = pipeline.transform([q['question']])[0]
    results = qdrant_search(q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [None]:
hit_rate(relevance_total) , mrr(relevance_total)

(0.9120380376053598, 0.8246524025646579)

Now the answer for this  question can be MRR = **0.82 ≈ 0.85**

## Cosine simiarity

In [None]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [None]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [None]:
answer_llm_vectors = pipeline.transform(df_results['answer_llm'])
answer_orig_vectors = pipeline.transform(df_results['answer_orig'])

cosine_similarities = []
for i in range(len(df_results)):
    similarity = cosine(answer_llm_vectors[i], answer_orig_vectors[i])
    cosine_similarities.append(similarity)

print(np.mean(cosine_similarities))

0.8415841233490402


Now we can say the answer for this question is **Cosine Similarity = 0.84**

## Rouge

In [None]:
rouge_scorer = Rouge()

In [None]:
rouge_scores = []
for r in tqdm(df_results.itertuples()):
    f1_scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]['rouge-1']['f']
    rouge_scores.append(f1_scores)
print(np.mean(rouge_scores))

0it [00:00, ?it/s]

0.3516946452113943


Now the answer for this question is **AVG F1 Score = 0.35**