In [1]:
from sentence_transformers import SentenceTransformer
from lancedb.pydantic import LanceModel, Vector
from tqdm import tqdm
import pandas as pd
import requests
import lancedb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
uri = "llm-zoomcap-lancedb"
db = lancedb.connect(uri)

In [3]:
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [4]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()
documents = [doc for doc in documents if doc['course']=="machine-learning-zoomcamp"]
# Iterate through each dictionary in the list
for doc in documents:
    if 'text' in doc:
        doc['answer'] = doc.pop('text')
    question = doc["question"]
    answer = doc["answer"]
    qa_text = f'{question} {answer}'
    vector = embedding_model.encode(qa_text)
    doc["vector"] = vector

In [5]:
class CourseQuestions(LanceModel):
    section: str
    question: str
    course: str
    id: str
    answer: str
    vector: Vector(embedding_model.get_sentence_embedding_dimension())

In [6]:
# course-questions
table = db.create_table("course_questions_managed", schema=CourseQuestions, mode="overwrite")
table.add(documents)
del documents

In [7]:
def lance_search(query, num_results=5,metric="cosine"):
    v_query = embedding_model.encode(query)
    return (
        table
        .search(v_query, query_type="vector")
        .metric(metric)
        .nprobes(10000)
        .select(["id", "question", "answer"])
        .limit(num_results)
        .to_pandas()
    )

In [8]:
user_question = "I just discovered the course. Can I still join it?"

In [9]:
lance_search(user_question, metric="cosine")

Unnamed: 0,id,question,answer,_distance
0,ee58a693,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some...",0.349343
1,0a278fb2,I just joined. What should I do next? How can ...,Welcome to the course! Go to the course page (...,0.528292
2,6ba259b1,"I filled the form, but haven't received a conf...","The process is automated now, so you should re...",0.541
3,9f261648,"Can I do the course in other languages, like R...","Technically, yes. Advisable? Not really. Reaso...",0.54301
4,e7ba6b8a,The course videos are from the previous iterat...,We won’t re-record the course videos. The focu...,0.549441


#### Evaluating lancedb

In [10]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [11]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [12]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [13]:
def evaluate(metric):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        df_results = lance_search(q['question'], metric=metric)
        results = df_results.to_dict(orient='records')
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [14]:
evaluate(metric="cosine")

100%|██████████| 1830/1830 [01:04<00:00, 28.46it/s]


{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}

In [15]:
evaluate(metric="l2")

100%|██████████| 1830/1830 [01:10<00:00, 25.78it/s]


{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}

In [16]:
evaluate(metric="dot")

100%|██████████| 1830/1830 [01:11<00:00, 25.65it/s]


{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}