In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [None]:
user_question = "I just discovered the course. Can I still join it?"
embedding_vector = embedding_model.encode(user_question)
embedding_vector[0]

In [None]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [None]:
documents[1]

In [None]:
len(documents)

In [None]:
filtered_documents = []
course_name = 'machine-learning-zoomcamp'

for doc in documents:
    if doc['course'] == course_name:
        filtered_documents.append(doc)

In [None]:
len(filtered_documents)

In [None]:
filtered_documents[1]

In [None]:
import numpy as np

embeddings = []
operations = []
for doc in filtered_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)
    doc["qa_vector"] = embedding.tolist()
    operations.append(doc)

X = np.array(embeddings)
X.shape

In [None]:
dot_product = np.dot(embedding_vector, embedding_vector)
dot_product

In [None]:
scores = X.dot(embedding_vector)
scores.max()

In [None]:
embedding_vector = np.array(embedding_vector, dtype=float)

In [None]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
search_engine.search(embedding_vector, num_results=5)

In [None]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
def calculate_hitrate(search_engine, ground_truth, num_results):
    total_queries = len(ground_truth)
    correct_matches = 0

    for query in ground_truth:
        question = query['question']
        document = query['document']
        embedding_vector = embedding_model.encode(question)
        results = search_engine.search(embedding_vector, num_results=num_results)

        for result in results:
            if result['id'] == document:
                correct_matches += 1
                break

    hitrate = correct_matches / total_queries
    return hitrate

hitrate = calculate_hitrate(search_engine, ground_truth, num_results=5)
print(hitrate)

In [None]:
len(embedding_model.encode("This is a simple sentence"))

In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "qa_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [None]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
for doc in operations:
    try:
        es_client.index(index=index_name, body=doc)
    except Exception as e:
        print(e)

In [None]:
search_term = "I just discovered the course. Can I still join it?"
vector_search_term = embedding_model.encode(search_term)

In [None]:
body = {
    "knn": {
        "field": "qa_vector",
        "query_vector": vector_search_term,
        "k": 5,
        "num_candidates": 10000, 
    },
    "size": 5,
}

In [None]:
res = es_client.search(index=index_name, body=body, _source=["text", "section", "question", "course"])
res["hits"]["hits"]

In [None]:
def calculate_hitrate_es(search_engine, ground_truth, embedding_model: SentenceTransformer, num_results):
    total_queries = len(ground_truth)
    correct_matches = 0

    for query in ground_truth:
        question = query['question']
        document = query['document']
        embedding_vector = embedding_model.encode(question)
        results = search_engine.search(embedding_vector, num_results=num_results)

        for result in results:
            if result['id'] == document:
                correct_matches += 1
                break

    hitrate = correct_matches / total_queries
    return hitrate

hitrate_es = calculate_hitrate_es(search_engine, ground_truth, embedding_model, num_results=5)
print(hitrate_es)