In [None]:
from sentence_transformers import SentenceTransformer
import requests
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
from elasticsearch import Elasticsearch

# Initialize model
model_name = 'multi-qa-distilbert-cos-v1'
model = SentenceTransformer(model_name)

# User question
user_question = "I just discovered the course. Can I still join it?"
user_question_embedding = model.encode(user_question)

print(user_question_embedding[0])

In [None]:
# Upload documents
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'

docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

# Filter documents
documents = [doc for doc in documents_raw if doc.get('course') == 'machine-learning-zoomcamp']

print(f"Total documents filtered: {len(documents)}")

In [None]:
# Generate document embeddings
embeddings = []
for doc in tqdm(documents, desc="Encoding documents"):
    question = doc["question"]
    text = doc["text"]
    qa_text = f'{question} {text}'
    qa_embedding = model.encode(qa_text)
    embeddings.append(qa_embedding)

X = np.array(embeddings)
print(X.shape)

In [None]:
# Calculate similarity scores
scores = X.dot(user_question_embedding)
highest_score = np.max(scores)
print(f"Highest score: {highest_score}")

# Search engine class
class VectorSearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

# Initialize search engine
search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_results = search_engine.search(user_question_embedding, num_results=1)
print(search_results)

# Load ground truth
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

# Function to calculate hitrate
def calculate_hitrate(ground_truth, search_engine, num_results):
    cnt = 0
    total = len(ground_truth)

    for record in tqdm(ground_truth, desc="Calculating hitrate"):
        query_vector = model.encode(record['question'])
        results = search_engine.search(query_vector, num_results)
        if record['document'] in [result['id'] for result in results]:
            cnt += 1

    hitrate = cnt / total
    return hitrate

hitrate = calculate_hitrate(ground_truth, search_engine, num_results=5)
print(f"Hit-rate: {hitrate}")

In [None]:
# Configure Elasticsearch client
es_client = Elasticsearch('http://localhost:9200') 
index_name = "machine-learning-zoomcamp-2"

# Configure index
settings = {
    "settings": {
        "index": {
            "number_of_shards": 1,
            "number_of_replicas": 1
        },
        "analysis": {
            "analyzer": {
                "vector_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "vector": {
                "type": "dense_vector",
                "dims": 768
            },
            "title": {
                "type": "text",
                "analyzer": "vector_analyzer"
            }
        }
    }
}

# Create index
if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=settings)

# Index documents
for doc, vector in zip(df_ground_truth.to_dict(orient='records'), X):
    body = {
        "vector": vector.tolist(),
        "title": doc["question"]
    }
    es_client.index(index=index_name, body=body)

# Function to calculate hitrate in Elasticsearch
def calculate_elastic_hitrate(ground_truth, es_client, index_name, model, num_results):
    cnt = 0
    total = len(ground_truth)

    for record in tqdm(ground_truth, desc="Calculating Elasticsearch hitrate"):
        query_vector = model.encode(record['question']).tolist()
        
        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                    "params": {"query_vector": query_vector}
                }
            }
        }

        res = es_client.search(index=index_name, body={"query": script_query, "size": num_results})
        
        if any(hit['_source']['title'] == record['question'] for hit in res['hits']['hits']):
            cnt += 1

    hitrate = cnt / total
    return hitrate

# Calculate hitrate using Elasticsearch
elastic_hitrate = calculate_elastic_hitrate(ground_truth, es_client, index_name, model, num_results=5)
print(f"Elasticsearch Hit-rate: {elastic_hitrate}")

# Compare with previous hitrate
print(f"Exact Search Hit-rate: {hitrate}")