In [32]:
# Import necessary libraries
from sentence_transformers import SentenceTransformer
import requests
import numpy as np
import pandas as pd
from tqdm.auto import tqdm 
from elasticsearch import Elasticsearch
from numpy.linalg import norm

In [2]:
# select model
model = "multi-qa-distilbert-cos-v1"

# connect to model
embedding_model = SentenceTransformer(model)

In [3]:
# encode the user question
query = 'I just discovered the course. Can I still join it?'
v = embedding_model.encode(query)

# print the first value of the resulting vector
print (f"Q1: The first value of the resulting vector is {v[0]:.3f}")

Q1: The first value of the resulting vector is 0.078


In [4]:
# load the documents
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [5]:
# Filter the documents for by course
course_name = "machine-learning-zoomcamp"
filtered_documents = [doc for doc in documents if doc["course"] == course_name]  

In [6]:
# see the number of documents after filtering
len(filtered_documents)

375

In [7]:
# create a variable to store the combined embeddings 
embedding_qa = []

# loop through the filtered documents and encode the combined question and text
for doc in tqdm(filtered_documents):
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embedding_qa.append(embedding)

  0%|          | 0/375 [00:00<?, ?it/s]

In [8]:
# convert the list to a numpy array
x = np.array(embedding_qa)

# print the shape of the resulting array
print("Q2: Shape of X is", x.shape)

Q2: Shape of X is (375, 768)


In [9]:
# Compute the dot product
scores = x.dot(v)

In [10]:
# Find the highest score
print(f"Q3: The highest score is: {np.max(scores)}")

Q3: The highest score is: 0.6506574153900146


In [11]:
# Define the VectorSearchEngine class
class VectorSearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=5):
        scores = self.embeddings.dot(v_query)
        idx = np.argpartition(-scores, num_results)[:num_results]
        top_idx = idx[np.argsort(-scores[idx])]
        return [self.documents[i] for i in top_idx]
    
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=x)

In [12]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [13]:
def hit_rate(relevance_total):
    hits = sum(any(relevance) for relevance in relevance_total)
    total_queries = len(relevance_total)
    hit_rate_value = hits / total_queries if total_queries > 0 else 0
    return hit_rate_value

In [14]:
# Calculate the relevance of each query
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    question = q['question']
    embed = embedding_model.encode(question)
    results = search_engine.search(embed)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [15]:
# Calculate the hit rate
print(f"Q4: Hit rate for vector search {hit_rate(relevance_total):.2f}")

Q4: Hit rate for vector search 0.94


In [16]:
# setup elastic search client
es_client = Elasticsearch('http://localhost:9200') 

In [17]:
# define the index name and settings
index_name = "course-questions"
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "course": {"type": "keyword"},
            "question": {"type": "text"},
            "text": {"type": "text"},
            "embedding": {"type": "dense_vector", "dims": 768},
            "id": {"type": "keyword"},
            "section": {"type": "text"}
        }
    }
}


# Delete the index if it exists otherwise create it
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [18]:
# combine the embeddings and the filtered documents
for i in range(len(filtered_documents)):
    filtered_documents[i]['embedding'] = x[i]

In [19]:
# Define the function to index documents with a custom ID
def index_documents(documents, index_name):
    responses = []
    for document in documents:
        response = es_client.index(index=index_name, id=document["id"], body=document)
        responses.append(response)
    return responses

# Index the documents
responses = index_documents(filtered_documents, index_name)

In [37]:
# Define the function to perform KNN search and find the ID of the document with the highest score
def elastic_search_knn(vector, course=course_name, k=5):
    knn_query = {
        "_source": ["question", "text", "id", "section"],
        "query": {
            "script_score": {
                "query": {"match": {"course": course}}, 
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                    "params": {"query_vector": vector}
                }
            }
        },
        "size": k
    }
    response = es_client.search(index=index_name, body=knn_query)
    hits = response['hits']['hits']
    
    return hits

response = elastic_search_knn(v, course_name)

In [38]:
print(f"Q5: The ID of the document with the highest score is: {response[0]['_id']}")

Q5: The ID of the document with the highest score is: ee58a693


In [39]:
# calculate the relevance of each query
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    question = q['question']
    embed = embedding_model.encode(question)
    results = elastic_search_knn(embed)
    relevance = [d['_source']['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [40]:
# Calculate the hit rate
print(f"Q6: Hit rate for elastic search is {hit_rate(relevance_total):.2f}")

Q6: Hit rate for elastic search is 0.94
