### Q1. Getting the embeddings model

In [117]:
from sentence_transformers import SentenceTransformer

model_name= 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)


In [116]:
user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question)
print(v.shape)
v[0]

(768,)


0.07822262

### Download the raw data and Prepare the documents

In [84]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

filtered_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']
print(len(filtered_documents))


375


In [86]:
filtered_documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872',
 'qa_text': array([ 8.80591124e-02,  1.55936480e-02,  7.92557895e-02,  2.52757445e-02,
         7.55764693e-02, -3.90596390e-02, -4.13813218e-02,  2.52916981e-02,
         2.43242122e-02,  3.62592028e-03, -7.28290388e-03, -3.28751244e-02,
         6.12956285e-02, -5.71100675e-02,  1.16774803e-02, -1.79440919e-02,
         4.49205711e-02, -5.41606098e-02, -1.92255375e-03,  1.48329586e-02,
         7.91353080e-03, -3.43126729e-02,  1.2148064

#### Q2 prepare index for the raw data and create the embeddings

In [132]:
from tqdm.auto import tqdm  
import numpy as np  


def create_embeddings(filtered_documents):
    
    embeddings = []  # Initialize an empty list to store the embeddings
    
    # Iterate over the filtered documents 
    for doc in tqdm(filtered_documents):
        question = doc['question']  
        text = doc['text']  
        qa_text = f'{question} {text}'  # Concatenate the question and text
        
        # Encode the concatenated text using the embedding model
        encoded_array = embedding_model.encode(qa_text)
        doc['qa_text'] = encoded_array
        embeddings.append(encoded_array)  # Append the embedding to the list
        
    return embeddings
        
# create embedding for our document
embeddings = create_embeddings(filtered_documents)  

X = np.array(embeddings)  # Convert the list of embeddings to a NumPy array
X.shape  

  0%|          | 0/375 [00:00<?, ?it/s]

(375, 768)

#### Q3. Search

In [88]:
cosine_similarities = X.dot(v)
print('Index of max simiarity is :: ',np.argmax(cosine_similarities))
print('Highest score of similarity is: ',cosine_similarities[np.argmax(cosine_similarities)])

Index of max simiarity is ::  14
Highest score of similarity is:  0.65065753


### Vector search

In [108]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        # Initialize the search engine with a list of documents and their embeddings
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        # Calculate the dot product of the query vector with all embeddings
        scores = self.embeddings.dot(v_query)
        
        # Get the indices of the top num_results documents with the highest scores
        idx = np.argsort(-scores)[:num_results]
        
        # Return the corresponding top matching documents
        return [self.documents[i] for i in idx]

def VectorSearchEngine_helper(query):
    # Create a VectorSearchEngine instance with the documents and embeddings
    search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
    
    # Search for the top 5 results with the given query
    results = search_engine.search(query, num_results=5)
    
    # Return the search results
    return results
    
# Get the search results for the query vector v
result = VectorSearchEngine_helper(v)

# Print the IDs of the search results, separated by ********
for i in result:
    print(i['id'])
    print("********")

ee58a693
********
0a278fb2
********
6ba259b1
********
9f261648
********
e7ba6b8a
********


### Load the ground truth data

In [76]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
len(ground_truth)

1830

In [128]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        query_vector = embedding_model.encode(q['question'])
        results = search_function(query_vector)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }
       

#### Q4. Hit-rate for our vector search engine

In [109]:
dic = evaluate(ground_truth,VectorSearchEngine_helper)
print(dic)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}


### Q5 Indexing the document embeddings with Elasticsearch

In [122]:
from elasticsearch import Elasticsearch

def connect_to_es():
    for _ in range(10):  # Retry up to 10 times
        try:
            es = Elasticsearch("http://localhost:9200", basic_auth=('elastic', 'DkIedPPSCb'))
            if es.ping():
                return es
        except Exception as e:
            print(f"Connection failed, retrying... ({e})")
            time.sleep(10)
    raise Exception("Failed to connect to Elasticsearch after several retries")
    
es_client = connect_to_es()

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "qa_text": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [131]:
from tqdm.auto import tqdm

def create_embeddings_index(filtered_documents):
    for doc in tqdm(filtered_documents):
        question = doc['question']
        text = doc['text']
        qa_text = f'{question} {text}'
        
        doc['qa_text'] = embedding_model.encode(qa_text)

    for doc in tqdm(filtered_documents):
        es_client.index(index=index_name, document=doc)
        
create_embeddings_index(filtered_documents)

  0%|          | 0/375 [00:00<?, ?it/s]

  0%|          | 0/375 [00:00<?, ?it/s]

#### Q6. Compute Hit-rate for Elasticsearch

In [126]:
def elastic_search_knn(vector):
    knn = {
        "field": 'qa_text',
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [127]:
dic = evaluate(ground_truth,elastic_search_knn)
print(dic)

  0%|          | 0/1830 [00:00<?, ?it/s]

  es_results = es_client.search(


{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}
