# Environment Setup

In [59]:
# generally used throughout the notebook
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [87]:
# Getting the documents

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'

docs_response = requests.get(docs_url)

dedicated_course_name = 'machine-learning-zoomcamp'
documents = list(filter(lambda doc: doc['course'] == dedicated_course_name, docs_response.json()))

len(documents)

375

In [89]:
# Getting the Gold standard Dataset

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

len(ground_truth)

1830

In [5]:
# propare the transformer

transformer = SentenceTransformer('multi-qa-distilbert-cos-v1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Q1. Getting the embeddings model

In [6]:
user_question = "I just discovered the course. Can I still join it?"

In [23]:
query_embedding = transformer.encode(user_question)

In [24]:
query_embedding[0]

0.07822261

In [26]:
query_embedding.shape[0]

768

# Q2. Creating the embeddings

In [11]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [51]:
embeddings = []

for doc in tqdm(documents):
  if doc['course'] != 'machine-learning-zoomcamp':
    continue

  embeddings.append(transformer.encode(f"{doc['question']} {doc['text']}"))

100%|██████████| 375/375 [00:03<00:00, 108.08it/s]


In [52]:
X = np.array(embeddings)

In [53]:
X.shape

(375, 768)

# Q3. Search

In [54]:
score = X.dot(query_embedding)

In [55]:
score.max()

0.6506574

# Implementing custom search engine

In [56]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

# Q4. Hit-rate for our search engine

In [96]:
search_engine = VectorSearchEngine(documents, X)

counter = 0
for item in tqdm(ground_truth):
  res = search_engine.search(transformer.encode(item['question']), num_results=5)
  res_ids = list(map(lambda ele: ele['id'], res))
  if item['document'] in res_ids:
    counter += 1

100%|██████████| 1830/1830 [00:21<00:00, 84.47it/s] 


In [98]:
# calculating hit-rate
counter / len(ground_truth)

0.9398907103825137

# Notes

In [83]:
from collections import defaultdict

repeatings = defaultdict(list)

for doc in documents:
  ln = len(df_ground_truth[df_ground_truth.document == doc['id']])

  repeatings[ln].append(doc['id'])

repeatings.keys()

dict_keys([5, 1])

In [84]:
len(repeatings[5])

365

In [85]:
len(repeatings[1])

10