In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import requests
import json

  from .autonotebook import tqdm as notebook_tqdm


### Q1

In [2]:
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [6]:
user_question = "I just discovered the course. Can I still join it?"

In [7]:
v = embedding_model.encode(user_question)

In [8]:
v[0]

0.078222655

In [9]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [11]:
documents = [doc for doc in documents if doc['course']=="machine-learning-zoomcamp"]

In [13]:
len(documents)

375

In [14]:
documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

### Q2

In [15]:
embeddings = list()

for doc in documents:
    question = doc["question"]
    text = doc["text"]
    qa_text = f'{question} {text}'
    vector = embedding_model.encode(qa_text)
    embeddings.append(vector)

In [16]:
X = np.array(embeddings)
print(X.shape)

(375, 768)


### Q3

In [22]:
scores = X.dot(v)

In [23]:
scores[0]

0.28921726

In [None]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        #idx = np.argpartition (-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)  

### Q4