In [1]:
# embeddings model multi-qa-distilbert-cos-v1

model_name = 'multi-qa-distilbert-cos-v1'
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
user_question = "I just discovered the course. Can I still join it?"

In [3]:
user_question_embed = embedding_model.encode(user_question)

In [4]:
len(user_question_embed)

768

In [5]:
user_question_embed[0:5]

array([ 0.07822262, -0.04013116,  0.03861362, -0.00017896,  0.08923467],
      dtype=float32)

### ans.1 0.07

In [6]:
#load documents-with-ids son file
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [7]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

In [8]:
#filter only documents for machine-learning-zoomcamp course
hw3_documents =[]

for doc in documents:
    if doc['course']=='machine-learning-zoomcamp':
        hw3_documents.append(doc)

len(hw3_documents)

375

In [9]:
from tqdm.auto import tqdm

#create embeddings of the concatenation of Q and A for all the filtered documents
embeddings = []

for doc in tqdm(hw3_documents):
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    emb = embedding_model.encode(qa_text)
    embeddings.append(emb)
    doc['question_text_vector'] = emb


100%|████████████████████████████████████████████████████████| 375/375 [00:04<00:00, 92.74it/s]


In [10]:
import numpy as np

X = np.array(embeddings)
X.shape

(375, 768)

### ans.2 (375, 768)

In [11]:
X[1].dot(X[1])

1.0000001

In [12]:
# compute similarity scores via dot-product between user query and all the filtered documents
v = user_question_embed

scores = X.dot(v)
max(scores)

0.6506573

### ans.3 0.65

In [13]:
#definition of a vector search engine
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [14]:
#initialize our search engine 
search_engine = VectorSearchEngine(documents=hw3_documents, embeddings=X)

# extract 5 more relevant docs to user query v by means of our search engine
search_engine.search(v, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'question_text_vector': array([ 8.06286186e-02, -6.66388720e-02,  2.52730921e-02, -1.30044259e-02,
          7.58728012e-02, -5.94626516e-02, -2.18838528e-02,  2.90003093e-03,
          7.92887702e-04, -5.22236573e-03, -3.36518139e-02, -2.79135965e-02,
          5.81165105e-02,  3.97483185e-02,  5.44185676e-02, -3.82516533e-02,
          6.30573854e-02, -3.89698669e-02,  4.33604961e-04, -3.55844982e-02,
          1.75641582e-03

In [15]:
#load ground truth dataset

import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [16]:
len(ground_truth)

1830

In [17]:
ground_truth[0:5]

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'}]

In [18]:
# now we apply our search engine to all the questions in the ground_truth dataset
# then we check if there are matches between the retrieved documents and the expected answers from the ground_truth

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search_engine.search(embedding_model.encode(q['question']), num_results=5)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████| 1830/1830 [00:17<00:00, 106.34it/s]


In [19]:
relevance_total[0:5]

[[False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False]]

In [20]:
# we calculate the hit_rate evaluation metric for the search engine
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [21]:
len(relevance_total)

1830

In [22]:
hit_rate(relevance_total)

0.9398907103825137

### ans.4 0.93

In [23]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "hw3-course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'hw3-course-questions'})

In [24]:
# now we index the documents (including the qa_vector field, that will be used for vector search)
for doc in tqdm(hw3_documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████| 375/375 [00:03<00:00, 96.33it/s]


In [25]:
knn_query = {
    "field": "question_text_vector",
    "query_vector": v,
    "k": 5,
    "num_candidates": 10000, 
}

In [27]:
res = es_client.search(index=index_name, knn=knn_query, source=["text", "section", "question", "course", "id"])
res["hits"]["hits"]

[{'_index': 'hw3-course-questions',
  '_id': 'FfWIxpABwNnko1CMGwt3',
  '_score': 0.8253289,
  '_source': {'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
   'id': 'ee58a693'}},
 {'_index': 'hw3-course-questions',
  '_id': 'GPWIxpABwNnko1CMGwuW',
  '_score': 0.7358538,
  '_source': {'question': 'I just joined. What should I do next? How can I access course materials?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Welcome to the course! Go

In [28]:
res["hits"]["hits"][0]['_id']

'FfWIxpABwNnko1CMGwt3'

### ans.5 'FfWIxpABwNnko1CMGwt3'

In [46]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [47]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [48]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
#        'mrr': mrr(relevance_total),
    }

In [49]:
evaluate(ground_truth, question_text_vector_knn)

100%|██████████████████████████████████████████████████████| 1830/1830 [00:28<00:00, 64.17it/s]


{'hit_rate': 0.9393442622950819}