## Question 1

In [1]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
user_question = "I just discovered the course. Can I still join it?"

res = embedding_model.encode(user_question)

res[0]

0.07822266

## Question 2

In [3]:
import numpy as np

In [4]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [5]:
documents_list = []

for course_dict in documents:
    if course_dict['course'] == "machine-learning-zoomcamp":
        documents_list.append(course_dict)

In [6]:
len(documents_list)

375

In [7]:
documents_list[1]

{'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
 'section': 'General course-related questions',
 'question': 'Is it going to be live? When?',
 'course': 'machine-learning-zoomcamp',
 'id': '39fda9f0'}

In [8]:
#created the dense vector using the pre-trained model
embeddings = []
for doc in documents_list:
    # create an embedding for both question and answer fields
    qa_text = f"{doc['question']} {doc['text']}"
    embeddings.append(embedding_model.encode(qa_text))

In [9]:
X = np.array(embeddings)
X.shape

(375, 768)

## Question 3

In [10]:
v = embedding_model.encode('I just discovered the course. Can I still join?')
scores = X.dot(v)

max(scores)

0.6456067

## Question 4

In [18]:
from tqdm.auto import tqdm

In [13]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [12]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [24]:
ground_truth[4]

{'question': 'How can I structure my questions and answers for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [16]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [25]:
search_engine = VectorSearchEngine(documents=documents, embeddings=X)

In [26]:
def question_vector(q):
    question = q['question']
    course = q['course']
    
    v_q = embedding_model.encode(question)
    # search_engine.search(v, num_results=5)
    return  search_engine.search(v_q, num_results=5)

In [87]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total)
    }

In [114]:
evaluate(documents_list, question_vector)

100%|██████████| 375/375 [00:10<00:00, 34.96it/s]


{'hit_rate': 0.0}

## Question 5

In [30]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'a0d5d8c2e59a', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'LZSU30K0SGWVXclN8z184A', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [62]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

In [63]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [64]:
documents_list[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872',
 'text_vector': [0.01943064294755459,
  -0.00365401990711689,
  0.04042728990316391,
  0.06813301146030426,
  0.07546192407608032,
  -0.06538787484169006,
  -0.07440778613090515,
  0.04882429540157318,
  0.012547523714601994,
  -0.0032986134756356478,
  -0.013346663676202297,
  -0.0361684150993824,
  0.028790242969989777,
  -0.06288651376962662,
  0.0243467316031456,
  -0.004165361635386944,
  0.015515316277742386,
  -0.09215954691171646,


In [65]:
#created the dense vector using the pre-trained model
operations = []
for doc in documents_list:
    # Transforming the title into an embedding using the model
    doc["text_vector"] = embedding_model.encode(doc["text"]).tolist()
    operations.append(doc)

In [66]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [73]:

vector_search_term = embedding_model.encode(user_question)

knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

response = es_client.search(
    index=index_name,
    query={
        "match": {"section": "General course-related questions"},
    },
    knn=knn_query,
    size=5
)
response["hits"]["hits"][0]

{'_index': 'course-questions',
 '_id': 'NaSgxpABTC1PeLII4jOn',
 '_score': 12.367736,
 '_source': {'text': 'The course is available in the self-paced mode too, so you can go through the materials at any time. But if you want to do it as a cohort with other students, the next iterations will happen in September 2023, September 2024 (and potentially other Septembers as well).',
  'section': 'General course-related questions',
  'question': 'When does the next iteration start?',
  'course': 'machine-learning-zoomcamp',
  'id': '636f55d5',
  'text_vector': [0.014466939494013786,
   -0.014389874413609505,
   0.03587263822555542,
   -0.003268664935603738,
   0.06494203954935074,
   -0.04171321913599968,
   0.021197177469730377,
   0.06547830998897552,
   -0.027138356119394302,
   0.03279392048716545,
   -0.044186558574438095,
   -0.05132470279932022,
   0.07049799710512161,
   -0.025836152955889702,
   0.030564289540052414,
   0.024795833975076675,
   0.046714551746845245,
   -0.0986777245998

## Question 6

In [74]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [111]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)
    return elastic_search_knn('text_vector', v_q, course)

In [112]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total)
    }

In [113]:
evaluate(documents_list, question_vector_knn)

100%|██████████| 375/375 [00:36<00:00, 10.34it/s]


{'hit_rate': 0.8266666666666667}