# Question 1

In [1]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [2]:
user_question = "I just discovered the course. Can I still join it?"

In [3]:
user_question_embedding = embedding_model.encode(user_question)

In [4]:
user_question_embedding[0]

0.07822261

# Question 2

In [5]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [6]:
filtered_documents = [doc for doc in documents if doc.get('course') == 'machine-learning-zoomcamp']

In [7]:
len(filtered_documents)

375

In [8]:
import numpy as np

In [9]:
embeddings = []

for doc in filtered_documents:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

X = np.array(embeddings)

In [10]:
X.shape

(375, 768)

# Question 3

In [11]:
v = user_question_embedding

In [12]:
scores = X.dot(v)

In [13]:
highest_score = np.max(scores)
highest_score

0.6506574

# Quesiton 4

In [14]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

# Load the ground dataset

In [15]:
# import pandas as pd

# base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
# relative_url = '03-vector-search/eval/ground-truth-data.csv'
# ground_truth_url = f'{base_url}/{relative_url}?raw=1'

# df_ground_truth = pd.read_csv(ground_truth_url)
# df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
# ground_truth = df_ground_truth.to_dict(orient='records')

In [16]:
import pandas as pd

# Manually download the file using requests
import requests

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/raw/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}'

response = requests.get(ground_truth_url)
with open('ground-truth-data.csv', 'wb') as f:
    f.write(response.content)

# Read the CSV file locally
df_ground_truth = pd.read_csv('ground-truth-data.csv')


In [17]:
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [18]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search_engine.search(v, num_results=5) # elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

NameError: name 'tqdm' is not defined

In [32]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [33]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [34]:
from tqdm.auto import tqdm

In [35]:
def evaluate(ground_truth, search_engine, num_results=5):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_engine.search(v, num_results=num_results)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [36]:
evaluate(ground_truth, search_engine)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.01366120218579235, 'mrr': 0.0062386156648451705}

In [37]:
# Create embeddings for filtered documents
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')
embeddings = []

for doc in filtered_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

X = np.array(embeddings)

# Initialize search engine
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)

# Define evaluation functions
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

# Define evaluate function
def evaluate(ground_truth, search_engine, embedding_model, num_results=5):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        query_embedding = embedding_model.encode(q['question'])
        results = search_engine.search(query_embedding, num_results=num_results)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Perform evaluation
evaluation_results = evaluate(ground_truth, search_engine, embedding_model, num_results=5)
print("Hit Rate:", evaluation_results['hit_rate'])
print("MRR:", evaluation_results['mrr'])


# Inspect some search results to debug
query_example = ground_truth[0]['question']
query_embedding = embedding_model.encode(query_example)
search_results = search_engine.search(query_embedding, num_results=5)
print(f"Query: {query_example}")
for result in search_results:
    print(f"Document ID: {result['id']}, Text: {result['text'][:100]}...")



  0%|          | 0/1830 [00:00<?, ?it/s]

Hit Rate: 0.9398907103825137
MRR: 0.8502823315118397
Query: Where can I sign up for the course?
Document ID: 0a278fb2, Text: Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going t...
Document ID: 39fda9f0, Text: The course videos are pre-recorded, you can start watching the course right now.
We will also occasi...
Document ID: ee58a693, Text: Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the ...
Document ID: 4930aa19, Text: Here’s how you join a in Slack: https://slack.com/help/articles/205239967-Join-a-channel
Click “All ...
Document ID: 67e2fd13, Text: Approximately 4 months, but may take more if you want to do some extra activities (an extra project,...


# Q5. Indexing with Elasticsearch

In [20]:
from elasticsearch import Elasticsearch

In [21]:
es_client = Elasticsearch('http://localhost:9200')

In [22]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 375,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 375,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 375,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

# Delete index if exists and create a new one
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [26]:
len(df_ground_truth)

1830

In [39]:
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
import requests

# Initialize Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

# Adjust index settings for 768 dimensions
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

# Delete index if exists and create a new one
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

# Index documents in Elasticsearch with partial debug information
for doc, embedding in tqdm(zip(filtered_documents, X), total=len(filtered_documents)):
    doc_body = {
        "text": doc['text'],
        "section": doc.get('section', ''),
        "question": doc['question'],
        "course": doc['course'],
        "id": doc['id'],
        "question_vector": embedding.tolist(),
        "text_vector": embedding.tolist(),  # Assuming the same embedding for simplicity
        "question_text_vector": embedding.tolist()  # Assuming the same embedding for simplicity
    }

    try:
        es_client.index(index=index_name, body=doc_body)
    except Exception as e:
        print(f"Error indexing document ID {doc['id']}: {e}")
        print(f"Document text: {doc['text'][:100]}...")
        print(f"Document question: {doc['question']}")
        print(f"Embedding length: {len(embedding)}")

# Define search function
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []

    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

# Perform search for a specific query
query_example = ground_truth[0]['question']
query_embedding = embedding_model.encode(query_example).tolist()
search_results = elastic_search_knn("question_vector", query_embedding, "machine-learning-zoomcamp")

# Print the ID of the document with the highest score
print(f"Query: {query_example}")
print(f"Top Document ID: {search_results[0]['id']}")


  0%|          | 0/375 [00:00<?, ?it/s]

Query: Where can I sign up for the course?
Top Document ID: 0a278fb2


# Question 6

In [38]:
def evaluate_elastic(ground_truth, es_client, embedding_model, num_results=5):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        query_embedding = embedding_model.encode(q['question']).tolist()
        results = elastic_search_knn("question_vector", query_embedding, q['course'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Perform evaluation using Elasticsearch
evaluation_results_elastic = evaluate_elastic(ground_truth, es_client, embedding_model, num_results=5)
print("Elastic Hit Rate:", evaluation_results_elastic['hit_rate'])
print("Elastic MRR:", evaluation_results_elastic['mrr'])

  0%|          | 0/1830 [00:00<?, ?it/s]

Elastic Hit Rate: 0.9398907103825137
Elastic MRR: 0.8502823315118397
