In [1]:
from elasticsearch import Elasticsearch
import json
import sys,os,os.path
from openai import OpenAI

# Use Embedding to Support Semantic Search 

## Gather Dataset

In [2]:
with open('documents.json','rt') as f:
    docs_raw = json.load(f)

In [3]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp'}

## Create Embeddings using Pretrained Models

In [5]:
#!pip install sentence_transformers==2.7.0

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
#!pip uninstall numpy
#!pip uninstall torch
#!pip install numpy = 1.26.4
#!pip install torch

In [8]:
model = SentenceTransformer('all-mpnet-base-v2')



In [9]:
len(model.encode('encode this word'))

768

In [10]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [11]:
operations = []
for doc in documents:
    doc['text_vector'] = model.encode(doc['text'])
    operations.append(doc)

In [12]:
operations[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': array([-4.10303585e-02,  2.58341245e-02, -3.68019156e-02, -2.08983012e-02,
        -2.05962416e-02,  9.35374666e-03, -3.33163445e-03, -9.49193351e-03,
         3.01179346e-02,  1.90821160e-02,  1.26900887e-02, -1.70788169e-02,
        -1.63238146e-03,  1.29972577e-01,  3.09692957e-02, -2.58236825e-02,
         2.78230421e-02,  2.51597390e-02, -8.08122009e-02, -3.61742009e-03,
        -8.90200026e-03,  3.40489321e-03, -2.30093114e-02, -3.40453349e-02,
         2.45986190e-02,  1.35456128e-02, -2.54389830e-02,  1.19510842e-02,
        -2.05401015e-02, -1.00774709e-02,  2.05753352e-02,  4.38897982e-02,
         1.43946512e-02,  1.82718430e-02,  1.71625845e-06, -1.92925166e-02,
        -1.57902297e-02,  8.83050449e-03,  4.14086170e-02,  1.7854562

## Setup ElasticSearch Connection

In [13]:
from elasticsearch import Elasticsearch

In [14]:
es_client = Elasticsearch('http://127.0.0.1:9200')

In [15]:
es_client.info()

ObjectApiResponse({'name': '741e032df576', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'V2xgg5AYRgm0oJyx5TrxaA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

## Create Mappings and Index

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector":{"type":"dense_vector","dims":768,"index":True,"similarity":"cosine"}
        }
    }
}
index_name = 'semantic_course_question'
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'semantic_course_question'})

## Add Index to Your Document

In [17]:
for doc in operations:
    try:
        es_client.index(index=index_name,document=doc)
    except:
        print(e)

In [18]:
query = 'can I use windows in this course'
query_vector = model.encode(query)

In [19]:
query = {
    "field":"text_vector",
    "query_vector":query_vector,
    "k":5,
    "num_candidates":10000
}

In [20]:
results = es_client.search(index=index_name,knn=query,source=["text","section","question","course"])

In [21]:
results = es_client.search(
    index=index_name,
    knn=query,
    query={
        "match":{
            "course":"data-engineering-zoomcamp"
        }
    },
    source=["text","section","question","course"],
    explain=True)
results["hits"]["hits"]

[{'_shard': '[semantic_course_question][0]',
  '_node': 'J_B72dnxRmi8U-nrbbH2jQ',
  '_index': 'semantic_course_question',
  '_id': 'FarKNZEBy-8yx8j45kgs',
  '_score': 1.5656364,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'},
  '_explanation': {'value': 1.5656364,
   'description': 'sum of:',
   'details': [{'value': 0.8134191,
     'description': 'within top k documents',
     'details': []},
    {'value': 0.7522173,
     'description': 'weight(course:data-engineering-zoomcamp in 13) [PerFieldSimilarity], result of:',
     'details': [{'value': 0.7522173,
       'description': 'score(freq=1.0), computed as boost * idf * tf from:',
       'details': [{'value': 2.2, 'description': 'boost', 'details': []},
        {'value': 0.7522173,
 

# Ground Truth Dataset

In [35]:
with open('documents.json','rt') as f:
    docs_raw = json.load(f)

In [36]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

## Add Id Columns

### Simple Id

In [26]:
n = len(documents)
for i in range(n):
    documents[i]['id']=i

In [27]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': 1}

### Hash Id with course, question and text

In [37]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [38]:
for doc in documents:
    doc['id']=generate_document_id(doc)

In [39]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

In [41]:
from collections import defaultdict

In [43]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [44]:
len(hashes),len(documents)

(947, 948)

In [45]:
for k,values in hashes.items():
    if len(values) > 1:
        print(k,len(values))

593f7569 2


In [46]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

duplicate detected

In [55]:
with open('documents-with-ids.json','wt') as f:
    json.dump(documents,f,indent=2)

## Generate Question

In [7]:
api_key = os.environ['OPEN_AI_API_KEY']
client = OpenAI(api_key=api_key)

In [54]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [66]:
api_key = os.environ['OPEN_AI_API_KEY']
client = OpenAI(api_key=api_key)
def llm_open_api(prompt):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
    return response.choices[0].message.content

In [86]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
    json_response = response.choices[0].message.content
    return json_response

In [77]:
prompt = prompt_template.format(**documents[0])
response = client.chat.completions.create(model='gpt-3.5-turbo-16k',messages = [{'role':'user',"content":prompt}])
json_response = response.choices[0].message.content

In [80]:
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [79]:
print(json_response)

[
  "When will the course start?",
  "How can I subscribe to the course public Google Calendar?",
  "What should I do before the course starts?",
  "How can I join the course Telegram channel?",
  "Where should I register and join the channel for DataTalks.Club's Slack?"
]


In [82]:
from tqdm.auto import tqdm

In [87]:
results = {}
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [91]:
results

{'c02e79ef': '["When will the course start?", "How do I subscribe to the course Google Calendar?", "What should I do before the course starts?", "How can I join the course Telegram channel?", "Where should I register and join the channel in Slack?"]',
 '1f6520ca': '["What are the prerequisites for this course?", "Where can I find the prerequisites for this course?", "Are there any prerequisites for this course?", "Can you tell me the requirements for this course?", "What are the necessary qualifications to take this course?"]',
 '7842b56a': '[\n  "Can I join the course after the start date?",\n  "Am I eligible to submit the homeworks if I don\'t register?",\n  "Are there deadlines for turning in the final projects?",\n  "What should I be aware of if I don\'t register?",\n  "Should I leave everything for the last minute?"\n]',
 '0bbf41ec': '["When can I expect to receive the confirmation email?",\n "Do I need to register for the course?",\n "Is registration necessary to start learning a

In [92]:
with open('results.json','wt') as f:
    json.dump(results,f)

In [88]:
import pickle

In [93]:
#with open('result.bin','rb') as f_in:
#    results = pickle.load(f_in)

In [112]:
parsed_results = {}
for doc_id, json_questions in results.items():
    json_questions = json_questions.strip()
    
    # Check if the json_questions is a JSON object
    if json_questions.startswith('{'):
        # Parse as a JSON object and extract the questions list
        try:
            questions = json.loads(json_questions).get("questions", [])
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON object for doc_id {doc_id}: {e}")
            questions = []
    # Check if it's a single JSON array
    elif json_questions.startswith('['):
        try:
            # Parse the whole JSON array
            questions = json.loads(json_questions)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON array for doc_id {doc_id}: {e}")
            questions = []
    else:
        # If not a JSON object or a single JSON array, parse line by line
        try:
            questions_list = json_questions.split('\n')
            questions = [json.loads(question)[0] for question in questions_list]
        except json.JSONDecodeError as e:
            print(f"Error parsing line by line for doc_id {doc_id}: {e}")
            questions = []

    # Store the parsed questions in the parsed_results dictionary
    parsed_results[doc_id] = questions

Error parsing JSON array for doc_id c9375c56: Extra data: line 2 column 1 (char 68)
Error parsing JSON array for doc_id 593a85ba: Expecting ',' delimiter: line 3 column 66 (char 240)
Error parsing JSON array for doc_id 50bd1a71: Expecting ',' delimiter: line 1 column 106 (char 105)
Error parsing JSON array for doc_id cd0f9300: Expecting ',' delimiter: line 1 column 183 (char 182)
Error parsing JSON array for doc_id c91ad8f2: Invalid \escape: line 1 column 61 (char 60)
Error parsing JSON array for doc_id aa6f52b8: Extra data: line 3 column 1 (char 93)
Error parsing JSON array for doc_id 29f84a82: Extra data: line 2 column 1 (char 92)
Error parsing JSON array for doc_id 5a712a20: Expecting ',' delimiter: line 1 column 82 (char 81)
Error parsing JSON array for doc_id f3adb937: Extra data: line 2 column 1 (char 101)
Error parsing line by line for doc_id 0952abde: Extra data: line 1 column 2 (char 1)
Error parsing JSON array for doc_id 6e1a0834: Extra data: line 3 column 1 (char 98)
Error p

In [113]:
parsed_results

{'c02e79ef': ['When will the course start?',
  'How do I subscribe to the course Google Calendar?',
  'What should I do before the course starts?',
  'How can I join the course Telegram channel?',
  'Where should I register and join the channel in Slack?'],
 '1f6520ca': ['What are the prerequisites for this course?',
  'Where can I find the prerequisites for this course?',
  'Are there any prerequisites for this course?',
  'Can you tell me the requirements for this course?',
  'What are the necessary qualifications to take this course?'],
 '7842b56a': ['Can I join the course after the start date?',
  "Am I eligible to submit the homeworks if I don't register?",
  'Are there deadlines for turning in the final projects?',
  "What should I be aware of if I don't register?",
  'Should I leave everything for the last minute?'],
 '0bbf41ec': ['When can I expect to receive the confirmation email?',
  'Do I need to register for the course?',
  'Is registration necessary to start learning and 

In [114]:
doc_index = {d['id']: d for d in documents}

In [115]:
doc_index

{'c02e79ef': {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 '1f6520ca': {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 '7842b56a': {'text': "Yes, even if you don't register, you're still eligibl

In [117]:
final_results = []

for doc_id,questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q,course,doc_id))

In [118]:
final_results[:10]

[('When will the course start?', 'data-engineering-zoomcamp', 'c02e79ef'),
 ('How do I subscribe to the course Google Calendar?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('What should I do before the course starts?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('How can I join the course Telegram channel?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('Where should I register and join the channel in Slack?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('What are the prerequisites for this course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Where can I find the prerequisites for this course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Are there any prerequisites for this course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Can you tell me the requirements for this course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('What are the necessary qualifications to take this course?',
  'data-engineering-zoomcamp',
  '1f6520ca')]

In [119]:
import pandas as pd

In [124]:
df = pd.DataFrame(final_results,columns=['question','course','document'])

In [125]:
df

Unnamed: 0,question,course,document
0,When will the course start?,data-engineering-zoomcamp,c02e79ef
1,How do I subscribe to the course Google Calendar?,data-engineering-zoomcamp,c02e79ef
2,What should I do before the course starts?,data-engineering-zoomcamp,c02e79ef
3,How can I join the course Telegram channel?,data-engineering-zoomcamp,c02e79ef
4,Where should I register and join the channel i...,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4309,How can I destroy infrastructure created using...,mlops-zoomcamp,886d1617
4310,What is the solution for destroying infrastruc...,mlops-zoomcamp,886d1617
4311,How do I configure terraform init to destroy i...,mlops-zoomcamp,886d1617
4312,What command should I use to destroy infrastru...,mlops-zoomcamp,886d1617


In [126]:
df.to_csv('ground_truth_dataset.csv',index=False)

# Evaluation Text Retrieval Technique

In [5]:
from elasticsearch import Elasticsearch
import json
import sys,os,os.path
from openai import OpenAI
import pandas as pd

In [6]:
es_client = Elasticsearch('http://127.0.0.1:9200')

In [7]:
es_client.info()

ObjectApiResponse({'name': 'e3b5a3a180fc', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ByIDpcoFT-Of9FI7ZHMA5Q', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [16]:
with open('documents-with-ids.json','rt') as f:
    documents = json.load(f)

In [17]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [21]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id":{"type":"keyword"}
        }
    }
}
index_name = 'text_retrieval'
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_retrieval'})

In [23]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [78]:
def elastic_query(query,course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name,body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [35]:
elastic_query(query = "Can I use windows to join this course?",course="data-engineering-zoomcamp")

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. Here is a great tutorial that shows you how to do this: https://www.atlassia

In [8]:
df_ground_truth = pd.read_csv('ground_truth_dataset.csv')

In [11]:
df_ground_truth.head(5)

Unnamed: 0,question,course,document
0,When will the course start?,data-engineering-zoomcamp,c02e79ef
1,How do I subscribe to the course Google Calendar?,data-engineering-zoomcamp,c02e79ef
2,What should I do before the course starts?,data-engineering-zoomcamp,c02e79ef
3,How can I join the course Telegram channel?,data-engineering-zoomcamp,c02e79ef
4,Where should I register and join the channel i...,data-engineering-zoomcamp,c02e79ef


In [36]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [39]:
ground_truth[1]

{'question': 'How do I subscribe to the course Google Calendar?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [49]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_query(query=q['question'],course=q['course'])
    relevance = [d['id']== doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4314 [00:00<?, ?it/s]

In [55]:
relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [False, False, False, False,

- **Hit-Rate (recall)** : Number of relevant document retrieved out of total number of relevant document
- **Mean Reciprocal Rank (MRR)**

In [59]:
def hit_rate(relevance_total):
    cnt= 0
    for line in relevance_total:
        if True in line:
            cnt=cnt+1
    return cnt/len(relevance_total)

In [61]:
def mrr(relevance_total):
    total_score= 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]==True:
                total_score = total_score+1/(rank+1)
    return total_score/len(relevance_total)

In [64]:
hit_rate(relevance_total),mrr(relevance_total)

(0.6710709318497914, 0.5499575027043733)

### Using Minsearch

In [66]:
import minsearch

In [67]:
index = minsearch.Index(
    text_fields = {'question','text','section'},
    keyword_fields=['course']
)
index.fit(documents)
boost = {'question':3.0, 'section':0.5}

In [69]:
def minsearch_search(query,course):
    boost = {'question':3.0, 'section':0.5}
    search_results = index.search(
        query=query,
        boost_dict = boost,
        filter_dict = {'couse':course},
        num_results=5
    )
    return search_results

In [70]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'],course=q['course'])
    relevance = [d['id']== doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4314 [00:00<?, ?it/s]

In [71]:
hit_rate(relevance_total),mrr(relevance_total)

(0.6270282800185443, 0.5380041724617518)

In [72]:
def evaluate (ground_truth,search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id']== doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate':hit_rate(relevance_total),
        'mrr':mrr(relevance_total)
    }

In [74]:
evaluate(ground_truth,lambda q:elastic_query(q['question'],q['course']))

  0%|          | 0/4314 [00:00<?, ?it/s]

{'hit_rate': 0.6710709318497914, 'mrr': 0.5499575027043733}

In [75]:
evaluate(ground_truth,lambda q:minsearch_search(query=q['question'],course=q['course']))

  0%|          | 0/4314 [00:00<?, ?it/s]

{'hit_rate': 0.6270282800185443, 'mrr': 0.5380041724617518}

In [77]:
evaluate(ground_truth,lambda q:elastic_query(q['question'],q['course']))

  0%|          | 0/4314 [00:00<?, ?it/s]

{'hit_rate': 0.5725544738062124, 'mrr': 0.48751352186679003}

## Evaluation with Vector Search

In [99]:
from sentence_transformers import SentenceTransformer

In [100]:
model = SentenceTransformer('all-mpnet-base-v2')



In [81]:
operations = []
for doc in tqdm(documents):
    doc['text_vector'] = model.encode(doc['text'])
    operations.append(doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [101]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector":{"type":"dense_vector","dims":768,"index":True,"similarity":"cosine"}
        }
    }
}
index_name = 'semantic_course_question'
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'semantic_course_question'})

In [102]:
for doc in operations:
    try:
        es_client.index(index=index_name,document=doc)
    except:
        print(e)

In [105]:
def vector_search(query,course):
    query_vector = model.encode(query)
    query_setting = {
    "field":"text_vector",
    "query_vector":query_vector,
    "k":5,
    "num_candidates":10000
}
    response = es_client.search(
        index=index_name,
        knn=query_setting,
        query={
            "match":{
                "course":course
            }
        })
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [106]:
vector_search(query = "Can I use windows to join this course?",course="data-engineering-zoomcamp")

[{'text': "It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop.",
  'section': 'General course-related questions',
  'question': 'Environment - Do we really have to use GitHub codespaces? I already have PostgreSQL & Docker installed.',
  'course': 'data-engineering-zoomcamp',
  'id': '251218fc',
  'text_vector': [-0.023726243525743484,
   -0.03184004873037338,
   -0.025577044114470482,
   0.011106034740805626,
   0.0005554416566155851,
   0.022152984514832497,
   0.07371804118156433,
   0.011269566603004932,
   0.01180771179497242,
   0.013761002570390701,
   0.01555496547371149,
   0.034056372940540314,
   -0.017209680750966072,
   0.06464901566505432,
   0.05755431205034256,
   -0.05429821088910103,
   0.0006437752163037658,
   0.013905134983360767,
   -0.06946833431720734,
   0.008833124302327633,
   0.06048642471432686,
   0.004764921497553587,
   -0.062804132

In [107]:
evaluate(ground_truth,lambda q:vector_search(q['question'],q['course']))

  0%|          | 0/4314 [00:00<?, ?it/s]

{'hit_rate': 0.7114047287899861, 'mrr': 0.6048460714837628}

### question, text, and question text encoding

In [110]:
for doc in tqdm(documents):
    qt = doc['question']+doc['text']
    doc['text_vector'] = model.encode(doc['text'])
    doc['question_vector'] = model.encode(doc['question'])
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/948 [00:00<?, ?it/s]

In [118]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
             "text_vector":{"type":"dense_vector","dims":768,"index":True,"similarity":"cosine"},
            "question_vector":{"type":"dense_vector","dims":768,"index":True,"similarity":"cosine"},
            "question_text_vector":{"type":"dense_vector","dims":768,"index":True,"similarity":"cosine"}
        }
    }
}
index_name = 'question_text_encoding'
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'question_text_encoding'})

In [119]:
for doc in tqdm(documents):
    try:
        es_client.index(index=index_name,document=doc)
    except:
        print(e)

  0%|          | 0/948 [00:00<?, ?it/s]

In [124]:
def vector_search_knn(field,vector,course):
    query_setting = {
    "field":field,
    "query_vector": vector,
    "k":5,
    "num_candidates":10000}
    response = es_client.search(
        index=index_name,
        knn=query_setting,
        query={
            "match":{
                "course":course
            }
        })
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [125]:
query = "Can I use windows to join this course?"
field = 'question_vector'
v_q = model.encode(query)
vector_search_knn(field,v_q,'data-engineering-zoomcamp')

[{'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
  'section': 'General course-related questions',
  'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
  'course': 'data-engineering-zoomcamp',
  'id': '7c700adb',
  'text_vector': [-0.02696543186903,
   -0.0006260390509851277,
   -0.016629500314593315,
   0.052851445972919464,
   0.054765306413173676,
   -0.0313398651778698,
   0.02994263358414173,
   -0.04808565601706505,
   0.04467547684907913,
   0.00583947217091918,
   0.016233114525675774,
   0.012001129798591137,
   -0.031222324818372726,
   0.016600564122200012,
   -0.04886898025870323,
   -0.06496302783489227,
   0.046434223651885986,
   -0.009297681972384453,
   -0.0642528161406517,
   -0.01373268011957407,
   -0.015976207330822945,
   0.008629539981484413,
   -0.02447897382080555,
   -0.005980636924505234,
   0.016313808038830757,
   -0.02634182572364807,
   -0.07652203738689423,
   

In [135]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']
    v_q = model.encode(question)
    return vector_search_knn('question_vector', v_q, course)

In [136]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)  # Ensure the function is called with argument `q`
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [137]:
evaluate(ground_truth, lambda q: question_vector_knn(q))

  0%|          | 0/4314 [00:00<?, ?it/s]

{'hit_rate': 0.650904033379694, 'mrr': 0.5699878947097307}

In [140]:
def text_vector_knn(q):
    question = q['question']
    course = q['course']
    v_q = model.encode(question)
    return vector_search_knn('text_vector', v_q, course)

In [141]:
evaluate(ground_truth, lambda q: text_vector_knn(q))

  0%|          | 0/4314 [00:00<?, ?it/s]

{'hit_rate': 0.7111729253592953, 'mrr': 0.6050784268273838}

In [142]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']
    v_q = model.encode(question)
    return vector_search_knn('question_text_vector', v_q, course)

In [143]:
evaluate(ground_truth, lambda q: question_text_vector_knn(q))

  0%|          | 0/4314 [00:00<?, ?it/s]

{'hit_rate': 0.7911451089476125, 'mrr': 0.6975807816558861}