### Importing the necessary libraries

In [1]:
import json
import pandas as pd
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm
import warnings

warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


### Data Loading - `documents_with_ids.json`

In [2]:
# let's start by loading our documents ids file
with open("documents_with_ids.json", "rb") as file:
    documents = json.load(file)

In [3]:
documents[0]

{'course': 'data-engineering-zoomcamp',
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'id': '7000acaa'}

### Initialising and Populating ElasticSearch

In [4]:
# initialising elasticsearch and checking to see if connection was successful
es_client = Elasticsearch("http://localhost:9200")

if es_client.ping():
    print("Connected to ElasticSearch!")
else:
    print("Connection Failed.")

Connected to ElasticSearch!


In [5]:
# now to create a new index as well as defining the index settings

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector" :{
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector":{
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine", # there is also another option of using dotp product which should yield the same result
            },
            "question_text_vector":{
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = "course-questions"

# lets delete and create a new index if it exists for ease of re-runs
if es_client.indices.exists(index="course-questions"):
    es_client.indices.delete(index="course-questions")

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
# Now that we have defined the schema of our index, we can proceed create the vector embeddings before populating it

# but first we need to load a pretrained sentence transformer model that would convert our questions and text to vector embeddings
model = SentenceTransformer("all-MiniLM-L12-v2")

# next we want to update our documents variable to include our vector embeddings while retaining the original data
documents = [doc.update({"question_vector": model.encode(doc["question"]),
                         "text_vector": model.encode(doc["text"]),
                         "question_text_vector": model.encode(doc["question"] + '-' + doc["text"])}) or doc for doc in documents]

In [7]:
# now to see if the index was indeed created

indices = es_client.indices.get_alias(index="*")

for index in indices:
    print(index)

course-questions


In [8]:
# next we simply have to populate our index using the bulk method
# the bulk method is a `pipeline` where you can perform multiple actions in single request 
# for populating the index we only use the `index` action but there are others (e.g. delete, update or create)
index = {'index':{
    '_index': index_name}
    }

operations = [item for doc in documents for item in (index, doc)]

response = es_client.bulk(operations=operations)

In [9]:
# last step for this function would be to define the search query function for elasticsearch
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = [hit['_source'] for hit in es_results['hits']['hits']]

    return result_docs

### Evaluating `elasticsearch` search engine using GTD - Hit Rate and MRR

In [10]:
# loading the ground truth dataset
ground_truth_df = pd.read_csv("ground-truth-data.csv", delimiter=',', engine='python')

ground_truth_df.head()

Unnamed: 0,Course,document_ID,Question
0,data-engineering-zoomcamp,7000acaa,When will the course start?
1,data-engineering-zoomcamp,7000acaa,What is the purpose of this document?
2,data-engineering-zoomcamp,7000acaa,How can I subscribe to the course public Googl...
3,data-engineering-zoomcamp,7000acaa,How can I register before the course starts?
4,data-engineering-zoomcamp,7000acaa,How can I join the course Telegram channel?


In [11]:
# there is 1 null row that needs to be removed
ground_truth_df.dropna(inplace=True)

In [12]:
# now that we have loaded our ground truth dataset, we can evaluate our retrieval system
# lets proceed to define a funtion that returns the first metric, hit rate
# hit rate is the percentage of successful results where the system returns at least one relevant document

def hit_rate_elasticserach(index_field:str, course:str, doc_id:str, query:str):
    query_res = elastic_search_knn(index_field, vector=model.encode(query), course=course)

    hit_rate = [True if res['id'] == doc_id else False for res in query_res]

    if any(hit_rate):
        return 1
    else:
        return 0


In [13]:
# now to define a function that returns the rank of the relevant results for a particular query
def mrr_elasticsearch(index_field:str,course:str, doc_id:str, query:str) -> int:
    query_res = elastic_search_knn(index_field, vector=model.encode(query), course=course)

    id_res = [res['id'] for res in query_res]

    for index, id in enumerate(id_res):
        if id == doc_id:
            return 1/(index+1)
            
    return 0

In [14]:
def evaluate(index_field, df):
    # calculating hit rate
    df[f'hit_rate_{index_field}'] = df.apply(lambda x : hit_rate_elasticserach(index_field,x['Course'],x['document_ID'],x['Question']), axis=1)
    hit_rate = df[f'hit_rate_{index_field}'].sum() / df[f'hit_rate_{index_field}'].count()

    # calculating mrr
    df[f'{index_field}_recip_rank'] = df.apply(lambda x : mrr_elasticsearch(index_field,x['Course'],x['document_ID'],x['Question']), axis=1)
    mrr = df[f'{index_field}_recip_rank'].sum() / df[f'{index_field}_recip_rank'].count() 

    # we want to drop these columns that we created
    df.drop(columns=[f"hit_rate_{index_field}", f"{index_field}_recip_rank"], inplace=True)

    return {
        f'hit rate for {index_field}' : hit_rate,
        f'mrr for {index_field}' : mrr
    }

In [15]:
for index_field in ["question_vector","text_vector","question_text_vector"]:
    print(evaluate(index_field, ground_truth_df))

{'hit rate for question_vector': 0.7293868921775899, 'mrr for question_vector': 0.6268745595489782}
{'hit rate for text_vector': 0.779492600422833, 'mrr for text_vector': 0.6500211416490486}
{'hit rate for question_text_vector': 0.8674418604651163, 'mrr for question_text_vector': 0.7651832276250881}
