### Importing the necessary libraries

In [1]:
import json
import pandas as pd
from elasticsearch import Elasticsearch
import minsearch
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings("ignore")

### Data Loading

In [2]:
# let's start by loading our documents ids file
with open("documents_with_ids.json", "rb") as file:
    documents = json.load(file)

In [3]:
documents[0]

{'course': 'data-engineering-zoomcamp',
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'id': '7000acaa'}

### Initialising and Populating ElasticSearch

In [4]:
# remember to run `sudo chown -R 1000:1000 /tmp/elasticsearch_data` after docker-compose up
es_client = Elasticsearch("http://localhost:9200")

if es_client.ping():
    print("Connected to ElasticSearch!")
else:
    print("Connection Failed.")

Connected to ElasticSearch!


In [5]:
# now to create a new index as well as defining the index settings

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

# lets delete and create a new index if it exists for ease of re-runs
if es_client.indices.exists(index="course-questions"):
    es_client.indices.delete(index="course-questions")

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
# next we simply have to populate our index using the bulk method
# this method is typically used as a pipeline, hence we need to mention `index` as action 
# you can define multiple 
index = {'index':{
    '_index': index_name}
    }

operations = [item for doc in documents for item in (index, doc)]

es_client.bulk(operations=operations)

ObjectApiResponse({'took': 72, 'errors': False, 'items': [{'index': {'_index': 'course-questions', '_id': 'MfSeZ5IBV2uAZK5sC9w2', '_version': 1, 'result': 'created', '_shards': {'total': 1, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'course-questions', '_id': 'MvSeZ5IBV2uAZK5sC9w2', '_version': 1, 'result': 'created', '_shards': {'total': 1, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'course-questions', '_id': 'M_SeZ5IBV2uAZK5sC9w2', '_version': 1, 'result': 'created', '_shards': {'total': 1, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'course-questions', '_id': 'NPSeZ5IBV2uAZK5sC9w2', '_version': 1, 'result': 'created', '_shards': {'total': 1, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'course-questions', '_id': 'NfSeZ5IBV2uAZK5sC9w2', '_version': 1,

In [7]:
# now to see if the index was indeed created

indices = es_client.indices.get_alias(index="*")

for index in indices:
    print(index)

course-questions


In [8]:
# last step for this function would be to define the search query function for elasticsearch
def elasticsearch_query(query, course):
    seach_query = {
        "size": 5, # five closest responses to query
        "query": {
            "bool":{
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["questions^3","text","section"], # giving more importance to questions
                        "type": "best_fields"
                    }
                },
                "filter":{
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name,
                                body=seach_query)
    
    result_docs = [hit for hit in response['hits']['hits']]

    return result_docs

### Evaluating `elasticsearch` search engine using GTD - Hit Rate and MRR

In [9]:
# loading the ground truth dataset
ground_truth_df = pd.read_csv("ground-truth-data.csv", delimiter=',', engine='python')

ground_truth_df.head()

Unnamed: 0,Course,document_ID,Question
0,data-engineering-zoomcamp,7000acaa,When will the course start?
1,data-engineering-zoomcamp,7000acaa,What is the purpose of this document?
2,data-engineering-zoomcamp,7000acaa,How can I subscribe to the course public Googl...
3,data-engineering-zoomcamp,7000acaa,How can I register before the course starts?
4,data-engineering-zoomcamp,7000acaa,How can I join the course Telegram channel?


In [10]:
ground_truth_df.dropna(inplace=True)

In [11]:
# now that we have loaded our ground truth dataset, we can evaluate our retrieval system
# lets proceed to define a funtion that returns the first metric, hit rate
# hit rate is the percentage of successful results where the system returns at least one relevant document

def hit_rate_elasticserach(course:str, doc_id:str, query:str):
    query_res = elasticsearch_query(query=query, course=course)

    hit_rate = [True if res["_source"]['id'] == doc_id else False for res in query_res]

    if any(hit_rate):
        return 1
    else:
        return 0


In [12]:
ground_truth_df['hit_rate_boolean'] = ground_truth_df.apply(lambda x : hit_rate_elasticserach(x['Course'],x['document_ID'],x['Question']), axis=1)

ground_truth_df.head()

Unnamed: 0,Course,document_ID,Question,hit_rate_boolean
0,data-engineering-zoomcamp,7000acaa,When will the course start?,0
1,data-engineering-zoomcamp,7000acaa,What is the purpose of this document?,0
2,data-engineering-zoomcamp,7000acaa,How can I subscribe to the course public Googl...,0
3,data-engineering-zoomcamp,7000acaa,How can I register before the course starts?,0
4,data-engineering-zoomcamp,7000acaa,How can I join the course Telegram channel?,0


In [13]:
ground_truth_df.hit_rate_boolean.value_counts()

hit_rate_boolean
1    3134
0    1596
Name: count, dtype: int64

In [14]:
# time to calculate hit-rate
ground_truth_df['hit_rate_boolean'].sum() / ground_truth_df['hit_rate_boolean'].count() 

0.6625792811839324

In [15]:
# now to define a function that returns the rank of the relevant results for a particular query
def mrr_elasticsearch(course:str, doc_id:str, query:str) -> int:
    query_res = elasticsearch_query(query=query, course=course)

    id_res = [res['_source']['id'] for res in query_res]

    for index, id in enumerate(id_res):
        if id == doc_id:
            return 1/(index+1)
            
    return 0

In [16]:
ground_truth_df['reciprocal_rank'] = ground_truth_df.apply(lambda x : mrr_elasticsearch(x['Course'],x['document_ID'],x['Question']), axis=1)

ground_truth_df.head()

Unnamed: 0,Course,document_ID,Question,hit_rate_boolean,reciprocal_rank
0,data-engineering-zoomcamp,7000acaa,When will the course start?,0,1.0
1,data-engineering-zoomcamp,7000acaa,What is the purpose of this document?,0,1.0
2,data-engineering-zoomcamp,7000acaa,How can I subscribe to the course public Googl...,0,1.0
3,data-engineering-zoomcamp,7000acaa,How can I register before the course starts?,0,1.0
4,data-engineering-zoomcamp,7000acaa,How can I join the course Telegram channel?,0,1.0


In [17]:
ground_truth_df.reciprocal_rank.value_counts()

reciprocal_rank
1.000000    3129
0.000000     974
0.500000     327
0.333333     135
0.250000      97
0.200000      68
Name: count, dtype: int64

In [18]:
# time to calculate mrr
ground_truth_df['reciprocal_rank'].sum() / ground_truth_df['reciprocal_rank'].count() 

0.7136046511627907

### Evaluating `minsearch` search engine using GTD - Hit Rate and MRR

In [19]:
# now to fit the documents_with_ids JSON into our minsearch index

index = minsearch.Index(
    text_fields=["question","text","section"],
    keyword_fields=["course","id"]
)

index.fit(documents)

<minsearch.Index at 0x7e168e97b770>

In [20]:
# next we want to define a function that searches through our minsearch index

def minsearch_query(query, course):
    boost = {"question": 3.0, "section": 0.5} # giving more weight to these fields in documents (i.e. our knowledge_base)

    res = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return res

In [21]:
# before evaluating, lets convert our ground truth df into a dictionary

ground_truth_df.drop(columns=['hit_rate_boolean','reciprocal_rank'], inplace=True)
ground_truth = ground_truth_df.to_dict(orient='records')

ground_truth[0]

{'Course': 'data-engineering-zoomcamp',
 'document_ID': '7000acaa',
 'Question': 'When will the course start?'}

In [22]:
# so same as before, we now want to define a function that returns the hit rate for minsearch

def hit_rate_minsearch(doc_index):
    relevance_total = []
    for doc in doc_index:
        res_list = minsearch_query(query=doc["Question"], course=doc["Course"])
        relevance = [True if res['id'] == doc['document_ID'] else False for res in res_list]
        if any(relevance):
            relevance_total.append(1)
        else:
            relevance_total.append(0)
            
    hit_rate = sum(relevance_total)/len(relevance_total)

    return hit_rate

In [23]:
hit_rate_minsearch(ground_truth)

0.7340380549682876

In [24]:
# now for a similar function for MRR

def mrr_minsearch(doc_index):
    rank_total = []
    for doc in doc_index:
        res_list = minsearch_query(query=doc["Question"], course=doc["Course"])
        try:
            reciprocal_rank = 1 / sum([(index+1) if res["id"] == doc["document_ID"] else 0 for index, res in enumerate(res_list)])
        except:
            reciprocal_rank = 0
            
        rank_total.append(reciprocal_rank)

    return sum(rank_total)/len(rank_total)
        

In [25]:
mrr_minsearch(ground_truth)

0.633971106412967

### Summary - Evaluation Function

In [26]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [27]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [28]:
# it would be much easier to combine all of our steps into one function as follows:

def evaluate_elasticsearch(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_ID']
        results = search_function(q)
        relevance = [d['_source']['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [29]:
evaluate_elasticsearch(ground_truth, lambda q: elasticsearch_query(q['Question'], q['Course']))

  0%|          | 0/4730 [00:00<?, ?it/s]

{'hit_rate': 0.7940803382663848, 'mrr': 0.7136046511627907}

In [30]:
def evaluate_minsearch(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_ID']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [31]:

evaluate_minsearch(ground_truth, lambda q: minsearch_query(q['Question'], q['Course']))

  0%|          | 0/4730 [00:00<?, ?it/s]

{'hit_rate': 0.7340380549682876, 'mrr': 0.6339711064129675}