In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_input:
    documents = json.load(f_input)

In [3]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

index_setting = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id":{"type": "keyword"
            },
        }
    }
}

index_name = "course_questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_questions'})

In [12]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 39.54it/s]


In [16]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["question^3", "text", "section"],
                            "type": "best_fields"
                        }
                    }
                ],
                "filter": [
                    {
                        "term": {
                            "course": course
                        }
                    }
                ]
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit['_source'])

    return result_docs


In [18]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)


[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerrafor

In [19]:
import pandas as pd

In [23]:
df_ground_truth = pd.read_csv("ground-truth-data.csv")
df_ground_truth.head(2)

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef


In [25]:
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I receive course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where do I join the Slack channel?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where can I find the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'How do I check the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'Where are the course prerequisites listed?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520c

In [27]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'],course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 310.02it/s]


In [28]:
relevance

[True, True, False, False, False]

In [29]:
relevance_total

[[True, True, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [False, False, False, False, True],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [True, True, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, True, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, True],
 [False, False, True, True, False],
 [False, False, True, True, False],
 [True, True, False

#### creating hit-rate(recall) and mean reciprocal rank (mrr) with a sample size

In [30]:
#creating hit-rate(recall) 
example = [
 [True, True, False, False, False],#2
 [False, False, False, False, False],#0
 [False, False, False, False, False],#0
 [False, False, False, False, False],#0
 [False, False, False, False, False],#0
 [True, True, False, False, False],#2
 [True, True, False, False, False],#2
 [True, True, False, False, False],#2
 [True, True, False, False, False],#2
 [True, True, False, False, False],#2
 [False, False, False, False, True],#1
 [False, False, False, False, False],#0
]

# mean reciprocal rank (mrr)
# 1 => 1
# 2 => 1 / 2 = 0.5
# 3 => 1 / 3 = 0.3333
# 4 => 0.25
# 5 => 0.2
# rank => 1 / rank
# none => 0

In [31]:
len(example)

12

In [33]:
#hit-rate(recall)
7 / len(example)

0.5833333333333334

In [34]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)

In [35]:
hit_rate(example)

0.5833333333333334

In [36]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank +1)
    return total_score / len(relevance_total)

In [37]:
mrr(example)

0.7666666666666666

In [38]:
hit_rate(relevance_total), mrr(relevance_total)

(0.6725740220445213, 0.8524890137598172)

In [41]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x72248990aa10>

In [42]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section':0.5}

    results = index.search(
        query=query,
        filter_dict={'course':course},
        boost_dict=boost,
        num_results=5
    )


    return results

In [43]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'],course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:16<00:00, 287.76it/s]


In [44]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7722066133563864, 0.661454506159499)

comparing the results with ElasticSearch results:
```
(0.6725740220445213, 0.8524890137598172)
```

In [45]:
#refactoring the above codes

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }
        

In [46]:
evaluate(ground_truth, lambda q: elastic_search(q['question'],q['course']))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:11<00:00, 412.86it/s]


{'hit_rate': 0.6725740220445213, 'mrr': 0.8524890137598172}

In [47]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'],q['course']))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:15<00:00, 289.75it/s]


{'hit_rate': 0.7722066133563864, 'mrr': 0.661454506159499}