In [1]:
from elasticsearch import Elasticsearch
import json

# Use Embedding to Support Semantic Search 

## Gather Dataset

In [2]:
with open('documents.json','rt') as f:
    docs_raw = json.load(f)

In [3]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp'}

## Create Embeddings using Pretrained Models

In [5]:
#!pip install sentence_transformers==2.7.0

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
#!pip uninstall numpy
#!pip uninstall torch
#!pip install numpy = 1.26.4
#!pip install torch

In [8]:
model = SentenceTransformer('all-mpnet-base-v2')



In [9]:
len(model.encode('encode this word'))

768

In [10]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [11]:
operations = []
for doc in documents:
    doc['text_vector'] = model.encode(doc['text'])
    operations.append(doc)

In [12]:
operations[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': array([-4.10303585e-02,  2.58341245e-02, -3.68019156e-02, -2.08983012e-02,
        -2.05962416e-02,  9.35374666e-03, -3.33163445e-03, -9.49193351e-03,
         3.01179346e-02,  1.90821160e-02,  1.26900887e-02, -1.70788169e-02,
        -1.63238146e-03,  1.29972577e-01,  3.09692957e-02, -2.58236825e-02,
         2.78230421e-02,  2.51597390e-02, -8.08122009e-02, -3.61742009e-03,
        -8.90200026e-03,  3.40489321e-03, -2.30093114e-02, -3.40453349e-02,
         2.45986190e-02,  1.35456128e-02, -2.54389830e-02,  1.19510842e-02,
        -2.05401015e-02, -1.00774709e-02,  2.05753352e-02,  4.38897982e-02,
         1.43946512e-02,  1.82718430e-02,  1.71625845e-06, -1.92925166e-02,
        -1.57902297e-02,  8.83050449e-03,  4.14086170e-02,  1.7854562

## Setup ElasticSearch Connection

In [13]:
from elasticsearch import Elasticsearch

In [14]:
es_client = Elasticsearch('http://127.0.0.1:9200')

In [15]:
es_client.info()

ObjectApiResponse({'name': '741e032df576', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'V2xgg5AYRgm0oJyx5TrxaA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

## Create Mappings and Index

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector":{"type":"dense_vector","dims":768,"index":True,"similarity":"cosine"}
        }
    }
}
index_name = 'semantic_course_question'
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'semantic_course_question'})

## Add Index to Your Document

In [17]:
for doc in operations:
    try:
        es_client.index(index=index_name,document=doc)
    except:
        print(e)

In [18]:
query = 'can I use windows in this course'
query_vector = model.encode(query)

In [19]:
query = {
    "field":"text_vector",
    "query_vector":query_vector,
    "k":5,
    "num_candidates":10000
}

In [20]:
results = es_client.search(index=index_name,knn=query,source=["text","section","question","course"])

In [21]:
results = es_client.search(
    index=index_name,
    knn=query,
    query={
        "match":{
            "course":"data-engineering-zoomcamp"
        }
    },
    source=["text","section","question","course"],
    explain=True)
results["hits"]["hits"]

[{'_shard': '[semantic_course_question][0]',
  '_node': 'J_B72dnxRmi8U-nrbbH2jQ',
  '_index': 'semantic_course_question',
  '_id': 'FarKNZEBy-8yx8j45kgs',
  '_score': 1.5656364,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'},
  '_explanation': {'value': 1.5656364,
   'description': 'sum of:',
   'details': [{'value': 0.8134191,
     'description': 'within top k documents',
     'details': []},
    {'value': 0.7522173,
     'description': 'weight(course:data-engineering-zoomcamp in 13) [PerFieldSimilarity], result of:',
     'details': [{'value': 0.7522173,
       'description': 'score(freq=1.0), computed as boost * idf * tf from:',
       'details': [{'value': 2.2, 'description': 'boost', 'details': []},
        {'value': 0.7522173,
 

# Ground Truth Dataset

In [35]:
with open('documents.json','rt') as f:
    docs_raw = json.load(f)

In [36]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

## Add Id Columns

### Simple Id

In [26]:
n = len(documents)
for i in range(n):
    documents[i]['id']=i

In [27]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': 1}

### Hash Id with course, question and text

In [37]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [38]:
for doc in documents:
    doc['id']=generate_document_id(doc)

In [39]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

In [41]:
from collections import defaultdict

In [43]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [44]:
len(hashes),len(documents)

(947, 948)

In [45]:
for k,values in hashes.items():
    if len(values) > 1:
        print(k,len(values))

593f7569 2


In [46]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

duplicate detected

In [52]:
with open('documents-with-ids.json','wt') as f:
    json.dump(documents,f,indent=2)