In [12]:
import json

In [13]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)


documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [14]:
documents[0]

{'text': "dThe purpose of this document is to capture frequently asked technical questions\nThe next cohort starts in Jan 2025. More info at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start？',
 'course': 'data-engineering-zoomcamp'}

### Create embedding with a pre-trained model

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")



In [16]:
indexd_text =[]

for doc in documents:
    doc['text_vector'] = model.encode(doc['text']).tolist()
    indexd_text.append(doc)

#### Elastic search connection

In [24]:
from elasticsearch import Elasticsearch

#here elastic search is set up docker
es_client = Elasticsearch('http://localhost:9200')

es_client.info()

ObjectApiResponse({'name': '934a4bc7a925', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'UGFBXMLRQrqh9_e0sByKvA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

## Create mapping and index

In [21]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
},
    "mappings": {
        "properties": {
            "text" : {"type": "text"},
            "section" : {"type": "text"},
            "question" : {"type" : "text"},
            "course": {"type" : "keyword"},
            "text_vector": {"type" :"dense_vector", "dims": 384, "index": True, "similarity": "cosine"},
            
        }
    }
}

## Best Practise Note
- as a best practice after updating or indexing or reindex   
- first delete (if there is with the same name) to avoid error  

In [22]:
index_name = "course-questions"

#as a best practice after updating or indexing or reindex first delete (if there is with the same name) to avoid error
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body= index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

## add document to index (aka elastic search)
- for every document push into the index db

In [23]:
for doc in indexd_text:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [27]:
query_term = " Tensorflow or Pytorch"
query_vector_term = model.encode(query_term)

In [28]:
query = {
    "field": "text_vector",
    "query_vector" : query_vector_term,
    "k" : 5,
    "num_candidates" : 10000,
}

In [29]:
result = es_client.search(index=index_name, knn=query, source=["text","section","question","course"])
result["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'm5QzWpEBOrmxiPAXaDbs',
  '_score': 0.79801464,
  '_source': {'question': 'Can we use pytorch for this lesson/homework ?',
   'course': 'machine-learning-zoomcamp',
   'section': '8. Neural Networks and Deep Learning',
   'text': 'Pytorch is also a deep learning framework that allows to do equivalent tasks as keras. Here is a tutorial to create a CNN from scratch using pytorch :\nhttps://blog.paperspace.com/writing-cnns-from-scratch-in-pytorch/\nThe functions have similar goals. The syntax can be slightly different. For the lessons and the homework, we use keras, but one can feel free to make a pull request with the equivalent with pytorch for lessons and homework!\nMélanie Fouesnard'}},
 {'_index': 'course-questions',
  '_id': 'u5QzWpEBOrmxiPAXazbB',
  '_score': 0.76735115,
  '_source': {'question': 'Using Tensorflow 2.15 for AWS deployment',
   'course': 'machine-learning-zoomcamp',
   'section': '9. Serverless Deep Learning',
   'text': 'Usin

### keyword search with elastic search

In [31]:
response = es_client.search(
    index= index_name,
    query= {
        "bool" : {
            "must" : {
                "multi_match": {
                    "query" : "Is pytorch better than Tensorflow",
                    "fields" : ["text", "section", "question", "course"],
                    "type" : "best_fields"
                                }
                    },
            "filter" :{
                "term" : { "course" : "machine-learning-zoomcamp" }
                      }
                }
            }
)

In [None]:
response["hits"]["hits"]

In [35]:
knn_query = {
    "field": "text_vector",
    "query_vector" : query_vector_term,
    "k" : 5,
    "num_candidates" : 10000,
}

response = es_client.search(
    index= index_name,
    query = {
        "match" : {
            "course" : "machine-learning-zoomcamp"
        }
    },
    knn = knn_query,
    size = 5,
    explain=True
)

In [None]:
response["hits"]["hits"]