# Semantic Search

In [1]:
import json

In [3]:
with open('documents.json', 'rt') as file_input:
    docs_raw = json.load(file_input)

documents=[]

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
# Sentence Transformers (a.k.a. SBERT) is the go-to Python module for accessing, using, and training state-of-the-art text and image-embedding models
# Sentence Transformer is used for semantic search in this project
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model.encode("this is a simple logic")

array([-6.92723738e-03, -4.76815924e-02,  3.69697735e-02,  1.53370267e-02,
       -2.05732360e-02,  4.13800031e-02,  5.61225868e-04, -2.58686673e-02,
        4.54159267e-02,  3.81226055e-02,  6.08133301e-02,  6.79076761e-02,
        3.56978849e-02,  4.24183486e-03,  3.83592993e-02, -4.09212895e-02,
        1.98478866e-02,  2.09019370e-02,  1.71326064e-02,  4.93581640e-03,
        1.37594193e-02,  8.86246096e-03, -3.09024621e-02, -5.15327342e-02,
        4.06162776e-02,  1.30062439e-02,  8.05857927e-02, -7.81587791e-03,
       -4.11503389e-02, -2.33079772e-02, -7.90117607e-02, -4.28807996e-02,
       -1.03251981e-02, -2.14723106e-02,  1.80705069e-06, -5.83427772e-03,
        1.15270009e-02,  7.28333695e-03, -4.02601436e-02, -5.76059381e-03,
       -5.09200953e-02,  3.42412405e-02, -2.74241529e-02,  2.65143942e-02,
       -2.88266204e-02,  4.36347723e-02,  5.78207569e-03,  2.08436828e-02,
       -8.87339655e-03,  3.22287763e-03, -4.96385014e-03, -4.12691198e-02,
       -3.30867618e-02, -

In [6]:
# Checking the dimensionality of the selected model of the Sentence Transformer
len(model.encode("this is a simple logic"))

768

In [7]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
#applying Sentence Transformer to all the "text" columns in the document(it can be apply to the "question" columns as "section" and 
# "course" are categorical responses)
operations = []
for doc in tqdm(documents):
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [04:58<00:00,  3.18it/s]


In [13]:
operations[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.041030403226614,
  0.025834161788225174,
  -0.036801841109991074,
  -0.020898321643471718,
  -0.020596304908394814,
  0.009353742003440857,
  -0.003331671468913555,
  -0.009491903707385063,
  0.030117977410554886,
  0.01908210851252079,
  0.012690035626292229,
  -0.017078785225749016,
  -0.0016324761090800166,
  0.12997251749038696,
  0.030969230458140373,
  -0.025823738425970078,
  0.0278230682015419,
  0.025159770622849464,
  -0.0808122381567955,
  -0.0036173474509269,
  -0.008902025409042835,
  0.003404824063181877,
  -0.0230092890560627,
  -0.03404529020190239,
  0.024598615244030952,
  0.013545555993914604,
  -0.025439025834202766,
  0.011951087042689323,
  -0.020540112629532814,
  -0.010077380575239658,
  0.020575348287820816,
  0.0

In [10]:
#importing ElasticSearch from docker
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
es_client.info()

ObjectApiResponse({'name': '76cfc3f67e2e', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'yxczCUD5SHCqxIJzjEksSw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [11]:
#Creating mapping and index: we can compare mapping to a database schema in how it describes the fields and properties that documents hold, the
#datatype of each field(e.g., string,integer, or date), and how those field should be indexed and stored.
index_setting = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector":{"type":"dense_vector","dims":768,"index":True,"similarity":"cosine"
            },
        }
    }
}


In [12]:
# deleting if indices exist and ignores if it doesn't, 
# create an indices 
index_name = "course_questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_questions'})

In [14]:
#adding documents to index
for doc in tqdm(operations):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)
    

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:21<00:00, 43.66it/s]


In [15]:
# Creating an end-user query
# encoding the query provided by the end-user
search_term = "windows or mac?"
vector_search_term = model.encode(search_term)

In [17]:
#setting up the query parameters
query = {
    "field" : "text_vector",
    "query_vector": vector_search_term,
    "k" : 5,
    "num_candidates" : 10000,
}

In [18]:
# Searching through the documents to provide the top 5 semantic searches based on the query provided ("windows or mac?").
# The ranking of the score of the semantic search is done by cosine similarity

response = es_client.search(index=index_name, knn=query, source=["text","section","question","course"])
response["hits"]["hits"]

[{'_index': 'course_questions',
  '_id': 'SQ4n7pABxI8c3hwtGTQv',
  '_score': 0.7350748,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course_questions',
  '_id': 'XA4n7pABxI8c3hwtXje1',
  '_score': 0.6213174,
  '_source': {'question': 'WSL instructions',
   'course': 'mlops-zoomcamp',
   'section': 'Module 1: Introduction',
   'text': 'If you wish to use WSL on your windows machine, here are the setup instructions:\nCommand: Sudo apt install wget\nGet Anaconda download address here. wget <download address>\nTurn on Docker Desktop WFree Download | AnacondaSL2\nCommand: git clone <github repository address>\nVSCODE on WSL\nJupyter: pip3 install jupyter\nAdded by Gregory Morris (gwm1980@gmail.com)\nAll in all softwares at

# Perform Keyword search with Semantic Search(Hybrid/Advanced Search)

In [28]:
#Keyword Search 
response1 = es_client.search(
    index=index_name,
    query={
        "bool": {
            "must": {
                "multi_match":
                            {"query": "windows or python?",
                             "fields": ["text", "question", "course", "title"],
                             "type": "best_fields"
                            }
                    },
            "filter": {
                "term": {
                        "course": "data-engineering-zoomcamp"
            }
        }
        }
    }

)
    
                

In [29]:
response1["hits"]["hits"]

[{'_index': 'course_questions',
  '_id': 'jw4n7pABxI8c3hwtHzSb',
  '_score': 7.728908,
  '_source': {'text': 'Problem: If you have already installed pgcli but bash doesn\'t recognize pgcli\nOn Git bash: bash: pgcli: command not found\nOn Windows Terminal: pgcli: The term \'pgcli\' is not recognized…\nSolution: Try adding a Python path C:\\Users\\...\\AppData\\Roaming\\Python\\Python39\\Scripts to Windows PATH\nFor details:\nGet the location: pip list -v\nCopy C:\\Users\\...\\AppData\\Roaming\\Python\\Python39\\site-packages\n3. Replace site-packages with Scripts: C:\\Users\\...\\AppData\\Roaming\\Python\\Python39\\Scripts\nIt can also be that you have Python installed elsewhere.\nFor me it was under c:\\python310\\lib\\site-packages\nSo I had to add c:\\python310\\lib\\Scripts to PATH, as shown below.\nPut the above path in "Path" (or "PATH") in System Variables\nReference: https://stackoverflow.com/a/68233660',
   'section': 'Module 1: Docker and Terraform',
   'question': 'PGCLI - pg

In [30]:
#Advanced Semantic Search
knn_query = {
    "field" : "text_vector",
    "query_vector": vector_search_term,
    "k" : 5,
    "num_candidates" : 10000,
}

response2 = es_client.search(
    index=index_name,
    query={
        "match": {
                "course": "data-engineering-zoomcamp"
            },
        },
    knn=knn_query,
    size=5,
    explain=True
)
    

In [31]:
response2["hits"]["hits"]

[{'_shard': '[course_questions][0]',
  '_node': '0JysseKaTGyIu4Q3LsJoeQ',
  '_index': 'course_questions',
  '_id': 'SQ4n7pABxI8c3hwtGTQv',
  '_score': 1.5139887,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.026965461671352386,
    -0.000626126304268837,
    -0.01662949100136757,
    0.05285150930285454,
    0.05476527288556099,
    -0.03133990615606308,
    0.029942581430077553,
    -0.04808562621474266,
    0.04467551037669182,
    0.005839474033564329,
    0.016233040019869804,
    0.012001154012978077,
    -0.031222281977534294,
    0.016600528731942177,
    -0.04886901378631592,
    -0.06496307998895645,
    0.046434223651885986,
    -0.009297756478190422,
    -0.0642528235912323,
    -0.01373267825692892,


##### Note:
- Our codes in the Advanced semantic search is using Elasticsearch with the k-nearest neighbors (k-NN) search functionality for querying embeddings generated by a Sentence Transformer model. The "_score" value of advanced semantic search returned by Elasticsearch in this context is not necessarily confined to the 0-1 range, unlike normalized similarity scores such as cosine similarity.
- The "_score" of the advanced semantic search is greater than 1 this can be rectified by normalizing the "_score" 