In [None]:
#### Require python 3.10+
!python --version

In [None]:
!pip uninstall nltk

In [None]:
conda install -c conda-forge ipywidgets

In [None]:
!pip install --no-cache-dir opensearch-py python-dotenv boto3 tqdm h5py matplotlib ipywidgets jedi ipython sentence_transformers


In [None]:
# For autocomplete use shift+tab
%config IPCompleter.greedy=True

In [None]:
# Download a dataset Scifact

!curl -o scifact.zip -L https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip
!unzip scifact.zip


In [None]:
# Read Data set

from tqdm.notebook import tqdm
import json

corpus_file = "./scifact/corpus.jsonl"
queries_file = "./scifact/queries.jsonl"

num_lines = sum(1 for i in open(corpus_file, 'rb'))
corpus = {}
queries = {}
print(f"Loading dataset... ")
with open(corpus_file, encoding='utf8') as fIn:
    for line in tqdm(fIn, total=num_lines):
        line = json.loads(line)
        corpus[line.get("_id")] = {
            "text": line.get("text"),
            "title": line.get("title"),
        }

print(f"Dataset size is : {num_lines}")

print(f"Loading queries... ")
num_lines = sum(1 for i in open(queries_file, 'rb'))
queries = {}

with open(queries_file, encoding='utf8') as fIn:
    for line in tqdm(fIn, total=num_lines):
        line = json.loads(line)
        queries[line.get("_id")] = { 
            "text": line.get("text")
        }


print(f"Queries size is : {num_lines}")

In [None]:
# Using sentence Transformer model Example
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
print(model)
dimension = model.get_sentence_embedding_dimension()
print(f"Model dimension is : {dimension}")

In [None]:
from dotenv import load_dotenv
from opensearchpy import OpenSearch, RequestsHttpConnection
import os


res = load_dotenv("environment.txt")

OS_HOST = os.getenv('OS_HOST')
OS_PORT = os.getenv('OS_PORT')
OS_USER = os.getenv('USER_NAME')
OS_PASSWORD = os.getenv('PASSWORD')


client = OpenSearch(
    hosts = [{'host': OS_HOST, 'port': OS_PORT}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (OS_USER, OS_PASSWORD),
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    timeout=6000,
    pool_maxsize = 20
)

client.info()
hybrid_search_index_name = os.getenv('HYBRID_SEARCH_INDEX_NAME', "hybrid_search_index")
print(f"hybrid search index name from env is : {hybrid_search_index_name}")


In [None]:
def create_index(index_name, dimension):
    index_mappings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            #"refresh_interval": "-1",
            "index": {
            "knn": True,
            "knn.algo_param.ef_search": 128 # Adjust to improve precision. Higher improves recall & precsion but increases latency. Lower degrades recall & precision but improves latency.
            }
        },
        "mappings": {
            "properties": {
                "vec": {
                    "type": "knn_vector",
                    "dimension": dimension,
                    "index": "true",
                    "method": {
                        "name": "hnsw",
                        "space_type": "l2", # l2 for SIFT, cosinesimil for typical
                        "engine": "nmslib",
                        "parameters": {
                            "ef_construction": 128
                        }
                    }
                }
            }
        }
    }

    if client.indices.exists(index=index_name):
        response = client.indices.delete(index=index_name)
        print(f"Deleting the index. Response : {response}")

    response = client.indices.create(index=index_name, body=index_mappings)
    print(f"Creating the index. Response : {response}")

create_index(hybrid_search_index_name, dimension)

In [None]:
# Refresh the index as we set the refresh interval to -1
client.indices.refresh(index=hybrid_search_index_name)

In [None]:
# Convert corpus into embeddings
# This cell can takes a lot of time to run.
for k,v in corpus.items():
    vector = model.encode(v['text'])
    v['vector'] = vector.tolist()

In [None]:
# Convert queries to vectors
for k , v in queries.items():
    vector = model.encode(v['text'])
    v['vector'] = vector.tolist()

In [None]:
# Print the vectors for queries
limit = 2
for index, v in zip(range(limit), queries.items()):
    print(v[1])

In [None]:
# Print the first 2 records.

limit = 2
for index, v in zip(range(limit), corpus.items()):
    print(v[1])


In [None]:
# Ingest data in the index
import time
from opensearchpy.helpers import bulk
from tqdm.notebook import tqdm

# for i, vec in enumerate(X_TRAIN):
#         yield { "_index": vector_index_name, "_id": str(i + 1), "vec": vec.tolist() }

def index_data_gen(corpus):
    for key, value in corpus.items():
        yield { "_index": hybrid_search_index_name, "_id": str(key), "vec": value["vector"], "text": value['text'], "title": value['title'] }

bulk_size = 100

total_time_to_ingest = 0.
ingest_latency = []
data_to_ingest = []

for data in tqdm(index_data_gen(corpus), total=len(corpus)):
    if len(data_to_ingest) == bulk_size:
        start = time.time()
        (res, errors) = bulk(client, data_to_ingest)
        end = time.time()
        total_time_to_ingest += (end-start)
        ingest_latency.append(end-start)
        if len(errors) != 0:
            print(errors)
            data_to_ingest = []
            StopIteration
        else:
            data_to_ingest = []

    if len(data_to_ingest) < bulk_size:
        data_to_ingest.append(data)


if len(data_to_ingest) != 0:
    start = time.time()
    (_, errors) = bulk(client, data_to_ingest)
    end = time.time()
    total_time_to_ingest += (end-start)
    if len(errors) != 0:
        print(errors)
    else:
        data_to_ingest = []

print(f"Ingestion completed. Total time to ingest = {total_time_to_ingest} seconds, average time per document: {total_time_to_ingest/(len(corpus))}")


In [None]:
# Check index details, you should see 1M documents in the index.
print(client.cat.indices(index=hybrid_search_index_name))

print("Segments Info After refresh...")

segments = client.cat.segments(hybrid_search_index_name, params={"format": "json"})

print(f"Total segments are: {len(segments)}")

print(f"Printing Segment info : \n{client.cat.segments(index=hybrid_search_index_name, params={'format': 'csv', 'v': 'true'})}")


In [None]:
# Lets do the hybrid query now using Bool Query clause
limit = 2
for index, v in zip(range(limit), queries.items()):
    print(v[1])


def generate_query_clause(queries):
    for _, query in queries.items():
        query_body = {
            "size": 10,
            "query": {
                "bool": {
                    "should": [
                        {
                            "match": {
                                "text": query['text']
                            }
                        },
                        {
                            "knn": {
                                "vec": {
                                    "vector": query['vector'],
                                    "k": 10
                                }
                            }
                        }
                    ]
                }
            }
        }
        yield query_body


search_latency = []
took_time = []

for query_body in tqdm(generate_query_clause(queries), total=len(queries)):
    start = time.time()
    search_response = client.search(body=query_body, index=hybrid_search_index_name, _source=False, docvalue_fields=["_id"], stored_fields="_none_")
    end = time.time()
    search_latency.append(end - start)
    took_time.append(search_response["took"])



In [None]:
import numpy as np

print("========================== Search Metrics ===================================")
print("\n\n========================== Server Side Latency ===================================")
print(f"average took_time(ms): {np.average(took_time)}") 
print(f"p50 took_time(ms): {np.percentile(took_time, 50)}") 
print(f"p90 took_time(ms): {np.percentile(took_time, 90)}")
print(f"p90 took_time(ms): {np.percentile(took_time, 99)}")


print("\n\n========================== Client side latency ===================================")
print(f"average Latency(ms): {np.average(search_latency) *1000}") 
print(f"p50 Latency(ms): {np.percentile(search_latency, 50) *1000}") 
print(f"p90 Latency(ms): {np.percentile(search_latency, 90) *1000}")
print(f"p99 Latency(ms): {np.percentile(search_latency, 99) *1000}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({
    "took_time": took_time
})

avg_latency = sum(took_time) / len(took_time)

plt.figure(figsize=(15, 5))
plt.scatter(df.index, df['took_time'], label="took_time", color='blue', s=5)
plt.title(f"Search took_time | avg {avg_latency:.2f} ms")
plt.xlabel("Search Run")
plt.xlim(0, len(queries)) # change this value to view a wider range of data. (Search Run Batch)
plt.ylim(0, 30) # change this value to view the precision/recall, some values may be out of view.
plt.ylabel("took_time (milliseconds)")
plt.legend()
plt.grid(True)

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({
    "search_latency": [(lat*1000) for lat in search_latency]
})

avg_latency = sum([(lat*1000) for lat in search_latency]) / len(search_latency)

plt.figure(figsize=(15, 5))
plt.scatter(df.index, df['search_latency'], label="search_latency", color='blue', s=5)
plt.title(f"Search search_latency | avg {avg_latency:.2f} ms")
plt.xlabel("Search Run")
plt.xlim(0, len(queries)) # change this value to view a wider range of data. (Search Run Batch)
plt.ylim(0, 400) # change this value to view the precision/recall, some values may be out of view.
plt.ylabel("search_latency (milliseconds)")
plt.legend()
plt.grid(True)

plt.show()