In [None]:
!python --version

In [None]:
conda install -c conda-forge ipywidgets

In [None]:
!pip install --no-cache-dir opensearch-py python-dotenv boto3 tqdm h5py matplotlib ipywidgets jedi ipython sentence_transformers

In [None]:
%config IPCompleter.greedy=True

In [None]:
# Using sentence Transformer model Example
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
print(model)
dimension = model.get_sentence_embedding_dimension()
print(f"Model dimension is : {dimension}")

In [None]:
from dotenv import load_dotenv
from opensearchpy import OpenSearch, RequestsHttpConnection
import os


res = load_dotenv("environment.txt")

OS_HOST = os.getenv('OS_HOST')
OS_PORT = os.getenv('OS_PORT')
OS_USER = os.getenv('USER_NAME')
OS_PASSWORD = os.getenv('PASSWORD')


client = OpenSearch(
    hosts = [{'host': OS_HOST, 'port': OS_PORT}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (OS_USER, OS_PASSWORD),
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    timeout=6000,
    pool_maxsize = 20
)

print(client.info())
hybrid_search_index_name = os.getenv('HYBRID_SEARCH_INDEX_NAME', "hybrid_search_index")
print(f"hybrid search index name from env is : {hybrid_search_index_name}")


In [None]:
def create_index(index_name, dimension):
    index_mappings = {
        "settings": {
            "index": {
                "knn": True,
            }
        },
        "mappings": {
            "properties": {
                "embeddings": {
                    "type": "knn_vector",
                    "dimension": dimension,
                    "method": {
                        "name": "hnsw",
                        "space_type": "l2",
                        "engine": "faiss"
                    }
                }
            }
        }
    }

    if client.indices.exists(index=index_name):
        response = client.indices.delete(index=index_name)
        print(f"Deleting the index. Response : {response}")

    response = client.indices.create(index=index_name, body=index_mappings)
    print(f"Creating the index. Response : {response}")

create_index(hybrid_search_index_name, dimension)

In [None]:
# Sample data for Hybrid Search
dataset = [
    {
      "text": "The quick brown fox jumps over the lazy dog.",
    },
    {
      "text": "A journey of a thousand miles begins with a single step.",
    },
    {
      "text": "In the midst of winter, I found there was, within me, an invincible summer.",
    },
    {
      "text": "To be yourself in a world that is constantly trying to make you something else is the greatest accomplishment.",
    },
    {
      "text": "The only limit to our realization of tomorrow will be our doubts of today.",
    },
    {
      "text": "Success is not final, failure is not fatal: It is the courage to continue that counts.",
    },
    {
      "text": "Life is really simple, but we insist on making it complicated.",
    },
    {
      "text": "Believe you can and you're halfway there.",
    }
]

for data in dataset:
    data['embedding'] = list(model.encode(data['text']))


In [None]:
for data in dataset:
    print(data)

In [None]:
# Ingest data
from tqdm.notebook import tqdm
from opensearchpy.helpers import bulk

global_errors = []
for key, data in enumerate(tqdm(dataset)):
    data_to_ingest = [{ "_index": hybrid_search_index_name, "_id": str(key + 1), "embeddings": data["embedding"], "text": data['text']}]
    (res, errors) = bulk(client, data_to_ingest)
    if len(errors) != 0:
        print(errors)
        global_errors.append(errors)


print(f"Ingestion completed. Errors: {global_errors}")


In [None]:
client.indices.refresh(index=hybrid_search_index_name)

In [None]:
# This is a basic example of how to do hybrid search using bool query. In the example vector is generated outside of OpenSearch.
#
queries = [
    {
        "query": "Give me inspirational quotes about overcoming doubts.",
        "expectedResponse": "The only limit to our realization of tomorrow will be our doubts of today."
    },
    {
        "query": "Quotes about a simple life without complications.",
        "expectedResponse": "Life is really simple, but we insist on making it complicated."
    }
]

def pretty_print_search_response(query, query_response):
    print(f"Query : {query['query']}")
    print(f"Expected Response: {query['expectedResponse']}")
    print(f"Actual Response : {query_response['hits']['hits'][0]['fields']['text'][0]}")
    print('\n')

for query in queries:
    query_body = {
        "size": 1,
        "query": {
            "bool": {
                "should": [
                    {
                        "match": {
                            "text": query["query"]
                        }
                    },
                    {
                        "knn": {
                            "embeddings" : { 
                                "vector": model.encode(query['query']),
                                "k": 10
                            }
                        }
                    }
                ]
            }
        },
        "_source": "false",
        "fields": ["text"]
    }

    response = client.search(index=hybrid_search_index_name, body=query_body)
    
    pretty_print_search_response(query=query, query_response=response)

