In [3]:
#### Require python 3.10+
!python --version

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python 3.10.12


In [4]:
conda install -c conda-forge ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.14.0
  latest version: 23.7.4

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.


In [5]:
!pip install --no-cache-dir opensearch-py python-dotenv boto3 tqdm h5py matplotlib ipywidgets jedi ipython sentence_transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
# For autocomplete use shift+tab
%config IPCompleter.greedy=True

In [3]:
# Download a dataset Scifact

!curl -o scifact.zip -L https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip
!unzip scifact.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2750k  100 2750k    0     0   961k      0  0:00:02  0:00:02 --:--:--  968k
Archive:  scifact.zip
   creating: scifact/
   creating: scifact/qrels/
  inflating: scifact/qrels/train.tsv  
  inflating: scifact/qrels/test.tsv  
  inflating: scifact/corpus.jsonl    
  inflating: scifact/queries.jsonl   


In [21]:
# Read Data set

from tqdm.notebook import tqdm
import json
import pathlib, os




corpus_file = "./scifact/corpus.jsonl"
queries_file = "./scifact/queries.jsonl"

num_lines = sum(1 for i in open(corpus_file, 'rb'))
corpus = {}
queries = {}
print(f"Loading dataset... ")
with open(corpus_file, encoding='utf8') as fIn:
    for line in tqdm(fIn, total=num_lines):
        line = json.loads(line)
        corpus[line.get("_id")] = {
            "text": line.get("text"),
            "title": line.get("title"),
        }

print(f"Dataset size is : {num_lines}")


print(f"Loading queries... ")
num_lines = sum(1 for i in open(queries_file, 'rb'))
queries = {}

with open(queries_file, encoding='utf8') as fIn:
    for line in tqdm(fIn, total=num_lines):
        line = json.loads(line)
        queries[line.get("_id")] = line.get("text")


print(f"Queries size is : {num_lines}")

Loading dataset... 


  0%|          | 0/5183 [00:00<?, ?it/s]

Dataset size is : 5183
Loading queries... 


  0%|          | 0/1109 [00:00<?, ?it/s]

Queries size is : 1109


In [None]:
# Using sentence Transformer model Example
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [9]:
from dotenv import load_dotenv
from opensearchpy import OpenSearch, RequestsHttpConnection
import os


res = load_dotenv("environment.txt")

OS_HOST = os.getenv('OS_HOST')
OS_PORT = os.getenv('OS_PORT')
OS_USER = os.getenv('USER_NAME')
OS_PASSWORD = os.getenv('PASSWORD')


client = OpenSearch(
    hosts = [{'host': OS_HOST, 'port': OS_PORT}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (OS_USER, OS_PASSWORD),
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    timeout=6000,
    pool_maxsize = 20
)

client.info()



{'name': '428d6ce63b054e8d1dd55d36eb0ea810',
 'cluster_name': '199552501713:go-daddy-xlarge',
 'cluster_uuid': 'Sc_tsdMeQ-6R8gdYNCY1FQ',
 'version': {'distribution': 'opensearch',
  'number': '2.7.0',
  'build_type': 'tar',
  'build_hash': 'unknown',
  'build_date': '2023-08-08T16:51:18.396423063Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.10.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

In [57]:
hybrid_search_index_name = os.getenv('HYBRID_SEARCH_INDEX_NAME', "hybrid_search_index")

print(f"hybrid search index name from env is : {hybrid_search_index_name}")

index_mappings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        #"refresh_interval": "-1",
        "index": {
          "knn": True,
          "knn.algo_param.ef_search": 128 # Adjust to improve precision. Higher improves recall & precsion but increases latency. Lower degrades recall & precision but improves latency.
        }
    },
    "mappings": {
        "properties": {
            "vec": {
                "type": "knn_vector",
                "dimension": dimension,
                "index": "true",
                "method": {
                    "name": "hnsw",
                    "space_type": "l2", # l2 for SIFT, cosinesimil for typical
                    "engine": "nmslib",
                    "parameters": {
                        "ef_construction": 128
                    }
                }
            }
        }
    }
}

if client.indices.exists(index=hybrid_search_index_name):
    response = client.indices.delete(index=hybrid_search_index_name)
    print(f"Deleting the index. Response : {response}")

response = client.indices.create(index=hybrid_search_index_name, body=index_mappings)
print(f"Creating the index. Response : {response}")



vector name from env is : test_vector
Deleting the index. Response : {'acknowledged': True}
Creating the index. Response : {'acknowledged': True, 'shards_acknowledged': True, 'index': 'test_vector'}


In [62]:
# Refresh the index as we set the refresh interval to -1
client.indices.refresh(index=hybrid_search_index_name)



{'_shards': {'total': 1, 'successful': 1, 'failed': 0}}

In [None]:
# Parallelization for Search