In [1]:
#!pip install opensearch-py==2.3.1

[0m

In [1]:
from opensearchpy import OpenSearch
from sentence_transformers import SentenceTransformer, util as STutil
from tqdm.notebook import tqdm
from datasets import Dataset,load_dataset
import numpy as np

In [2]:
# https://github.com/opensearch-project/opensearch-py/blob/main/USER_GUIDE.md
host = 'ai-search-opensearch-node'
port = 9200
client = OpenSearch(hosts = [{'host': host, 'port': port}])
info = client.info()
print(f"Welcome to {info['version']['distribution']} {info['version']['number']}!")

Welcome to opensearch 2.8.0!


## Create the OpenSearch Index

In [3]:
def create_index(name="ai-search",filename="schema.json",delete=False):
    schema = None
    with open(filename) as fd:
        schema = fd.read()
    index_name = name
    index_body = schema

    if delete:
        try:
            client.indices.delete(index_name)
        except:
            pass
        
    response = client.indices.create(index_name,body=index_body)
    
    print(response)

In [4]:
create_index(delete=True)
#create_index()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ai-search'}


## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [5]:
#The E5 models expect 'query:' and 'passage:' prefixes
model = SentenceTransformer('intfloat/e5-small-v2')
def get_embeddings(texts,prefix="query:"):
    #The E5 models expects either 'query:' or 'passage:' prefix
    if not isinstance(texts, list):
        texts = [texts]
    prefixed = [prefix+text for text in texts]
    embeddings = model.encode(prefixed,show_progress_bar=False)
    return embeddings

## Get our dataset and title_embeddings

In [6]:
# Load 50k records of the the 'cc_news' dataset from Hugging Face
dataset = load_dataset("cc_news",split='train[0:50000]')

In [7]:
#Load the title_embeddings we generated in 02-sentence-transformers
title_embeddings = []
import pickle
with open('cc_news_title_embeddings_50000.pkl','rb') as fd:
    title_embeddings = pickle.load(fd)

## Index the data in bulk, into our Opensearch index

In [8]:
def index_one(title_embedding,record):
    document = {
        'title_embedding':title_embedding,
        'title': record['title'][0],
        'text': record['text'][0],
        'domain': record['domain'][0],
        'date': record['date'][0],
        'description': record['description'][0],
        'url': record['url'][0],
        'image_url': record['image_url'][0]
    }
    index_name = "ai-search"  # Replace with your desired index name
    client.index(index=index_name, id=document['url'], body=document)

In [9]:
index_one(title_embeddings[0],dataset.select([0]))

In [None]:
#def get_batch_body(title_embeddings,dataset):
#    
#def index_bulk(title_embeddings,dataset,batch_size=100):
#    index = 0
#    count = len(title_embeddings)
#    for batch in range(0,count//batch_size):
#        data = [???]
#
#    response = client.bulk(data)