In [1]:
import numpy as np
import datetime
from opensearchpy import helpers
from sentence_transformers import SentenceTransformer
from opensearchpy import OpenSearch, RequestsHttpConnection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SERVER_URL = "http://localhost:9200"
INDEX_NAME = "llama-mix-index"
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') 


In [3]:

def normalize_data(data):
    return data / np.linalg.norm(data, ord=2)

In [33]:
def store_index(index_name: str, data: np.array, metadata: list, os_client: OpenSearch): 
    documents = [] 
    for index_num, vector in enumerate(data): 
        metadata_line = metadata[index_num] 
        text_field = metadata_line["text"] 
        embedding = model.encode(text_field) 
        norm_text_vector_np = normalize_data(embedding) 
        document = { 
            "_index": index_name, 
            "_id": index_num, 
            "description_vector": norm_text_vector_np.tolist(), 
            "text_field": text_field 
        } 
        documents.append(document) 
    helpers.bulk(os_client, documents, request_timeout=1800) 
    documents = [] 
    print(f"bulk {index_num} indexed successfully") 
    os_client.indices.refresh(INDEX_NAME) 
 
    os_client.indices.refresh(INDEX_NAME) 

In [14]:
def get_vector_dimension(metadata: list):
    text = metadata[0]["text"]
    embeddings = model.encode(text)
    return len(embeddings)

In [7]:
def create_index(index_name: str, os_client: OpenSearch, metadata: np):
    mapping = {
        "mappings": {
            "properties": {
                "description_vector": {
                    "type": "knn_vector",
                    "dimension": get_vector_dimension(metadata),
                },
                "text_field": {
                    "type": "text",
                    "analyzer": "standard",
                }
            }
        },
        "settings": {
            "index": {
                "number_of_shards": "1",
                "knn": "false",
                "number_of_replicas": "0"
            }
        }

    }
    os_client.indices.create(index=index_name, body=mapping)

In [8]:
def delete_index(index_name: str, os_client: OpenSearch):
    os_client.indices.delete(index_name)

In [9]:
def get_client(server_url: str) -> OpenSearch:
    os_client_instance = OpenSearch(SERVER_URL, use_ssl=False, verify_certs=False,
                                    connection_class=RequestsHttpConnection)
    print("OS connected")
    print(datetime.datetime.now())
    return os_client_instance

In [26]:
def load_file(file_path):
    try:
        data_objects = []
        with open(file_path, "r") as data_file:
            for line in data_file:
                line = line.strip()
                if line != "":
                    data_objects.append({"text": line})
        print("Done")
    finally:
        pass
    return data_objects


In [34]:
os_client = get_client(SERVER_URL)
delete_index(INDEX_NAME, os_client)

OS connected
2023-09-26 13:06:29.418495


In [35]:
DATA_PATH = "data-bak/paul_graham_essay.txt"
metadata = load_file(DATA_PATH)
# print(metadata)
res = create_index(INDEX_NAME, os_client, metadata)
# print(res)
store_index(INDEX_NAME, metadata, metadata, os_client)

Done
bulk 170 indexed successfully
