# 2) Index ESCI Data

After the OpenSearch preparations are done we can move towards indexing products.

We're using the [ESCI](https://github.com/amazon-science/esci-data) datset.

In [17]:
import pandas as pd
import numpy as np
import mercury as mr
import requests
import json
from opensearchpy import OpenSearch

# Load Data

In [18]:
df_products = pd.read_parquet('../data/shopping_queries_dataset_products.parquet')

In [19]:
df_products.head(5)

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color,product_locale
0,B079VKKJN7,"11 Degrees de los Hombres Playera con Logo, Ne...",Esta playera con el logo de la marca Carrier d...,11 Degrees Negro Playera con logo\nA estrenar ...,11 Degrees,Negro,es
1,B079Y9VRKS,Camiseta Eleven Degrees Core TS White (M),,,11 Degrees,Blanco,es
2,B07DP4LM9H,11 Degrees de los Hombres Core Pull Over Hoodi...,La sudadera con capucha Core Pull Over de 11 G...,11 Degrees Azul Core Pull Over Hoodie\nA estre...,11 Degrees,Azul,es
3,B07G37B9HP,11 Degrees Poli Panel Track Pant XL Black,,,11 Degrees,,es
4,B07LCTGDHY,11 Degrees Gorra Trucker Negro OSFA (Talla úni...,,,11 Degrees,Negro (,es


In [20]:
df_products.shape[0]

1814924

In [21]:
# only us the products with a product_locale of us
df_products_us = df_products[df_products['product_locale'] == 'us']
df_products_us.shape[0]

1215854

In [22]:
del df_products

## Augment the existing ecommerce schema
Read in the existing ecommerce schema and augment it with the knn settings and the new ingestion pipeline for generating embeddings.
This ensures that when we reindex data, we are running the `nlp-ingest-pipeline` to get the embeddings on the title field.


Note the number of dimensions is set to 384, the third time we encounter the number. This makes sure we can store the embeddings generated by the model.

In [23]:
# Specify the path to your JSON file
json_file_path = '../configs/schema.json'

with open(json_file_path, 'r') as file:
    schema = json.load(file)

# Add the new setting to the settings hash
new_settings = {
  "settings": {
    "index.knn": True,
    "default_pipeline": "nlp-ingest-pipeline"
  }
}

new_mappings = {
  "mappings": {
    "properties": {
      "title_embedding": {
        "type": "knn_vector",
        "dimension": 384,
        "method": {
          "engine": "lucene",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      }      
    }
  }
}

settings = {**new_settings['settings'], **schema['settings']}
properties = {**new_mappings['mappings']['properties'], **schema['mappings']['properties']}

schema['settings'] = settings
schema['mappings']['properties'] = properties
mr.JSON(schema, level=2)

In [24]:
url = "http://localhost:9200/ecommerce"

headers = {
    'Content-Type': 'application/json'
}

payload = schema
response = requests.request("DELETE", url, headers=headers)

response = requests.request("PUT", url, headers=headers, data=json.dumps(payload))
mr.JSON(response.json(), level=4)

In [34]:
# Create OpenSearch client
host = 'localhost'
port = 9200

client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # Enable HTTP compression
    use_ssl=False,       # Set to True if SSL is enabled on your cluster
    verify_certs=False   # Set to True if SSL certificates should be verified
)

# Indexing DataFrame into OpenSearch
index_name = 'ecommerce'

docs = df_products_us.to_dict(orient='records')

actions = [
    f'{{"index": {{"_index": "{index_name}", "_id": "{doc["product_id"]}"}}}}\n{json.dumps(doc)}'
    for doc in docs
]

len(actions)

1215854

In [35]:
def split_into_batches(actions, batch_size):    
    for i in range(0, len(actions), batch_size):
        yield actions[i:i + batch_size]

In [40]:
batch_size = 10_000  # Set the batch size

for batch in split_into_batches(actions, batch_size):
    response = client.bulk(index=index_name, body=batch)

## Check that all documents were indexed


In [41]:
df_products_us.shape[0]

1215854

In [42]:
url = "http://localhost:9200/ecommerce/_search"
payload = {
  "query": {
    "match_all": {}
  },
  "track_total_hits": "true",
  "size": 0
}

response = requests.request("POST", url, headers=headers, data=json.dumps(payload))
mr.JSON(response.json(), level=3)

In [43]:
df_products_us.shape[0] == response.json()['hits']['total']['value']

True