# 2) Index ESCI Data

After the OpenSearch preparations are done we can move towards indexing products.

We're using the ESCI datset.

In [58]:
import pandas as pd
import numpy as np
import mercury as mr
import requests
import json
from opensearchpy import OpenSearch

In [59]:
DATA_DIR = '/Users/danielwrigley/work/Testing/git_repos/esci-data/shopping_queries_dataset/'

# Load Data

In [60]:
df_products = pd.read_parquet(DATA_DIR + 'shopping_queries_dataset_products.parquet')

In [61]:
df_products.head(5)

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color,product_locale
0,B079VKKJN7,"11 Degrees de los Hombres Playera con Logo, Ne...",Esta playera con el logo de la marca Carrier d...,11 Degrees Negro Playera con logo\nA estrenar ...,11 Degrees,Negro,es
1,B079Y9VRKS,Camiseta Eleven Degrees Core TS White (M),,,11 Degrees,Blanco,es
2,B07DP4LM9H,11 Degrees de los Hombres Core Pull Over Hoodi...,La sudadera con capucha Core Pull Over de 11 G...,11 Degrees Azul Core Pull Over Hoodie\nA estre...,11 Degrees,Azul,es
3,B07G37B9HP,11 Degrees Poli Panel Track Pant XL Black,,,11 Degrees,,es
4,B07LCTGDHY,11 Degrees Gorra Trucker Negro OSFA (Talla úni...,,,11 Degrees,Negro (,es


In [62]:
df_products.shape[0]

1814924

In [63]:
df_products_us = df_products[df_products['product_locale'] == 'us']
df_products_us.shape[0]

1215854

In [64]:
df_sample = df_products_us.head(10)

## Augment the existing ecommerce schema
Read in the existing ecommerce schema and augment it with the knn settings and the new ingestion pipeline for generating embeddings.
This ensures that when we reindex data, we are running the `nlp-ingest-pipeline` to get the embeddings on the title field.


Note the number of dimensions is set to 384, the third time we encounter the number. This makes sure we can store the embeddings generated by the model.

In [65]:
# Specify the path to your JSON file
json_file_path = './schema.json'

with open(json_file_path, 'r') as file:
    schema = json.load(file)

# Add the new setting to the settings hash
new_settings = {
  "settings": {
    "index.knn": True,
    "default_pipeline": "nlp-ingest-pipeline"
  }
}

new_mappings = {
  "mappings": {
    "properties": {
      "title_embedding": {
        "type": "knn_vector",
        "dimension": 384,
        "method": {
          "engine": "lucene",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      }      
    }
  }
}

settings = {**new_settings['settings'], **schema['settings']}
properties = {**new_mappings['mappings']['properties'], **schema['mappings']['properties']}

#mr.JSON(properties)

schema['settings'] = settings
schema['mappings']['properties'] = properties
mr.JSON(schema, level=2)

In [66]:
url = "http://localhost:9200/ecommerce"

headers = {
    'Content-Type': 'application/json'
}

payload = schema
response = requests.request("DELETE", url, headers=headers)

response = requests.request("PUT", url, headers=headers, data=json.dumps(payload))
mr.JSON(response.json(), level=4)

In [67]:
# Create OpenSearch client
host = 'localhost'
port = 9200
#auth = ('admin', 'admin')  # Replace with your OpenSearch credentials

client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # Enable HTTP compression
 #   http_auth=auth,      # Basic Auth
    use_ssl=False,       # Set to True if SSL is enabled on your cluster
    verify_certs=False   # Set to True if SSL certificates should be verified
)

# Indexing DataFrame into OpenSearch
index_name = 'ecommerce'

# Index each row from the DataFrame
for idx, row in df_products_us.iterrows():
    doc = row.to_dict()
    response = client.index(index=index_name, id=doc['product_id'], body=doc)
    #print(f"Indexed document {doc['product_id']}: {response['result']}")

In [68]:
# Get model_id
url = "http://localhost:9200/_plugins/_ml/models/_search"

payload = {
  "query": {
    "match_all": {}
  },
  "size": 1
}

response = requests.request("POST", url, headers=headers, data=json.dumps(payload))

#hits.hits[0]['model_id']

model_id = response.json()['hits']['hits'][0]['_source']['model_id']
#model_id
#mr.JSON(response.json(), level=4)

In [69]:

url = "http://localhost:9200/_search/pipeline/hybrid-search-pipeline"

print(f"Setting default model id to: {model_id}")
payload = {
  "request_processors": [
    {
      "neural_query_enricher" : {
        "description": "Sets the default model ID at index and field levels",
        "default_model_id": model_id,
        "neural_field_default_id": {
           "title_embeddings": model_id
        }
      }
    }
  ],
  "phase_results_processors": [
    {
      "normalization-processor": {
        "normalization": {
          "technique": "min_max"
        },
        "combination": {
          "technique": "arithmetic_mean",
          "parameters": {
            "weights": [
              0.3,
              0.7
            ]
          }
        }
      }
    }
  ]    
}


response = requests.request("PUT", url, headers=headers, data=json.dumps(payload))
mr.JSON(response.json(), level=4)


Setting default model id to: zZrgMpIBFSlgWAuG9zO3


In [70]:
url = "http://localhost:9200/ecommerce/_search?search_pipeline=hybrid-search-pipeline"

payload = {
  "_source": {
    "excludes": [
      "title_embedding"
    ]
  },
  "query": {
    "hybrid": {
      "queries": [
        {
          "match": {
            "title_text": {
              "query": "ELECTRONICS"
            }
          }
        },
        {
          "neural": {
            "title_embedding": {
              "query_text": "ELECTRONICS",
              "k": 50
            }
          }
        }
      ]
    }
  }
}


response = requests.request("GET", url, headers=headers, data=json.dumps(payload))
#mr.Confetti()
mr.JSON(response.json(), level=5)