In [1]:
# Import required libraries
import os
from elasticsearch import Elasticsearch
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.filterwarnings("ignore")

In [2]:
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME")
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD")

ELASTIC_CERT_FILE_LOCATION = os.environ.get("ELASTIC_CERT_FILE_LOCATION")

HUGGING_FACE_MODEL = os.environ.get("HUGGING_FACE_MODEL")

ES_INDEX = os.environ.get("ES_INDEX")

In [3]:
es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=(ELASTIC_USERNAME,ELASTIC_PASSWORD),
    ca_certs=ELASTIC_CERT_FILE_LOCATION
)
es.ping()

True

### Prepare the data

In [4]:
data = pd.read_csv("myntra_products_catalog.csv")
data.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [5]:
data.shape

(12491, 8)

In [6]:
data.isnull().sum()

ProductID         0
ProductName       0
ProductBrand      0
Gender            0
Price (INR)       0
NumImages         0
Description       0
PrimaryColor    894
dtype: int64

In [7]:
not_null_mask = data.notnull().all(axis=1)
not_null_df = data[not_null_mask]

print(not_null_df.shape)

(11597, 8)


In [8]:
df = not_null_df.sample(1000)

In [9]:
df.isnull().value_counts()

ProductID  ProductName  ProductBrand  Gender  Price (INR)  NumImages  Description  PrimaryColor
False      False        False         False   False        False      False        False           1000
Name: count, dtype: int64

### Convert the relevant field to Vector using BERT model

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(HUGGING_FACE_MODEL)

.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<?, ?B/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 10.3MB/s]
config.json: 100%|██████████| 571/571 [00:00<00:00, 615kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 117kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 422kB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [01:21<00:00, 5.40MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 235kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 817kB/s]
tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 330kB/s]
train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 13.6MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 289kB/s]
modules.json: 100%|██████████| 349/349 [00:00<?, ?B/s] 


In [11]:
df["Description_Embeddings"] = df["Description"].apply(lambda x: model.encode(x))
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor,Description_Embeddings
6373,10156229,Indian Terrain Men Off-White Slim Fit Solid Ca...,Indian Terrain,Men,2239,5,"Off-White solid casual shirt, has a spread col...",White,"[-0.022425951, 0.012131211, 0.03328043, 0.0308..."
119,10013025,Sera Women White Self Design Top,Sera,Women,714,5,"White self-design knitted regular top, has a ...",White,"[-0.030529924, 0.057417456, -0.005600156, 0.00..."
6613,10176783,Tommy Hilfiger Men Red & Navy Blue Printed Tru...,Tommy Hilfiger,Men,1049,4,"Red and Navy Blue printed trunks, has an outer...",Blue,"[0.009835771, -0.02389972, 0.0019814908, -0.03..."
1360,10030413,Geox Men Navy Blue Suede Sneakers,Geox,Men,4999,5,"A pair of round-toe navy blue sneakers, has re...",Blue,"[-0.011293902, 0.013711613, 0.0060762265, 0.01..."
11262,10255561,Pepe Jeans Boys Off-White & Black Printed Roun...,Pepe Jeans,Boys,499,4,"Off-white, red and black printed T-shirt, has ...",Black,"[0.028025815, 0.0016927083, 0.00023940434, 0.0..."


### Create new index in ElasticSearch!

In [12]:
from index_mapping import indexMapping

es.indices.create(index=ES_INDEX, mappings=indexMapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'myntra_products'})

### Ingest the data into index

In [13]:
record_list = df.to_dict("records")

In [14]:
def index_records(records):
    try:
        for record in records:
            es.index(index=ES_INDEX, document=record, id=record["ProductID"])
    except Exception as e:
        raise Exception(f"Exception while ingesting data into ElasticSearch: {e}")
        

index_records(record_list)

### Search the data

In [15]:
input_keyword = "Polo Collar T-Shirt"
input_embeddings = model.encode(input_keyword)

query = {
    "field" : "Description_Embeddings",
    "query_vector" : input_embeddings,
    "k" : 2,
    "num_candidates" : 1000, 
}

res = es.knn_search(index=ES_INDEX, knn=query , source=["ProductName","Description"])
res["hits"]["hits"]

[{'_index': 'myntra_products',
  '_id': '10255641',
  '_score': 0.92506945,
  '_source': {'ProductName': 'Pepe Jeans Boys Navy Blue & White Striped Pure Cotton Polo Collar T-shirt',
   'Description': 'Navy blue and white striped polo collar T-shirt with embroidered detail, has a polo collar, long sleeves and button closure'}},
 {'_index': 'myntra_products',
  '_id': '10143889',
  '_score': 0.9104985,
  '_source': {'ProductName': 'Chkokko Men White Solid Polo Collar T-shirt',
   'Description': 'White solid T-shirt, has a polo collar, button closure and short sleeves'}}]