In [None]:
!pip3 -q install eland sentence_transformers transformers elasticsearch


In [2]:
from elasticsearch import Elasticsearch
import time 
import pandas as pd 
from sentence_transformers import SentenceTransformer

## Clean 

In [38]:
import pandas as pd
df = pd.read_csv('./dataset/clean_products.csv')
df.head()

Unnamed: 0,id,category,title,description,brand,mrp,price,offers,stock_availibility,product_asin,image_urls
0,eb49cc038190f6f03c272f79fbbce894,Skin Care,Lee posh Lactic Acid 60% Anti ageing Pigmenta...,PROFESSIONAL GRADE Face Peel: this peel stimul...,Lee Posh,2000.0,799.0,60.05%,True,B072BGHNJ1,https://images-na.ssl-images-amazon.com/images...
1,1657cc30c438affede6a5060d6847363,Skin Care,Branded SLB Works New 1.5mm Titanium 1200 nee...,Item name: 1.5mm titanium 1200 needles microne...,SLB Works,2040.0,2040.0,0%,True,B07QDTZYSJ,https://images-na.ssl-images-amazon.com/images...
2,41654633cce38c8650690f6dbac01fd3,Skin Care,Generic 1 Pc brand snail eye cream remove dar...,"Use: eye, item type: cream, net wt: 20g, gzzz:...",Generic,1824.0,1042.0,42.87%,True,B07DCSN8MP,https://images-na.ssl-images-amazon.com/images...
3,08b1bd85c3efc2d7aa556fd79b073382,Skin Care,Generic Anti Snoring Snore Stopper Sleep Apne...,Prevent the tongue from dropping backward or b...,Generic,2185.0,1399.0,35.97%,True,B07GLW9VQN,https://images-na.ssl-images-amazon.com/images...
4,3ac3f213732512d1d11bb73ab3b1900f,Grocery & Gourmet Foods,Harveys Crunchy & Creame Gourmet Delicacies C...,Harvey's wafer Cream Wafer 110g. Made in India,Harveys,594.0,570.0,4.04%,True,B07NFYYLF1,https://images-na.ssl-images-amazon.com/images...


In [39]:
df.dtypes

id                    object
category              object
title                 object
description           object
brand                 object
mrp                   object
price                 object
offers                object
stock_availibility      bool
product_asin          object
image_urls            object
dtype: object

In [45]:
df['category'].value_counts()

Skin Care                    13656
Grocery and Gourmet Foods     7369
Hair Care                     2261
Fragrance                     1966
Bath and Shower               1960
Detergents and Dishwash        176
Name: category, dtype: int64

In [44]:
df["category"] = df["category"].apply(lambda x: x.replace("&", "and"))
# df['category'].replace('&','and', inplace=True)

In [49]:
df['price'] = pd.to_numeric(df['price'], errors='coerce')

## Encode 

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [53]:
# Merge title with description
df['title_description'] = 'Product name is ' + df['title'] +'. With description: '+ df['description']

In [54]:
df["description_vector"] = df["title_description"].apply(lambda x: model.encode(x))


## Indexing data

In [3]:
client = Elasticsearch('http://localhost:9200')
client.ping()

True

In [65]:
client.indices.delete(index='ecommerce_search')

ObjectApiResponse({'acknowledged': True})

In [66]:
mapping = {
    "properties": {
        "id": {
            "type": "text",
        },
        "description": {
            "type": "text",
        },
        "category": {
            "type": "text",
        },
        "title": {
            "type": "text",
        },
        "brand": {
            "type": "text",
        },
        "mrp": {
            "type": "text",
        },
        "price": {
            "type": "double",
        },
        "offers": {
            "type": "text",
        },
        "stock_availibility": {
            "type": "boolean",
        },
        "product_asin": {
            "type": "text",
        },
        "image_urls": {
            "type": "text",
        },
        "description_vector": {  # Inference results field, target_field.predicted_value
            "type": "dense_vector",
            "dims": 768,  # The all-mpnet-base-v2 model has embedding_size of 768, so dims is set to 768.
            "index": "true",
            "similarity": "l2_norm",  #  When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.
        },
    },
}
client.indices.create(index="ecommerce_search", mappings=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ecommerce_search'})

In [62]:
df1 = df.dropna()

In [67]:
record_list = df1.to_dict("records")

In [68]:
for record in record_list:
    try:
        # record['description_vector'] = 'null'
        client.index(index="ecommerce_search", document=record, id=record["id"])
    except Exception as e:
        print(e)
    

In [69]:
client.count(index='ecommerce_search')

ObjectApiResponse({'count': 27328, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

## Test 

In [8]:
import time 
input_keyword = "Lee posh Lactic Acid 60%"
start_time = time.time()
res = client.search(
    index='ecommerce_search', 
    from_=0,
    size=12,
    sort=[{'price': {'order': 'desc'}}, '_score'],
    query={
            "match": {
            "title": {
                "query": input_keyword,
                "boost": 0.2
            }
        }
        },
    post_filter={
        "match":{
            "category": "Skin Care",
            # "boost": 0.7
        }
    },
    fields=['title', 'description', 'category', 'price'],
    knn={
        "field" : "description_vector",
        "query_vector" : model.encode(input_keyword),
        "k" : 10,
        "num_candidates" : 50,
        "boost": 0.8,
    },
    )
hits = res["hits"]["hits"]
if not hits:
    print("No matches found")
else:
    # print(res['hits']['total'])
    for hit in hits:
        score = hit["_score"]
        product = hit["_source"]["title"]
        category = hit["_source"]["category"]
        # description = hit["_source"]["description"]
        price = hit["_source"]["price"]
        
        print(
            f"\nScore: {score}\nProduct: {product}\nCategory: {category}\n Price: {price}"
        )

print("***************************")
print(time.time() - start_time)
t = "Ecoplanet Aromatherapy"


Score: 0.72413087
Product:  Kopari Sundaze Mineral Face Sunscreen Lotion SPF 30 | Fragrance Free Zinc Oxide Mineral-Based Daily Sunscreen with Hyaluronic Acid and Coconut Water 
Category: Skin Care
 Price: 9670.0

Score: 1.6433403
Product:  Exfoliating Body Lotion – 12% Lactic Acid body lotion, provides immediate moisture and gentle exfoliation to treat symptoms associated with keratosis pilaris, dry skin, flakiness, chicken skin, and other dry skin conditions. 
Category: Skin Care
 Price: 9110.0

Score: 1.0679411
Product:  Replenix Pure Hydration Hyaluronic Acid Serum - 1 fl oz 
Category: Skin Care
 Price: 8767.0

Score: 0.9624846
Product:  YASHUS Hyaluronic Acid Face Cream Skin Care Whitening Reverse Aging Repair Moisture 
Category: Skin Care
 Price: 8736.0

Score: 1.1926383
Product:  60 pcs/bottle Gold Crystal Collagen Eye Patches Eye Mask 
Category: Skin Care
 Price: 8190.0

Score: 0.8264655
Product:  SWD prime Hyaluronic Cleanser Amino acid gentle facial cleaning formula Gently c