In [1]:
%cd ../data/ecommerce/home-depot-product-search-relevance/
!pwd

/Users/felicildaloveme/personal_projects/data/ecommerce/home-depot-product-search-relevance
/Users/felicildaloveme/personal_projects/data/ecommerce/home-depot-product-search-relevance


In [2]:
import warnings
from collections import defaultdict
from random import choice

import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

warnings.filterwarnings("ignore")

In [3]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
# prod_ngrams = pd.read_json('./prod_ngrams.json', lines=True)
# prod_ngrams.head(3)

In [5]:
data = pd.read_pickle("./cleaned_train_data.pkl")
data.head(3)

Unnamed: 0,product_uid,product_title,search_term,relevance,combined_attr,brand,product_description,cleaned_title,cleaned_brand,cleaned_description,cleaned_attributes,cleaned_search,corrected_search
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,Bullet01 Versatile connector for various 90Â° ...,Simpson Strong-Tie,"Not only do angles make joints stronger, they ...",simpson strong tie 12 gauge angle,simpson strong tie,angles make joints stronger also provide consi...,versatile connector various 90â connections ho...,angle bracket,angle bracket
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,Bullet01 Versatile connector for various 90Â° ...,Simpson Strong-Tie,"Not only do angles make joints stronger, they ...",simpson strong tie 12 gauge angle,simpson strong tie,angles make joints stronger also provide consi...,versatile connector various 90â connections ho...,l bracket,l bracket
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,"Application Method Brush,Roller,Spray Assemble...",BEHR Premium Textured DeckOver,BEHR Premium Textured DECKOVER is an innovativ...,behr premium textured deckover 1 gallon sc 141...,behr premium textured deckover,behr premium textured deckover innovative soli...,application method brush roller spray assemble...,deck,deck oven


In [6]:
product_data = (
    data[
        [
            "product_uid",
            "cleaned_title",
            "cleaned_description",
            "cleaned_attributes",
            "cleaned_brand",
        ]
    ]
    .drop_duplicates(subset=["product_uid"])
    .reset_index(drop=True)
)

product_data.head()

Unnamed: 0,product_uid,cleaned_title,cleaned_description,cleaned_attributes,cleaned_brand
0,100001,simpson strong tie 12 gauge angle,angles make joints stronger also provide consi...,versatile connector various 90â connections ho...,simpson strong tie
1,100002,behr premium textured deckover 1 gallon sc 141...,behr premium textured deckover innovative soli...,application method brush roller spray assemble...,behr premium textured deckover
2,100005,delta vero 1 handle shower faucet trim kit chr...,update bathroom delta vero single handle showe...,bath faucet type combo tub shower built water ...,delta
3,100006,whirlpool 1 9 cu feet range convection microwa...,achieving delicious results almost effortless ...,appliance type range microwave assembled depth...,whirlpool
4,100007,lithonia lighting quantum 2 light black led em...,quantum adjustable 2 light led black emergency...,battery power type ni cad battery size built b...,lithonia lighting


In [7]:
corpus = product_data.apply(
    lambda row: " ".join(
        [
            row["cleaned_title"],
            row["cleaned_description"],
            row["cleaned_attributes"],
            row["cleaned_brand"],
        ]
    ),
    axis=1,
).tolist()
inverted_index = defaultdict(set)

for docid, c in enumerate(corpus):
    for word in c.split():
        inverted_index[word].add(docid)

In [8]:
def process_and_search(query):
    matched_documents = set()
    for word in query.split():
        matches = inverted_index.get(word)
        if matches:
            matched_documents |= matches
    return matched_documents

In [9]:
# descrip_embeddings = embedder.encode(corpus, convert_to_tensor=True)
# torch.save(descrip_embeddings, './home_depo_descrip_embeddings.pt')
descrip_embeddings = torch.load("./home_depo_descrip_embeddings.pt")

In [10]:
high_relevance_queries = data[data.relevance == 3].corrected_search.tolist()

In [11]:
sample_query = choice(high_relevance_queries)

top_k = 10
query_vector = embedder.encode(sample_query, convert_to_tensor=True)
descrip_vectors = descrip_embeddings[list(process_and_search(sample_query))]
cos_scores = util.pytorch_cos_sim(query_vector, descrip_vectors)[0]
top_results = torch.topk(cos_scores, k=top_k)

print(
    "\n\n================================================================================="
)
print("Query:", sample_query)
print("\nTop 5 most similar products:\n")

product_titles = product_data.cleaned_title.tolist()
for score, idx in zip(top_results[0].tolist(), top_results[1].tolist()):
    print("\t", product_titles[idx], "(Score: {:.4f})".format(score))
print(
    "====================================================================================="
)



Query: satin nickel pull

Top 5 most similar products:

	 gibraltar mailboxes tuff body post mount mailbox black (Score: 0.5224)
	 amana 11 500 btu 230 208 volt wall heat pump 3 5 kw electric heat remote (Score: 0.5046)
	 duraheat plastic siphon pump (Score: 0.4983)
	 elegant home fashions albion 22 inch w mdf white wall cabinet (Score: 0.4978)
	 dewalt 20 volt max lithium ion cordless combo kit 4 tool 6 1 2 inch circular saw tool (Score: 0.4943)
	 viagrow 25 feet mylar 2 mil reflective film (Score: 0.4854)
	 echo 165 mph 391 cfm low noise handheld gas blower (Score: 0.4814)
	 honda 21 inch push mover walk behind gas mower california compliant (Score: 0.4782)
	 rust oleum stops rust 12 ounce protective enamel satin hunter green spray paint 6 pack (Score: 0.4661)
	 mayfair lift soft round closed front toilet seat black (Score: 0.4574)
