In [1]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,BRAND,PRODUCT_CATEGORY,IS_CHILD_CATEGORY_TO,OFFER,RETAILER
0,gillette venus,hair removal,health & wellness,gillette venus ® for pubic hair & skin spend $20,gillette venus
1,gillette venus,hair removal,health & wellness,gillette venus ® for pubic hair & skin,gillette venus
2,gillette venus,hair removal,health & wellness,"gillette venus ® for pubic hair & skin, spend $20",gillette venus
3,burts bees,hair removal,health & wellness,"burt's bees sensitive lotions and creams, sele...",walmart
4,burts bees,hair removal,health & wellness,"burt's bee's® facial wipes, select varieties, ...",walmart


In [3]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [4]:
def get_similarity_score(query, offer):
    # Tokenize and encode the query and offer
    tokens = tokenizer([query, offer], padding=True, truncation=True, return_tensors='pt')
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    
    # Get the embeddings for each sentence
    query_embedding = embeddings[0].mean(dim=1).squeeze().detach().numpy()
    offer_embedding = embeddings[1].mean(dim=1).squeeze().detach().numpy()
    
    # Reshape the embeddings to 2D arrays
    query_embedding = query_embedding.reshape(1, -1)
    offer_embedding = offer_embedding.reshape(1, -1)
    
    # Calculate cosine similarity
    similarity = cosine_similarity(query_embedding, offer_embedding)[0][0]
    return similarity


In [5]:
def search_offers(query, df):
    # Filter the DataFrame based on multiple columns and query
    mask = df.apply(lambda row: any(query.lower() in str(row[col]).lower() for col in ['BRAND', 'PRODUCT_CATEGORY', 'IS_CHILD_CATEGORY_TO', 'RETAILER']), axis=1)
    filtered_df = df[mask]
    
    # Calculate similarity score for each offer and add it to the DataFrame
    filtered_df['SIMILARITY_SCORE'] = filtered_df['OFFER'].apply(lambda x: get_similarity_score(query, x))
    
    # Sort by similarity score in descending order
    filtered_df = filtered_df.sort_values(by='SIMILARITY_SCORE', ascending=False)
    
    # Return the filtered DataFrame
    return filtered_df


In [6]:
query = input("Enter category, brand, or retailer: ")

results = search_offers(query, df)
print(len(results))
results.head(15)


37


Unnamed: 0,BRAND,PRODUCT_CATEGORY,IS_CHILD_CATEGORY_TO,OFFER,RETAILER,SIMILARITY_SCORE
259,snickers,candy,snacks,"snickers® chocolate candy bar, select varieties",snickers,0.968787
258,snickers,candy,snacks,"snickers®, select sizes, buy 1",snickers,0.96089
269,lifesavers,candy,snacks,"lifesavers®, any size, buy 1",lifesavers,0.95827
257,m&ms,candy,snacks,"m&m's® chocolate candies, select varieties",m&ms,0.951028
736,extra,gum,candy,"extra®, select varieties",extra,0.949993
261,twix,candy,snacks,"twix®, select varieties",twix,0.949412
256,m&ms,candy,snacks,"m&m's®, select sizes, buy 1",m&ms,0.949025
270,starburst,candy,snacks,"starburst®, select sizes, buy 1",starburst,0.947556
274,starburst,gum,candy,"starburst®, select sizes, buy 1",starburst,0.947556
738,orbit,gum,candy,"orbit®, select sizes, buy 1",orbit,0.94728
