<a href="https://colab.research.google.com/github/mohitchauhan/ml-samples/blob/main/similar_products.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import scipy.sparse as sp


In [15]:


url = "https://drive.google.com/uc?export=download&id=1PkF-wuF8cfV0PWt5L-z5_4PxuZxZeIjg"
df = pd.read_csv(url)

# Combine text fields
df["text"] = (
    df["title"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["product_type"].fillna("") + " " +
    df["vendor"].fillna("")
)


## Analyse data


In [6]:
print("Data loaded successfully. First 5 rows:")
display(df.head())

print("\nData types and non-null counts:")
display(df.info())

print("\nDescribe")
display(df.describe())

print("\nCount")
display(df.count())

Data loaded successfully. First 5 rows:


Unnamed: 0,id,title,description,product_type,min_variant_price_amount,vendor,text
0,7793540956310,CRP 1015 L shape Sofa Set,Frame: Teak Frame Finish: Melamine Polish Upho...,Sofa,5500.0,Vendor 1,CRP 1015 L shape Sofa Set Frame: Teak Frame Fi...
1,8191747784854,RFP 1399 High Back Chair-Set of 2,Frame: Teak With Cane Frame Finish: PU Polish ...,Bedroom Chair,43000.0,Vendor 2,RFP 1399 High Back Chair-Set of 2 Frame: Teak ...
2,7445034336406,NS 295 Cushion Bed Head,Cushioned Bed Head and Tail (2 pieces) The pro...,Cushion Bed Head,7500.0,Vendor 3,NS 295 Cushion Bed Head Cushioned Bed Head and...
3,8072457748630,Customer Requested Product,"Side Table CP Teak Wood 21"" Dia x Height 27""-RAW",Furniture,5015.0,Vendor 1,Customer Requested Product Side Table CP Teak ...
4,8303328624790,Customer Requested Product,Manual recliner -FINISHED,Furniture,18000.0,Vendor 1,Customer Requested Product Manual recliner -FI...



Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15476 entries, 0 to 15475
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        15476 non-null  int64  
 1   title                     15476 non-null  object 
 2   description               15476 non-null  object 
 3   product_type              15476 non-null  object 
 4   min_variant_price_amount  15476 non-null  float64
 5   vendor                    15347 non-null  object 
 6   text                      15476 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 846.5+ KB


None


Describe


Unnamed: 0,id,min_variant_price_amount
count,15476.0,15476.0
mean,7894514000000.0,20907.526102
std,272115200000.0,22626.495126
min,7360670000000.0,100.0
25%,7731253000000.0,6883.75
50%,7805606000000.0,14909.5
75%,8126922000000.0,26500.0
max,8449779000000.0,300000.0



Count


Unnamed: 0,0
id,15476
title,15476
description,15476
product_type,15476
min_variant_price_amount,15476
vendor,15347
text,15476


## Compute similarity

In [7]:
# TF-IDF representation
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
text_features = vectorizer.fit_transform(df["text"])

# Normalize price
scaler = MinMaxScaler()
price_features = scaler.fit_transform(df[["min_variant_price_amount"]].fillna(0))

# Combine text + price
product_features = sp.hstack([text_features, price_features])

similarity_matrix = cosine_similarity(product_features)


In [8]:
def recommend(product_id, top_n=5):
    idx = df.index[df["id"] == product_id][0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_matches = sim_scores[1: top_n+1]  # skip itself
    return df.iloc[[i[0] for i in top_matches]][["id", "title", "product_type", "min_variant_price_amount", "vendor"]]


In [9]:
query_product_id = 7528416870550 #df['id'].iloc[12]
print(f"Query product ID: {query_product_id}")
print(recommend(product_id=query_product_id, top_n=5))


Query product ID: 7528416870550
                  id                                          title  \
13693  7527894614166  Teak Wood Arm Chair (Set Of 2) In Teal Colour   
8013   7660481413270                 RFP 262 Bedroom Chair Set of 2   
10052  7528423850134                   Armchair In Honey Oak Finish   
13778  7831664099478              RFPM 767 Bedroom Chair - Set of 2   
14549  7700879474838              RFPM 026 Bedroom Chair - set of 2   

        product_type  min_variant_price_amount     vendor  
13693  Bedroom Chair                   23502.0  Vendor 67  
8013   Bedroom Chair                   34000.0   Vendor 2  
10052  Bedroom Chair                    9426.0  Vendor 67  
13778  Bedroom Chair                   25650.0   Vendor 2  
14549  Bedroom Chair                   16000.0   Vendor 2  
