# Problem 3 - Use TF-IDF and cosine similarity to find similar product descriptions in an e-commerce dataset.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
with open('/content/flipkart_fashion_products_dataset.json', 'r') as f:
    for _ in range(5):
        print(f.readline())

import json

with open('/content/flipkart_fashion_products_dataset.json', 'r') as f:
    data = json.load(f)

# Now convert to DataFrame
df = pd.DataFrame(data)

# Preview
df.head()


[

    {

        "_id": "fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a", 

        "actual_price": "2,999", 

        "average_rating": "3.9", 



Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
0,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:51",Yorker trackpants made from 100% rich combed c...,69% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EA7H5FYZH,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,921,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
1,893e6980-f2a0-531f-b056-34dd63fe912c,1499,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:52",Yorker trackpants made from 100% rich combed c...,66% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EJZV2UVRZ,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,499,Bottomwear,Solid Men Blue Track Pants,https://www.flipkart.com/yorker-solid-men-blue...
2,eb4c8eab-8206-59d0-bcd1-a724d96bf74f,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:52",Yorker trackpants made from 100% rich combed c...,68% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EHFCY5Z4Y,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,931,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
3,3f3f97bb-5faf-57df-a9ff-1af24e2b1045,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:53",Yorker trackpants made from 100% rich combed c...,69% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9ESZZ7YWEF,"[{'Style Code': '1005COMBO3'}, {'Closure': 'El...",Shyam Enterprises,911,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
4,750caa3d-6264-53ca-8ce1-94118a1d8951,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:53",Yorker trackpants made from 100% rich combed c...,68% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EVXKBSUD7,"[{'Style Code': '1005COMBO1'}, {'Closure': 'Dr...",Shyam Enterprises,943,Bottomwear,"Solid Men Brown, Grey Track Pants",https://www.flipkart.com/yorker-solid-men-brow...


In [None]:
df['description'] = df['description'].fillna('').str.lower()

In [None]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def get_similar_products(product_index, top_n=5):
    similarity_scores = list(enumerate(cosine_sim[product_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in similarity_scores[1:top_n+1]]  # Exclude the product itself
    return df.iloc[top_indices][['title', 'description', 'url']]

# Example: Get products similar to product at index 0
print("🔹 Original Product:\n")
print(df.iloc[0][['title', 'description', 'url']])
print("\n🔸 Top 5 Similar Products:\n")
print(get_similar_products(0))

🔹 Original Product:

title                           Solid Men Multicolor Track Pants
description    yorker trackpants made from 100% rich combed c...
url            https://www.flipkart.com/yorker-solid-men-mult...
Name: 0, dtype: object

🔸 Top 5 Similar Products:

                               title  \
1         Solid Men Blue Track Pants   
2   Solid Men Multicolor Track Pants   
3   Solid Men Multicolor Track Pants   
4  Solid Men Brown, Grey Track Pants   
5   Solid Men Multicolor Track Pants   

                                         description  \
1  yorker trackpants made from 100% rich combed c...   
2  yorker trackpants made from 100% rich combed c...   
3  yorker trackpants made from 100% rich combed c...   
4  yorker trackpants made from 100% rich combed c...   
5  yorker trackpants made from 100% rich combed c...   

                                                 url  
1  https://www.flipkart.com/yorker-solid-men-blue...  
2  https://www.flipkart.com/yorker-solid-men-

In [None]:
# Get feature names (words)
feature_names = tfidf.get_feature_names_out()

# Convert the sparse matrix row to array
row = tfidf_matrix[0].toarray().flatten()

# Create a DataFrame for readability
tfidf_df = pd.DataFrame({'term': feature_names, 'tfidf': row})
tfidf_df = tfidf_df[tfidf_df['tfidf'] > 0].sort_values(by='tfidf', ascending=False)

print("🔍 Top TF-IDF words for product index 0:")
print(tfidf_df.head(10))


🔍 Top TF-IDF words for product index 0:
            term     tfidf
4412        rich  0.425922
6008      yorker  0.324557
2828        itch  0.286673
5465  trackpants  0.280705
2331      giving  0.235939
5998        year  0.225927
2237    friendly  0.222559
4109     proudly  0.220056
1130      combed  0.211760
5777   waistband  0.203589


In [None]:
for i in range(3):  # First 3 products
    row = tfidf_matrix[i].toarray().flatten()
    tfidf_df = pd.DataFrame({'term': feature_names, 'tfidf': row})
    top_words = tfidf_df[tfidf_df['tfidf'] > 0].sort_values(by='tfidf', ascending=False).head(5)
    print(f"\n🔸 Product {i} Title: {df.iloc[i]['title']}")
    print("Top words:", ', '.join(top_words['term'].values))



🔸 Product 0 Title: Solid Men Multicolor Track Pants
Top words: rich, yorker, itch, trackpants, giving

🔸 Product 1 Title: Solid Men Blue Track Pants
Top words: rich, yorker, itch, trackpants, giving

🔸 Product 2 Title: Solid Men Multicolor Track Pants
Top words: rich, yorker, itch, trackpants, giving


In [None]:
import numpy as np

# Set diagonal to 0 to ignore self-similarity
np.fill_diagonal(cosine_sim, 0)

# Get top similar pairs
similar_pairs = []
for i in range(len(df)):
    for j in range(i+1, len(df)):
        similar_pairs.append((i, j, cosine_sim[i, j]))

# Sort and get top 5
similar_pairs = sorted(similar_pairs, key=lambda x: x[2], reverse=True)[:5]

print("\n🔗 Top 5 Most Similar Product Pairs:")
for i, j, score in similar_pairs:
    print(f"\nProduct {i}: {df.iloc[i]['title']}")
    print(f"Product {j}: {df.iloc[j]['title']}")
    print(f"Similarity: {score:.4f}")
