In [2]:
!pip install openpyxl



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score
import re

<pre>
Preprocess Data: Extract product names and transform them into numerical representations using TF-IDF (Term Frequency-Inverse Document Frequency).
Compute Similarities: Use cosine similarity to measure how similar products are based on their names.
Recommend Products: Given a product, find the most similar products.
    
</pre>

In [4]:
df = pd.read_excel('Air Conditioners.xlsx')
df.head()

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/31UISB90sY...,https://www.amazon.in/Lloyd-Inverter-Convertib...,4.2,2255,"â‚¹32,999","â‚¹58,990"
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Convertible-Anti-Viru...,4.2,2948,"â‚¹46,490","â‚¹75,990"
2,LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Inverter-Convertible-...,4.2,1206,"â‚¹34,490","â‚¹61,990"
3,LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Convertible-Anti-Viru...,4.0,69,"â‚¹37,990","â‚¹68,990"
4,Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/41lrtqXPiW...,https://www.amazon.in/Carrier-Inverter-Split-C...,4.1,630,"â‚¹34,490","â‚¹67,790"


In [5]:
# Content-Based Filtering using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['name'])
vectorizer

In [6]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9008 stored elements and shape (720, 1188)>

In [7]:
# Compute similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [8]:
# Preprocessing Prices
def clean_price(price):
    if isinstance(price, str):
        price = re.sub(r'[^0-9]', '', price)  # Remove non-numeric characters
    return float(price) if price else np.nan

df['discount_price'] = df['discount_price'].apply(clean_price)
df['actual_price'] = df['actual_price'].apply(clean_price)


In [9]:
def recommend_products(product_name, top_n=5):
    # Find product index
    idx = df[df['name'] == product_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    product_indices = [i[0] for i in sim_scores]
    return df.iloc[product_indices][['name', 'ratings', 'discount_price']]

In [10]:
example_product = df['name'].iloc[0]  # First product in dataset
recommendations = recommend_products(example_product)
print("Recommended Products:")
print(recommendations)

Recommended Products:
                                                 name ratings  discount_price
6   Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1...     4.2         29999.0
7   Lloyd 1.5 Ton 5 Star Inverter Split Ac (5 In 1...     4.3         39990.0
16  Lloyd 1.0 Ton 5 Star Inverter Split Ac (5 In 1...     4.1         34000.0
59  Lloyd 2.0 Ton 5 Star Inverter Split Ac (5 In 1...       4         52090.0
69  Lloyd 2.0 Ton 3 Star Inverter Split Ac (5 In 1...     NaN         46000.0


 Using High-Rated Products as Ground Truth
We assume that products with high ratings (e.g., above 4.0) are relevant to users.
If a recommended product has a high rating, we count it as a True Positive (TP).

In [12]:
from sklearn.metrics import precision_score, recall_score

# Define threshold for relevant products
RATING_THRESHOLD = 4.0
df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce')  # Convert to float
df = df.dropna(subset=['ratings'])  # Remove rows where ratings could not be converted
# Simulated ground truth: High-rated products
ground_truth = set(df[df['ratings'] >= RATING_THRESHOLD]['name'])

def evaluate_recommendations(recommended_products):
    recommended_set = set(recommended_products['name'])
    
    TP = len(ground_truth & recommended_set)  # True Positives
    FP = len(recommended_set - ground_truth)  # False Positives
    FN = len(ground_truth - recommended_set)  # False Negatives

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    return precision, recall

# Example evaluation for a product
example_product = df['name'].iloc[0]  # Select a product
recommended_products = recommend_products(example_product)

precision, recall = evaluate_recommendations(recommended_products)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")


Precision: 0.80, Recall: 0.02


Precision = 0.80 (80%)
→ 80% of the recommended products are actually relevant (i.e., they meet the criteria for being "correct" recommendations, such as high ratings).

Recall = 0.02 (2%)
→ Your system is only identifying 2% of all the relevant products that exist in the dataset. This means it is missing a large number of potentially good recommendations.