# Simple Amazon Product Recommendation System

This notebook implements a simple recommendation system using TF-IDF and cosine similarity to find similar products.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from IPython.display import HTML, display
import pandas as pd

pd.set_option('display.max_colwidth', None)

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nouha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nouha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nouha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nouha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 1. Load and Prepare Data

In [2]:
# Load the cleaned dataset
df = pd.read_csv('../data/cleaned_amazon_data.csv')
print("Dataset shape:", df.shape)
# df.head()

Dataset shape: (1465, 21)


## 2. Text Preprocessing Functions

In [3]:
# def preprocess_text(text):
#     """
#     Preprocess text data for better recommendation results
#     """
#     if not isinstance(text, str):
#         return ''
    
#     # Convert to lowercase
#     text = text.lower()
    
#     # Remove URLs
#     text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
#     # Remove HTML tags
#     text = re.sub(r'<.*?>', '', text)
    
#     # Remove special characters and numbers
#     text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
#     # Remove extra whitespace
#     text = ' '.join(text.split())
    
#     # Tokenization
#     tokens = word_tokenize(text)
    
#     # Initialize lemmatizer
#     lemmatizer = WordNetLemmatizer()
    
#     # Get stopwords
#     stop_words = set(stopwords.words('english'))
    
#     # Custom words to remove (domain-specific)
#     custom_stops = {'amazon', 'product', 'item', 'buy', 'price', 'review'}
#     stop_words.update(custom_stops)
    
#     # Process tokens
#     processed_tokens = []
#     for token in tokens:
#         if token not in stop_words and len(token) > 2:
#             lemmatized = lemmatizer.lemmatize(token)
#             processed_tokens.append(lemmatized)
    
#     return ' '.join(processed_tokens)

# def clean_text_data(df):
#     """
#     Clean and combine text columns for recommendation
#     """
#     # Create copies of text columns to avoid modifying original data
#     df['clean_name'] = df['product_name'].fillna('').apply(preprocess_text)
#     df['clean_about'] = df['about_product'].fillna('').apply(preprocess_text)
#     df['clean_category'] = df['category'].fillna('').apply(lambda x: ' '.join(x.split('|')).lower())
    
#     # Combine reviews into a single string per product
#     df['clean_reviews'] = df['review_content'].fillna('').apply(lambda x: ' '.join(x.split(',')[:5]))  # Take first 5 reviews
#     df['clean_reviews'] = df['clean_reviews'].apply(preprocess_text)
    
#     # Combine all cleaned text columns with different weights
#     df['combined_features'] = (
#         df['clean_name'] + ' ' +                  # Product name (1x weight)
#         df['clean_name'] + ' ' +                  # Repeat name for higher weight (2x)
#         df['clean_category'] + ' ' +              # Category
#         df['clean_about'] + ' ' +                 # Product description
#         df['clean_about'] + ' ' +                 # Repeat description (2x)
#         df['clean_reviews']                       # Reviews
#     )
    
#     return df

# # Clean and prepare the text data
# df = clean_text_data(df)

# # Display a sample of the cleaned and combined text
# print("\nSample of combined features for first product:")
# print(df['combined_features'].iloc[0][:200], "...")

## 3. Create TF-IDF Matrix and Similarity Matrix

In [4]:
# Create feature matrix using TF-IDF
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2)
)

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(df['combined_features'])
print(f"\nTF-IDF matrix shape: {tfidf_matrix.shape}")

# Calculate similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)
print(f"Similarity matrix shape: {similarity_matrix.shape}")


TF-IDF matrix shape: (1465, 5000)
Similarity matrix shape: (1465, 1465)


## 4. Recommendation Function

In [5]:
def get_recommendations(product_idx, similarity_matrix, df, n_recommendations=5):
    """
    Get product recommendations based on similarity scores
    """
    # Get similarity scores for the product
    sim_scores = list(enumerate(similarity_matrix[product_idx]))
    
    # Sort products based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N most similar products (excluding itself)
    sim_scores = sim_scores[1:n_recommendations+1]
    
    # Get product indices and scores
    product_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]
    
    # Create recommendations dataframe
    recommendations = pd.DataFrame({
        'Product Name': df['product_name'].iloc[product_indices],
        'Category': df['main_category'].iloc[product_indices],
        'Similarity Score': similarity_scores,
        'Price': df['discounted_price'].iloc[product_indices],
        'Rating': df['rating'].iloc[product_indices]
    })
    
    return recommendations

## 5. Test the Recommendation System

In [6]:
# Function to get recommendations by product name
def get_recommendations_by_name(product_name, df, similarity_matrix):
    # Find the product index
    idx = df[df['product_name'].str.contains(product_name, case=False)].index
    
    if len(idx) == 0:
        return print(f"No product found with name containing '{product_name}'")
    
    # Get the first matching product
    idx = idx[0]
    
    # Print the selected product details
    print("Selected Product:")
    print(f"Name: {df['product_name'].iloc[idx]}")
    print(f"Category: {df['main_category'].iloc[idx]}")
    print(f"Price: ${df['discounted_price'].iloc[idx]}")
    print(f"Rating: {df['rating'].iloc[idx]}\n")
    
    # Get recommendations
    return get_recommendations(idx, similarity_matrix, df)

# # Test with a sample product
# sample_product_idx = 0
# print("Recommendations for:", df['product_name'].iloc[sample_product_idx])
# recommendations = get_recommendations(sample_product_idx, similarity_matrix, df)
# print("\nTop 5 recommendations:")
# print(recommendations)

# # Test with a product name search
# print("\nTrying to find recommendations for a product containing 'USB':")
# recommendations = get_recommendations_by_name('USB', df, similarity_matrix)
# print(recommendations)

In [7]:
def display_recommendations(product_name, recommendations):
    """
    Create a beautiful HTML display for product recommendations
    """
    html_content = f"""
    <div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; margin: 10px;">
        <h3 style="color: #2c3e50; margin-bottom: 20px;">🔍 Recommendations for:</h3>
        <div style="background-color: white; padding: 15px; border-radius: 5px; margin-bottom: 20px;">
            <p style="color: #34495e; font-size: 16px;">{product_name}</p>
        </div>
        
        <h4 style="color: #2c3e50; margin-top: 20px;">📦 Top 5 Similar Products:</h4>
        <div style="overflow-x: auto;">
        <table style="width: 100%; border-collapse: collapse; margin-top: 10px;">
            <tr style="background-color: #3498db; color: white;">
                <th style="padding: 12px; text-align: left;">Product Name</th>
                <th style="padding: 12px; text-align: center;">Category</th>
                <th style="padding: 12px; text-align: center;">Price</th>
                <th style="padding: 12px; text-align: center;">Rating</th>
                <th style="padding: 12px; text-align: center;">Similarity</th>
            </tr>
    """
    
    for _, row in recommendations.iterrows():
        html_content += f"""
            <tr style="border-bottom: 1px solid #ddd; background-color: white;">
                <td style="padding: 12px; text-align: left;">{row['Product Name'][:100]}...</td>
                <td style="padding: 12px; text-align: center;">{row['Category']}</td>
                <td style="padding: 12px; text-align: center;">₹{row['Price']:.2f}</td>
                <td style="padding: 12px; text-align: center;">{'⭐' * int(row['Rating'])}</td>
                <td style="padding: 12px; text-align: center;">{row['Similarity Score']:.2%}</td>
            </tr>
        """
    
    html_content += """
        </table>
        </div>
    </div>
    """
    
    display(HTML(html_content))

def get_recommendations_by_name(product_name, df, similarity_matrix):
    """
    Get recommendations by product name with beautiful display
    """
    # Find the product index
    idx = df[df['product_name'].str.contains(product_name, case=False)].index
    
    if len(idx) == 0:
        return print(f"❌ No product found with name containing '{product_name}'")
    
    # Get the first matching product
    idx = idx[0]
    product_details = df.iloc[idx]
    
    # Get recommendations
    recommendations = get_recommendations(idx, similarity_matrix, df)
    
    # Display product details and recommendations
    display_recommendations(product_details['product_name'], recommendations)

# Test with a sample product
sample_product_idx = 0
sample_product = df['product_name'].iloc[sample_product_idx]
recommendations = get_recommendations(sample_product_idx, similarity_matrix, df)
display_recommendations(sample_product, recommendations)

# Test with a product name search
print("\nSearching for products containing 'USB':")
get_recommendations_by_name('USB', df, similarity_matrix)

Product Name,Category,Price,Rating,Similarity
"Wayona Nylon Braided USB to Lightning Fast Charging and Data Sync Cable Compatible for iPhone 13, 12...",Computers&Accessories,₹399.00,⭐⭐⭐⭐,100.00%
"Wayona Nylon Braided USB to Lightning Fast Charging and Data Sync Cable Compatible for iPhone 13, 12...",Computers&Accessories,₹399.00,⭐⭐⭐⭐,100.00%
"Wayona Nylon Braided Usb Syncing And Charging Cable Sync And Charging Cable For Iphone, Ipad (3 Ft, ...",Computers&Accessories,₹649.00,⭐⭐⭐⭐,95.81%
"Wayona Nylon Braided 3A Lightning to USB A Syncing and Fast Charging Data Cable for iPhone, Ipad (3 ...",Computers&Accessories,₹399.00,⭐⭐⭐⭐,91.41%
"Wayona Nylon Braided (2 Pack) Lightning Fast Usb Data Cable Fast Charger Cord For Iphone, Ipad Table...",Computers&Accessories,₹649.00,⭐⭐⭐⭐,91.07%



Searching for products containing 'USB':


Product Name,Category,Price,Rating,Similarity
"Wayona Nylon Braided USB to Lightning Fast Charging and Data Sync Cable Compatible for iPhone 13, 12...",Computers&Accessories,₹399.00,⭐⭐⭐⭐,100.00%
"Wayona Nylon Braided USB to Lightning Fast Charging and Data Sync Cable Compatible for iPhone 13, 12...",Computers&Accessories,₹399.00,⭐⭐⭐⭐,100.00%
"Wayona Nylon Braided Usb Syncing And Charging Cable Sync And Charging Cable For Iphone, Ipad (3 Ft, ...",Computers&Accessories,₹649.00,⭐⭐⭐⭐,95.81%
"Wayona Nylon Braided 3A Lightning to USB A Syncing and Fast Charging Data Cable for iPhone, Ipad (3 ...",Computers&Accessories,₹399.00,⭐⭐⭐⭐,91.41%
"Wayona Nylon Braided (2 Pack) Lightning Fast Usb Data Cable Fast Charger Cord For Iphone, Ipad Table...",Computers&Accessories,₹649.00,⭐⭐⭐⭐,91.07%
