<a href="https://colab.research.google.com/github/nancy-kataria/NexTrade/blob/main/product_matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

=== Imports ===

In [1]:
import kagglehub
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from sklearn.preprocessing import normalize

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

=== 1. Dataset Download ===

In [3]:
# Download latest version
print("Dowlaod Dataset...")
path = kagglehub.dataset_download("vivek468/superstore-dataset-final")
print(f"Dataset downloaded to: {path}")
csv_file_path = os.path.join(path, "Sample - Superstore.csv")
print(f"Reading data from: {csv_file_path}")

Dowlaod Dataset...
Dataset downloaded to: /kaggle/input/superstore-dataset-final
Reading data from: /kaggle/input/superstore-dataset-final/Sample - Superstore.csv


=== 2. Load & Clean Data ===

In [4]:
try:
    superstore_data = pd.read_csv(csv_file_path, encoding='ISO-8859-1')
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: File not found at {csv_file_path}.")
    exit()

Data loaded successfully.


In [5]:
# Keep necessary columns
columns_to_keep = ['Order ID', 'Order Date', 'Ship Date', 'Customer ID', 'Product ID', 'Product Name', 'Sales', 'Quantity', 'Category', 'Sub-Category']
superstore_data = superstore_data[columns_to_keep]

In [None]:
# Display the first 5 rows to check the data
print("First 5 rows of data:")
print(superstore_data.head())

In [6]:
# Convert dates
superstore_data['Order Date'] = pd.to_datetime(superstore_data['Order Date'])
superstore_data['Ship Date'] = pd.to_datetime(superstore_data['Ship Date'])

In [None]:
# check if dropna() is overkill
print(superstore_data[columns_to_keep].isnull().sum())

In [7]:
# drop rows with missing any necessary columns
superstore_data.dropna(subset=columns_to_keep, inplace=True)

In [8]:
print("\n--- Finding Customers with Most Transactions ---")

# Count the number of rows (transaction line items) for each Customer ID
customer_transaction_counts = superstore_data.groupby('Customer ID').size()

# Sort the counts in descending order
customer_transaction_counts_sorted = customer_transaction_counts.sort_values(ascending=False)

print("Top 5 Customers by Number of Transaction Entries:")
print(customer_transaction_counts_sorted.head(5))

# Get the Customer ID with the absolute highest count
if not customer_transaction_counts_sorted.empty:
    top_customer_id = customer_transaction_counts_sorted.index[0]
    top_customer_count = customer_transaction_counts_sorted.iloc[0]
    print(f"\nCustomer with the most transaction entries: '{top_customer_id}' ({top_customer_count} entries)")
else:
    top_customer_id = None # Handle case where data might be empty
    print("\nCould not determine top customer.")



--- Finding Customers with Most Transactions ---
Top 5 Customers by Number of Transaction Entries:
Customer ID
WB-21850    37
MA-17560    34
JL-15835    34
PP-18955    34
EH-13765    32
dtype: int64

Customer with the most transaction entries: 'WB-21850' (37 entries)


=== (TEST STAGE) 2b. Evaluation Split (Time-Based) ===

In [15]:
print("\n--- Splitting Data by User (Time-Based) ---")

train_records = []
test_records = []

# Group by user and sort each user's history by date
user_groups = superstore_data.groupby('Customer ID')

for user, group in user_groups:
    if len(group) < 2:
        continue  # skip users with less than 2 purchases

    group_sorted = group.sort_values('Order Date')
    split_idx = int(len(group_sorted) * 0.8)

    train_records.append(group_sorted.iloc[:split_idx])
    test_records.append(group_sorted.iloc[split_idx:])

# Combine all into DataFrames
train_df = pd.concat(train_records).reset_index(drop=True)
test_df = pd.concat(test_records).reset_index(drop=True)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Train period: {train_df['Order Date'].min()} → {train_df['Order Date'].max()}")
print(f"Test period: {test_df['Order Date'].min()} → {test_df['Order Date'].max()}")

test_user_items = defaultdict(set)
for _, row in test_df.iterrows():
    test_user_items[row['Customer ID']].add(row['Product ID'])

test_users = list(test_user_items.keys())
print(f"Number of test users: {len(test_users)}")


--- Splitting Data by User (Time-Based) ---
Train shape: (7688, 11)
Test shape: (2301, 11)
Train period: 2014-01-03 00:00:00 → 2017-12-29 00:00:00
Test period: 2014-10-10 00:00:00 → 2017-12-30 00:00:00
Number of test users: 788


=== 3. Precomputation  ===

In [19]:
# 1. Product Popularity
product_popularity = superstore_data.groupby('Product ID').agg({
    'Product Name': 'first',
    'Category': 'first',
    'Sub-Category': 'first',
    'Quantity': 'sum',
    'Sales': 'sum'
}).reset_index()

# Normalize popularity score
product_popularity['popularity_score'] = product_popularity['Quantity'] / product_popularity['Quantity'].max()

# 2. Content-Based Info Preparation
superstore_data['product_info'] = (
    superstore_data['Product Name'].astype(str) + ' ' +
    superstore_data['Category'].astype(str) + ' ' +
    superstore_data['Sub-Category'].astype(str)
)

# One row per product
products = superstore_data.drop_duplicates(subset='Product ID')[
    ['Product ID', 'Product Name', 'Category', 'Sub-Category', 'product_info']
]

# 3. TF-IDF Matrix and Cosine Similarity
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(products['product_info'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 4. Product Index Mapping
product_indices = pd.Series(data=range(len(products)), index=products['Product ID']).drop_duplicates()

# 5. User-Product Matrix and Product Similarity for Collaborative Filtering
# Create user-product interaction matrix
user_product_matrix = superstore_data.pivot_table(
    index='Customer ID',
    columns='Product ID',
    values='Quantity',
    aggfunc='sum'
).fillna(0)

# Compute item-item similarity - similar to item you liked
product_similarity = cosine_similarity(user_product_matrix.T)

# Store as DataFrame
product_similarity_df = pd.DataFrame(
    product_similarity,
    index=user_product_matrix.columns,
    columns=user_product_matrix.columns
)

=== 4. Recommendation Functions ===

In [10]:
# === Helper Functions ===
def get_customer_orders_and_products(customer_id, df):
    """Fetches purchase data and unique purchased product IDs for a customer.

    Args:
        customer_id (str): The ID of the target customer.
        df (pd.DataFrame): The main DataFrame containing all transaction data.
                           Must include 'Customer ID' and 'Product ID'.

    Returns:
        tuple[pd.DataFrame, np.ndarray]: A tuple containing:
            - order_history (pd.DataFrame): A DataFrame filtered to only include
                                            rows for the given customer_id. Returns
                                            an empty DataFrame if customer not found.
            - product_ids (np.ndarray): A NumPy array of unique Product IDs
                                          purchased by the customer. Returns an
                                          empty array if customer not found.
    """
    order_history = df[df['Customer ID'] == customer_id].copy()
    # Using .copy() is good practice here to prevent potential SettingWithCopyWarning
    # if the returned DataFrame is modified later in another function.
    product_ids = order_history['Product ID'].unique()
    return order_history, product_ids

def get_unseen_products(customer_id, df, product_df):
    """
    Get a list of products the customer hasn't purchased yet

    Args:
      customer_id (str): ID of the target customer.
      df (pd.DataFrame): Full transaction data (e.g., superstore_data)
                           used to find customer history.
      product_df (pd.DataFrame): DataFrame of all products to recommend
                                   from (e.g., product_popularity).

    Returns:
      pd.DataFrame: filtered product_df with only unseen products
      pd.DataFrame: list of purchased Product IDs for fallback logic
    """

    _, product_ids = get_customer_orders_and_products(customer_id, df)
    return product_df[~product_df['Product ID'].isin(product_ids)], product_ids

def add_fallback_if_needed(recommendations, product_ids, product_df, n, by):
    """
    Add fallback recommendations if there aren't enough unseen products to recommend
    This uses globally popular products (based on 'Quantity' or 'Sales') to fill the gap

    Args:
      recommendations: filtered list of unseen, ranked products
      purchased_ids: list of already purchased product IDs
      product_df: global product list (e.g., product_popularity)
      n: number of products we want to recommend
      by: popularity metric ('Quantity' or 'Sales')

    Returns:
     pd.DataFrame: final DataFrame of n recommendations
    """

    if len(recommendations) < n:
        print(f"Customer has only {len(recommendations)} new products available. Showing global popular items instead.")
        fallback = get_global_popular_products(top_n=n, by=by)
        fallback = fallback[~fallback['Product ID'].isin(product_ids)]
        recommendations = pd.concat([recommendations, fallback]).drop_duplicates('Product ID')
    return recommendations

def get_customer_preferences(customer_id, df):
    """
    Gets the customer's most frequent categories and sub-categories.

    Analyzes a customer's purchase history to find the categories and
    sub-categories they interact with most often, based on the count
    of purchases in each. Used for personalized popularity recommendations.

    Args:
        customer_id (str): The ID of the target customer.
        df (pd.DataFrame): The DataFrame containing transaction data, including
                           'Customer ID', 'Category', and 'Sub-Category' columns.

    Returns:
        tuple[list[str], list[str]]: A tuple containing two lists:
            - The first list contains category names, sorted by frequency (most frequent first).
            - The second list contains sub-category names, sorted by frequency.
            Returns two empty lists ([], []) if the customer has no purchase history in df.
    """
    order_history, _ = get_customer_orders_and_products(customer_id, df)
    if order_history.empty:
        return [], []
    top_categories = order_history['Category'].value_counts().index.tolist()
    top_subcategories = order_history['Sub-Category'].value_counts().index.tolist()
    return top_categories, top_subcategories

In [11]:
# === Calculation  Functions ===
def get_global_popular_products(top_n=5, by='Quantity'):
    """
    Recommends top-N globally popular products. Sorts all products based on a specified metric ('Quantity' or 'Sales') and returns the top N. Does not consider customer history.

    Args:
        top_n (int, optional): The number of products to recommend. Defaults to 10.
        by (str, optional): The metric to sort popularity by ('Quantity' or 'Sales'). Defaults to 'Quantity'.

    Returns:
        pd.DataFrame: A DataFrame containing the top N popular products with columns ['Product ID', 'Product Name', 'Category', 'Sub-Category', <by>]. Returns an empty DataFrame if an invalid 'by' parameter is provided (though it currently raises ValueError).

    Raises:
        ValueError: If 'by' is not 'Quantity' or 'Sales'.
    """
    if by not in ['Quantity', 'Sales']:
        raise ValueError("Parameter 'by' must be either 'Quantity' or 'Sales'")

    return product_popularity.sort_values(by=by, ascending=False).head(top_n)[['Product ID', 'Product Name', 'Category', 'Sub-Category', by]]

def get_content_similar_items(product_id, top_n=5):
    """
    Recommends products similar to a given product based on content.

      Uses precomputed TF-IDF vectors and cosine similarity based on product
      name, category, and sub-category.

      Args:
          product_id (str): The ID of the product to find similar items for.
          top_n (int, optional): The number of similar products to return.
                                Defaults to 5.

      Returns:
          pd.DataFrame: A DataFrame containing the top_n similar products with
                        columns ['Product Name', 'Category', 'Sub-Category'].
                        Returns an empty DataFrame if the product_id is not found.
      """
    if product_id not in product_indices.index:
      print(f"Product ID '{product_id}' not found in product indices.")
      return pd.DataFrame() # Or an empty list

    idx = product_indices[product_id]

    if idx >= cosine_sim.shape[0]:
        print(f"Index {idx} is out of bounds for cosine similarity matrix.")
        return pd.DataFrame()

    # Get similarity scores and corresponding product indices
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Skip the first one if it's the same product (similarity = 1)
    sim_scores = [x for x in sim_scores if x[0] != idx][:top_n]

    # Build result DataFrame
    result = []
    for i, score in sim_scores:
        if i >= len(products):
            continue
        row = products.iloc[i]
        result.append({
            'Product ID': row['Product ID'],
            'Product Name': row['Product Name'],
            'Category': row['Category'],
            'Sub-Category': row['Sub-Category'],
            'Similarity Score': score
        })

    return pd.DataFrame(result)

def get_collaborative_similar_items(product_id, top_n=5):
    """Recommends products similar to a given product using item-item collaborative filtering.

    Uses a precomputed product similarity matrix based on user co-purchase patterns.

    Args:
        product_id (str): The ID of the product to find collaboratively similar items for.
        top_n (int, optional): The number of similar products to return. Defaults to 5.

    Returns:
        pd.DataFrame or str: A DataFrame containing the top_n similar products
                             with columns ['Product ID', 'Similarity Score', 'Product Name',
                             'Category', 'Sub-Category']. Returns a string message if the
                             product_id is not found in the similarity matrix. (Consider
                             changing string returns to an empty DataFrame).
    """

    if product_id not in product_similarity_df.columns:
        print(f"Product {product_id} not found in dataset.")
        return pd.DataFrame() # Or an empty list
    similar_scores = product_similarity_df[product_id].sort_values(ascending=False)
    # return similar_scores[1:top_n+1]

    recommended = similar_scores[1:top_n+1].reset_index()
    recommended.columns = ['Product ID', 'Similarity Score']
    return recommended.merge(
        product_popularity[['Product ID', 'Product Name', 'Category', 'Sub-Category']],
        on='Product ID', how='left'
    )

In [12]:
# === Main Recommendation Functions ===
def recommend_popular(customer_id=None, top_n=5, by='Quantity'):
    """
    Recommends popular products, optionally personalized for a customer.

    Modes:
    1. Global: If customer_id is None, returns globally popular products.
    2. Unseen for Customer: Returns globally popular products not yet purchased by the customer,
       with fallback if fewer than n are found.

    Args:
        customer_id (str, optional): The ID of the customer. Defaults to None.
        top_n (int, optional): The number of products to recommend. Defaults to 10.
        by (str, optional): The metric for popularity ('Quantity' or 'Sales').
                            Defaults to 'Quantity'.

    Returns:
        pd.DataFrame: Top-N recommended products.

    """
    if by not in ['Quantity', 'Sales']:
        raise ValueError("Parameter 'by' must be either 'Quantity' or 'Sales'")

    # Case 1: No customer → return top global products
    if customer_id is None:
        print("No customer ID provided. Returning global popular products.")
        return get_global_popular_products(top_n=top_n)

    # Case 2: Exclude products already purchased
    unseen_products, product_ids = get_unseen_products(customer_id, superstore_data, product_popularity)
    unseen_products = unseen_products.sort_values(by=by, ascending=False)

    # Apply fallback if needed
    final = add_fallback_if_needed(unseen_products, product_ids, product_popularity, top_n, by)

    return final.head(top_n)[['Product ID', 'Product Name', 'Category', 'Sub-Category', by]]

def recommend_content_based(customer_id, top_n=5):
    """
    Recommends products similar to the last item purchased by a customer.

    Finds the customer's most recent purchase and then uses content-based
    similarity (get_content_similar_items) to find similar items.

    Args:
        customer_id (str): The ID of the customer.
        top_n (int, optional): The number of similar products to recommend.
                               Defaults to 5.

    Returns:
        pd.DataFrame or str: A DataFrame containing the recommended products
                             (from get_content_similar_items) or a string message
                             if the customer has no purchase history.
                             (Consider changing the string return to an empty DataFrame
                             for consistency).
    """
    # Case 1: No customer → return top global products
    if customer_id is None:
        print("No customer ID provided. Returning global popular products.")
        return get_global_popular_products(top_n=top_n)

    order_history, product_ids = get_customer_orders_and_products(customer_id, superstore_data)
    if order_history.empty:
          print(f"No purchase history for customer '{customer_id}'.")
          return pd.DataFrame() # Or an empty list

    # Get last product bought
    last_purchase = order_history.sort_values('Order Date', ascending=False).iloc[0]
    last_product_id = last_purchase['Product ID']
    last_product_name = last_purchase['Product Name']
    print(f"Based on last product purchased (ID: {last_product_id}): {last_product_name}")

    # Get content-based similar items
    similar_items = get_content_similar_items(last_product_id, top_n * 2)  # get more to allow filtering

    # Exclude already purchased
    similar_items = similar_items[~similar_items['Product ID'].isin(product_ids)]

    return similar_items.head(top_n)[['Product ID', 'Product Name', 'Category', 'Sub-Category']]

def recommend_collaborative(customer_id, top_n=5):
    """Recommends products to a customer based on collaborative filtering.

    Aggregates similarity scores from items the customer has purchased to find
    new items that are similar based on co-purchase patterns across all users.
    Excludes items already purchased by the customer.

    Args:
        customer_id (str): The ID of the customer.
        top_n (int, optional): The number of products to recommend. Defaults to 5.

    Returns:
        pd.DataFrame or str: A DataFrame containing the top_n recommended products
                             with columns ['Product ID', 'Product Name', 'Category',
                             'Sub-Category']. Returns a string message if the customer
                             has no history or suitable product data isn't found.
                             (Consider changing string returns to an empty DataFrame).
    """
    # Case 1: No customer → return top global products
    if customer_id is None:
        print("No customer ID provided. Returning global popular products.")
        return get_global_popular_products(top_n=top_n)

    order_history, product_ids = get_customer_orders_and_products(customer_id, superstore_data)
    if order_history.empty:
        print(f"No purchase history for customer '{customer_id}'.")
        return pd.DataFrame() # Or an empty list

    # If user has multiple purchases, accumulate similarity
    total_collab_scores = None
    valid_count = 0
    for pid in product_ids:
        if pid not in product_similarity_df.columns:
            continue
        product_scores = product_similarity_df[pid]
        total_collab_scores = product_scores if total_collab_scores is None else total_collab_scores + product_scores
        valid_count += 1

    if total_collab_scores is None or valid_count == 0:
        print(f"No valid products found for similarity for customer '{customer_id}'.")
        return pd.DataFrame()

    # Normalize if multiple products
    total_collab_scores = total_collab_scores / valid_count

    # Remove already purchased products
    total_collab_scores = total_collab_scores.drop(labels=product_ids, errors='ignore')

    # Get top similar product IDs
    top_scores = total_collab_scores.sort_values(ascending=False)
    top_ids = top_scores.head(top_n).index.tolist()

    # Fetch recommended products
    recommendations = product_popularity[product_popularity['Product ID'].isin(top_ids)].copy()
    recommendations['Similarity Score'] = top_scores[top_ids].values

    # Fallback if not enough items
    if len(recommendations) < top_n:
        print(f"Only {len(recommendations)} collaborative recommendations found. Adding fallback items.")
        fallback = get_global_popular_products(top_n=top_n * 2)  # more to ensure enough
        fallback = fallback[~fallback['Product ID'].isin(product_ids + top_ids)]
        fallback = fallback.head(top_n - len(recommendations))
        fallback['Similarity Score'] = 0  # or None, since fallback isn't similarity-based
        recommendations = pd.concat([recommendations, fallback])

    return recommendations.head(top_n)[['Product ID', 'Product Name', 'Category', 'Sub-Category', 'Similarity Score']]

def recommend_hybrid(customer_id, top_n=5, w_content=0.5, w_collab=0.4, w_pop=0.1, show_debug=False):
    """
    Recommends products using a hybrid approach combining content similarity,
    collaborative similarity, and global popularity.
    """
    # Case 1: No customer → return top global products
    if customer_id is None:
        print("No customer ID provided. Returning global popular products.")
        return get_global_popular_products(top_n=top_n)

    order_history, product_ids = get_customer_orders_and_products(customer_id, superstore_data)
    if order_history.empty:
        print(f"No purchase history found for customer '{customer_id}'.")
        return pd.DataFrame()

    # --- 1. Calculate Average Content Similarity Scores ---
    purchased_idxs_content = [product_indices[pid] for pid in product_ids if pid in product_indices]
    if not purchased_idxs_content:
        print(f"No purchased products for customer '{customer_id}' found in content product index.")
        # Could potentially proceed without content score or return empty
        avg_content_sim_scores = np.zeros(len(products)) # Assign zero score if no history match
    else:
        # Average similarity to user's purchase history
        valid_idxs = [idx for idx in purchased_idxs_content if idx < cosine_sim.shape[0]]
        if not valid_idxs:
            print(f"No valid content-based product indices for customer '{customer_id}'.")
            avg_content_sim_scores = np.zeros(len(products))
        else:
            avg_content_sim_scores = sum(cosine_sim[idx] for idx in valid_idxs) / len(valid_idxs)

    content_df = pd.DataFrame({
        'Product ID': products['Product ID'], # Use Product ID from the 'products' DataFrame
        'content_score': avg_content_sim_scores
    })

    # --- 2. Calculate Average Collaborative Similarity Scores ---
    total_collab_sim = None
    valid_purchased_ids_count = 0
    for pid in product_ids:
        if pid not in product_similarity_df.columns:
            continue
        product_scores = product_similarity_df[pid]
        total_collab_sim = product_scores if total_collab_sim is None else total_collab_sim + product_scores
        valid_purchased_ids_count += 1

    if total_collab_sim is None:
        print(f"No valid products found for collaborative similarity for customer '{customer_id}'.")
         # Assign zero score if no history match in collaborative matrix
        collab_df = pd.DataFrame({'Product ID': product_similarity_df.columns, 'collab_score': 0.0})
    else:
        avg_collab_sim_scores = total_collab_sim / valid_purchased_ids_count
        collab_df = avg_collab_sim_scores.reset_index()
        collab_df.columns = ['Product ID', 'collab_score']

    # --- 3. Combine All Scores ---
    # Start with all products and their popularity
    combined_df = product_popularity[['Product ID', 'Product Name', 'Category', 'Sub-Category', 'popularity_score']].copy()

    # Merge content scores
    combined_df = combined_df.merge(content_df, on='Product ID', how='left')
    combined_df['content_score'] = combined_df['content_score'].fillna(0)

    # Merge collaborative scores
    combined_df = combined_df.merge(collab_df, on='Product ID', how='left')
    combined_df['collab_score'] = combined_df['collab_score'].fillna(0)

    # Filter out already purchased items
    combined_df = combined_df[~combined_df['Product ID'].isin(product_ids)].copy()

    # --- 4. Normalize All Scores ---
    # if not normalize, popularity score is just too high
    scaler = MinMaxScaler()
    combined_df[['content_score', 'collab_score', 'popularity_score']] = scaler.fit_transform(
        combined_df[['content_score', 'collab_score', 'popularity_score']]
    )

    # --- 5. Calculate Final Score ---
    combined_df['final_score'] = (
        w_content * combined_df['content_score'] +
        w_collab * combined_df['collab_score'] +
        w_pop * combined_df['popularity_score']
    )

    # --- 6. Show Debug Info (Optional) ---
    if show_debug:
        print("\n[DEBUG] Top products by each score (before final sort):")
        print(combined_df[['Product Name', 'content_score', 'collab_score', 'popularity_score', 'final_score']]
              .sort_values(by='final_score', ascending=False).head(10))

    # --- 7. Sort and Return ---
    final_recommendations = combined_df.sort_values(by='final_score', ascending=False).head(top_n)

    # --- 8. Fallback Logic ---
    if len(final_recommendations) < top_n:
      print(f"Only {len(final_recommendations)} hybrid recommendations found. Adding fallback items.")
      fallback = get_global_popular_products(n=top_n * 2)
      fallback = fallback[~fallback['Product ID'].isin(product_ids + final_recommendations['Product ID'].tolist())]
      fallback['final_score'] = 0  # Neutral fallback score
      final_recommendations = pd.concat([final_recommendations, fallback.head(top_n - len(final_recommendations))])

    return final_recommendations[['Product ID', 'Product Name', 'Category', 'Sub-Category', 'final_score']]

=== 5. Example Usage ===

In [20]:
def print_recommendation_output(customer_id, num_recommendations=5):
    print("=" * 60)
    print("Example: Personalized Recommendations for One Customer")
    print("=" * 60)

    # Step 1: Show context
    order_history, product_ids = get_customer_orders_and_products(customer_id, superstore_data)

    if order_history.empty:
        print(f"\nNo purchase history found for customer '{customer_id}'. Showing global popular items instead.")
        print(recommend_popular(customer_id=None, top_n=num_recommendations))
        return

    print(f"\n Purchase History Summary for Customer: {customer_id}")
    print(f"  - Total Unique Products Purchased: {len(product_ids)}")

    # Last purchase
    last_purchase = order_history.sort_values('Order Date', ascending=False).iloc[0]
    print(f"  - Most Recent Purchase: '{last_purchase['Product Name']}' on {last_purchase['Order Date'].date()} — {last_purchase['Category']} / {last_purchase['Sub-Category']}")

    # Frequent items
    freq_counts = order_history['Product ID'].value_counts()
    top_freq_ids = freq_counts.head(3).index.tolist()
    print("  - Most Frequently Purchased Items:")
    for pid in top_freq_ids:
        row = superstore_data[superstore_data['Product ID'] == pid].iloc[0]
        print(f"    → '{row['Product Name']}' ({freq_counts[pid]} times) — {row['Category']} / {row['Sub-Category']}")

    # Category preferences
    top_cats, top_subcats = get_customer_preferences(customer_id, superstore_data)
    print(f"  - Top Categories: {', '.join(top_cats[:3])}")
    print(f"  - Top Sub-Categories: {', '.join(top_subcats[:3])}")

    print("\n Customer Purchase History")
    history_sorted = order_history.sort_values('Order Date', ascending=False).copy()
    for _, row in history_sorted.iterrows():
        print(f"  [{row['Order Date'].date()}] {row['Product Name']} (ID: {row['Product ID']}) — {row['Category']} / {row['Sub-Category']}")

    print("\n" + "=" * 60)
    print(" Recommendation Outputs")
    print("=" * 60)

    def explain_recommendations(name, df, context_col=None):
        print(f"\nTop {num_recommendations} {name} Recommendations:")
        if context_col:
            print(f"(Based on {context_col})")
        for _, row in df.iterrows():
            reason = []
            if 'Similarity Score' in row and row['Similarity Score'] == 0:
                reason.append("fallback (popular item)")
            elif 'Similarity Score' in row:
                reason.append(f"similarity score: {row['Similarity Score']:.4f}")
            if row['Category'] in top_cats:
                reason.append(f"matches favorite category: {row['Category']}")
            if row['Sub-Category'] in top_subcats:
                reason.append(f"matches frequent sub-category: {row['Sub-Category']}")
            explanation = "; ".join(reason)
            print(f"→ {row['Product Name']} (ID: {row['Product ID']}) — {row['Category']} / {row['Sub-Category']}")
            if explanation:
                print(f"   Explanation: {explanation}\n")

    # Generate all recommendations
    popular_df = recommend_popular(customer_id, top_n=num_recommendations)
    content_df = recommend_content_based(customer_id, top_n=num_recommendations)
    collab_df = recommend_collaborative(customer_id, top_n=num_recommendations)
    hybrid_df = recommend_hybrid(customer_id, top_n=num_recommendations, show_debug=True)

    # Print all with explanations
    explain_recommendations("Popular", popular_df, context_col="overall purchase frequency across all users")
    explain_recommendations("Content-Based", content_df, context_col="last product purchased")
    explain_recommendations("Collaborative", collab_df, context_col="co-purchase patterns of similar users")
    explain_recommendations("Hybrid", hybrid_df, context_col="content + collaborative + popularity")

# Example usage:
print_recommendation_output("WB-21850", num_recommendations=5)


Example: Personalized Recommendations for One Customer

 Purchase History Summary for Customer: WB-21850
  - Total Unique Products Purchased: 36
  - Most Recent Purchase: 'Contract Clock, 14", Brown' on 2017-12-10 — Furniture / Furnishings
  - Most Frequently Purchased Items:
    → 'Fellowes 8 Outlet Superior Workstation Surge Protector' (2 times) — Office Supplies / Appliances
    → 'Fellowes PB200 Plastic Comb Binding Machine' (1 times) — Office Supplies / Binders
    → 'Motorla HX550 Universal Bluetooth Headset' (1 times) — Technology / Phones
  - Top Categories: Office Supplies, Technology, Furniture
  - Top Sub-Categories: Binders, Phones, Furnishings

 Customer Purchase History
  [2017-12-10] Contract Clock, 14", Brown (ID: FUR-FU-10001475) — Furniture / Furnishings
  [2017-12-10] Heavy-Duty E-Z-D Binders (ID: OFF-BI-10000014) — Office Supplies / Binders
  [2017-11-11] Vinyl Coated Wire Paper Clips in Organizer Box, 800/Box (ID: OFF-FA-10004854) — Office Supplies / Fasteners
  [2

=== 6. Evaluation (Can't think of it yet) ===

In [17]:
def precision_at_k(recommended, actual_set, k):
    if not recommended:
        return 0.0
    hits = sum(1 for item in recommended[:k] if item in actual_set)
    return hits / k

def recall_at_k(recommended, actual_set, k):
    if not actual_set:
        return 0.0
    hits = sum(1 for item in recommended[:k] if item in actual_set)
    return hits / len(actual_set)

def evaluate_model(model_func, k=10, model_name="Model"):
    """
    Evaluates a recommendation model using Precision@K and Recall@K.

    Args:
        model_func (function): Recommender function with signature model_func(customer_id, top_n)
        k (int): Top-K cutoff
        model_name (str): Label for printing results

    Returns:
        None. Prints out average metrics.
    """
    precisions, recalls = [], []

    for user_id in test_user_items:
        try:
            recs = model_func(user_id, top_n=k)
            rec_items = recs['Product ID'].tolist() if not recs.empty else []
        except Exception as e:
            print(f"Error for user {user_id} in {model_name}: {e}")
            rec_items = []

        actual_items = test_user_items[user_id]
        precisions.append(precision_at_k(rec_items, actual_items, k))
        recalls.append(recall_at_k(rec_items, actual_items, k))

    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)

    print(f"\n📊 Evaluation for {model_name}")
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")


In [21]:
evaluate_model(recommend_popular, model_name="Popularity-Based")
evaluate_model(recommend_content_based, model_name="Content-Based")
evaluate_model(recommend_collaborative, model_name="Collaborative Filtering")
evaluate_model(recommend_hybrid, model_name="Hybrid")


📊 Evaluation for Popularity-Based
Precision@10: 0.0000
Recall@10: 0.0000


KeyboardInterrupt: 