In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load cleaned retail data
df = pd.read_csv('../data/cleaned_retail.csv')

# Basic filtering
df = df[df['Quantity'] > 0]
df = df[df['Customer ID'].notnull()]
df = df[df['Description'].notnull()]

# Standardize column types
df['Customer ID'] = df['Customer ID'].astype(int)
df['Description'] = df['Description'].str.strip()


In [3]:
# Pivot table: rows = customers, columns = products, values = quantity
user_item_matrix = df.pivot_table(
    index='Customer ID',
    columns='Description',
    values='Quantity',
    aggfunc='sum',
    fill_value=0
)

user_item_matrix.head()


Description,10 COLOUR SPACEBOY PEN,11 PC CERAMIC TEA SET POLKADOT,12 ASS ZINC CHRISTMAS DECORATIONS,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 MINI TOADSTOOL PEGS,...,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347,24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12349,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Cosine similarity between customers
user_similarity = cosine_similarity(user_item_matrix)

# Convert to DataFrame
user_sim_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)


In [7]:
def recommend_products(target_customer_id, top_n_customers=5, top_n_products=5):
    if target_customer_id not in user_sim_df.index:
        return f"Customer ID {target_customer_id} not found."

    # Get top similar customers (excluding self)
    similar_customers = user_sim_df[target_customer_id].sort_values(ascending=False)[1:top_n_customers+1].index

    # Aggregate product quantities purchased by similar customers
    similar_customer_data = user_item_matrix.loc[similar_customers].sum()

    # Get products already purchased by the target customer
    already_bought = user_item_matrix.loc[target_customer_id]
    already_bought_products = already_bought[already_bought > 0].index

    # Remove already purchased products
    recommended = similar_customer_data.drop(labels=already_bought_products, errors='ignore')

    # Return top N recommendations
    return recommended.sort_values(ascending=False).head(top_n_products)


In [8]:
sample_customer = user_item_matrix.index[0]
print(f"Recommendations for Customer {sample_customer}:\n")
print(recommend_products(sample_customer))


Recommendations for Customer 12346:

Description
BLUE PUDDING SPOON                     48
RED PUDDING SPOON                      48
WOODEN HEART CHRISTMAS SCANDINAVIAN    48
SMALL CHINESE STYLE SCISSOR            40
LARGE CHINESE STYLE SCISSOR            40
dtype: int64


In [9]:
# Generate top 5 recommendations for all customers
recommendation_dict = {}

for cid in user_item_matrix.index:
    try:
        recs = recommend_products(cid).index.tolist()
        recommendation_dict[cid] = recs
    except:
        recommendation_dict[cid] = []

# Convert and save
rec_df = pd.DataFrame.from_dict(recommendation_dict, orient='index')
rec_df.to_csv('../data/customer_recommendations.csv')
