In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix
import gc

df = pd.read_csv('instacart.csv')

In [2]:
# # For products
# product_counts = df['product_id'].value_counts()
# min_product_count = product_counts.min()
# max_product_count = product_counts.max()

# # For users
# user_counts = df['user_id'].value_counts()
# min_user_count = user_counts.min()
# max_user_count = user_counts.max()

# print(f"Minimum product count: {min_product_count}, Maximum product count: {max_product_count}")
# print(f"Minimum user purchase count: {min_user_count}, Maximum user purchase count: {max_user_count}")


In [3]:
from scipy.sparse import csr_matrix, vstack
# Randomly sample 50% of users
sampled_users = df['user_id'].drop_duplicates().sample(frac=0.50)
df_sampled_by_user = df[df['user_id'].isin(sampled_users)]

# Randomly sample 50% of products
sampled_products = df['product_id'].drop_duplicates().sample(frac=0.50)
df_sampled = df_sampled_by_user[df_sampled_by_user['product_id'].isin(sampled_products)]

# Step 1: Data Preparation
customer_product_matrix = pd.pivot_table(df_sampled, values='reordered', index='user_id', columns='product_id', fill_value=0)
customer_product_matrix_sparse = csr_matrix(customer_product_matrix.values)

# Standardize the data (Centering only as sparse matrix can't be scaled)
mean_vector = customer_product_matrix_sparse.mean(axis=0)
mean_vector = np.array(mean_vector).reshape(-1)
mean_vector_sparse = csr_matrix(mean_vector)
mean_vector_repeated = vstack([mean_vector_sparse] * customer_product_matrix_sparse.shape[0])

# Subtract mean matrix from the original matrix to center the data
customer_product_matrix_centered = customer_product_matrix_sparse - mean_vector_repeated

# Free some memory
gc.collect()

# Step 2: Find the optimal number of clusters using the Elbow Method
costs = []
K_range = range(1, 25)
for i in K_range:
    print(f"Running K-means for K={i}")
    kmeans = KMeans(n_clusters=i, random_state=42, n_init=10)
    kmeans.fit(np.asarray(customer_product_matrix_centered.toarray()))
    costs.append(kmeans.inertia_)

knee_point = np.diff(np.diff(costs))
optimal_k = K_range[np.where(knee_point == min(knee_point))[0][0] + 1]

# Step 3: Run k-means with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(np.asarray(customer_product_matrix_centered.toarray()))

clustered_df = pd.DataFrame({'user_id': customer_product_matrix.index, 'group': clusters})

# Step 4: Extract top 3 most ordered products for each user
def get_top_n_products(user_id, n=3):
    user_data = df_sampled[df_sampled['user_id'] == user_id]
    product_count = Counter(user_data['product_id'])
    most_common = product_count.most_common(n)
    top_n_products = [item[0] for item in most_common]
    return top_n_products

clustered_df['top_3_products'] = clustered_df['user_id'].apply(get_top_n_products)

# Step 5: Extract top 3 products for each cluster/group
def get_top_n_products_for_group(group, n=3):
    group_data = df_sampled[df_sampled['user_id'].isin(clustered_df[clustered_df['group'] == group]['user_id'])]
    product_count = Counter(group_data['product_id'])
    most_common = product_count.most_common(n)
    top_n_products = [item[0] for item in most_common]
    return top_n_products

clustered_df['group_top_3_products'] = clustered_df['group'].apply(get_top_n_products_for_group)

# Step 6: Save to CSV
clustered_df.reset_index(drop=True, inplace=True)

output_path = 'path'
output_filename = 'grouped_customers_with_top_3_products.csv'
full_output_path = output_path + output_filename
clustered_df.to_csv(full_output_path, index=False)

Running K-means for K=1
Running K-means for K=2
Running K-means for K=3
Running K-means for K=4
Running K-means for K=5
Running K-means for K=6
Running K-means for K=7
Running K-means for K=8
Running K-means for K=9
Running K-means for K=10
Running K-means for K=11
Running K-means for K=12
Running K-means for K=13
Running K-means for K=14
Running K-means for K=15
Running K-means for K=16
Running K-means for K=17
Running K-means for K=18
Running K-means for K=19
Running K-means for K=20
Running K-means for K=21
Running K-means for K=22
Running K-means for K=23
Running K-means for K=24


In [4]:
#### Non-efficient version #####

# # Step 1: Data Preparation
# # Create a customer-product matrix
# customer_product_matrix = pd.pivot_table(df, values='reordered', index='user_id', columns='product_id', fill_value=0)

# # Standardize the data
# scaler = StandardScaler()
# customer_product_matrix_scaled = scaler.fit_transform(customer_product_matrix)
# s
# # Step 2: Find the optimal number of clusters using the Elbow Method
# costs = []
# K_range = range(1, 25)  # You can also increase the range for better granularity
# for i in K_range:
#     kmeans = KMeans(n_clusters=i, random_state=42,n_init=10)
#     kmeans.fit(customer_product_matrix_scaled)
#     costs.append(kmeans.inertia_)

# # Identify the elbow point programmatically
# knee_point = np.diff(np.diff(costs))
# optimal_k = K_range[np.where(knee_point == min(knee_point))[0][0] + 1]

# # Step 3: Run k-means with the optimal number of clusters
# kmeans = KMeans(n_clusters=optimal_k, random_state=42,n_init=10)
# clusters = kmeans.fit_predict(customer_product_matrix_scaled)

# # Add the cluster label to the original DataFrame
# customer_product_matrix['cluster'] = clusters

# # Step 4: Extract top 3 most ordered products for each user
# def get_top_n_products(user_id, n=3):
#     user_data = df[df['user_id'] == user_id]
#     product_count = Counter(user_data['product_id'])
#     most_common = product_count.most_common(n)
#     top_n_products = [item[0] for item in most_common]
#     return top_n_products

# clustered_df = pd.DataFrame({'user_id': customer_product_matrix.index, 'group': customer_product_matrix['cluster']})
# clustered_df['top_3_products'] = clustered_df['user_id'].apply(get_top_n_products)

# # Step 5: Extract top 3 products for each cluster/group
# def get_top_n_products_for_group(group, n=3):
#     group_data = df[df['user_id'].isin(clustered_df[clustered_df['group'] == group]['user_id'])]
#     product_count = Counter(group_data['product_id'])
#     most_common = product_count.most_common(n)
#     top_n_products = [item[0] for item in most_common]
#     return top_n_products

# clustered_df['group_top_3_products'] = clustered_df['group'].apply(get_top_n_products_for_group)

# # Step 6: Save to CSV
# clustered_df.reset_index(drop=True, inplace=True)
