In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [5]:
#load
customers = pd.read_csv('/Users/a12345/Desktop/DATA_PT/ML_Project/02_data/Blinkit/blinkit_customers.csv')
orders = pd.read_csv('/Users/a12345/Desktop/DATA_PT/ML_Project/02_data/Blinkit/blinkit_orders.csv')

In [8]:
customers

Unnamed: 0,customer_id,customer_name,email,phone,address,area,pincode,registration_date,customer_segment,total_orders,avg_order_value
0,97475543,Niharika Nagi,ektataneja@example.org,912987579691,"23, Nayar Path, Bihar Sharif-154625",Udupi,321865,2023-05-13,Premium,13,451.92
1,22077605,Megha Sachar,vedant45@example.com,915123179717,"51/302, Buch Chowk\nSrinagar-570271",Aligarh,149394,2024-06-18,Inactive,4,825.48
2,47822591,Hema Bahri,samiazaan@example.com,910034076149,"941\nAnne Street, Darbhanga 186125",Begusarai,621411,2024-09-25,Regular,17,1969.81
3,79726146,Zaitra Vig,ishanvi87@example.org,916264232390,"43/94, Ghosh, Alappuzha 635655",Kozhikode,826054,2023-10-04,New,4,220.09
4,57102800,Januja Verma,atideshpande@example.org,917293526596,"06\nOm, Ambarnath 477463",Ichalkaranji,730539,2024-03-22,Inactive,14,578.14
...,...,...,...,...,...,...,...,...,...,...,...
2495,48002829,Daksh Mandal,varkeymohammed@example.com,919587731286,"28/42, Venkataraman Ganj, Kishanganj-360157",Mumbai,45238,2024-01-25,Inactive,17,754.33
2496,57392064,Lavanya Jain,deshpandeom@example.com,916137420258,"391, Edwin Path, Korba 202726",Udupi,688100,2024-06-23,Regular,4,1540.81
2497,71688530,Umang Dash,gokhalenicholas@example.org,917595275963,"151, Parmer Ganj\nLoni 906431",Kavali,528749,2024-03-18,Regular,1,1541.22
2498,89051578,Zinal Natarajan,vyasfrado@example.com,911994009448,"H.No. 172, Khanna Street, Pudukkottai 610564",Alwar,586734,2024-09-06,Premium,12,1185.50


In [9]:
orders

Unnamed: 0,order_id,customer_id,order_date,promised_delivery_time,actual_delivery_time,delivery_status,order_total,payment_method,delivery_partner_id,store_id
0,1961864118,30065862,2024-07-17 08:34:01,2024-07-17 08:52:01,2024-07-17 08:47:01,On Time,3197.07,Cash,63230,4771
1,1549769649,9573071,2024-05-28 13:14:29,2024-05-28 13:25:29,2024-05-28 13:27:29,On Time,976.55,Cash,14983,7534
2,9185164487,45477575,2024-09-23 13:07:12,2024-09-23 13:25:12,2024-09-23 13:29:12,On Time,839.05,UPI,39859,9886
3,9644738826,88067569,2023-11-24 16:16:56,2023-11-24 16:34:56,2023-11-24 16:33:56,On Time,440.23,Card,61497,7917
4,5427684290,83298567,2023-11-20 05:00:39,2023-11-20 05:17:39,2023-11-20 05:18:39,On Time,2526.68,Cash,84315,2741
...,...,...,...,...,...,...,...,...,...,...
4995,1669690997,62600289,2023-12-25 15:46:20,2023-12-25 16:05:20,2023-12-25 16:10:20,On Time,1132.33,Cash,90914,1587
4996,8340761903,53640286,2023-11-27 09:18:43,2023-11-27 09:38:43,2023-11-27 09:36:43,On Time,2372.01,Cash,27952,3458
4997,5936301790,87059497,2024-06-21 19:09:09,2024-06-21 19:23:09,2024-06-21 19:26:09,On Time,3158.35,Cash,9590,7424
4998,5710579377,67310893,2024-06-06 14:58:13,2024-06-06 15:12:13,2024-06-06 15:10:13,On Time,1918.92,UPI,29940,6128


In [6]:
#mergeing customer data with orders
customer_orders = pd.merge(
    customers, orders, on="customer_id", how="left"
)

#aggregating
customer_behavior = customer_orders.groupby("customer_id").agg(
    total_spent=("order_total", "sum"),
    avg_order_value=("order_total", "mean"),
    order_count=("order_id", "count"),
    avg_delivery_time=("actual_delivery_time", lambda x: pd.to_datetime(x).diff().dt.seconds.mean() if len(x) > 1 else np.nan),
).fillna(0)

In [7]:
customer_behavior

Unnamed: 0_level_0,total_spent,avg_order_value,order_count,avg_delivery_time
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31813,5726.04,2863.020000,2,33993.000000
31826,0.00,0.000000,0,0.000000
61020,7844.90,2614.966667,3,83181.000000
75482,0.00,0.000000,0,0.000000
119099,14768.77,3692.192500,4,54889.333333
...,...,...,...,...
99734256,8571.89,2857.296667,3,55002.500000
99772709,5208.45,1736.150000,3,34572.000000
99784559,2527.14,2527.140000,1,0.000000
99824171,4515.68,2257.840000,2,10245.000000


In [7]:
#normalise
scaler = StandardScaler()
customer_behavior_scaled = scaler.fit_transform(customer_behavior)

#K-Means
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
customer_behavior["cluster"] = kmeans.fit_predict(customer_behavior_scaled)

In [9]:
#Silhouette Score
silhouette_avg = silhouette_score(customer_behavior_scaled, customer_behavior["cluster"])

silhouette_avg

0.42052208066641905

In [10]:
#Inertia
inertia_per_cluster = {i: np.mean(np.linalg.norm(customer_behavior_scaled[kmeans.labels_ == i] - kmeans.cluster_centers_[i], axis=1)) for i in range(kmeans.n_clusters)}

In [11]:
#Prep print
cluster_cohesion = pd.DataFrame({
    "cluster": list(inertia_per_cluster.keys()),
    "avg_inertia": list(inertia_per_cluster.values())
}).set_index("cluster")

#Print
print("Silhouette Score:", silhouette_avg)
print("Cluster Cohesion:")
print(cluster_cohesion)


Silhouette Score: 0.42052208066641905
Cluster Cohesion:
         avg_inertia
cluster             
0           1.182013
1           0.671616
2           1.032820
3           0.885097


In [13]:
# Describe each cluster
cluster_summary = customer_behavior.groupby("cluster").agg(
    avg_total_spent=("total_spent", "mean"),
    avg_order_value=("avg_order_value", "mean"),
    avg_order_count=("order_count", "mean"),
    avg_delivery_time=("avg_delivery_time", "mean"),
    customer_count=("cluster", "count")
)

print(cluster_summary)

         avg_total_spent  avg_order_value  avg_order_count  avg_delivery_time  \
cluster                                                                         
0            9967.281294      2606.137448         3.967146       42477.706351   
1             585.358474       519.498871         0.603424         849.917974   
2            4660.864150      1939.728257         2.467005       50512.303299   
3            3954.475706      3054.717303         1.337786        4440.689885   

         customer_count  
cluster                  
0                   487  
1                   701  
2                   788  
3                   524  


In [15]:
#saving machine and scaler
import joblib

joblib.dump(kmeans, "customer_segmentation_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']