In [6]:
#importing important libraries
import os
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from src.persistence.database import database_authentication

by default the notebook is running on "notebook folder", hence running "from src.persistence.database import database_authentication" will throw an error. Ensure to change the directory to "customer_segmentation" before running "from src.persistence.database import database_authentication"

In [5]:
os.getcwd()

'e:\\STUDIES\\projects\\customer_segmentation'

In [4]:
os.chdir("../")

In [7]:
collection = database_authentication()

In [8]:
result = collection.find(projection = {"_id": 0})
df = pd.DataFrame(result)

In [11]:
df["cluster_labels"] = df["cluster_labels"].astype(str)

In [12]:
result = collection.aggregate(
        [
            {
                "$group": {"_id": "$cluster_labels", "count":{"$count": {}}}
            }
        ]
)
df = pd.DataFrame(result).rename({"_id":"clusters"}, axis="columns")
df

Unnamed: 0,clusters,count
0,1,59
1,2,59
2,5,58
3,4,59
4,3,58
5,0,58


### Frequency tables per cluster

In [12]:
df_cat = df.select_dtypes("object")
df_cat.columns

Index(['Gender', 'City', 'Membership Type', 'Satisfaction Level',
       'cluster_labels'],
      dtype='object')

In [None]:
df_gender = pd.crosstab(df["cluster_labels"], df["Gender"], normalize="index")* 100
round(df_gender, 1)

Gender,Female,Male
cluster_labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,100.0,0.0
1,98.3,1.7
2,0.0,100.0
3,1.7,98.3
4,98.3,1.7
5,0.0,100.0


: 

In [30]:
df_gender = pd.crosstab(df["cluster_labels"], df["Gender"], normalize="index")
gender_dist = df_gender.reset_index().melt(id_vars="cluster_labels", var_name ="Gender", value_name = "Propotion")
fig = px.bar(gender_dist, x="cluster_labels", y="Propotion", color = "Gender", barmode = "stack")
fig.show()

In [34]:
df_city = pd.crosstab(df["cluster_labels"], df["City"], normalize="index") * 100
round(df_city, 1)

City,Chicago,Houston,Los Angeles,Miami,New York,San Francisco
cluster_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0,100.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,100.0,0.0
2,0.0,0.0,100.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0
4,98.3,0.0,1.7,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,100.0


cluster 0: has 58 females all from Houston.
cluster 1: has 59 new york customers.
cluster 2: has 59 Los Angeles customers
cluster 3: 58 Miami customers.
cluster 4: 58 Chicago and 1 Los Angeles
cluster 5: has 58 San Francisco customers.

In [33]:
df_membership_type = pd.crosstab(df["cluster_labels"], df["Membership Type"], normalize="index")* 100
round(df_membership_type, 1)

Membership Type,Bronze,Gold,Silver
cluster_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,100.0,0.0,0.0
1,0.0,100.0,0.0
2,0.0,0.0,100.0
3,0.0,0.0,100.0
4,98.3,0.0,1.7
5,0.0,100.0,0.0


cluster 1 and 5: High spend customers.

cluster 0 and 4: moderate spend customers.

cluster 2 and 3: Low spend customers.


In [32]:
df_satisfaction_level = pd.crosstab(df["cluster_labels"], df["Satisfaction Level"], normalize="index") * 100
round(df_satisfaction_level, 1) 

Satisfaction Level,Neutral,Satisfied,Unsatisfied
cluster_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,100.0,0.0,0.0
1,0.0,100.0,0.0
2,86.4,13.6,0.0
3,0.0,0.0,100.0
4,0.0,1.7,98.3
5,0.0,100.0,0.0


In [31]:
satisfaction_level_dist = df_satisfaction_level.reset_index().melt(id_vars="cluster_labels", var_name = "Satisfaction Level", value_name ="Proportion")
fig =px.bar(satisfaction_level_dist, barmode="stack", x="cluster_labels", y="Proportion", color="Satisfaction Level")
fig.show()

**Overal Intepretations**: cluster 1 and 5, are high spenders and loyal customers.
Cluster 3 and 4 are low spenders and unsatisfied customer, at-risk of churning customers.


In [None]:

# numerical analysis
df_num = df.drop(columns=["Customer ID"]).select_dtypes("number")
df_behavior = df_num.groupby("cluster_labels").mean()
df_behavior

Unnamed: 0_level_0,Age,Total Spend,Items Purchased,Average Rating,Days Since Last Purchase
cluster_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,36.706897,446.894828,7.568966,3.193103,22.758621
1,30.711864,1165.035593,15.271186,4.544068,24.59322
2,34.118644,805.491525,11.677966,4.172881,15.271186
3,28.948276,690.389655,11.637931,3.927586,45.5
4,42.254237,498.711864,9.576271,3.466426,40.457627
5,29.12069,1459.772414,20.0,4.808621,11.172414


In [9]:
fig = px.bar(
    df_behavior,
    barmode = "group"
)

fig.show()

In [20]:
ss = StandardScaler()
x_num = ss.fit_transform(df_num)

In [21]:
pca = PCA(n_components=2, random_state=42)
transfromed_features = pca.fit_transform(x_num)

X_pca = pd.DataFrame(transfromed_features, columns = ["pca_1", "pca_2"])
X_pca

Unnamed: 0,pca_1,pca_2
0,1.219825,-0.869477
1,-0.138058,-0.648857
2,-2.216109,1.560912
3,3.116181,0.449516
4,-0.124862,1.391998
...,...,...
346,-2.452635,-1.120424
347,1.338086,-0.660244
348,-0.111866,-0.511962
349,-2.310517,1.910150


In [22]:
fig = px.scatter(
    x = X_pca["pca_1"],
    y = X_pca["pca_2"],
    color = df_num["cluster_labels"].astype(str),
    title = "PCA Representation of Clusters"
    )

fig.update_layout(xaxis_title ="PC1", yaxis_title="PC2")
fig.show()