In [1]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
from category_encoders import OneHotEncoder
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from kmodes.kprototypes import KPrototypes

In [4]:
os.getcwd()

'e:\\STUDIES\\projects\\customer_segmentation'

In [3]:
os.chdir("../")

In [5]:
df= pd.read_csv("artifact\data_ingestion\customer_segmentation.csv")
print(df.shape)
print(df.info())
df.head()

  df= pd.read_csv("artifact\data_ingestion\customer_segmentation.csv")


(280, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               280 non-null    int64  
 1   gender                    280 non-null    object 
 2   age                       280 non-null    int64  
 3   city                      280 non-null    object 
 4   membership_type           280 non-null    object 
 5   total_spend               280 non-null    float64
 6   items_purchased           280 non-null    int64  
 7   average_rating            280 non-null    float64
 8   discount_applied          280 non-null    bool   
 9   days_since_last_purchase  280 non-null    int64  
 10  satisfaction_level        278 non-null    object 
dtypes: bool(1), float64(2), int64(4), object(4)
memory usage: 22.3+ KB
None


Unnamed: 0,customer_id,gender,age,city,membership_type,total_spend,items_purchased,average_rating,discount_applied,days_since_last_purchase,satisfaction_level
0,329,Male,35,Los Angeles,Silver,820.9,12,4.3,False,13,Neutral
1,179,Female,30,New York,Gold,1180.8,16,4.7,True,19,Satisfied
2,191,Female,30,New York,Gold,1190.8,16,4.5,True,20,Satisfied
3,117,Male,26,Miami,Silver,700.6,12,3.7,True,48,Unsatisfied
4,167,Female,32,New York,Gold,1160.3,14,4.4,True,22,Satisfied


In [7]:
df.columns.to_list()

['customer_id',
 'gender',
 'age',
 'city',
 'membership_type',
 'total_spend',
 'items_purchased',
 'average_rating',
 'discount_applied',
 'days_since_last_purchase',
 'satisfaction_level']

In [6]:
#it has meaning information
df.drop(columns =["customer_id"], inplace=True)

### Data Transformation:

### Method 1: PCA on numeric data + OHE on categorical data + KMeans

* The aim for this method is to reduce dimension(not that it big for practice) for numerical variables, then perform OneHotEncoder and then Kmeans.

In [7]:
num_pipeline = Pipeline(steps =[
    ("impute", SimpleImputer(strategy="mean")),
    ("std", StandardScaler()),
    ("pca", PCA(n_components=0.9))
])

cat_pipeline= Pipeline(steps =[
    ("ohe", OneHotEncoder())

])

processor = ColumnTransformer([
    ("num_pipeline", num_pipeline, df.select_dtypes("number").columns.to_list()),
    ("cat_pipeline", cat_pipeline, df.select_dtypes("object").columns.to_list())
])

transformed_df = processor.fit_transform(df)

$ inertia =\sum^{n}_{i=1} \left | \right | x_{i} - c \left | \right | ^ 2 $

* Where c is a centroid.

In [8]:
# fine tuning the model
n_clusters = range(3, 12)
inertia_list_scores = []
inertia_scores = {}
silhouette_list_scores = []
silhouette_scores ={}

for i in n_clusters:
    model = KMeans(n_clusters=i, random_state=42)    
    model.fit(transformed_df)
    
    #inertia_scores
    inertia_scores[i] = model.inertia_
    inertia_list_scores.append(model.inertia_)

    #silhouette_scores
    silhouette_scores[i] = silhouette_score(transformed_df, model.labels_)
    silhouette_list_scores.append(silhouette_score(transformed_df, model.labels_))
    

In [9]:
#inertia vs clusters
fig = px.line(x=n_clusters, y=inertia_scores.values(), title="KMeans Model: Inertia vs number of clusters")
fig.update_layout(xaxis_title="number_ of_clusters", yaxis_title="inertia")
fig.show()

The best inertia scoresis when the number of clusters is 6.

In [10]:
fig=px.line(x=n_clusters, y=silhouette_scores.values(), title="KMeans Model: number of clusters vs Silhouette Scores")
fig.update_layout(xaxis_title="Number of clusters", yaxis_title ="Silhouette Scores")
fig.show()

The best silhouette scores is when the number of clusters is 7 but 7 is a lot of groups.

### Final Model

In [11]:
model=KMeans(n_clusters = 6, random_state=42)
model.fit(transformed_df)

0,1,2
,n_clusters,6
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


In [12]:
model.inertia_

104.78252190926057

Observations are not that far from their centroids.

In [13]:
silhouette_score(transformed_df, model.labels_)

0.7967643603971416

Fair score.

In [14]:
#number of components
num_components = processor.named_transformers_["num_pipeline"].named_steps["pca"]
num_components.get_feature_names_out()


array(['pca0', 'pca1'], dtype=object)

### Method 2:  Kmeans without PCA/FAMD


In [23]:
num_pipeline_2 = Pipeline(steps =[
    ("impute", SimpleImputer(strategy="mean")),
    ("std", StandardScaler())    
])

cat_pipeline_2= Pipeline(steps =[
    ("ohe", OneHotEncoder())

])

processor_2 = ColumnTransformer([
    ("num_pipeline", num_pipeline_2, df.select_dtypes("number").columns.to_list()),
    ("cat_pipeline", cat_pipeline_2, df.select_dtypes("object").columns.to_list())
])

transformed_df_2 = processor.fit_transform(df)

In [24]:
n_clusters = range(3, 12)
inertia_scores = {}
silhouette_scores ={}

for i in n_clusters:
    model = KMeans(n_clusters=i, random_state=42)    
    model.fit(transformed_df_2)
    
    #inertia_scores
    inertia_scores[i] = model.inertia_
    
    #silhouette_scores
    silhouette_scores[i] = silhouette_score(transformed_df_2, model.labels_)    

In [26]:
#inertia vs clusters
fig = px.line(x=n_clusters, y=inertia_scores.values(), title="KMeans Model: Inertia vs number of clusters")
fig.update_layout(xaxis_title="number_ of_clusters", yaxis_title="inertia")
fig.show()

In [27]:
fig=px.line(x=n_clusters, y=silhouette_scores.values(), title="KMeans Model: number of clusters vs Silhouette Scores")
fig.update_layout(xaxis_title="Number of clusters", yaxis_title ="Silhouette Scores")
fig.show()

### Method 3:  KPrototypes without PCA/FAMD/OHE

In [30]:

si = SimpleImputer(strategy="most_frequent")
transformed_df = si.fit_transform(df)
transformed_df

array([['Female', 29, 'New York', ..., True, 25, 'Satisfied'],
       ['Male', 34, 'Los Angeles', ..., False, 18, 'Neutral'],
       ['Female', 43, 'Chicago', ..., True, 42, 'Unsatisfied'],
       ...,
       ['Female', 30, 'New York', ..., True, 28, 'Satisfied'],
       ['Male', 34, 'Los Angeles', ..., False, 21, 'Neutral'],
       ['Female', 43, 'Chicago', ..., True, 49, 'Unsatisfied']],
      shape=(350, 10), dtype=object)

In [31]:
number_clusters = range(3, 12)
cost = {}
clusters={}

for k in number_clusters:
    kprototype = KPrototypes(n_clusters=k, init="cao", random_state=42)
    clusters[k] = kprototype.fit_predict(transformed_df, 
                                        categorical=[df.columns.get_loc(col) for col in df.select_dtypes("object").columns.to_list()])
    cost[k] = kprototype.cost_

ValueError: Clustering algorithm could not initialize. Consider assigning the initial clusters manually.

In [54]:
fig=px.line(x=n_clusters, y=cost.values(), title="KPrototype Model: number of clusters vs Cost")
fig.update_layout(xaxis_title="Number of clusters", yaxis_title ="Cost")
fig.show()

The best number of cluster is 6.

### Final Model

In [55]:

kprototype = KPrototypes(n_clusters=6, init="cao", random_state=42)
clusters = kprototype.fit_predict(transformed_df, categorical=[df.columns.get_loc(col) for col in df.select_dtypes("object").columns.to_list()])
kprototype.cost_

np.float64(2514076.4483778686)

### Communicaton

In [24]:
model.labels_

array([ 1,  2,  5,  9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  8,  5,  9,  7,
        0,  1,  2,  5,  9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  8,  5,  9,
        7,  0,  1,  2,  5,  9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  2,  5,
        9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  8,  5,  9,  7,  0,  1,  2,
        5,  9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  2,  5,  9,  7,  0,  1,
        2,  5,  4,  6,  0, 10,  8,  5,  9,  7,  0,  1,  2,  5,  9,  7,  0,
        1,  2,  5,  4,  6,  0, 10,  2,  5,  9,  7,  0,  1,  2,  5,  4,  6,
        0, 10,  8,  5,  9,  7,  0,  1,  2,  5,  9,  7,  0,  1,  2,  5,  4,
        6,  0, 10,  2,  5,  9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  2,  5,
        9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  8,  5,  9,  7,  0,  1,  2,
        5,  9,  7,  0,  1,  2,  5,  4,  6,  0, 10,  2,  5,  9,  7,  0,  1,
        2,  5,  4,  6,  0, 10,  8,  5,  9,  7,  0,  1,  2,  9,  7,  0, 10,
        2,  5,  9,  7,  0,  1,  2,  5,  9,  7,  0,  1,  2,  5,  4,  6,  0,
       10,  2,  5,  9,  7

In [23]:
df["cluster_labels"] = model.labels_
df["cluster_labels"].unique()

array([ 1,  2,  5,  9,  7,  0,  4,  6, 10,  8,  3], dtype=int32)

In [21]:
labels = model.labels_
gb = df.groupby(labels).mean()
gb

TypeError: agg function failed [how->mean,dtype->object]

In [15]:
fig = px.bar(
    gb,
    barmode="group",
    title = "Mean"
)
fig.update_layout(xaxis_title="cluster", yaxis_title="")
fig.show()