In [1]:
import pandas as pd
import numpy as np

from pycaret.clustering import ClusteringExperiment


data_dir = "./data/"

In [2]:
df_insurance = pd.read_csv(f"{data_dir}insurance.csv")
df_insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [3]:
# Feature Engeneering
df_insurance["born_year"] = (pd.Timestamp.now() - (df_insurance["age"] * np.timedelta64(365, "D"))).dt.year
df_insurance["has_children"] = df_insurance["children"] > 0
df_insurance = pd.get_dummies(df_insurance, drop_first=True)

df_insurance

Unnamed: 0,age,bmi,children,charges,born_year,has_children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,2005,False,False,True,False,False,True
1,18,33.770,1,1725.55230,2006,True,True,False,False,True,False
2,28,33.000,3,4449.46200,1996,True,True,False,False,True,False
3,33,22.705,0,21984.47061,1991,False,True,False,True,False,False
4,32,28.880,0,3866.85520,1992,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1974,True,True,False,True,False,False
1334,18,31.920,0,2205.98080,2006,False,False,False,False,False,False
1335,18,36.850,0,1629.83350,2006,False,False,False,False,True,False
1336,21,25.800,0,2007.94500,2003,False,False,False,False,False,True


In [4]:
exp = ClusteringExperiment()
s = exp.setup(data = df_insurance, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(1338, 11)"
2,Transformed data shape,"(1338, 11)"
3,Numeric features,5
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


In [5]:
'''
'kmeans' - K-Means Clustering
'ap' - Affinity Propagation
'meanshift' - Mean shift Clustering
'sc' - Spectral Clustering
'hclust' - Agglomerative Clustering
'dbscan' - Density-Based Spatial Clustering
'optics' - OPTICS Clustering
'birch' - Birch Clustering
'kmodes' - K-Modes Clustering
'''

model = exp.create_model("kmeans")
data_cluster = exp.assign_model(model)
data_cluster

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6105,6746.0756,0.4657,0,0,0


Unnamed: 0,age,bmi,children,charges,born_year,has_children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,Cluster
0,19,27.900000,0,16884.923828,2005,False,False,True,False,False,True,Cluster 3
1,18,33.770000,1,1725.552246,2006,True,True,False,False,True,False,Cluster 0
2,28,33.000000,3,4449.461914,1996,True,True,False,False,True,False,Cluster 0
3,33,22.705000,0,21984.470703,1991,False,True,False,True,False,False,Cluster 2
4,32,28.879999,0,3866.855225,1992,False,True,False,True,False,False,Cluster 0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.969999,3,10600.547852,1974,True,True,False,True,False,False,Cluster 3
1334,18,31.920000,0,2205.980713,2006,False,False,False,False,False,False,Cluster 0
1335,18,36.849998,0,1629.833496,2006,False,False,False,False,True,False,Cluster 0
1336,21,25.799999,0,2007.944946,2003,False,False,False,False,False,True,Cluster 0


In [6]:
exp.evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…