In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('insurance_processed.csv')

In [3]:
df['charges_cat'] = pd.cut(df['charges'], bins=[0, 5000, 25000, float('inf')], labels=[0, 1, 2], include_lowest=True)

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest,charges_cat
0,19,0,27.9,0,1,16884.924,0,0,0,1,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0,0
2,28,1,33.0,3,0,4449.462,0,0,1,0,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0,1
4,32,1,28.88,0,0,3866.8552,0,1,0,0,0


In [5]:
df.to_csv('insurance_clustering.csv', index=False)
df = pd.read_csv('insurance_clustering.csv')

In [6]:
insurance_features = df.drop(['charges', 'charges_cat'], axis=1)

insurance_features.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,0,0,0,1
1,18,1,33.77,1,0,0,0,1,0
2,28,1,33.0,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.88,0,0,0,1,0,0


In [7]:
insurance_labels = df['charges_cat']

insurance_labels.sample(5)

213    1
706    2
858    1
422    2
41     0
Name: charges_cat, dtype: int64

In [8]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_),
            metrics.silhouette_score(data, model.labels_)))

In [9]:
def k_means(data, n_clusters=5, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.326	0.194	0.243	0.161	0.241	0.321


In [23]:
kmeans_model = KMeans(n_clusters=3).fit(insurance_features)

In [24]:
kmeans_model.labels_

array([0, 0, 0, ..., 0, 0, 1])

In [25]:
kmeans_model.cluster_centers_

array([[23.42250531,  0.51804671, 29.89234607,  0.87261146,  0.21231423,
         0.24203822,  0.24416136,  0.27600849,  0.23779193],
       [55.569161  ,  0.49433107, 31.70868481,  0.97959184,  0.17006803,
         0.24489796,  0.24489796,  0.26530612,  0.24489796],
       [39.72065728,  0.50234742, 30.43380282,  1.4600939 ,  0.23239437,
         0.23943662,  0.23943662,  0.27464789,  0.24647887]])

In [26]:
print("Homogeneity_score: ", metrics.homogeneity_score(insurance_labels, kmeans_model.labels_))
print("Completeness_score: ", metrics.completeness_score(insurance_labels, kmeans_model.labels_))
print("v_measure_score: ", metrics.v_measure_score(insurance_labels, kmeans_model.labels_))

Homogeneity_score:  0.2884428822123002
Completeness_score:  0.25042060936656246
v_measure_score:  0.26809031771446573
