In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

In [24]:
df = pd.read_csv('insurance_processed.csv')

In [56]:
df['charges_cat'] = pd.cut(df['charges'], bins=[0, 5000, 25000, float('inf')], labels=[0, 1, 2], include_lowest=True)

In [57]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest,charges_cat
0,19,0,27.9,0,1,16884.924,0,0,0,1,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0,0
2,28,1,33.0,3,0,4449.462,0,0,1,0,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0,1
4,32,1,28.88,0,0,3866.8552,0,1,0,0,0


In [58]:
df.to_csv('insurance_clustering.csv', index=False)
df = pd.read_csv('insurance_clustering.csv')

In [59]:
insurance_features = df.drop(['charges', 'charges_cat'], axis=1)

insurance_features.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,0,0,0,1
1,18,1,33.77,1,0,0,0,1,0
2,28,1,33.0,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.88,0,0,0,1,0,0


In [60]:
insurance_labels = df['charges_cat']

insurance_labels.sample(5)

165     1
1253    1
936     2
917     2
382     1
Name: charges_cat, dtype: int64

In [61]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_),
            metrics.silhouette_score(data, model.labels_)))

In [62]:
def k_means(data, n_clusters=5, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [63]:
build_model(k_means, insurance_features, insurance_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.332	0.198	0.248	0.164	0.246	0.319
