In [None]:
# Health Insurance Clustering using K-Means and Hierarchical Clustering

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage


In [None]:
df = pd.read_csv('insurance.csv')

In [None]:
# Convert categorical features into numerical using Label Encoding
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])

In [None]:
#  Select features for clustering
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
X = df[features]

In [None]:
# Scale the features
# Standardize features to bring them to the same scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Determine the optimal number of clusters using Elbow and Silhouette methods
inertia = []  # Elbow method (Sum of Squared Distances)
silhouette = []  # Silhouette scores for cluster validity
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X_scaled, kmeans.labels_))


In [None]:
# Plot Elbow Method and Silhouette Scores to find optimal K
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')

plt.subplot(1, 2, 2)
plt.plot(K, silhouette, 'ro-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score')

plt.tight_layout()
plt.show()

In [None]:
# Choose optimal K based on Elbow/Silhouette (assumed to be K=3 from plots)
optimal_k = 3

# Apply KMeans Clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

# Apply Hierarchical Agglomerative Clustering
agglo = AgglomerativeClustering(n_clusters=optimal_k)
df['agglo_cluster'] = agglo.fit_predict(X_scaled)

# Visualize clusters using PCA (2D projection)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12, 5))


In [None]:
# K-Means Clusters Visualization
plt.subplot(1, 2, 1)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['kmeans_cluster'], palette='Set1')
plt.title('K-Means Clustering')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')


In [None]:
# Hierarchical Clustering Visualization
plt.subplot(1, 2, 2)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['agglo_cluster'], palette='Set2')
plt.title('Hierarchical Clustering')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

plt.tight_layout()
plt.show()


In [None]:
# Interpret the Clusters
# Analyze average feature values in each cluster
print("\nK-Means Cluster Summary:")
kmeans_summary = df.groupby('kmeans_cluster')[features].mean()
print(kmeans_summary)

print("\nHierarchical Cluster Summary:")
agglo_summary = df.groupby('agglo_cluster')[features].mean()
print(agglo_summary)