# Diabetes Risk Assessment - Patient Segmentation Analysis

Comprehensive clustering analysis for diabetes patient risk stratification with advanced visualizations and clinical insights.

# Libraries and Configuration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

# Modern plotting style

In [2]:
plt.style.use('dark_background')
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8']
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3

# Dataset Generation and Loading

In [3]:
np.random.seed(2024)
n_patients = 850

patient_data = {
    'glucose_level': np.random.gamma(2, 50, n_patients) + 80,
    'bmi': np.random.normal(27.5, 6.2, n_patients),
    'insulin_resistance': np.random.exponential(2.8, n_patients) + 1,
    'hba1c': np.random.normal(6.2, 1.4, n_patients),
    'triglycerides': np.random.gamma(3, 40, n_patients) + 80,
    'cholesterol_ratio': np.random.normal(4.2, 1.1, n_patients),
    'systolic_bp': np.random.normal(135, 18, n_patients),
    'family_history': np.random.binomial(1, 0.35, n_patients),
    'physical_activity': np.random.gamma(2, 1.5, n_patients),
    'stress_level': np.random.uniform(1, 10, n_patients)
}

for i in range(n_patients):
    if patient_data['bmi'][i] > 30:
        patient_data['glucose_level'][i] *= np.random.uniform(1.1, 1.4)
        patient_data['insulin_resistance'][i] *= np.random.uniform(1.2, 1.6)
        patient_data['systolic_bp'][i] += np.random.normal(15, 5)

diabetes_df = pd.DataFrame(patient_data)

diabetes_df['risk_score'] = (
    (diabetes_df['glucose_level'] - 90) * 0.02 +
    (diabetes_df['bmi'] - 25) * 0.05 +
    diabetes_df['insulin_resistance'] * 0.15 +
    (diabetes_df['hba1c'] - 5.7) * 0.3 +
    diabetes_df['triglycerides'] * 0.001 +
    (diabetes_df['cholesterol_ratio'] - 3.5) * 0.1 +
    (diabetes_df['systolic_bp'] - 120) * 0.01 +
    diabetes_df['family_history'] * 0.8 +
    (10 - diabetes_df['physical_activity']) * 0.05 +
    diabetes_df['stress_level'] * 0.08
)

diabetes_df['risk_score'] = np.clip(diabetes_df['risk_score'], 0, 10)

diabetes_df['diabetes_status'] = (diabetes_df['risk_score'] > 5.5).astype(int)

print(f"Diabetes Dataset Generated Successfully")
print(f"Dataset Dimensions: {diabetes_df.shape}")
print(f"Features: {list(diabetes_df.columns)}")
print(f"Diabetes Prevalence: {diabetes_df['diabetes_status'].mean():.1%}")

Diabetes Dataset Generated Successfully
Dataset Dimensions: (850, 12)
Features: ['glucose_level', 'bmi', 'insulin_resistance', 'hba1c', 'triglycerides', 'cholesterol_ratio', 'systolic_bp', 'family_history', 'physical_activity', 'stress_level', 'risk_score', 'diabetes_status']
Diabetes Prevalence: 26.5%
