In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the dataset
def load_data(file_path):
    return pd.read_csv(file_path)

# Preprocess the data
def preprocess_data(df):
    # Select features for clustering
    features = ['Age', 'Height (cm)', 'Weight (kg)', 'Systolic BP', 'Diastolic BP', 'Glucose Level', 'Cholesterol Level']
    X = df[features]
    
    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled

# Perform K-means clustering
def perform_kmeans(X, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    return kmeans

# Assign labels to clusters
def assign_cluster_labels(kmeans):
    # Analyze cluster centers to determine labels
    centers = kmeans.cluster_centers_
    # This is a simplistic approach. You might want to adjust this based on domain knowledge.
    labels = ['Weak', 'Normal', 'Healthy']
    return dict(zip(range(len(centers)), sorted(labels, key=lambda x: np.mean(centers[labels.index(x)]))))

# Classify patients
def classify_patients(df, kmeans, cluster_labels):
    df['Cluster'] = kmeans.labels_
    df['Classification'] = df['Cluster'].map(cluster_labels)
    return df

# Visualize results
def visualize_clusters(X, kmeans, df, cluster_labels):
    # Use PCA to reduce dimensionality for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    plt.figure(figsize=(12, 8))
    
    # Create a scatter plot
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, cmap='viridis', alpha=0.7)
    
    # Plot cluster centers
    centers_pca = pca.transform(kmeans.cluster_centers_)
    plt.scatter(centers_pca[:, 0], centers_pca[:, 1], marker='x', s=200, linewidths=3, color='r', label='Cluster Centers')
    
    # Add labels for each point
    for i, txt in enumerate(df['Classification']):
        plt.annotate(txt, (X_pca[i, 0], X_pca[i, 1]), xytext=(5, 2), textcoords='offset points', fontsize=8, alpha=0.5)
    
    # Customize the plot
    plt.title('Patient Clusters using K-means (n=3)', fontsize=16)
    plt.xlabel(f'First Principal Component', fontsize=12)
    plt.ylabel(f'Second Principal Component', fontsize=12)
    
    # Add a color bar
    cbar = plt.colorbar(scatter)
    cbar.set_label('Cluster Label', fontsize=12)
    
    # Add a legend
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w', label=f'Cluster {i}: {label}', 
                       markerfacecolor=plt.cm.viridis(kmeans.labels_[list(cluster_labels.keys())[i]]/3), markersize=10)
                       for i, label in cluster_labels.items()]
    legend_elements.append(plt.Line2D([0], [0], marker='x', color='r', label='Cluster Centers', markersize=10, linewidth=0))
    plt.legend(handles=legend_elements, loc='upper right', fontsize=10)
    
    # Add text box with additional information
    info_text = f"Total Patients: {len(df)}\n"
    for label in cluster_labels.values():
        count = df['Classification'].value_counts()[label]
        percentage = count / len(df) * 100
        info_text += f"{label}: {count} ({percentage:.1f}%)\n"
    plt.text(0.05, 0.95, info_text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top', 
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig('patient_clusters.png', dpi=300)
    plt.close()

# Updated main function
def main():
    # Load data
    df = load_data('patients2.csv')
    
    # Preprocess data
    X_scaled = preprocess_data(df)
    
    # Perform K-means clustering
    kmeans = perform_kmeans(X_scaled)
    
    # Assign labels to clusters
    cluster_labels = assign_cluster_labels(kmeans)
    
    # Classify patients
    classified_df = classify_patients(df, kmeans, cluster_labels)
    
    # Print results
    print(classified_df[['Patient ID', 'Classification']].head(10))
    print("\nCluster Distribution:")
    print(classified_df['Classification'].value_counts())
    
    # Visualize clusters with enhanced plot
    visualize_clusters(X_scaled, kmeans, classified_df, cluster_labels)

if __name__ == "__main__":
    main()

   Patient ID Classification
0           1         Normal
1           2           Weak
2           3         Normal
3           4           Weak
4           5         Normal
5           6        Healthy
6           7           Weak
7           8         Normal
8           9        Healthy
9          10           Weak

Cluster Distribution:
Classification
Normal     38
Weak       38
Healthy    24
Name: count, dtype: int64
