In [2]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [5]:
from google.colab import files
uploaded = files.upload()

Saving scp_statements.csv to scp_statements.csv


In [11]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database (1).csv


In [26]:
#A5
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # Import SimpleImputer
import pandas as pd
import numpy as np

# --------------------------
# Function to load and prepare ECG data
# --------------------------
def load_and_prepare_data(metadata_file, scp_file, drop_target=True):
    """
    Load ECG metadata, optionally remove target column for clustering.
    Standardize numerical features before clustering.

    Args:
        metadata_file (str): Path to PTB-XL metadata CSV.
        scp_file (str): Path to SCP statements CSV (not used in clustering here).
        drop_target (bool): If True, removes target column from the dataset.

    Returns:
        np.ndarray: Scaled feature matrix.
    """
    # Load data
    ecg_df = pd.read_csv(metadata_file)

    # Keep only numeric columns for imputation and scaling
    numeric_cols = ecg_df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_data = ecg_df[numeric_cols]

    # Impute missing values *before* dropping or selecting columns
    imputer = SimpleImputer(strategy='mean')
    numeric_data_imputed = imputer.fit_transform(numeric_data)
    numeric_data_imputed_df = pd.DataFrame(numeric_data_imputed, columns=numeric_cols, index=ecg_df.index)

    # Remove target variable if needed (assuming 'scp_code' is not a numeric column,
    # which is true in the given data structure)
    if drop_target and 'scp_code' in ecg_df.columns:
        # Find the index of the 'scp_code' column in the original dataframe
        # to check if it was included in numeric_cols
        if 'scp_code' in numeric_cols:
             numeric_data_imputed_df = numeric_data_imputed_df.drop(columns=['scp_code'])


    # Standardize data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(numeric_data_imputed_df)


    return X_scaled

# --------------------------
# Function to perform K-Means clustering
# --------------------------
def perform_kmeans(X, n_clusters=2, random_state=42):
    """
    Perform K-Means clustering on given data.

    Args:
        X (np.ndarray): Feature matrix.
        n_clusters (int): Number of clusters.
        random_state (int): Random seed for reproducibility.

    Returns:
        tuple: Fitted KMeans model, cluster labels, cluster centers.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init="auto")
    kmeans.fit(X)
    return kmeans, kmeans.labels_, kmeans.cluster_centers_

# --------------------------
# Function to calculate clustering metrics
# --------------------------
def calculate_clustering_metrics(X, labels):
    """
    Calculate Silhouette, Calinski-Harabasz, and Davies-Bouldin scores.

    Args:
        X (np.ndarray): Feature matrix.
        labels (np.ndarray): Cluster labels.

    Returns:
        dict: Dictionary of clustering metrics.
    """
    metrics = {
        "Silhouette Score": silhouette_score(X, labels),
        "Calinski-Harabasz Score": calinski_harabasz_score(X, labels),
        "Davies-Bouldin Score": davies_bouldin_score(X, labels)
    }
    return metrics

# --------------------------
# Main execution
# --------------------------
if __name__ == "__main__":
    # Step 1: Load and prepare data
    X_train = load_and_prepare_data("ptbxl_database.csv", "scp_statements.csv", drop_target=True)

    # Step 2: Perform K-Means clustering
    kmeans_model, cluster_labels, cluster_centers = perform_kmeans(X_train, n_clusters=2, random_state=42)

    # Step 3: Calculate clustering metrics
    clustering_results = calculate_clustering_metrics(X_train, cluster_labels)

    # Step 4: Display results
    print("=== K-Means Clustering Results ===")
    # print("Cluster Labels:") # Commented out to avoid printing a large array
    # print(cluster_labels)
    print("\nCluster Centers (scaled):")
    print(cluster_centers)
    print("\n=== Clustering Metrics ===")
    for metric, value in clustering_results.items():
        print(f"{metric}: {value:.4f}")

=== K-Means Clustering Results ===

Cluster Centers (scaled):
[[-0.42079756 -1.16364992  0.31825331  0.15284493 -0.03054928 -0.06387763
   1.18649009  0.31001563 -0.54054643 -0.01255779]
 [ 0.1900819   0.52564181 -0.1437608  -0.06904283  0.01379966  0.02885469
  -0.53595913 -0.14003969  0.24417464  0.00567258]]

=== Clustering Metrics ===
Silhouette Score: 0.2731
Calinski-Harabasz Score: 4033.7392
Davies-Bouldin Score: 2.0442
