1. Load and inspect the data, plot few examples from different arrythmia types

In [None]:
# 1. Load and inspect the data, plot few examples from different arrythmia types

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import glob
import warnings
warnings.filterwarnings('ignore')

In [None]:

# 1.Loading and inspecting heartbeat holdout validation data
print("1.  Loading and inspecting heartbeat holdout validation data")
train_files = glob.glob('train_beats.csv') 
test_files = glob.glob('test_beats.csv')   
if not train_files or not test_files:
    raise FileNotFoundError("Heartbeat training or test data files not found. Please ensure data_split_resample.ipynb has been run correctly and generated the files.")

# load data
train_data = pd.read_csv(train_files[0], header=None)
test_data = pd.read_csv(test_files[0], header=None)
print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Check class distribution
print("\nTraining set class distribution:")
# print(train_data.iloc[:, -1].value_counts().sort_index())
print("\nTest set class distribution:")
# print(test_data.iloc[:, -1].value_counts().sort_index())

# Prepare data
X_train = train_data.iloc[:, :-2].values 
y_train = train_data.iloc[:, -2].values  
X_test = test_data.iloc[:, :-2].values
y_test = test_data.iloc[:, -2].values
print(f"Training features shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

# Data standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nData loading and preprocessing completed.")

print("\n2. Plotting examples of different arrhythmia types...")
class_names = {
    1: 'N (Normal)',
    2: 'L (LBBB)',
    3: 'R (RBBB)',
    4: 'V (Premature Ventricular Contraction)',
    5: 'A (Atrial premature beat)',
    6: 'F (Fusion ventricular normal beat)',
    7: 'f (Fusion of paced and normal beat)',
    8: '/ (Paced beat)'
}
fig, axes = plt.subplots(4, 2, figsize=(15, 12))
axes = axes.ravel()
for class_id in range(1, 9):
    if class_id in y_train:
        idx = np.where(y_train == class_id)[0][0]  
        axes[class_id-1].plot(X_train[idx])
        axes[class_id-1].set_title(class_names[class_id])
        axes[class_id-1].set_xlabel("Time")
        axes[class_id-1].set_ylabel("Amplitude")

plt.tight_layout()
plt.show()

2. Classification of ECG beats based on the holdout splitting method

In [None]:
#  2.Heartbeat Holdout Validation Classification Model

# create svm model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred = svm_model.predict(X_test_scaled)
# Evaluation
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    # Visualize confusion matrix cm
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
    plt.title('Confusion Matrix (Heartbeat Holdout Validation)')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    return accuracy, precision, recall, f1, cm

# Evaluate the model
print("\nHeartbeat Holdout Validation SVM Model Evaluation Results")
acc, prec, rec, f1, cm = evaluate_model(y_test, y_pred)


In [None]:
# 3. Model Interpretability Analysis
print("\n3. Using permutation feature importance to interpret model functionality")
from sklearn.ensemble import RandomForestClassifier  
from sklearn.inspection import permutation_importance  
print("\nSVM Classifier")
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)
print("\nHeartbeat Holdout Validation SVM Model Evaluation Results")
acc_svm, prec_svm, rec_svm, f1_svm, cm_svm = evaluate_model(y_test, y_pred_svm)
print("\nRandom Forest Classifier")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
acc_rf, prec_rf, rec_rf, f1_rf, cm_rf = evaluate_model(y_test, y_pred_rf)
# 4. Apply the same permutation feature importance analysis to different classifiers
print("\n4.Apply the same permutation feature importance analysis to different classifiers")

def calculate_permutation_importance_cv(model, X, y, cv_folds=5, n_repeats=10, random_state=42, model_type="SVM"):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    importances = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"   Processing fold {fold+1}/{cv_folds} ")
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        if model_type == "SVM":
            fold_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
            fold_model.fit(X_train_fold, y_train_fold)
            perm_imp = permutation_importance(
                fold_model, X_val_fold, y_val_fold,
                n_repeats=n_repeats,
                random_state=random_state,
                scoring='accuracy'
            )
        else:  # RandomForest
            fold_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
            fold_model.fit(X_train_fold, y_train_fold)
            perm_imp = permutation_importance(
                fold_model, X_val_fold, y_val_fold,
                n_repeats=n_repeats,
                random_state=random_state,
                scoring='accuracy'
            )
            
        importances.append(perm_imp.importances_mean)
    importances = np.array(importances)
    mean_importance = np.mean(importances, axis=0)
    std_importance = np.std(importances, axis=0)
    return mean_importance, std_importance

# visualize
def plot_feature_importance(mean_importance, std_importance, title, top_n=50):
    sorted_idx = np.argsort(mean_importance)[::-1]
    top_indices = sorted_idx[:top_n]

    plt.figure(figsize=(12, 8))
    plt.bar(range(top_n), mean_importance[top_indices],
            yerr=std_importance[top_indices], align='center')
    plt.xlabel('Feature Index (Sorted by Importance)')
    plt.ylabel('Permutation Importance (Drop in Accuracy)')
    plt.title(f'Top {top_n} Feature Importances ({title})')
    plt.xticks(range(top_n), [f'F{i}' for i in top_indices], rotation=90)
    plt.tight_layout()
    plt.show()

    print(f"\nTop 10 most important features (based on mean importance):")
    for i in range(min(10, top_n)):
        idx = top_indices[i]
        print(f" Feature {idx}: Mean importance = {mean_importance[idx]:.5f},  Std= {std_importance[idx]:.5f}")
        
print("\nSVM Classifier Feature Importance Analysis")
n_samples_for_perm_imp = min(5000, X_train_scaled.shape[0])
indices = np.random.choice(X_train_scaled.shape[0], n_samples_for_perm_imp, replace=False)
X_train_subset_svm = X_train_scaled[indices]
y_train_subset_svm = y_train[indices]

mean_imp_svm, std_imp_svm = calculate_permutation_importance_cv(
    svm_model, X_train_subset_svm, y_train_subset_svm, cv_folds=5, model_type="SVM"
)

plot_feature_importance(mean_imp_svm, std_imp_svm, "Heartbeat Holdout SVM", top_n=50)

print("\nRandom Forest Classifier Feature Importance Analysis")
X_train_subset_rf = X_train_scaled[indices]
y_train_subset_rf = y_train[indices]

mean_imp_rf, std_imp_rf = calculate_permutation_importance_cv(
    rf_model, X_train_subset_rf, y_train_subset_rf, cv_folds=5, model_type="RF"
)

plot_feature_importance(mean_imp_rf, std_imp_rf, "Heartbeat Holdout Random Forest", top_n=50)

print("\nTask 5: Applying the same interpretability techniques to different types of classifiers completed")
print("- SVM and Random Forest classification models have been trained and evaluated")
print("- Permutation feature importance for both classifiers calculated using 5-fold stratified cross-validation.")
print("- Feature importance results have been visualized and compared.")

print("\nClassifier Performance Comparison")
print(f"SVM - Accuracy: {acc_svm:.4f}, F1-score: {f1_svm:.4f}")
print(f"Random Forest - Accuracy: {acc_rf:.4f}, F1-score: {f1_rf:.4f}")

print("\nFeature Importance Comparison")
top_features_svm = np.argsort(mean_imp_svm)[::-1][:20]
top_features_rf = np.argsort(mean_imp_rf)[::-1][:20]

print("SVM top 20 important feature indices:", top_features_svm)
print("Random Forest top 20 important feature indices:", top_features_rf)

overlap = len(set(top_features_svm) & set(top_features_rf))
print(f"Number of overlapping features in top 20 between both methods{overlap}/20")

 # Random Forest vs. SVM for ECG Classification

## 1. Performance Overview
Random Forest demonstrated superior performance over SVM across key evaluation metrics, with notable improvements in Accuracy, Recall, and F1-score, indicating its stronger capability in identifying abnormal heartbeats.

| Metric                | SVM     | Random Forest |
|-----------------------|---------|---------------|
| **Accuracy**          | 0.9494  | 0.9667        |
| **Precision (Weighted)** | 0.9722  | 0.9741        |
| **Recall (Weighted)**    | 0.9494  | 0.9667        |
| **F1-Score (Weighted)**  | 0.9577  | 0.9690        |

## 2. Detailed Category-wise Misclassification Analysis
Category 1 (Normal): In SVM, a large number were misclassified as Category 4 (Premature Ventricular Contraction) and Category 5 (Atrial Premature Beat), whereas Random Forest correctly classified almost all of them.

Category 4 (Premature Ventricular Contraction): In SVM, approximately 323 instances were misclassified as Normal, while Random Forest misclassified only 19, showing significant improvement.

Category 5 (Atrial Premature Beat): In SVM, 52 instances were misclassified as Normal, while in Random Forest, 56 were misclassified, though the overall trend is better.

Category 6 (Fusion ventricular normal beat ): In SVM, 179 instances were misclassified as Normal, while in Random Forest, only 169 were misclassified, indicating a slight improvement.

Conclusion: Random Forest demonstrates greater advantages in distinguishing between "Normal" and "Abnormal" heartbeats, particularly excelling in handling ambiguous premature beat samples.

## 3. Feature Importance Analysis

1.Top 50 Feature lmportances(Heartbeat Holdout SVM)（1lqpweqj.png）

Top 10 Important Features: F133, F134, F138, F139, F89, F90, F137, F96, F155, F154  

Distribution Characteristics:  
(1) Feature importance is relatively uniform, with the maximum value being approximately 0.003.  
(2) The standard deviation is large (e.g., F133: ±0.00152), indicating poor stability.  
(3) The features are mainly concentrated in the F89-F155 interval, which may correspond to the QRS complex or T-wave region.

2. Top 50 Feature lmportances (Heartbeat Holdout Random Forest) (rtzp5c9p.png)  
Top 10 Important Features: F145, F144, F155, F139, F117, F153, F128, F154, F90, F135  

Distribution Characteristics:  
(1) Feature importance is more concentrated, with F145 exceeding 0.006, significantly higher than other features.  
(2) The distribution exhibits a "long-tail" pattern, indicating that a small number of features play a dominant role.  
(3) Compared to SVM, higher-order features such as F145 and F144 are more critical.

## Conclusion:

The feature importance in SVM is more suitable for modeling linear relationships, making it effective for capturing localized signal variations.  
In contrast, the feature importance in Random Forest reflects non-linear patterns and complex feature interactions, demonstrating its advantage in handling multi-dimensional morphological characteristics.

Random Forest demonstrates superior accuracy and robustness in ECG heartbeat classification tasks, exhibiting distinct advantages particularly in identifying complex arrhythmias. Although SVM's feature importance offers higher interpretability, Random Forest delivers better overall performance, making it more suitable as the primary classification model. By integrating the interpretability insights from both approaches, we can provide clinicians with accurate and trustworthy diagnostic support, ultimately enhancing the reliability and actionable insights of automated ECG analysis.

3. Classification of ECG beats based on the leave out patients-hold out validation 
protocol

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
import time



In [None]:
# Define the 8 target Class IDs
CLASS_IDS = [1, 2, 3, 4, 5, 6, 7, 8]

# --- Required Evaluation Function ---

def evaluate_model(Y_true, Y_pred, all_classes):
    """
    Calculates and prints performance metrics (Accuracy, Precision, Recall, F1-score)
    and the Confusion Matrix, handling cases where some classes might be missing from Y_true.
    """

    # Identify only the unique labels present in the true test data (Y_true)
    present_classes = np.unique(Y_true.astype(int))

    # 1. Overall Accuracy
    accuracy = accuracy_score(Y_true, Y_pred)

    # 2. Overall Metrics (Micro average)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        Y_true, Y_pred, average='micro', zero_division=0, labels=present_classes)

    # 3. Individual Class Metrics (Calculate only for present classes)
    metrics_per_class = precision_recall_fscore_support(
        Y_true, Y_pred, labels=present_classes, average=None, zero_division=0)

    # 4. Confusion Matrix (Calculate partial matrix first)
    # cm_partial = confusion_matrix(Y_true, Y_pred, labels=present_classes)
    # The above would create a matrix only for present_classes.
    # To ensure all relevant labels are included in the confusion matrix, we take a union.
    all_relevant_labels = sorted(list(set(all_classes).union(set(Y_true.unique())).union(set(Y_pred))))
    cm_final = confusion_matrix(Y_true, Y_pred, labels=all_relevant_labels)


    # --- Reconstruct Output for all 8 Classes ---
    class_names = ['N (1)', 'L (2)', 'R (3)', 'V (4)', 'A (5)', 'F (6)', 'f (7)', '/ (8)']
    class_names_map = {class_id: class_name for class_id, class_name in zip(all_classes, class_names)}

    metrics_map = {label: [p, r, f, s] for label, p, r, f, s in zip(present_classes, *metrics_per_class)}

    # Create the final, complete 8-class metrics data structure (filling in 0s for missing classes)
    final_metrics_data = []
    for class_id, class_name in zip(all_classes, class_names):
        if class_id in metrics_map:
            p, r, f, s = metrics_map[class_id]
        else:
            # Class was missing from Y_true, fill with zeros
            p, r, f, s = 0.0, 0.0, 0.0, 0

        final_metrics_data.append({
            'Class': class_name,
            'Precision': p,
            'Recall': r,
            'F1-Score': f,
            'Support': s
        })

    metrics_df = pd.DataFrame(final_metrics_data)

    # Reconstruct the Confusion Matrix for all 8 classes
    # The original logic below caused the ValueError due to length mismatch.
    # We now create the DataFrame based on all_relevant_labels and map names.

    cm_full_df = pd.DataFrame(cm_final, index=all_relevant_labels, columns=all_relevant_labels)

    # Map numerical labels to human-readable names where available, otherwise use generic names.
    mapped_index = [class_names_map.get(label, f"Class {label}") for label in all_relevant_labels]
    mapped_columns = [class_names_map.get(label, f"Class {label}") for label in all_relevant_labels]

    cm_full_df.index = mapped_index
    cm_full_df.columns = mapped_columns

    print(f"--- Task 2: SVC Model Evaluation (Beat Holdout Protocol) ---")
    print(f"Overall Accuracy: {accuracy:.4f}")
    print(f"Overall Precision (Micro Avg): {precision_micro:.4f}")
    print(f"Overall Recall (Micro Avg):    {recall_micro:.4f}")
    print(f"Overall F1-Score (Micro Avg):  {f1_micro:.4f}")
    print("\n--- Detailed Metrics per Class ---")
    print(metrics_df.to_string(index=False))

    print("\n--- Confusion Matrix (True Label Rows vs Predicted Label Columns) ---")
    if len(all_relevant_labels) > len(all_classes):
        print("Note: Confusion matrix labels include additional classes found in the data, not just the predefined 8 CLASS_IDS.")
    print(cm_full_df)




In [None]:
# --- Data Loading and Preparation ---

TRAIN_FILE = './train_beats.csv'
TEST_FILE = './test_beats.csv'

print(f"Loading data from {TRAIN_FILE} and {TEST_FILE}...")

try:
    # Load the datasets (header=None as they are saved without headers)
    train_data = pd.read_csv(TRAIN_FILE, header=None)
    test_data = pd.read_csv(TEST_FILE, header=None)

except FileNotFoundError as e:
    print(f"\nERROR: File not found: {e.filename}")
    print("Please ensure your files are in the same directory as this script/notebook.")
    exit()

# Separate features (X) and labels (Y). Features are all columns except the last two (Patient ID and Class ID).
X_train = train_data.iloc[:, :-2]
Y_train = train_data.iloc[:, -1].astype(int) # Class ID

X_test = test_data.iloc[:, :-2]
Y_test = test_data.iloc[:, -1].astype(int)   # Class ID

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")




In [None]:
# --- Model Training and Prediction ---

print("\n[STEP 1] Initializing and training the SVC model...")
start_time = time.time()

# SVC with a linear kernel for multi-class classification
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, Y_train)

training_time = time.time() - start_time
print(f"Training complete in {training_time:.2f} seconds.")


print("[STEP 2] Making predictions on the test data...")
start_time = time.time()
Y_pred = svm_model.predict(X_test).astype(int)
prediction_time = time.time() - start_time
print(f"Prediction complete in {prediction_time:.2f} seconds.")


# --- Evaluation ---

print("\n[STEP 3] Evaluating the model performance...")
evaluate_model(Y_test, Y_pred, CLASS_IDS)

6. Use at least one clustering technique to visualize the data and understand better their 
structure and how well classes are separated. Perform classification based on the cluster 
features and discuss the results.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

sns.set()
print("Imports OK")


In [None]:
print("Loading all_data.csv ...")
data = np.genfromtxt("./all_data.csv", delimiter=",")

# ECG features (277 columns)
X = data[:, :-2]

# arrhythmia labels (N, L, R, V, A, F, f, /)
y = data[:, -2].astype(int)

# patient ID (not needed for clustering)
patient_id = data[:, -1].astype(int)

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='tab10', s=2)
plt.title("PCA Visualization (Colored by True Labels)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar()
plt.show()


In [None]:
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(X)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap='tab10', s=3)
plt.title("PCA + KMeans Clustering (Colored by Cluster Labels)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar()
plt.show()


In [None]:
cluster_ids = clusters.reshape(-1, 1)
distances = kmeans.transform(X)
cluster_features = np.hstack([cluster_ids, distances])

print("Cluster feature shape:", cluster_features.shape)

In [None]:
clf = LogisticRegression(max_iter=2000)
clf.fit(cluster_features, y)
y_pred = clf.predict(cluster_features)

print("\n===== Cluster-Based Classification Metrics =====")
print("Accuracy:", accuracy_score(y, y_pred))
print("Precision:", precision_score(y, y_pred, average='macro'))
print("Recall:", recall_score(y, y_pred, average='macro'))
print("F1-score:", f1_score(y, y_pred, average='macro'))


In [None]:
cm = confusion_matrix(y, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Cluster-Based Classification)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()
