Handle Missing Values

In [10]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# Load dataset
df = pd.read_csv("ptbxl_features.csv")

# ✅ Step 1: Drop rows with >5 missing values
df_filtered = df[df.isna().sum(axis=1) <= 5].copy()

# ✅ Step 2: Separate numeric and categorical columns
numeric_cols = df_filtered.select_dtypes(include=[np.number]).columns
categorical_cols = df_filtered.select_dtypes(exclude=[np.number]).columns

df_numeric = df_filtered[numeric_cols]
df_categorical = df_filtered[categorical_cols]  # Keep untouched

# ✅ Step 3: Function for MICE Imputation
def mice_imputation(data, m=5):
    imputations = []
    for seed in range(m):
        imputer = IterativeImputer(
            estimator=BayesianRidge(),
            sample_posterior=True,
            max_iter=10,
            random_state=seed
        )
        imputed_array = imputer.fit_transform(data)
        imputations.append(pd.DataFrame(imputed_array, columns=data.columns))
    return imputations

# ✅ Step 4: Apply MICE only on numeric columns
imputed_sets = mice_imputation(df_numeric, m=5)

# ✅ Step 5: Pool imputations (mean of all imputed datasets)
mean_imputed_numeric = sum(imputed_sets) / len(imputed_sets)

# ✅ Step 6: Recombine numeric and categorical columns
final_df = pd.concat([mean_imputed_numeric, df_categorical.reset_index(drop=True)], axis=1)

# ✅ Step 7: Save cleaned dataset
final_df.to_csv("ptbxl_features_cleaned.csv", index=False)

print("✅ Final cleaned dataset shape:", final_df.shape)
print(final_df.head())



✅ Final cleaned dataset shape: (13328, 137)
     HR_mean     HR_std     PR_mean     PR_std   QRSd_mean    QRSd_std  \
0  62.419378   7.254019  178.625000  25.377909  108.146341   81.480228   
1  46.043361   6.425092  186.303797  24.492989   95.063291   21.392152   
2  74.587370   3.379025  179.134752  23.376796  104.408451   19.260865   
3  64.641225   8.128676  182.868852  20.198990  107.377049   51.313383   
4  79.109622  13.557656  187.898551  16.409580  121.640288  120.741010   

      QT_mean      QT_std    JTc_mean    JTc_std  ...  st80_aVF_std  \
0  389.678571  109.563114  285.452979  81.261413  ...      0.018684   
1  380.253165   40.499911  248.865670  43.646579  ...      0.012294   
2  372.253521   72.162881  298.168406  75.502867  ...      0.030279   
3  383.150000   67.983024  289.990565  74.022704  ...      0.012784   
4  405.304348  140.909383  318.483220  87.818569  ...      0.016001   

   vat_aVF_mean  vat_aVF_std  ramp_aVF_mean  ramp_aVF_std  tamp_aVF_mean  \
0     55