# ECG 3‑Lead Feature Pre‑processing
This notebook reproduces steps **7 – 13** of the assignment:

7. Missing‑data handling
8. Stratified sampling on imbalanced class
9. One‑hot encoding for categorical predictors
10. Normalization / Standardization for numeric features
11. Balancing (random oversampling or swap for SMOTE)
12. Correlation matrix & feature drop
13. Outlier removal


7. Missing‑data handling

In [2]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)


In [3]:
# 📂 Load dataset (update the path if needed)
file_path = 'After_columns_clean.csv'
df = pd.read_csv(file_path)
print(df.shape)
df.head()


(21837, 40)


Unnamed: 0,ecg_id,patient_id,age,sex,heart_axis,RR_mean,RR_std,HR,QRS_duration,l5_QRS_duration,PR_interval,l5_PR_interval,QT_interval,l5_QT_interval,T_duration,l5_T_duration,l5_R_amp,l5_S_amp,l5_R_S_ratio,l5_Q_amp,l5_QRS_area,l5_T_amp,l5_T_asymmetry,ST_elevation,l4_ST_elevation,l5_ST_elevation,VLF_power,LF_power,MF_power,HF_power,LF_HF_ratio,dominant_freq,spectral_entropy,mean,median,std,skew,kurt,zero_crossings,condition
0,1,15709.0,56.0,1,,853.818182,240.891457,70.272572,109.666667,96.727273,6.0,,403.818182,397.4,249.818182,211.0,6.405659,-1.276841,5.016801,-0.525612,79.713786,1.459379,2.468523,-0.157677,-0.364245,-0.607699,0.005366,0.319642,0.43487,0.1291,2.47592,3.417969,5.128917,-5.115908e-17,-0.349752,1.0,2.202134,5.745205,113.0,Normal
1,2,13243.0,19.0,0,,1271.333333,75.318583,47.194546,70.0,74.857143,202.571429,175.142857,385.428571,349.428571,261.714286,218.857143,8.429349,-2.171858,3.881169,-0.465232,93.217888,1.332506,3.47853,-0.52904,-0.342348,-0.162812,0.097483,0.268296,0.312508,0.206826,1.297205,0.976562,5.101367,2.2737370000000003e-17,-0.191573,1.0,2.742948,13.005555,46.0,Normal
2,3,20372.0,37.0,1,,1048.666667,289.717103,57.215512,98.0,87.818182,,,382.4,424.0,250.2,277.636364,7.873735,-1.682306,4.680323,-0.51193,87.467119,0.410758,1.5438,-0.192742,-1.295766,-1.23717,0.042528,0.298434,0.378582,0.136506,2.186239,0.976562,5.194964,5.684342e-18,-0.150124,1.0,2.564642,10.432457,96.0,Normal
3,4,17014.0,24.0,0,,808.6,36.954567,74.202325,92.727273,89.0,185.818182,183.166667,396.909091,315.333333,268.727273,192.666667,5.423686,-2.969719,1.826329,-0.489608,86.939348,1.49648,2922.884584,0.171993,-0.229057,-0.304308,0.011083,0.270045,0.39846,0.162305,1.663815,0.976562,5.260235,4.689582e-17,-0.245402,1.0,2.104261,7.027535,73.0,Normal
4,5,17448.0,19.0,1,,1026.25,345.074539,58.465286,91.111111,90.727273,181.777778,190.363636,352.444444,345.818182,242.444444,189.818182,6.785335,-1.008426,6.728642,-0.433127,81.865674,1.364782,12.058163,-0.136861,-0.41696,-0.43536,0.008304,0.250306,0.428828,0.17393,1.439118,0.976562,5.388594,-1.4210850000000002e-17,-0.262932,1.0,3.268444,13.709445,70.0,Normal


In [4]:
import pandas as pd


# Count missing (NaN) values per column
missing_counts = df.isnull().sum()

# Display only columns that have missing values
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)

# Show the result
print("Missing values per column:")
print(missing_counts)


Missing values per column:
heart_axis          8505
l5_PR_interval      5691
PR_interval         5385
condition           2171
l5_QT_interval      1981
l5_T_asymmetry      1925
l5_T_duration       1925
QT_interval         1220
T_duration          1112
HR                  1006
RR_std              1006
RR_mean             1006
QRS_duration         846
ST_elevation         707
l4_ST_elevation      621
l5_QRS_duration      586
l5_QRS_area          586
l5_Q_amp             500
l5_ST_elevation      499
l5_T_amp             499
age                   89
l5_S_amp              39
l5_R_S_ratio          39
l5_R_amp              39
VLF_power             38
LF_power              38
MF_power              38
HF_power              38
LF_HF_ratio           38
spectral_entropy      38
mean                  38
median                38
std                   38
skew                  38
kurt                  38
zero_crossings        38
dominant_freq         38
dtype: int64


In [3]:
# Define column mapping: Lead II -> Lead V5 fallback
lead_pairs = {
    'QRS_duration': 'l5_QRS_duration',
    'PR_interval': 'l5_PR_interval',
    'QT_interval': 'l5_QT_interval',
    'T_duration' : 'l5_T_duration'
}

# ✅ 1. Check initial missing counts
print("Missing values BEFORE imputation:")
print(df[list(lead_pairs.keys())].isna().sum())

# ✅ 2. Copy Lead V5 values into Lead II where Lead II is NaN
for lead_ii, lead_v5 in lead_pairs.items():
    mask = df[lead_ii].isna() & df[lead_v5].notna()
    df.loc[mask, lead_ii] = df.loc[mask, lead_v5]

# ✅ 3. Check missing counts after filling
print("\nMissing values AFTER imputation:")
print(df[list(lead_pairs.keys())].isna().sum())


# Define column mapping: Lead V5 -> fallback from Lead II
lead_pairs = {
    'l5_QRS_duration': 'QRS_duration',
    'l5_PR_interval': 'PR_interval',
    'l5_QT_interval': 'QT_interval',
    'l5_T_duration': 'T_duration'
}

# ✅ 1. Check initial missing counts
print("\nMissing values BEFORE imputation:")
print(df[list(lead_pairs.keys())].isna().sum())

# ✅ 2. Copy Lead II values into Lead V5 where Lead V5 is NaN
for lead_v5, lead_ii in lead_pairs.items():
    mask = df[lead_v5].isna() & df[lead_ii].notna()
    df.loc[mask, lead_v5] = df.loc[mask, lead_ii]

# ✅ 3. Check missing counts after filling
print("\nMissing values AFTER imputation:")
print(df[list(lead_pairs.keys())].isna().sum())

Missing values BEFORE imputation:
QRS_duration     846
PR_interval     5385
QT_interval     1220
T_duration      1112
dtype: int64

Missing values AFTER imputation:
QRS_duration     277
PR_interval     4329
QT_interval      521
T_duration       497
dtype: int64

Missing values BEFORE imputation:
l5_QRS_duration     586
l5_PR_interval     5691
l5_QT_interval     1981
l5_T_duration      1925
dtype: int64

Missing values AFTER imputation:
l5_QRS_duration     277
l5_PR_interval     4329
l5_QT_interval      521
l5_T_duration       497
dtype: int64


In [4]:
# ✅ Before removing
print(f"Original shape: {df.shape}")
print(f"Rows with NaN in heart_axis: {df['heart_axis'].isna().sum()}")

# ✅ Remove rows where heart_axis is NaN
df = df[df['heart_axis'].notna()].reset_index(drop=True)

# ✅ After removing
print(f"Shape after removing rows with NaN heart_axis: {df.shape}")

Original shape: (21837, 40)
Rows with NaN in heart_axis: 8505
Shape after removing rows with NaN heart_axis: (13332, 40)


In [6]:
#  Define core timing features (Lead II + fallback Lead V5)
core_timing_features = [
    "RR_mean", "RR_std", "HR",
    "QRS_duration", "l5_QRS_duration",
    "PR_interval", "l5_PR_interval",
    "QT_interval", "l5_QT_interval",
    "T_duration", "l5_T_duration"
]

#  1. Check missing values per feature
missing_counts = df[core_timing_features].isna().sum()
print("🔍 Missing values per core timing feature:\n", missing_counts)

#  2. Count rows where at least one core timing feature is missing
rows_with_nan = df[core_timing_features].isna().any(axis=1).sum()
print(f"\nRows with at least one missing core timing feature: {rows_with_nan} / {len(df)}")

#  3. Drop rows with any missing core timing feature
df= df.dropna(subset=core_timing_features).reset_index(drop=True)

#  4. Show new shape after dropping
print(f"\n Shape after removing rows with missing core timing features: {df.shape}")


🔍 Missing values per core timing feature:
 RR_mean            0
RR_std             0
HR                 0
QRS_duration       0
l5_QRS_duration    0
PR_interval        0
l5_PR_interval     0
QT_interval        0
l5_QT_interval     0
T_duration         0
l5_T_duration      0
dtype: int64

Rows with at least one missing core timing feature: 0 / 10297

 Shape after removing rows with missing core timing features: (10297, 40)


In [88]:
total_missing = df.isna().sum().sum()
print(f"Total missing values in dataset: {total_missing}")

print("\nMissing values per column:")
print(df.isna().sum())


rows_with_nan = df.isna().any(axis=1).sum()
print(f"\nRows with at least one missing value: {rows_with_nan} / {len(df)}")


Total missing values in dataset: 2381

Missing values per column:
ecg_id                0
patient_id            0
age                  19
sex                   0
heart_axis            0
RR_mean               0
RR_std                0
HR                    0
QRS_duration          0
l5_QRS_duration       0
PR_interval           0
l5_PR_interval        0
QT_interval           0
l5_QT_interval        0
T_duration            0
l5_T_duration         0
l5_R_amp              0
l5_S_amp              0
l5_R_S_ratio          0
l5_Q_amp            111
l5_QRS_area         156
l5_T_amp            111
l5_T_asymmetry      738
ST_elevation          0
l4_ST_elevation     231
l5_ST_elevation     111
VLF_power             0
LF_power              0
MF_power              0
HF_power              0
LF_HF_ratio           0
dominant_freq         0
spectral_entropy      0
mean                  0
median                0
std                   0
skew                  0
kurt                  0
zero_crossings        

In [None]:
#  Define morphology/ST features
morphology_features = [
    "l5_R_amp", "l5_S_amp", "l5_R_S_ratio", "l5_Q_amp", "l5_QRS_area",
    "l5_T_amp", "l5_T_asymmetry", "ST_elevation",
    "l4_ST_elevation", "l5_ST_elevation"
]

#  1. Check missingness percentage per feature
missing_percentage = (df[morphology_features].isna().mean() * 100).round(2)
print("🔍 Missingness (%) per morphology feature:\n", missing_percentage)

#  2. Decide strategy: impute or drop
if missing_percentage.max() < 5:
    print("\nMissingness is <5% → Imputing with median stratified by 'condition'...")
    
    # Impute missing values with median per condition group
    for feature in morphology_features:
        df[feature] = df.groupby("condition")[feature].transform(
            lambda x: x.fillna(x.median())
        )

else:
    print("\n Missingness ≥5% → Dropping rows where any morphology feature is missing...")
    df = df.dropna(subset=morphology_features).reset_index(drop=True)

#  3. Verify no remaining NaNs in these columns
print("\nRemaining NaNs in morphology features:\n", df[morphology_features].isna().sum())


🔍 Missingness (%) per morphology feature:
 l5_R_amp           0.00
l5_S_amp           0.00
l5_R_S_ratio       0.00
l5_Q_amp           1.08
l5_QRS_area        1.52
l5_T_amp           1.08
l5_T_asymmetry     7.17
ST_elevation       0.00
l4_ST_elevation    2.24
l5_ST_elevation    1.08
dtype: float64

⚠️ Missingness ≥5% → Dropping rows where any morphology feature is missing...

Remaining NaNs in morphology features:
 l5_R_amp           0
l5_S_amp           0
l5_R_S_ratio       0
l5_Q_amp           0
l5_QRS_area        0
l5_T_amp           0
l5_T_asymmetry     0
ST_elevation       0
l4_ST_elevation    0
l5_ST_elevation    0
dtype: int64


In [None]:
# Features
hrv_features = [
    "VLF_power", "LF_power", "MF_power", "HF_power",
    "LF_HF_ratio", "dominant_freq", "spectral_entropy"
]
core_rr = ["RR_mean", "RR_std", "HR"]

#  1. Drop rows where core RR features are missing
df = df.dropna(subset=core_rr).reset_index(drop=True)

#  2. Handle missing sub-band powers
for band in ["VLF_power", "LF_power", "MF_power", "HF_power"]:
    # If missing and other bands sum > 0 → set to 0
    mask_zero_fill = df[band].isna() & (df[["VLF_power","LF_power","MF_power","HF_power"]].sum(axis=1) > 0)
    df.loc[mask_zero_fill, band] = 0.0
    
    # Remaining NaN (not true zero) → drop row
    df = df.dropna(subset=[band])

#  3. Recompute LF/HF ratio and handle HF=0 cases
df["LF_HF_ratio"] = np.where(
    (df["HF_power"] > 0),
    df["LF_power"] / df["HF_power"],
    np.nan
)

#  4. Handle dominant_freq and spectral_entropy
for feature in ["dominant_freq", "spectral_entropy"]:
    missing_pct = df[feature].isna().mean() * 100
    if missing_pct < 5:
        print(f"Imputing {feature} with median (missing {missing_pct:.2f}%)")
        df[feature] = df.groupby("condition")[feature].transform(lambda x: x.fillna(x.median()))
    else:
        print(f"Dropping rows where {feature} is missing (missing {missing_pct:.2f}%)")
        df = df.dropna(subset=[feature])

#5. Final check for remaining NaNs in HRV features
print("\nRemaining NaNs in HRV features:\n", df[hrv_features].isna().sum())

Imputing dominant_freq with median (missing 0.00%)
Imputing spectral_entropy with median (missing 0.00%)

Remaining NaNs in HRV features:
 VLF_power             0
LF_power              0
MF_power              0
HF_power              0
LF_HF_ratio           0
dominant_freq       801
spectral_entropy    801
dtype: int64


In [8]:
#  Define global statistics features
global_stats = ["mean", "median", "std", "skew", "kurt", "zero_crossings"]

# 1. Check missingness percentage for global stats
missing_pct_stats = (df[global_stats].isna().mean() * 100).round(2)
print("🔍 Missingness (%) for global statistics features:\n", missing_pct_stats)

# 2. Decide strategy based on missingness rate
if missing_pct_stats.max() <= 15:
    print("\n Missingness ≤15% → Imputing with median stratified by condition...")
    for feature in global_stats:
        df[feature] = df.groupby("condition")[feature].transform(
            lambda x: x.fillna(x.median())
        )
else:
    print("\n Missingness >15% → Dropping rows where any global statistic is missing...")
    df = df.dropna(subset=global_stats).reset_index(drop=True)

#  3. Handle missing 'condition' (categorical)
missing_condition = df["condition"].isna().sum()
print(f"\nRows with missing 'condition': {missing_condition}")

if missing_condition > 0:
    print("⚠️ Dropping rows where 'condition' is missing...")
    df = df[df["condition"].notna()].reset_index(drop=True)

#  4. Verify no remaining NaNs in these columns
print("\nRemaining NaNs in global stats:\n", df[global_stats].isna().sum())
print("Remaining NaNs in condition column:", df["condition"].isna().sum())


🔍 Missingness (%) for global statistics features:
 mean              0.0
median            0.0
std               0.0
skew              0.0
kurt              0.0
zero_crossings    0.0
dtype: float64

 Missingness ≤15% → Imputing with median stratified by condition...

Rows with missing 'condition': 0

Remaining NaNs in global stats:
 mean              0
median            0
std               0
skew              0
kurt              0
zero_crossings    0
dtype: int64
Remaining NaNs in condition column: 0


In [92]:
total_missing = df.isna().sum().sum()
print(f"Total missing values in dataset: {total_missing}")

print("\nMissing values per column:")
print(df.isna().sum())


rows_with_nan = df.isna().any(axis=1).sum()
print(f"\nRows with at least one missing value: {rows_with_nan} / {len(df)}")

print(df.shape)

Total missing values in dataset: 14

Missing values per column:
ecg_id               0
patient_id           0
age                 14
sex                  0
heart_axis           0
RR_mean              0
RR_std               0
HR                   0
QRS_duration         0
l5_QRS_duration      0
PR_interval          0
l5_PR_interval       0
QT_interval          0
l5_QT_interval       0
T_duration           0
l5_T_duration        0
l5_R_amp             0
l5_S_amp             0
l5_R_S_ratio         0
l5_Q_amp             0
l5_QRS_area          0
l5_T_amp             0
l5_T_asymmetry       0
ST_elevation         0
l4_ST_elevation      0
l5_ST_elevation      0
VLF_power            0
LF_power             0
MF_power             0
HF_power             0
LF_HF_ratio          0
dominant_freq        0
spectral_entropy     0
mean                 0
median               0
std                  0
skew                 0
kurt                 0
zero_crossings       0
condition            0
dtype: int64

Ro

In [9]:
# ✅ Calculate median age and fill missing
median_age = df['age'].median()
df['age'] = df['age'].fillna(median_age)


# ✅ Verify after imputation
print("Missing values in age AFTER:", df['age'].isna().sum())
print("Median age used for imputation:", median_age)


Missing values in age AFTER: 0
Median age used for imputation: 60.0


In [10]:
total_missing = df.isna().sum().sum()
print(f"Total missing values in dataset: {total_missing}")

print("\nMissing values per column:")
print(df.isna().sum())


rows_with_nan = df.isna().any(axis=1).sum()
print(f"\nRows with at least one missing value: {rows_with_nan} / {len(df)}")

print(df.shape)

Total missing values in dataset: 1282

Missing values per column:
ecg_id                0
patient_id            0
age                   0
sex                   0
heart_axis            0
RR_mean               0
RR_std                0
HR                    0
QRS_duration          0
l5_QRS_duration       0
PR_interval           0
l5_PR_interval        0
QT_interval           0
l5_QT_interval        0
T_duration            0
l5_T_duration         0
l5_R_amp              0
l5_S_amp              0
l5_R_S_ratio          0
l5_Q_amp             95
l5_QRS_area         136
l5_T_amp             95
l5_T_asymmetry      661
ST_elevation          0
l4_ST_elevation     200
l5_ST_elevation      95
VLF_power             0
LF_power              0
MF_power              0
HF_power              0
LF_HF_ratio           0
dominant_freq         0
spectral_entropy      0
mean                  0
median                0
std                   0
skew                  0
kurt                  0
zero_crossings        

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ecg_id            9393 non-null   int64  
 1   patient_id        9393 non-null   float64
 2   age               9393 non-null   float64
 3   sex               9393 non-null   int64  
 4   heart_axis        9393 non-null   object 
 5   RR_mean           9393 non-null   float64
 6   RR_std            9393 non-null   float64
 7   HR                9393 non-null   float64
 8   QRS_duration      9393 non-null   float64
 9   l5_QRS_duration   9393 non-null   float64
 10  PR_interval       9393 non-null   float64
 11  l5_PR_interval    9393 non-null   float64
 12  QT_interval       9393 non-null   float64
 13  l5_QT_interval    9393 non-null   float64
 14  T_duration        9393 non-null   float64
 15  l5_T_duration     9393 non-null   float64
 16  l5_R_amp          9393 non-null   float64


In [13]:
df.head()

Unnamed: 0,ecg_id,patient_id,age,sex,heart_axis,RR_mean,RR_std,HR,QRS_duration,l5_QRS_duration,PR_interval,l5_PR_interval,QT_interval,l5_QT_interval,T_duration,l5_T_duration,l5_R_amp,l5_S_amp,l5_R_S_ratio,l5_Q_amp,l5_QRS_area,l5_T_amp,l5_T_asymmetry,ST_elevation,l4_ST_elevation,l5_ST_elevation,VLF_power,LF_power,MF_power,HF_power,LF_HF_ratio,dominant_freq,spectral_entropy,mean,median,std,skew,kurt,zero_crossings,condition
0,7,16193.0,54.0,0,LAD,970.0,17.029386,61.85567,97.777778,93.2,191.111111,188.4,431.333333,414.6,222.444444,229.2,7.003805,-4.265694,1.641891,-0.177384,114.008957,0.511105,2.863138,-0.366118,-0.160241,-0.298328,0.000543,0.127323,0.626647,0.329813,0.386047,2.929688,5.650555,-3.53495e-17,-0.15301,1.0,4.241455,25.360262,77.0,Normal
1,8,11275.0,48.0,0,LAD,814.363636,15.155393,73.67716,107.5,92.333333,194.333333,214.166667,392.666667,330.666667,253.0,194.5,5.301466,-3.856418,1.374713,-0.498833,91.947093,1.687939,3.043828,0.003433,-0.191397,-0.189884,0.010473,0.278021,0.437037,0.203707,1.364808,4.882812,5.293184,-1.278977e-17,-0.280783,1.0,2.106691,5.721974,78.0,MI
2,26,13619.0,56.0,0,LAD,681.538462,2.499704,88.036117,115.0,101.0,196.0,202.285714,441.428571,5166.0,300.714286,248.0,5.511187,-3.587114,1.536385,-0.314297,103.253589,-0.197594,1.135624,-0.506422,-0.314035,-0.347026,0.015165,0.235806,0.54377,0.199571,1.181564,1.464844,5.117901,-9.947598e-18,0.07195,1.0,0.24389,4.556635,75.0,Normal
3,27,10316.0,56.0,0,LAD,962.0,53.690471,62.370062,95.4,90.8,194.6,173.0,383.8,378.8,210.6,211.0,6.613336,-2.978423,2.220415,-0.499123,94.950844,1.632476,3.24765,-0.631757,-0.623628,-0.442634,0.011361,0.310699,0.399045,0.171059,1.816331,0.976562,5.273018,-4.7606360000000007e-17,-0.268608,1.0,2.175067,6.217853,78.0,Normal
4,28,13619.0,56.0,0,LAD,679.0,6.129554,88.365243,92.666667,87.466667,203.866667,192.4,467.142857,524.0,327.285714,282.0,5.834349,-3.64023,1.602742,-0.248403,92.419156,0.086599,3.405743,-0.616742,-0.645154,-0.56177,0.002944,0.137332,0.454421,0.340946,0.402798,1.464844,5.489678,-3.552714e-18,0.044945,1.0,0.863977,7.894348,93.0,Normal


In [14]:
columns_to_drop = ['ecg_id', 'patient_id']
df = df.drop(columns=columns_to_drop)

In [15]:
df.head()

Unnamed: 0,age,sex,heart_axis,RR_mean,RR_std,HR,QRS_duration,l5_QRS_duration,PR_interval,l5_PR_interval,QT_interval,l5_QT_interval,T_duration,l5_T_duration,l5_R_amp,l5_S_amp,l5_R_S_ratio,l5_Q_amp,l5_QRS_area,l5_T_amp,l5_T_asymmetry,ST_elevation,l4_ST_elevation,l5_ST_elevation,VLF_power,LF_power,MF_power,HF_power,LF_HF_ratio,dominant_freq,spectral_entropy,mean,median,std,skew,kurt,zero_crossings,condition
0,54.0,0,LAD,970.0,17.029386,61.85567,97.777778,93.2,191.111111,188.4,431.333333,414.6,222.444444,229.2,7.003805,-4.265694,1.641891,-0.177384,114.008957,0.511105,2.863138,-0.366118,-0.160241,-0.298328,0.000543,0.127323,0.626647,0.329813,0.386047,2.929688,5.650555,-3.53495e-17,-0.15301,1.0,4.241455,25.360262,77.0,Normal
1,48.0,0,LAD,814.363636,15.155393,73.67716,107.5,92.333333,194.333333,214.166667,392.666667,330.666667,253.0,194.5,5.301466,-3.856418,1.374713,-0.498833,91.947093,1.687939,3.043828,0.003433,-0.191397,-0.189884,0.010473,0.278021,0.437037,0.203707,1.364808,4.882812,5.293184,-1.278977e-17,-0.280783,1.0,2.106691,5.721974,78.0,MI
2,56.0,0,LAD,681.538462,2.499704,88.036117,115.0,101.0,196.0,202.285714,441.428571,5166.0,300.714286,248.0,5.511187,-3.587114,1.536385,-0.314297,103.253589,-0.197594,1.135624,-0.506422,-0.314035,-0.347026,0.015165,0.235806,0.54377,0.199571,1.181564,1.464844,5.117901,-9.947598e-18,0.07195,1.0,0.24389,4.556635,75.0,Normal
3,56.0,0,LAD,962.0,53.690471,62.370062,95.4,90.8,194.6,173.0,383.8,378.8,210.6,211.0,6.613336,-2.978423,2.220415,-0.499123,94.950844,1.632476,3.24765,-0.631757,-0.623628,-0.442634,0.011361,0.310699,0.399045,0.171059,1.816331,0.976562,5.273018,-4.7606360000000007e-17,-0.268608,1.0,2.175067,6.217853,78.0,Normal
4,56.0,0,LAD,679.0,6.129554,88.365243,92.666667,87.466667,203.866667,192.4,467.142857,524.0,327.285714,282.0,5.834349,-3.64023,1.602742,-0.248403,92.419156,0.086599,3.405743,-0.616742,-0.645154,-0.56177,0.002944,0.137332,0.454421,0.340946,0.402798,1.464844,5.489678,-3.552714e-18,0.044945,1.0,0.863977,7.894348,93.0,Normal


In [16]:
df.to_csv("missingvaluesdone.csv", index=False)