In [1]:
from hfpred.utils import get_project_root
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import joblib

import dataframe_image as dfi
pd.set_option("display.max_columns", None)

# Data pre-Processing \& Feature Selection

From EDA, have higlighted that there are unwanted zero values that can either be imputed or removed from the data. EDA showed that there is better correlation for the affected attributes (Cholesterol \& RestingBP) with the taget when the zeros are dropped, rather than imputed.

In [2]:
df = pd.read_csv(
    get_project_root() / "data/heart.csv"
)

print(f"Length of raw data: {len(df)} instances {df.shape}.")
print(f"Class balance, raw data:\n{df.HeartDisease.value_counts()/len(df)}")

Length of raw data: 918 instances (918, 12).
Class balance, raw data:
1    0.553377
0    0.446623
Name: HeartDisease, dtype: float64


## Dropping 'Zeros'

In [3]:
# remove the rows with 'zero' entries in Cholesterol or RestingBP:
df = df[
    (df["RestingBP"] != 0) &
    (df["Cholesterol"] != 0)
]

print(f"Lenght of data, dropped zeros: {len(df)} instances.")
print(f"Class balance, dropped zeros:\n{df.HeartDisease.value_counts()/len(df)}")

Lenght of data, dropped zeros: 746 instances.
Class balance, dropped zeros:
0    0.522788
1    0.477212
Name: HeartDisease, dtype: float64


Just under 20\% of the data is lost when dropping the rows with zero values, it is not ideal to throw data away but there are 746 instances remaining that should be sufficient to work with. Dropping these rows also reduce the slight class imbalance.

## Under / Over / SMOTE Sampling
* if used, only perform oversampling on the training data rather than the test data.

In [4]:
print(f"Class balance, dropped zeros:\n{df.HeartDisease.value_counts()/len(df)}")

Class balance, dropped zeros:
0    0.522788
1    0.477212
Name: HeartDisease, dtype: float64


* As the classes are relatively balanced, there is no real need for under / over / SMOTE sampling.
* Accuracy will remain a valid metric.
* (has this dataset been preprocessed?)
* I would guess that the source of the data is not representative of the general population.
* The balance between class labels serves our purpose well in this ML context.

If there was a greater imbalance, synthetic minority oversampling technique (SMOTE) would have been used. c.f.:
* Minou where ratio of patients without cardio vascular diseases and patients with cardio vascular diseases was 85:15 so SMOTE was utilised for 50:50 ratio (no comparative results).
* Khdair where ratio of (patients witout CHD) 2:1 (with CHD) - this paper has comparison of results showing how SMOTE improved performance here.

## Outliers

As described in EDA, there are outliers present in the non-categorical attributes. Removing these outliers reduce the size of the dataset. The presence of outliers does not mean they are invalid data points.

## One-Hot-Encoding
* OHE the categorical attributes.
* Where there are only two categories, there is no need to keep both encodings (Sex, ExerciseAngina) as this does not give the model any extra information.

In [5]:
# one-hot-encode with pandas built in get_dummies():
df = pd.get_dummies(df)
cols = list(df.columns)
cols.pop(cols.index("HeartDisease"))
df = df[cols + ["HeartDisease"]]

print(df.shape)
df.head()

(746, 21)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0
1,49,160,180,0,156,1.0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1
2,37,130,283,0,98,0.0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0
3,48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0,1,0,1,0,1
4,54,150,195,0,122,0.0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0


In [6]:
print(f"Sex_F: {round(df.Sex_F.sum()/len(df),2)}")
print(f"Sex_M: {round(df.Sex_M.sum()/len(df),2)}")
print(f"ExerciseAngina_N: {round(df.ExerciseAngina_N.sum()/len(df),2)}")
print(f"ExerciseAngina_Y: {round(df.ExerciseAngina_Y.sum()/len(df),2)}")

Sex_F: 0.24
Sex_M: 0.76
ExerciseAngina_N: 0.62
ExerciseAngina_Y: 0.38


Keep the majority attribute in each case (doesn't really matter which is kept):

In [7]:
# drop appropriate columns from dataframe:
df.drop(
    columns=[
        "Sex_F",
        "ExerciseAngina_Y"
    ],
    inplace=True
)

## Correlation

Removing the 3 attributes that have the least correlation with the target variable: 

In [8]:
least_corr = [
    "ChestPainType_TA",
    "RestingECG_LVH",
    "RestingECG_ST",
]

df.drop(
    columns=least_corr,
    inplace=True
)

print(df.shape)

(746, 16)



---

In [9]:
print(f"dataset instances: {len(df)}")
df.tail()

dataset instances: 746


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,RestingECG_Normal,ExerciseAngina_N,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,HeartDisease
913,45,110,264,0,132,1.2,1,0,0,0,1,1,0,1,0,1
914,68,144,193,1,141,3.4,1,1,0,0,1,1,0,1,0,1
915,57,130,131,0,115,1.2,1,1,0,0,1,0,0,1,0,1
916,57,130,236,0,174,0.0,0,0,1,0,0,1,0,1,0,1
917,38,138,175,0,173,0.0,1,0,0,1,1,1,0,0,1,0


---
## train-validation / test split
* for all of the pre-processing steps so far, it makes little difference if they are performed before or after the train / val / test split
* in the following scaling, we want to scale the training data and leave the test set unaffected for now and at least make sure that the MinMaxScaler is not fit to a set that includes the test set
* the scaler fit to the training data will be saved to be used on the test set in the final prediction pipeline

In [10]:
cols = list(df.columns)
cols.pop(cols.index("HeartDisease"))
x = df[cols]
y = df["HeartDisease"]

#### 10% Test Set to Evaluate Final Model Performance

In [11]:
x_train_val, x_test, y_train_val, y_test = train_test_split(
    x,
    y,
    test_size=0.1,
    random_state=13
)

In [12]:
# save the test set to csv:
heart_test = x_test.join(
    y_test
)

# print(heart_test.shape)
# heart_test.head()

heart_test.to_csv(
    (get_project_root() / "data/heart-test-set.csv"),
)

## Scaling

In order to get a better sense when it comes to feature importance, scaling the data between a maximum and minimum value is often worthwhile. As there are encoded attributes and binary marked attributes the continous attributes are scaled between 0->1 to match.

In [13]:
scaler = MinMaxScaler((0,1))

scale_cols = [
    "Age",
    "RestingBP",
    "Cholesterol",
    "FastingBS",
    "MaxHR",
    "Oldpeak"
]

# scale for all training and validation data (not test):
x_train_val[scale_cols] = scaler.fit_transform(x_train_val[scale_cols])

In [14]:
print(x_train_val.shape)
x_train_val.describe()

(671, 15)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,RestingECG_Normal,ExerciseAngina_N,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
count,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0,671.0
mean,0.508197,0.381175,0.309375,0.166915,0.534765,0.161309,0.749627,0.503726,0.219076,0.219076,0.600596,0.61699,0.059613,0.47392,0.466468
std,0.194997,0.160905,0.114988,0.373178,0.182594,0.17354,0.433551,0.500359,0.413929,0.413929,0.490141,0.486483,0.236944,0.499692,0.499246
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.367347,0.259259,0.237452,0.0,0.398496,0.015873,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.530612,0.351852,0.295367,0.0,0.533835,0.095238,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,0.632653,0.449074,0.367761,0.0,0.676692,0.269841,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Save the MinMaxScaler to use on test set (otherwise would not have the same scaling)

In [15]:
# use standard library 'joblib':
scaler_path = get_project_root() / "models/scaler.joblib"
joblib.dump(scaler, scaler_path)

['/Users/djq98242/repos/heart_failure_prediction/models/scaler.joblib']

## train / validation split
#### Train 80:20 Validation from Remaining 90% of Data
* Cross Validation will be used in training the model; the validation set will be used to assess candidate model performance during development.

In [16]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train_val,
    y_train_val,
    test_size=0.2,
    random_state=13
)

#### train / validation / test split sizes:

In [17]:
print(
    f"""
    training split: {len(x_train)}
    validation split: {len(x_val)}
    test split: {len(x_test)}
    """
)


    training split: 536
    validation split: 135
    test split: 75
    


## Save Processed Data

In [18]:
training = x_train.join(y_train)
validation = x_val.join(y_val)

training["train/val"] = "train"
validation["train/val"] = "val"

processed = pd.concat([
    training,
    validation
])

In [19]:
print(processed.shape)
processed.sample(10, random_state=42)

(671, 17)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,RestingECG_Normal,ExerciseAngina_N,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,HeartDisease,train/val
827,0.306122,0.277778,0.247104,0.0,0.721805,0.047619,0,0,0,1,1,1,0,1,0,0,train
601,0.673469,0.444444,0.38417,0.0,0.406015,0.222222,1,0,0,1,1,0,0,1,0,1,train
757,0.44898,0.444444,0.285714,0.0,0.706767,0.111111,1,0,0,1,1,1,0,1,0,1,train
185,0.612245,0.62963,0.243243,1.0,0.172932,0.015873,1,0,0,1,0,1,0,1,0,1,val
132,0.571429,0.722222,0.584942,0.0,0.398496,0.333333,1,1,0,0,0,0,0,1,0,1,train
710,0.387755,0.166667,0.366795,0.0,0.368421,0.174603,1,1,0,0,0,0,0,1,0,1,train
215,0.040816,0.722222,0.293436,0.0,0.759398,0.015873,0,0,0,0,0,1,0,0,1,0,train
848,0.489796,0.333333,0.328185,0.0,0.691729,0.015873,1,1,0,0,1,0,0,0,1,1,train
46,0.183673,0.259259,0.266409,0.0,0.744361,0.015873,1,1,0,0,1,1,0,0,1,0,train
806,0.857143,0.490741,0.171815,0.0,0.421053,0.428571,1,1,0,0,1,0,1,0,0,1,train


In [None]:
# dfi.export(
#     processed.sample(10, random_state=42),
#     "../output/tables/processed.png"
# )

In [21]:
processed.to_csv(
    (get_project_root() / "data/heart-processed.csv"),
)