## Heart Disease Data Preprocessing

A model to predict heart disease from the dataset downloaded from https://www.kaggle.com/datasets/mahdifaour/heart-disease-dataset/data?select=Heart_Disease+%281%29.csv.

This file preprocesses data by removing empty values, converts categorical to numeric and splits into train and test datasets in 80-20 ratio.


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [26]:
# Load the dataset for examination

df = pd.read_csv('data/Heart_Disease.csv')

In [27]:
df.describe()

Unnamed: 0,age,education,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose
count,3674.0,3674.0,3674.0,3674.0,3674.0,3674.0,3674.0,3674.0,3674.0,3674.0,3674.0,3674.0
mean,49.5773,1.984213,9.09227,0.030212,0.005716,0.310016,236.76184,132.38024,82.906505,25.783038,75.719652,81.769461
std,8.546068,1.022891,11.938399,0.171194,0.075397,0.462563,44.039295,22.04683,11.948024,4.056048,11.957171,23.884454
min,32.0,1.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0
25%,42.0,1.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0
50%,49.0,2.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0
75%,56.0,3.0,20.0,0.0,0.0,1.0,263.0,143.5,89.5,27.99,82.0,87.0
max,70.0,4.0,70.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0


In [28]:
# The column CHDRisk is what we will predict.
df.columns

Index(['sex', 'age', 'education', 'smokingStatus', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'CHDRisk'],
      dtype='object')

In [29]:
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,male,39,4,no,0,0,0,0,no,195,106.0,70.0,26.97,80,77,no
1,female,46,2,no,0,0,0,0,no,250,121.0,81.0,28.73,95,76,no
2,male,48,1,yes,20,0,0,0,no,245,127.5,80.0,25.34,75,70,no
3,female,61,3,yes,30,0,0,1,no,225,150.0,95.0,28.58,65,103,yes
4,female,46,3,yes,23,0,0,0,no,285,130.0,84.0,23.1,85,85,no


In [30]:
# Check if the data has empty or missing fields.
np.where(df.isna().sum(axis=1) == 1)

(array([3544, 3551, 3566, 3614, 3619, 3626, 3627, 3630, 3631, 3633, 3636,
        3640, 3644, 3645, 3649, 3650, 3652, 3654, 3656, 3657]),)

In [31]:
# We can drop these rows for our portfolio project.
df = df.dropna(axis=0)
df = df.reset_index(drop=True)
df.describe()


Unnamed: 0,age,education,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose
count,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0,3652.0
mean,49.567908,1.982749,9.093373,0.03012,0.00575,0.310789,236.761227,132.380203,82.90115,25.785758,75.746166,81.731654
std,8.547131,1.022067,11.932352,0.170942,0.075623,0.46288,44.078277,22.036518,11.931499,4.051199,11.968388,23.748448
min,32.0,1.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0
25%,42.0,1.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.0875,68.0,71.0
50%,49.0,2.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.405,75.0,78.0
75%,56.0,3.0,20.0,0.0,0.0,1.0,263.0,143.5,89.625,28.0225,82.0,87.0
max,70.0,4.0,70.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0


In [32]:
# Check the categories for all categorical columns.
print(df['diabetes'].unique())
print(df['smokingStatus'].unique())
print(df['CHDRisk'].unique())

['no' 'yes']
['no' 'yes']
['no' 'yes']


In [33]:
# Next we will convert categorical variables smokingStatus, diabetes, sex and CHDRisk to numeric
df['diabetes'] = df['diabetes'].apply(lambda x: 0 if x == 'no' else 1)
df['smokingStatus'] = df['smokingStatus'].apply(lambda x: 0 if x == 'no' else 1)
df['CHDRisk'] = df['CHDRisk'].apply(lambda x: 0 if x == 'no' else 1)
df['sex'] = df['sex'].apply(lambda x: 0 if x == 'male' else 1)




In [34]:
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,0,39,4,0,0,0,0,0,0,195,106.0,70.0,26.97,80,77,0
1,1,46,2,0,0,0,0,0,0,250,121.0,81.0,28.73,95,76,0
2,0,48,1,1,20,0,0,0,0,245,127.5,80.0,25.34,75,70,0
3,1,61,3,1,30,0,0,1,0,225,150.0,95.0,28.58,65,103,1
4,1,46,3,1,23,0,0,0,0,285,130.0,84.0,23.1,85,85,0


In [35]:
# Next we split the dataset into train, validate and test

# Split 60% in train, 20% in test
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [36]:
# Save the preprocessed dataframe into a preprocessed CSV
df_train.to_csv('data/Heart_Disease_train.csv', index=False)
df_test.to_csv('data/Heart_Disease_test.csv', index=False)