## 4.1 Create dummies for categorical features

### 4.1.1 Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

### 4.1.2. Import data from EDA step

In [2]:
df = pd.read_csv('heart_EDA.csv')

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,MaxHRCat
0,40,M,ATA,140,289,0,Normal,N,0.0,Up,0,0
1,49,F,NAP,160,180,0,Normal,N,1.0,Flat,1,0
2,37,M,ATA,130,283,0,ST,N,0.0,Up,0,0
3,48,F,ASY,138,214,0,Normal,Y,1.5,Flat,1,0
4,54,M,NAP,150,195,0,Normal,N,0.0,Up,0,0


### 4.1.3. Identify categorical variables and create dummies

In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,Oldpeak,HeartDisease,MaxHRCat
count,746.0,746.0,746.0,746.0,746.0,746.0,746.0
mean,52.882038,133.022788,244.635389,0.16756,0.901609,0.477212,0.093834
std,9.505888,17.28275,59.153524,0.373726,1.072861,0.499816,0.291793
min,28.0,92.0,85.0,0.0,-0.1,0.0,0.0
25%,46.0,120.0,207.25,0.0,0.0,0.0,0.0
50%,54.0,130.0,237.0,0.0,0.5,0.0,0.0
75%,59.0,140.0,275.0,0.0,1.5,1.0,0.0
max,77.0,200.0,603.0,1.0,6.2,1.0,1.0


In [5]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease',
       'MaxHRCat'],
      dtype='object')

In [6]:
# create a list of categorical features

cat_vars = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'MaxHRCat']

In [7]:
# One-hot encode categorical features

ohe = OneHotEncoder(drop='first', sparse=False)
enc_features = ohe.fit_transform(df[cat_vars])
enc_features.shape

(746, 10)

In [8]:
ohe.categories_

[array(['F', 'M'], dtype=object),
 array(['ASY', 'ATA', 'NAP', 'TA'], dtype=object),
 array(['LVH', 'Normal', 'ST'], dtype=object),
 array(['N', 'Y'], dtype=object),
 array(['Down', 'Flat', 'Up'], dtype=object),
 array([0, 1], dtype=int64)]

In [9]:
enc_labels = ['Male', 'ChestPain_ATA', 'ChestPain_NAP', 'ChestPain_TA', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_Y', 
              'ST_Slope_Flat', 'ST_Slope_Up', 'MaxHRCat_1']

In [10]:
df_enc = pd.DataFrame(enc_features, columns = enc_labels)
df_enc.head()

Unnamed: 0,Male,ChestPain_ATA,ChestPain_NAP,ChestPain_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,MaxHRCat_1
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [11]:
# Replace categorical features in the original dataframe with one-hot encoded features

df1 = df.drop(cat_vars, axis=1)
df1 = pd.concat([df_enc, df1], axis=1)
df1.head()

Unnamed: 0,Male,ChestPain_ATA,ChestPain_NAP,ChestPain_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,MaxHRCat_1,Age,RestingBP,Cholesterol,FastingBS,Oldpeak,HeartDisease
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,40,140,289,0,0.0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,49,160,180,0,1.0,1
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,37,130,283,0,0.0,0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,48,138,214,0,1.5,1
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,54,150,195,0,0.0,0


## 4.2 Splitting data into training and testing sets

Before scaling the numerical features, it is important to split the data set into training and test sets. Scaling the entire data set before standardization may lead to test data leakage.

In [12]:
# Group independent features as X
X = df1.drop(['HeartDisease'], axis=1)

# Isolate target feature is y
y = df1['HeartDisease']

In [13]:
# Split the data 75% in training set and 25% in test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## 4.3 Scaling of training and test data sets

We will use 'StandardScaler' method to scale the numerical features. Standardization by centering and scaling will be performed on training and test data sets using the mean and standard deviation of the training data set. This step avoids test data leak in the training data.

In [14]:
X_train.columns

Index(['Male', 'ChestPain_ATA', 'ChestPain_NAP', 'ChestPain_TA',
       'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_Y',
       'ST_Slope_Flat', 'ST_Slope_Up', 'MaxHRCat_1', 'Age', 'RestingBP',
       'Cholesterol', 'FastingBS', 'Oldpeak'],
      dtype='object')

The aim of scaling operation is to bring values of a many features within a comparable range. Thus, scaling will be performed only on continuous features.

In [15]:
# List numerical features

num_vars = ['Age', 'RestingBP', 'Cholesterol', 'Oldpeak']

In [16]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train[num_vars])

X_test_scaled = scaler.transform(X_test[num_vars])

In [17]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = num_vars)

In [18]:
X_train_scaled_df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,Oldpeak
0,-1.262083,0.998971,-1.27845,0.536384
1,-0.203721,0.244735,1.640773,0.723959
2,0.431297,-0.509501,0.285419,-0.589066
3,0.64297,-0.161392,-0.983053,0.067446
4,1.066315,0.998971,-0.374881,-0.870429


In [19]:
# Drop unscaled features from X_train and replace them with scaled ones

X_train1 = X_train.drop(num_vars, axis=1).reset_index(drop = True)
X_train1[num_vars] = X_train_scaled_df

In [20]:
X_train1.head()

Unnamed: 0,Male,ChestPain_ATA,ChestPain_NAP,ChestPain_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,MaxHRCat_1,FastingBS,Age,RestingBP,Cholesterol,Oldpeak
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0,-1.262083,0.998971,-1.27845,0.536384
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0,-0.203721,0.244735,1.640773,0.723959
2,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0.431297,-0.509501,0.285419,-0.589066
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0.64297,-0.161392,-0.983053,0.067446
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,1.066315,0.998971,-0.374881,-0.870429


In [21]:
# Replace unscaled features with scaled ones in X_test

X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = num_vars)
X_test1 = X_test.drop(num_vars, axis=1).reset_index(drop = True)
X_test1[num_vars] = X_test_scaled_df

In [22]:
X_test1.head()

Unnamed: 0,Male,ChestPain_ATA,ChestPain_NAP,ChestPain_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,MaxHRCat_1,FastingBS,Age,RestingBP,Cholesterol,Oldpeak
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,-2.637955,-0.161392,-1.956127,-0.870429
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0.219624,-0.625537,1.310623,-0.870429
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,-1.473756,1.579152,-1.695482,-0.870429
3,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,-0.309557,-0.741573,-1.330579,-0.870429
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0,1.913005,-0.161392,-0.409634,-0.870429


In [23]:
# Save each data set in a new '.csv' file

X_train1.to_csv('heart_X_train.csv', index=False)
X_test1.to_csv('heart_X_test.csv', index=False)
y_train.to_csv('heart_y_train.csv', index=False)
y_test.to_csv('heart_y_test.csv', index=False)