## 4.1 Create dummies for categorical features

### 4.1.1 Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### 4.1.2. Import data from EDA step

In [2]:
df = pd.read_csv('heart_EDA.csv')

In [3]:
df.head()

Unnamed: 0,AgeCat,SexCat,RestingBP,Cholesterol,FastingBS,Oldpeak,MaxHRCat,ST_SlopeCat,ChestPainTypeCat,RestingECGCat,ExerciseAnginaCat,HeartDisease
0,3,0,140,289,0,0.0,0,0,0,0,0,0
1,5,1,160,180,0,1.0,0,1,1,0,0,1
2,2,0,130,283,0,0.0,0,0,0,1,0,0
3,4,1,138,214,0,1.5,0,1,2,0,1,1
4,6,0,150,195,0,0.0,0,0,1,0,0,0


### 4.1.3. Identify categorical variables and create dummies

In [4]:
df.describe()

Unnamed: 0,AgeCat,SexCat,RestingBP,Cholesterol,FastingBS,Oldpeak,MaxHRCat,ST_SlopeCat,ChestPainTypeCat,RestingECGCat,ExerciseAnginaCat,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,5.486928,0.21024,132.396514,198.799564,0.233115,0.887364,0.08061,0.569717,1.45207,0.603486,0.404139,0.553377
std,1.881826,0.407701,18.514154,109.384145,0.423046,1.06657,0.272384,0.495386,0.851832,0.805968,0.490992,0.497414
min,1.0,0.0,0.0,0.0,0.0,-2.6,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,120.0,173.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,6.0,0.0,130.0,223.0,0.0,0.6,0.0,1.0,2.0,0.0,0.0,1.0
75%,7.0,0.0,140.0,267.0,0.0,1.5,0.0,1.0,2.0,1.0,1.0,1.0
max,9.0,1.0,200.0,603.0,1.0,6.2,1.0,1.0,3.0,2.0,1.0,1.0


#### Dummies of all categorical variables have been already created in the EDA step. Thus, we can move on to the scaling of numerical features.

## 4.2 Splitting data into training and testing sets

Before scaling the numerical features, it is important to split the data set into training and test sets. Scaling the entire data set before standardization may lead to test data leakage.

In [5]:
# Group independent variables as X
X = df.drop(['HeartDisease'], axis=1)

# Isolate target feature is y
y = df['HeartDisease']

In [6]:
# Split the data 75% in training set and 25% in test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## 4.3 Scaling of training and test data sets

We will use 'StandardScaler' method to scale the numerical features. Standardization by centering and scaling will be performed on training and test data sets using the mean and standard deviation of the training data set. This step avoids test data leak in the training data.

In [7]:
X_train.columns

Index(['AgeCat', 'SexCat', 'RestingBP', 'Cholesterol', 'FastingBS', 'Oldpeak',
       'MaxHRCat', 'ST_SlopeCat', 'ChestPainTypeCat', 'RestingECGCat',
       'ExerciseAnginaCat'],
      dtype='object')

In [8]:
scaling_vars = ['AgeCat', 'RestingBP', 'Cholesterol', 'Oldpeak', 'ChestPainTypeCat', 'RestingECGCat']

The aim of scaling operation is to bring values of a given feature within 0-1 range. Thus, scaling will be performed only on those feature whose values are not in 0-1 range.

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[scaling_vars])
X_test_scaled = scaler.transform(X_test[scaling_vars])

In [10]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = scaling_vars)

In [11]:
X_train_scaled_df.head()

Unnamed: 0,AgeCat,RestingBP,Cholesterol,Oldpeak,ChestPainTypeCat,RestingECGCat
0,0.248559,1.183802,1.307314,1.900458,0.653939,-0.750298
1,0.248559,1.183802,-1.878,-0.834739,-0.533159,0.493569
2,0.779641,0.913811,0.096522,0.624033,-0.533159,-0.750298
3,-0.282523,-0.166155,-0.210833,-0.834739,0.653939,-0.750298
4,0.248559,-0.166155,0.990645,0.076994,1.841038,-0.750298


In [12]:
# Drop unscaled features from X_train and replace them with scaled ones

X_train2 = X_train.drop(scaling_vars, axis=1).reset_index(drop = True)
X_train2[scaling_vars] = X_train_scaled_df

In [13]:
X_train2.head()

Unnamed: 0,SexCat,FastingBS,MaxHRCat,ST_SlopeCat,ExerciseAnginaCat,AgeCat,RestingBP,Cholesterol,Oldpeak,ChestPainTypeCat,RestingECGCat
0,0,1,0,1,1,0.248559,1.183802,1.307314,1.900458,0.653939,-0.750298
1,0,0,0,1,0,0.248559,1.183802,-1.878,-0.834739,-0.533159,0.493569
2,0,1,0,0,0,0.779641,0.913811,0.096522,0.624033,-0.533159,-0.750298
3,0,0,0,0,0,-0.282523,-0.166155,-0.210833,-0.834739,0.653939,-0.750298
4,1,0,0,1,0,0.248559,-0.166155,0.990645,0.076994,1.841038,-0.750298


In [14]:
# Replace unscaled features with scaled ones in X_test

X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = scaling_vars)
X_test2 = X_test.drop(scaling_vars, axis=1).reset_index(drop = True)
X_test2[scaling_vars] = X_train_scaled_df

In [15]:
X_test.head()

Unnamed: 0,AgeCat,SexCat,RestingBP,Cholesterol,FastingBS,Oldpeak,MaxHRCat,ST_SlopeCat,ChestPainTypeCat,RestingECGCat,ExerciseAnginaCat
668,7,1,140,195,0,0.0,1,0,0,0,0
30,5,0,145,518,0,0.0,0,1,1,0,0
377,8,0,160,0,1,1.2,0,1,2,1,0
535,6,0,130,0,0,1.0,0,1,2,2,1
807,6,0,108,309,0,0.0,0,0,0,0,0


In [16]:
X_test2.head()

Unnamed: 0,SexCat,FastingBS,MaxHRCat,ST_SlopeCat,ExerciseAnginaCat,AgeCat,RestingBP,Cholesterol,Oldpeak,ChestPainTypeCat,RestingECGCat
0,1,0,1,0,0,0.248559,1.183802,1.307314,1.900458,0.653939,-0.750298
1,0,0,0,1,0,0.248559,1.183802,-1.878,-0.834739,-0.533159,0.493569
2,0,1,0,1,0,0.779641,0.913811,0.096522,0.624033,-0.533159,-0.750298
3,0,0,0,1,1,-0.282523,-0.166155,-0.210833,-0.834739,0.653939,-0.750298
4,0,0,0,0,0,0.248559,-0.166155,0.990645,0.076994,1.841038,-0.750298


In [17]:
# Save each data set in a new '.csv' file

X_train2.to_csv('heart_X_train.csv', index=False)
X_test2.to_csv('heart_X_test.csv', index=False)
y_train.to_csv('heart_y_train.csv', index=False)
y_test.to_csv('heart_y_test.csv', index=False)