## 4.1 Create dummies for categorical features

### 4.1.1 Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

### 4.1.2. Import data from EDA step

In [2]:
df = pd.read_csv('heart_EDA.csv')

In [3]:
df.head()

Unnamed: 0,AgeCat,SexCat,RestingBP,Cholesterol,FastingBS,Oldpeak,MaxHRCat,ST_SlopeCat,ChestPainTypeCat,RestingECGCat,ExerciseAnginaCat,HeartDisease
0,3,0,140,289,0,0.0,0,0,0,0,0,0
1,5,1,160,180,0,1.0,0,1,1,0,0,1
2,2,0,130,283,0,0.0,0,0,0,1,0,0
3,4,1,138,214,0,1.5,0,1,2,0,1,1
4,6,0,150,195,0,0.0,0,0,1,0,0,0


### 4.1.3. Identify categorical variables and create dummies

In [4]:
df.describe()

Unnamed: 0,AgeCat,SexCat,RestingBP,Cholesterol,FastingBS,Oldpeak,MaxHRCat,ST_SlopeCat,ChestPainTypeCat,RestingECGCat,ExerciseAnginaCat,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,5.486928,0.21024,132.396514,198.799564,0.233115,0.887364,0.08061,0.569717,1.45207,0.603486,0.404139,0.553377
std,1.881826,0.407701,18.514154,109.384145,0.423046,1.06657,0.272384,0.495386,0.851832,0.805968,0.490992,0.497414
min,1.0,0.0,0.0,0.0,0.0,-2.6,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,120.0,173.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,6.0,0.0,130.0,223.0,0.0,0.6,0.0,1.0,2.0,0.0,0.0,1.0
75%,7.0,0.0,140.0,267.0,0.0,1.5,0.0,1.0,2.0,1.0,1.0,1.0
max,9.0,1.0,200.0,603.0,1.0,6.2,1.0,1.0,3.0,2.0,1.0,1.0


In [5]:
df.columns

Index(['AgeCat', 'SexCat', 'RestingBP', 'Cholesterol', 'FastingBS', 'Oldpeak',
       'MaxHRCat', 'ST_SlopeCat', 'ChestPainTypeCat', 'RestingECGCat',
       'ExerciseAnginaCat', 'HeartDisease'],
      dtype='object')

In [6]:
# create a list of categorical features

cat_vars = ['AgeCat', 'SexCat', 'FastingBS', 'MaxHRCat', 'ST_SlopeCat', 'ChestPainTypeCat', 'RestingECGCat','ExerciseAnginaCat']

In [7]:
# One-hot encode categorical features

ohe = OneHotEncoder(drop='first', sparse=False)
enc_features = ohe.fit_transform(df[cat_vars])
enc_features.shape

(918, 18)

In [8]:
ohe.categories_

[array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64),
 array([0, 1], dtype=int64),
 array([0, 1], dtype=int64),
 array([0, 1], dtype=int64),
 array([0, 1], dtype=int64),
 array([0, 1, 2, 3], dtype=int64),
 array([0, 1, 2], dtype=int64),
 array([0, 1], dtype=int64)]

In [9]:
enc_labels = ['AgeCat_2', 'AgeCat_3', 'AgeCat_4', 'AgeCat_5', 'AgeCat_6','AgeCat_7', 'AgeCat_8', 'AgeCat_9',
              'SexCat_1', 
              'FastingBS_1', 
              'MaxHRCat_1', 
              'ST_SlopeCat_1', 
              'ChestPainTypeCat_1', 'ChestPainTypeCat_2', 'ChestPainTypeCat_3',
              'RestingECGCat_1', 'RestingECGCat_2',
              'ExerciseAnginaCat_1']

In [10]:
df_enc = pd.DataFrame(enc_features, columns=enc_labels)
df_enc.head()

Unnamed: 0,AgeCat_2,AgeCat_3,AgeCat_4,AgeCat_5,AgeCat_6,AgeCat_7,AgeCat_8,AgeCat_9,SexCat_1,FastingBS_1,MaxHRCat_1,ST_SlopeCat_1,ChestPainTypeCat_1,ChestPainTypeCat_2,ChestPainTypeCat_3,RestingECGCat_1,RestingECGCat_2,ExerciseAnginaCat_1
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Replace categorical features in the original dataframe with one-hot encoded features

df1 = df.drop(cat_vars, axis=1)
df1 = pd.concat([df1,df_enc], axis=1)
df1.head()

Unnamed: 0,RestingBP,Cholesterol,Oldpeak,HeartDisease,AgeCat_2,AgeCat_3,AgeCat_4,AgeCat_5,AgeCat_6,AgeCat_7,...,SexCat_1,FastingBS_1,MaxHRCat_1,ST_SlopeCat_1,ChestPainTypeCat_1,ChestPainTypeCat_2,ChestPainTypeCat_3,RestingECGCat_1,RestingECGCat_2,ExerciseAnginaCat_1
0,140,289,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,160,180,1.0,1,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,130,283,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,138,214,1.5,1,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,150,195,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Rearrange columns of df1 so that the target feature 'HeartDisease' is at the right end of dataframe

df1.columns

Index(['RestingBP', 'Cholesterol', 'Oldpeak', 'HeartDisease', 'AgeCat_2',
       'AgeCat_3', 'AgeCat_4', 'AgeCat_5', 'AgeCat_6', 'AgeCat_7', 'AgeCat_8',
       'AgeCat_9', 'SexCat_1', 'FastingBS_1', 'MaxHRCat_1', 'ST_SlopeCat_1',
       'ChestPainTypeCat_1', 'ChestPainTypeCat_2', 'ChestPainTypeCat_3',
       'RestingECGCat_1', 'RestingECGCat_2', 'ExerciseAnginaCat_1'],
      dtype='object')

In [13]:
cols = ['RestingBP', 'Cholesterol', 'Oldpeak', 'AgeCat_2', 'AgeCat_3', 'AgeCat_4', 'AgeCat_5', 'AgeCat_6', 'AgeCat_7', 
        'AgeCat_8', 'AgeCat_9', 'SexCat_1', 'FastingBS_1', 'MaxHRCat_1', 'ST_SlopeCat_1','ChestPainTypeCat_1', 
        'ChestPainTypeCat_2', 'ChestPainTypeCat_3', 'RestingECGCat_1', 'RestingECGCat_2', 'ExerciseAnginaCat_1', 'HeartDisease']

In [14]:
df1 = df1[cols]
df1.head()

Unnamed: 0,RestingBP,Cholesterol,Oldpeak,AgeCat_2,AgeCat_3,AgeCat_4,AgeCat_5,AgeCat_6,AgeCat_7,AgeCat_8,...,FastingBS_1,MaxHRCat_1,ST_SlopeCat_1,ChestPainTypeCat_1,ChestPainTypeCat_2,ChestPainTypeCat_3,RestingECGCat_1,RestingECGCat_2,ExerciseAnginaCat_1,HeartDisease
0,140,289,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,160,180,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1
2,130,283,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,138,214,1.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1
4,150,195,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


## 4.2 Splitting data into training and testing sets

Before scaling the numerical features, it is important to split the data set into training and test sets. Scaling the entire data set before standardization may lead to test data leakage.

In [15]:
# Group independent features as X
X = df1.drop(['HeartDisease'], axis=1)

# Isolate target feature is y
y = df1['HeartDisease']

In [16]:
# Split the data 75% in training set and 25% in test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## 4.3 Scaling of training and test data sets

We will use 'StandardScaler' method to scale the numerical features. Standardization by centering and scaling will be performed on training and test data sets using the mean and standard deviation of the training data set. This step avoids test data leak in the training data.

In [17]:
X_train.columns

Index(['RestingBP', 'Cholesterol', 'Oldpeak', 'AgeCat_2', 'AgeCat_3',
       'AgeCat_4', 'AgeCat_5', 'AgeCat_6', 'AgeCat_7', 'AgeCat_8', 'AgeCat_9',
       'SexCat_1', 'FastingBS_1', 'MaxHRCat_1', 'ST_SlopeCat_1',
       'ChestPainTypeCat_1', 'ChestPainTypeCat_2', 'ChestPainTypeCat_3',
       'RestingECGCat_1', 'RestingECGCat_2', 'ExerciseAnginaCat_1'],
      dtype='object')

In [18]:
scaling_vars = ['RestingBP', 'Cholesterol', 'Oldpeak']

The aim of scaling operation is to bring values of a given feature within 0 to +/-1 range. Thus, scaling will be performed only on those feature whose values are not in 0 to +/-1 range.

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[scaling_vars])
X_test_scaled = scaler.transform(X_test[scaling_vars])

In [20]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = scaling_vars)

In [21]:
X_train_scaled_df.head()

Unnamed: 0,RestingBP,Cholesterol,Oldpeak
0,1.183802,1.307314,1.900458
1,1.183802,-1.878,-0.834739
2,0.913811,0.096522,0.624033
3,-0.166155,-0.210833,-0.834739
4,-0.166155,0.990645,0.076994


In [22]:
# Drop unscaled features from X_train and replace them with scaled ones

X_train1 = X_train.drop(scaling_vars, axis=1).reset_index(drop = True)
X_train1[scaling_vars] = X_train_scaled_df

In [23]:
X_train1.head()

Unnamed: 0,AgeCat_2,AgeCat_3,AgeCat_4,AgeCat_5,AgeCat_6,AgeCat_7,AgeCat_8,AgeCat_9,SexCat_1,FastingBS_1,...,ST_SlopeCat_1,ChestPainTypeCat_1,ChestPainTypeCat_2,ChestPainTypeCat_3,RestingECGCat_1,RestingECGCat_2,ExerciseAnginaCat_1,RestingBP,Cholesterol,Oldpeak
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.183802,1.307314,1.900458
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.183802,-1.878,-0.834739
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.913811,0.096522,0.624033
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.166155,-0.210833,-0.834739
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.166155,0.990645,0.076994


In [24]:
# Replace unscaled features with scaled ones in X_test

X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = scaling_vars)
X_test1 = X_test.drop(scaling_vars, axis=1).reset_index(drop = True)
X_test1[scaling_vars] = X_train_scaled_df

In [25]:
X_test.head()

Unnamed: 0,RestingBP,Cholesterol,Oldpeak,AgeCat_2,AgeCat_3,AgeCat_4,AgeCat_5,AgeCat_6,AgeCat_7,AgeCat_8,...,SexCat_1,FastingBS_1,MaxHRCat_1,ST_SlopeCat_1,ChestPainTypeCat_1,ChestPainTypeCat_2,ChestPainTypeCat_3,RestingECGCat_1,RestingECGCat_2,ExerciseAnginaCat_1
668,140,195,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,145,518,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
377,160,0,1.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
535,130,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
807,108,309,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
X_test1.head()

Unnamed: 0,AgeCat_2,AgeCat_3,AgeCat_4,AgeCat_5,AgeCat_6,AgeCat_7,AgeCat_8,AgeCat_9,SexCat_1,FastingBS_1,...,ST_SlopeCat_1,ChestPainTypeCat_1,ChestPainTypeCat_2,ChestPainTypeCat_3,RestingECGCat_1,RestingECGCat_2,ExerciseAnginaCat_1,RestingBP,Cholesterol,Oldpeak
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.183802,1.307314,1.900458
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.183802,-1.878,-0.834739
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.913811,0.096522,0.624033
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,-0.166155,-0.210833,-0.834739
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.166155,0.990645,0.076994


In [27]:
# Save each data set in a new '.csv' file

X_train1.to_csv('heart_X_train.csv', index=False)
X_test1.to_csv('heart_X_test.csv', index=False)
y_train.to_csv('heart_y_train.csv', index=False)
y_test.to_csv('heart_y_test.csv', index=False)