In [3]:
#First, we need to import all the libraries we need
import sklearn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [4]:
#Next, we read in the Excel file
df = pd.read_excel (r'C:/Users/hruss/Desktop/OneDriveDocs/OneDrive/Documents/GMU/Datasets/heart.xlsx')

In [5]:
#Let's take a look at the head of the dataset
#df.head()
#Or the tail of it
df.tail()
#Or an overall view of types and missing values
#df.info()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
913,30,F,TA,170.0,237.0,0,ST,170,N,0.0,Up,0
914,43,F,TA,100.0,223.0,0,Normal,142,N,0.0,Up,0
915,62,F,TA,160.0,193.0,0,Normal,116,N,0.0,Up,0
916,34,M,TA,118.0,182.0,0,LVH,174,N,0.0,Up,0
917,35,F,TA,120.0,160.0,0,ST,185,N,0.0,Up,0


In [6]:
#Let's check describe
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,914.0,746.0,918.0,918.0,918.0,918.0
mean,55.255991,132.38512,266.942359,0.233115,136.809368,0.887364,0.553377
std,32.350905,18.539099,287.206851,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,85.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,208.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,237.5,0.0,138.0,0.6,1.0
75%,60.0,140.0,276.0,0.0,156.0,1.5,1.0
max,660.0,200.0,6003.0,1.0,202.0,6.2,1.0


In [7]:
#A few things are wrong with the training data. Let's fix those before we go further.
#First, the age values go up to 660. Based on life expetancy, we'll set the max to 77.
#Cholesterol also has some seriously high values. High risk is 600, let's cap at 800.
for x in df.index:
    if df.loc[x, "Cholesterol"] > 800:
        df.loc[x, "Cholesterol"] = 800
    if df.loc[x, "Age"] > 77:
        df.loc[x, "Age"] = 77
    if df.loc[x, "RestingBP"] < 50:
        df.loc[x, "RestingBP"] = 50    
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,914.0,746.0,918.0,918.0,918.0,918.0
mean,53.568627,132.439825,248.561662,0.233115,136.809368,0.887364,0.553377
std,9.517886,18.219039,76.065829,0.423046,25.460334,1.06657,0.497414
min,28.0,50.0,85.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,208.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,237.5,0.0,138.0,0.6,1.0
75%,60.0,140.0,276.0,0.0,156.0,1.5,1.0
max,77.0,200.0,800.0,1.0,202.0,6.2,1.0


In [9]:
#Let's see a count of nulls
df.isnull().sum()

Age                 0
Sex                 0
ChestPainType       4
RestingBP           4
Cholesterol       172
FastingBS           0
RestingECG          0
MaxHR               0
ExerciseAngina      0
Oldpeak             0
ST_Slope            0
HeartDisease        0
dtype: int64

In [10]:
#We impute missing values
df['ChestPainType'].fillna(df['ChestPainType'].value_counts().index[0], inplace=True)
df['RestingBP'].fillna(df['RestingBP'].mean(), inplace=True)
df['Cholesterol'].fillna(df['Cholesterol'].mean(), inplace=True)
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [19]:
#Ordinal doesn't always work. In this case, we want 1 Hot encoding (1 is hot or true, 0 is not or false)
#You'll wind up with a sparse matrix because there will be a lot of zeros.
from sklearn.preprocessing import OneHotEncoder 
cat_encoder = OneHotEncoder() 
df_1hot = cat_encoder.fit_transform(df[['Sex', 'ChestPainType', 'ExerciseAngina', 'ST_Slope']]) 
enc_names = cat_encoder.get_feature_names_out()
df_1hotdf = pd.DataFrame(df_1hot.todense(), columns = enc_names)
df_tr = pd.merge(df_1hotdf, df.drop(['Sex', 'ChestPainType', 'ExerciseAngina', 'ST_Slope'], axis=1), left_index=True, right_index=True)
df_tr.head()

Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,Oldpeak,HeartDisease
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,62,160.0,164.0,0,LVH,145,6.2,1
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,55,140.0,217.0,0,Normal,111,5.6,1
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,58,114.0,318.0,0,ST,140,4.4,1
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,56,200.0,288.0,1,LVH,133,4.0,1
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,61,120.0,282.0,0,ST,135,4.0,1
