In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder, OneHotEncoder, OrdinalEncoder, TargetEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("heart.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### Cleaning Null

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


There are no columns containing null values

### Removing outliers

In [5]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


Some columns may have outliers: RestingBP, Cholesterol, MaxHR, Oldpeak

In [6]:
df = df[(df.RestingBP<(df.RestingBP.mean()+3*df.RestingBP.std())) & 
       (df.RestingBP>(df.RestingBP.mean()-3*df.RestingBP.std()))]

In [7]:
df = df[(df.Cholesterol<(df.Cholesterol.mean()+3*df.Cholesterol.std())) & 
       (df.Cholesterol>(df.Cholesterol.mean()-3*df.Cholesterol.std()))]

In [8]:
df = df[(df.MaxHR<(df.MaxHR.mean()+3*df.MaxHR.std())) & 
       (df.MaxHR>(df.MaxHR.mean()-3*df.MaxHR.std()))]

In [9]:
df = df[(df.Oldpeak<(df.Oldpeak.mean()+3*df.Oldpeak.std())) & 
       (df.Oldpeak>(df.Oldpeak.mean()-3*df.Oldpeak.std()))]

In [10]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### Encoder

There are some columns containing discrete data: 'Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope'

In [11]:
columns_for_iterations = ['Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope']
for column in columns_for_iterations:
    unique_values = df[column].unique()
    print('{}:'.format(column), unique_values)

Sex: ['M' 'F']
ChestPainType: ['ATA' 'NAP' 'ASY' 'TA']
FastingBS: [0 1]
RestingECG: ['Normal' 'ST' 'LVH']
ExerciseAngina: ['N' 'Y']
ST_Slope: ['Up' 'Flat' 'Down']


In [12]:
df2 = df.copy(deep = True)

In [13]:
le = LabelEncoder()

df2['Sex'] = le.fit_transform(df['Sex'])
df2['ChestPainType'] = le.fit_transform(df['ChestPainType'])
df2['RestingECG'] = le.fit_transform(df['RestingECG'])
df2['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df2['ST_Slope'] = le.fit_transform(df['ST_Slope'])

df2

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1,1
914,68,1,0,144,193,1,1,141,0,3.4,1,1
915,57,1,0,130,131,0,1,115,1,1.2,1,1
916,57,0,1,130,236,0,0,174,0,0.0,1,1


### Scaler

Normalization : Oldpeak feature is normalized as it had displayed a right skewed data distribution.

Standardizarion : Age, RestingBP, Cholesterol and MaxHR features are scaled down because these features are normally distributed.

In [14]:
mms = MinMaxScaler() # Normalization
ss = StandardScaler() # Standardization

df2['Oldpeak'] = mms.fit_transform(df2[['Oldpeak']])
df2['Age'] = ss.fit_transform(df2[['Age']])
df2['RestingBP'] = ss.fit_transform(df2[['RestingBP']])
df2['Cholesterol'] = ss.fit_transform(df2[['Cholesterol']])
df2['MaxHR'] = ss.fit_transform(df2[['MaxHR']])
df2

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,-1.428154,1,1,0.465900,0.849636,0,1,1.384320,0,0.333333,2,0
1,-0.475855,0,2,1.634714,-0.168122,0,1,0.752973,0,0.500000,1,1
2,-1.745588,1,1,-0.118507,0.793612,0,2,-1.535661,0,0.333333,2,0
3,-0.581666,0,0,0.349019,0.149344,0,1,-1.141069,1,0.583333,1,1
4,0.053200,1,2,1.050307,-0.028064,0,1,-0.588640,0,0.333333,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,-0.899099,1,3,-1.287320,0.616205,0,1,-0.194048,0,0.533333,1,1
914,1.534554,1,0,0.699663,-0.046738,1,1,0.161085,0,0.900000,1,1
915,0.370633,1,0,-0.118507,-0.625646,0,1,-0.864854,1,0.533333,1,1
916,0.370633,0,1,-0.118507,0.354763,0,0,1.463238,0,0.333333,1,1


### Train

In [15]:
X = df2.drop('HeartDisease',axis=1)
y = df2.HeartDisease

In [16]:
models_params = {
    'svc': {
        'model': SVC(),
        'param' : {'C':[1,10,20], 'kernel':['rbf','linear'], 'gamma':['auto','scale']}  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'param' : {'n_estimators': [10,20,30]}
    },
    'logistic_regression': {
        'model': LogisticRegression(),
        'param': {'C': [1,5,10]}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'param': {'criterion': ['gini','entropy']}
    }     
}

In [17]:
best_scores = []

for module_name, module_info in models_params.items():
    gs = GridSearchCV(module_info['model'],module_info['param'],cv=5,return_train_score=False)
    gs.fit(X,y)
    best_scores.append({'model':module_info['model'], 'best_param':gs.best_params_, 'best_test_score': gs.best_score_})

In [18]:
df = pd.DataFrame(best_scores)
df

Unnamed: 0,model,best_param,best_test_score
0,SVC(),"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.81414
1,RandomForestClassifier(),{'n_estimators': 30},0.816387
2,LogisticRegression(),{'C': 10},0.805258
3,DecisionTreeClassifier(),{'criterion': 'entropy'},0.755171


In [19]:
bagging = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    oob_score=True,
    random_state=0
)

cv_scores = cross_val_score(bagging,X,y,cv=5)
cv_scores.mean()

0.8152513966480447