In [1]:
# Import libraries
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters  #Register pandas formatters and converters with matplotlib.
from pylab import rcParams      # for customizing matplotlib graphs.
sns.set_style("whitegrid")    

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline  
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler


In [3]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm           import SVC



In [4]:
df = pd.read_csv('heart.csv')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace = True)

In [5]:
## chainging numerical col 'chol' into categorical column:
df['chol']= pd.cut(df['chol'], bins=[0,200,239,564], labels = [0,1,2]) #0-High, 1-Borderline high, 2-High
df['trtbps']= pd.cut(df['trtbps'], bins=[90,120,139,200], labels = [0,1,2])
df['age'] = pd.cut(df['age'], bins=[25,53,80], labels = [0, 1])
df = df.astype({'trtbps':'float64','chol':'float64', 'age':'float64'})

In [6]:
df['chol_bps'] = df['chol']+df['trtbps']
df.drop(['chol','trtbps'], axis=1, inplace=True)
df

Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,150,0,2.3,0,0,1,1,3.0
1,0.0,1,2,0,1,187,0,3.5,0,0,2,1,3.0
2,0.0,0,1,0,0,172,0,1.4,2,0,2,1,2.0
3,1.0,1,1,0,1,178,0,0.8,2,0,2,1,1.0
4,1.0,0,0,0,1,163,1,0.6,2,0,2,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,1.0,0,0,0,1,123,1,0.2,1,0,3,0,4.0
298,0.0,1,3,0,1,132,0,1.2,1,0,3,0,2.0
299,1.0,1,0,1,1,141,0,3.4,1,2,3,0,2.0
300,1.0,1,0,0,1,115,1,1.2,1,1,3,0,1.0


In [7]:
# data agumentation
new_df = df.copy()
def get_data(data):
    gen_data = data
    for restecg_values in data['restecg'].unique():
        new_data = gen_data[gen_data['restecg']== restecg_values]
        thalachh_std = new_data['thalachh'].std()
        oldpeak_std = new_data['oldpeak'].std()

        for i in gen_data[gen_data['restecg']== restecg_values].index:
            if np.random.randint(2)==1:
                gen_data['thalachh'].values[i] += thalachh_std/10   
            else:
                gen_data['thalachh'].values[i] -= thalachh_std/10
            if np.random.randint(2)==1:
                gen_data['oldpeak'].values[i] += oldpeak_std/10
            else:
                gen_data['oldpeak'].values[i] -= oldpeak_std/10
    return gen_data
print(df.head())  
std_data = get_data(new_df)
std_data.head()

   age  sex  cp  fbs  restecg  thalachh  exng  oldpeak  slp  caa  thall  \
0  1.0    1   3    1        0       150     0      2.3    0    0      1   
1  0.0    1   2    0        1       187     0      3.5    0    0      2   
2  0.0    0   1    0        0       172     0      1.4    2    0      2   
3  1.0    1   1    0        1       178     0      0.8    2    0      2   
4  1.0    0   0    0        1       163     1      0.6    2    0      2   

   output  chol_bps  
0       1       3.0  
1       1       3.0  
2       1       2.0  
3       1       1.0  
4       1       2.0  


Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,152,0,2.178532,0,0,1,1,3.0
1,0.0,1,2,0,1,189,0,3.394439,0,0,2,1,3.0
2,0.0,0,1,0,0,174,0,1.521468,2,0,2,1,2.0
3,1.0,1,1,0,1,175,0,0.905561,2,0,2,1,1.0
4,1.0,0,0,0,1,165,1,0.494439,2,0,2,1,2.0


In [8]:
x= df.drop(['output'], axis=1)
y = df.output
cat_fe = [ 'sex','cp','fbs','restecg','exng','slp','caa','thall','age']
num_fe = ['thalachh','oldpeak','chol_bps']

x_train, x_test, y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=0, stratify=y)

extra_sample = std_data.sample(std_data.shape[0]//3)
x_train = pd.concat([x_train, extra_sample.drop(['output'], axis=1)])
y_train = pd.concat([y_train, extra_sample['output']])


In [9]:
s_col = ['oldpeak', 'chol_bps']
scaler = MinMaxScaler()
for d in s_col:
    train_array = x_train[d].to_numpy()
    test_array = x_test[d].to_numpy()
    train_array = train_array.reshape(-1,1)
    test_array = test_array.reshape(-1,1)

    scaler.fit(train_array)
    x_train[d] = scaler.transform(train_array)
    x_test[d] = scaler.transform(test_array)

x_train.head()

Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,chol_bps
268,1.0,1,0,1,0,103,1,0.300879,0,0,3,0.75
190,1.0,1,0,0,0,131,1,0.405747,1,3,3,0.5
15,0.0,0,2,0,1,158,0,0.300879,1,0,2,0.25
223,1.0,1,0,0,1,126,1,0.510615,1,1,3,0.25
249,0.0,1,0,0,1,122,1,0.755308,1,3,3,1.0


In [10]:
num_4_classifiers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
])
cat_4_classifiers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
])
classifier_prepro = ColumnTransformer(transformers=[
    ('num', num_4_classifiers, num_fe),
    ('cat',cat_4_classifiers, cat_fe)
], remainder='drop')

In [11]:
tree_classifiers = {
  "KNN": KNeighborsClassifier(),
  "svm": SVC(kernel='linear', C=2.0),
  "Logistic": LogisticRegression(C= 0.5, penalty= 'l1', solver= 'liblinear'),
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost": AdaBoostClassifier(n_estimators=100),
  "Skl GBM": GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
  "XGBoost": XGBClassifier(n_estimators=100),
  "LightGBM": LGBMClassifier(n_estimators=100),
  "CatBoost":  CatBoostClassifier(depth= 9, iterations= 80, learning_rate= 0.04),
}
tree_classifiers= {name:make_pipeline(classifier_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["AdaBoost"]

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=-9999,
                                                                                 strategy='constant'))]),
                                                  ['thalachh', 'oldpeak',
                                                   'chol_bps']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=-9999,
                                                                                 strategy='constant'))]),
                                                  ['sex', 'cp', 'fbs',
                                                   'restecg', 'exng', 's

In [12]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time()-start_time
    pred = model.predict(x_test)
    results = results.append({"Model":    model_name,
                            "Accuracy": accuracy_score(y_test, pred)*100,
                            "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
                            "Time":     total_time},
                            ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


    



0:	learn: 0.6834848	total: 54.5ms	remaining: 4.31s
1:	learn: 0.6741182	total: 55.8ms	remaining: 2.17s
2:	learn: 0.6626619	total: 57.1ms	remaining: 1.47s
3:	learn: 0.6532238	total: 58.1ms	remaining: 1.1s
4:	learn: 0.6457756	total: 60ms	remaining: 900ms
5:	learn: 0.6378859	total: 61.3ms	remaining: 756ms
6:	learn: 0.6289270	total: 62.6ms	remaining: 653ms
7:	learn: 0.6220203	total: 64ms	remaining: 576ms
8:	learn: 0.6156977	total: 65.7ms	remaining: 519ms
9:	learn: 0.6088227	total: 67.2ms	remaining: 470ms
10:	learn: 0.6004155	total: 68.5ms	remaining: 429ms
11:	learn: 0.5922462	total: 69.7ms	remaining: 395ms
12:	learn: 0.5836898	total: 71.1ms	remaining: 366ms
13:	learn: 0.5745324	total: 71.5ms	remaining: 337ms
14:	learn: 0.5637276	total: 72.1ms	remaining: 312ms
15:	learn: 0.5543219	total: 72.5ms	remaining: 290ms
16:	learn: 0.5480913	total: 74.1ms	remaining: 275ms
17:	learn: 0.5420255	total: 75.4ms	remaining: 260ms
18:	learn: 0.5350524	total: 76.7ms	remaining: 246ms
19:	learn: 0.5295905	total:

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Extra Trees,90.163934,90.097403,0.149809
2,Random Forest,90.163934,90.097403,0.218209
3,Skl HistGBM,90.163934,90.097403,0.647371
4,LightGBM,90.163934,90.097403,0.037062
5,CatBoost,90.163934,89.82684,0.198256
6,XGBoost,88.52459,88.582251,0.342066
7,svm,86.885246,86.255411,0.239163
8,Logistic,86.885246,86.525974,0.019277
9,Skl GBM,86.885246,86.796537,0.098552
10,Decision Tree,83.606557,83.225108,0.014587
