In [145]:
# Import libraries
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters  #Register pandas formatters and converters with matplotlib.
from pylab import rcParams      # for customizing matplotlib graphs.
sns.set_style("whitegrid")    

In [146]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline  
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler


In [147]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm           import SVC

In [148]:
df = pd.read_csv('heart.csv')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace = True)

In [149]:
## chainging numerical col 'chol' into categorical column:
df['chol']= pd.cut(df['chol'], bins=[0,200,239,564], labels = [0,1,2]) #0-High, 1-Borderline high, 2-High
df['trtbps']= pd.cut(df['trtbps'], bins=[94,120,139,200], labels = [0,1,2])
df['age'] = pd.cut(df['age'], bins=[29,53,80], labels = [0, 1])
df = df.astype({'trtbps':'float64','chol':'float64', 'age':'float64'})

In [150]:
df['chol_bps'] = df['chol']+df['trtbps']
df.drop(['chol','trtbps'], axis=1, inplace=True)
df

Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,150,0,2.3,0,0,1,1,3.0
1,0.0,1,2,0,1,187,0,3.5,0,0,2,1,3.0
2,0.0,0,1,0,0,172,0,1.4,2,0,2,1,2.0
3,1.0,1,1,0,1,178,0,0.8,2,0,2,1,1.0
4,1.0,0,0,0,1,163,1,0.6,2,0,2,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,1.0,0,0,0,1,123,1,0.2,1,0,3,0,4.0
298,0.0,1,3,0,1,132,0,1.2,1,0,3,0,2.0
299,1.0,1,0,1,1,141,0,3.4,1,2,3,0,2.0
300,1.0,1,0,0,1,115,1,1.2,1,1,3,0,1.0


In [151]:
# data agumentation
new_df = df.copy()
def get_data(data):
    gen_data = data
    for restecg_values in data['restecg'].unique():
        new_data = gen_data[gen_data['restecg']== restecg_values]
        thalachh_std = new_data['thalachh'].std()
        oldpeak_std = new_data['oldpeak'].std()

        for i in gen_data[gen_data['restecg']== restecg_values].index:
            if np.random.randint(2)==1:
                gen_data['thalachh'].values[i] += thalachh_std/10   
            else:
                gen_data['thalachh'].values[i] -= thalachh_std/10
            if np.random.randint(2)==1:
                gen_data['oldpeak'].values[i] += oldpeak_std/10
            else:
                gen_data['oldpeak'].values[i] -= oldpeak_std/10
    return gen_data
print(df.head())  
std_data = get_data(new_df)
std_data.head()

   age  sex  cp  fbs  restecg  thalachh  exng  oldpeak  slp  caa  thall  \
0  1.0    1   3    1        0       150     0      2.3    0    0      1   
1  0.0    1   2    0        1       187     0      3.5    0    0      2   
2  0.0    0   1    0        0       172     0      1.4    2    0      2   
3  1.0    1   1    0        1       178     0      0.8    2    0      2   
4  1.0    0   0    0        1       163     1      0.6    2    0      2   

   output  chol_bps  
0       1       3.0  
1       1       3.0  
2       1       2.0  
3       1       1.0  
4       1       2.0  


Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,152,0,2.178532,0,0,1,1,3.0
1,0.0,1,2,0,1,189,0,3.394439,0,0,2,1,3.0
2,0.0,0,1,0,0,174,0,1.521468,2,0,2,1,2.0
3,1.0,1,1,0,1,175,0,0.905561,2,0,2,1,1.0
4,1.0,0,0,0,1,160,1,0.705561,2,0,2,1,2.0


In [152]:
x= df.drop(['output'], axis=1)
y = df.output
cat_fe = [ 'sex','cp','fbs','restecg','exng','slp','caa','thall','age']
num_fe = ['thalachh','oldpeak','chol_bps']

x_train, x_test, y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=0, stratify=y)

extra_sample = std_data.sample(std_data.shape[0]//3)
x_train = pd.concat([x_train, extra_sample.drop(['output'], axis=1)])
y_train = pd.concat([y_train, extra_sample['output']])


In [153]:
s_col = ['oldpeak', 'chol_bps']
scaler = MinMaxScaler()
for d in s_col:
    train_array = x_train[d].to_numpy()
    test_array = x_test[d].to_numpy()
    train_array = train_array.reshape(-1,1)
    test_array = test_array.reshape(-1,1)

    scaler.fit(train_array)
    x_train[d] = scaler.transform(train_array)
    x_test[d] = scaler.transform(test_array)

x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[d] = scaler.transform(test_array)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[d] = scaler.transform(test_array)


Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,chol_bps
268,1.0,1,0,1,0,103,1,0.267187,0,0,3,0.75
190,1.0,1,0,0,0,131,1,0.360312,1,3,3,0.5
15,0.0,0,2,0,1,158,0,0.267187,1,0,2,0.25
223,1.0,1,0,0,1,126,1,0.453437,1,1,3,0.25
249,0.0,1,0,0,1,122,1,0.67073,1,3,3,1.0


In [154]:
num_4_classifiers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
])
cat_4_classifiers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
])
classifier_prepro = ColumnTransformer(transformers=[
    ('num', num_4_classifiers, num_fe),
    ('cat',cat_4_classifiers, cat_fe)
], remainder='drop')

In [155]:
tree_classifiers = {
  "KNN": KNeighborsClassifier(),
  "svm": SVC(),
  "Logistic": LogisticRegression(),
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost": AdaBoostClassifier(n_estimators=100),
  "Skl GBM": GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
  "XGBoost": XGBClassifier(n_estimators=100),
  "LightGBM": LGBMClassifier(n_estimators=100),
  "CatBoost":  CatBoostClassifier(n_estimators=100),
}
tree_classifiers= {name:make_pipeline(classifier_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["AdaBoost"]

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=-9999,
                                                                                 strategy='constant'))]),
                                                  ['thalachh', 'oldpeak',
                                                   'chol_bps']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=-9999,
                                                                                 strategy='constant'))]),
                                                  ['sex', 'cp', 'fbs',
                                                   'restecg', 'exng', 's

In [156]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time()-start_time
    pred = model.predict(x_test)
    results = results.append({"Model":    model_name,
                            "Accuracy": accuracy_score(y_test, pred)*100,
                            "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
                            "Time":     total_time},
                            ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.053755
0:	learn: 0.6784573	total: 832us	remaining: 82.4ms
1:	learn: 0.6635479	total: 1.65ms	remaining: 81ms
2:	learn: 0.6497028	total: 2.42ms	remaining: 78.3ms
3:	learn: 0.6360039	total: 3.17ms	remaining: 76.1ms
4:	learn: 0.6212420	total: 3.9ms	remaining: 74.2ms
5:	learn: 0.6075403	total: 4.7ms	remaining: 73.6ms
6:	learn: 0.5953964	total: 5.83ms	remaining: 77.5ms
7:	learn: 0.5835276	total: 7.11ms	remaining: 81.8ms
8:	learn: 0.5728605	total: 7.9ms	remaining: 79.9ms
9:	learn: 0.5629665	total: 8.69ms	remaining: 78.2ms
10:	learn: 0.5523456	total: 9.53ms	remaining: 77.1ms
11:	learn: 0.5427822	total: 10.4ms	remaining: 76.1ms
12:	learn: 0.5331475	total: 11.2ms	remaining: 75ms
13:	learn: 0.5241529	total: 12.1ms	remaining: 74.1ms
14:	learn: 0.5174658	total: 12.7ms	remaining: 72.1ms
15:	learn: 0.5101811	total: 13.6ms	remaining: 71.3ms
16:	learn: 0.5019394	total: 14.3ms	remaining: 70ms
17:	learn: 0.4938620	total: 15.1ms	remaining: 68.8ms
18:	learn: 0.4853106	total: 15.9ms	r

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Logistic,90.163934,89.82684,0.048839
2,Extra Trees,90.163934,90.097403,13.903031
3,LightGBM,90.163934,90.097403,0.062832
4,Skl HistGBM,88.52459,88.582251,0.428853
5,CatBoost,88.52459,88.311688,0.210131
6,XGBoost,86.885246,87.0671,0.148603
7,Skl GBM,85.245902,85.551948,0.072806
8,Random Forest,83.606557,83.495671,0.163003
9,AdaBoost,80.327869,80.194805,0.138629
10,KNN,78.688525,78.138528,0.012001
