In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
x = df.drop(['output'], axis=1)
y = df.output
y.head()

0    1
1    1
2    1
3    1
4    1
Name: output, dtype: int64

In [4]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm           import SVC

In [5]:
tree_classifiers = {
  "KNN": KNeighborsClassifier(),
  "svm": SVC(kernel='linear'),
  "Logistic": LogisticRegression(),
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost": AdaBoostClassifier(n_estimators=100),
  "Skl GBM": GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
  "XGBoost": XGBClassifier(n_estimators=100),
  "LightGBM": LGBMClassifier(n_estimators=100),
  "CatBoost":  CatBoostClassifier(n_estimators=100),
}

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.2, random_state=0, stratify=y)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y_val, pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.046433
0:	learn: 0.6790821	total: 158ms	remaining: 15.7s
1:	learn: 0.6657594	total: 162ms	remaining: 7.93s
2:	learn: 0.6543576	total: 165ms	remaining: 5.33s
3:	learn: 0.6444219	total: 169ms	remaining: 4.06s
4:	learn: 0.6329162	total: 173ms	remaining: 3.29s
5:	learn: 0.6221862	total: 177ms	remaining: 2.77s
6:	learn: 0.6107582	total: 181ms	remaining: 2.4s
7:	learn: 0.6009203	total: 185ms	remaining: 2.13s
8:	learn: 0.5915528	total: 189ms	remaining: 1.91s
9:	learn: 0.5844890	total: 193ms	remaining: 1.73s
10:	learn: 0.5774758	total: 196ms	remaining: 1.58s
11:	learn: 0.5699803	total: 202ms	remaining: 1.48s
12:	learn: 0.5634174	total: 204ms	remaining: 1.36s
13:	learn: 0.5535252	total: 205ms	remaining: 1.26s
14:	learn: 0.5457229	total: 207ms	remaining: 1.17s
15:	learn: 0.5388175	total: 209ms	remaining: 1.09s
16:	learn: 0.5319404	total: 210ms	remaining: 1.03s
17:	learn: 0.5259541	total: 212ms	remaining: 967ms
18:	learn: 0.5191738	total: 214ms	remaining: 911ms
19:	learn: 0

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Logistic,88.52459,88.311688,0.027494
2,CatBoost,88.52459,88.041126,0.587404
3,svm,86.885246,86.255411,0.29002
4,XGBoost,86.885246,86.525974,0.111017
5,Skl GBM,85.245902,85.010823,0.090207
6,Random Forest,83.606557,83.495671,0.129311
7,Extra Trees,81.967213,81.709957,0.103065
8,Skl HistGBM,80.327869,79.924242,0.24756
9,LightGBM,80.327869,79.924242,0.115545
10,AdaBoost,78.688525,78.138528,0.138048


In [8]:
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

# for model_name, model in tree_classifiers.items():
#     start_time = time.time()
        
#     # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
#     pred = cross_val_predict(model, x,y, cv=skf)

#     total_time = time.time() - start_time

#     results = results.append({"Model":    model_name,
#                               "Accuracy": accuracy_score(y, pred)*100,
#                               "Bal Acc.": balanced_accuracy_score(y, pred)*100,
#                               "Time":     total_time},
#                               ignore_index=True)
                              
                              
# results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
# results_ord.index += 1 
# results_ord.style.bar(subset=['Accuracy','Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')