In [15]:
import pandas as pd
import pycaret.classification as caret

In [16]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [17]:
from collections import Counter

counter = Counter(data['stroke'])
counter, counter[0]/counter[1], counter[0]/(counter[0] + counter[1]), counter[1]/(counter[0] + counter[1])

(Counter({1: 249, 0: 4861}),
 19.522088353413654,
 0.9512720156555773,
 0.0487279843444227)

In [18]:
95/5

19.0

In [19]:
from sklearn.model_selection import train_test_split

x = data.drop('stroke', axis=1)
y = data['stroke']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [20]:
train = pd.concat([x_train, y_train], axis=1)
test = pd.concat([x_test, y_test], axis=1)
train.shape, test.shape

((3577, 12), (1533, 12))

In [21]:
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler()
session = caret.setup(data=train, target='stroke', session_id=42, silent=True,
                      normalize=True, 
                    #   pca=True,)
                      remove_multicollinearity=True, )
                      # remove_perfect_collinearity=True,
                      # fix_imbalance=True, fix_imbalance_method=sampler)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,stroke
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(3577, 12)"
5,Missing Values,True
6,Numeric Features,4
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,False


In [22]:
# topk = caret.compare_models(n_select=3, sort='F1')

In [54]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight(class_weight='balanced',
                               classes=np.unique(y),
                               y=y)
weights, weights[1]/weights[0]

(array([ 0.52561201, 10.26104418]), 19.522088353413658)

In [44]:
lr = caret.create_model('lr', class_weight='balanced', cross_validation=False)
rf = caret.create_model('rf', class_weight='balanced', cross_validation=False)
et = caret.create_model('et', class_weight='balanced', cross_validation=False)
lightgbm = caret.create_model('lightgbm', class_weight='balanced', cross_validation=False)
xgboost = caret.create_model('xgboost', scale_pos_weight=0.95/0.05, cross_validation=False)
catboost = caret.create_model('catboost', class_weights=[0.05, 0.95], cross_validation=False)

models = [lr, rf, et, lightgbm, xgboost, catboost]
topk = caret.compare_models(n_select=5, sort='F1', include=models)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Logistic Regression,0.7479,0.8315,0.7773,0.1302,0.2227,0.1557,0.2462,0.01
5,CatBoost Classifier,0.9209,0.8137,0.1652,0.1484,0.1527,0.1126,0.1141,2.62
3,Light Gradient Boosting Machine,0.9277,0.817,0.1053,0.1314,0.113,0.0772,0.0789,0.018
4,Extreme Gradient Boosting,0.9365,0.7974,0.0773,0.1732,0.102,0.0731,0.082,0.056
2,Extra Trees Classifier,0.9513,0.7489,0.0356,0.1233,0.0536,0.0458,0.0546,0.074
1,Random Forest Classifier,0.9533,0.794,0.0,0.0,0.0,-0.0007,-0.0014,0.079


In [45]:
topk_tuned = [caret.tune_model(model, optimize='F1', 
                               choose_better=True) for model in topk]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7331,0.8598,0.8333,0.1333,0.2299,0.1607,0.2617
1,0.7251,0.7993,0.6667,0.1096,0.1882,0.1156,0.1854
2,0.7131,0.8007,0.75,0.1154,0.2,0.1277,0.2127
3,0.736,0.841,0.8182,0.1233,0.2143,0.1492,0.2483
4,0.704,0.8328,0.9091,0.1205,0.2128,0.1464,0.2629
5,0.724,0.8503,0.9091,0.1282,0.2247,0.1599,0.2765
6,0.708,0.8667,0.8182,0.1125,0.1978,0.1305,0.2291
7,0.72,0.8759,0.9167,0.1375,0.2391,0.1698,0.2872
8,0.712,0.8906,0.8333,0.125,0.2174,0.1461,0.2471
9,0.772,0.757,0.5833,0.1186,0.1972,0.1276,0.1837


In [46]:
blender = caret.blend_models(topk_tuned, optimize='F1', choose_better=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8167,0.8811,0.6667,0.16,0.2581,0.1961,0.2623
1,0.8367,0.8389,0.5833,0.1628,0.2545,0.1943,0.245
2,0.7968,0.8431,0.6667,0.1455,0.2388,0.174,0.2424
3,0.848,0.8281,0.7273,0.186,0.2963,0.2433,0.3157
4,0.816,0.8692,0.7273,0.1569,0.2581,0.2002,0.2786
5,0.796,0.8631,0.8182,0.1552,0.2609,0.2018,0.2979
6,0.876,0.8631,0.5455,0.1875,0.2791,0.2285,0.2681
7,0.812,0.8627,0.8333,0.1818,0.2985,0.2385,0.3325
8,0.848,0.909,0.8333,0.2174,0.3448,0.2908,0.3763
9,0.828,0.7374,0.5,0.1395,0.2182,0.1547,0.1952


In [47]:
caret.predict_model(blender, data=test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8382,0.8387,0.6517,0.2109,0.3187,0.2532,0.3056


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Label,Score
4688,40041,Male,31.0,0,0,No,Self-employed,Rural,64.85,23.0,Unknown,0,0,0.8108
4478,55244,Male,40.0,0,0,Yes,Self-employed,Rural,65.29,28.3,never smoked,0,0,0.7856
3849,70992,Female,8.0,0,0,No,children,Urban,74.42,22.5,Unknown,0,0,0.8212
4355,38207,Female,79.0,1,0,Yes,Self-employed,Rural,76.64,19.5,never smoked,0,1,0.6160
3826,8541,Female,75.0,0,0,Yes,Govt_job,Rural,94.77,27.2,never smoked,0,1,0.5556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818,7663,Male,20.0,0,0,No,Govt_job,Rural,106.97,27.9,formerly smoked,0,0,0.8252
4829,66067,Male,66.0,0,0,Yes,Private,Rural,67.92,31.1,formerly smoked,0,0,0.5909
611,30753,Male,42.0,0,0,Yes,Govt_job,Urban,93.79,27.2,never smoked,0,0,0.7844
3082,66270,Female,57.0,0,0,Yes,Private,Rural,69.40,24.0,Unknown,0,0,0.6068


In [48]:
best_model = caret.finalize_model(blender)
caret.predict_model(best_model, data=test)

bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8311,0.8482,0.6404,0.2007,0.3056,0.2383,0.2909


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Label,Score
4688,40041,Male,31.0,0,0,No,Self-employed,Rural,64.85,23.0,Unknown,0,0,0.8079
4478,55244,Male,40.0,0,0,Yes,Self-employed,Rural,65.29,28.3,never smoked,0,0,0.7833
3849,70992,Female,8.0,0,0,No,children,Urban,74.42,22.5,Unknown,0,0,0.8202
4355,38207,Female,79.0,1,0,Yes,Self-employed,Rural,76.64,19.5,never smoked,0,1,0.6826
3826,8541,Female,75.0,0,0,Yes,Govt_job,Rural,94.77,27.2,never smoked,0,1,0.5444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818,7663,Male,20.0,0,0,No,Govt_job,Rural,106.97,27.9,formerly smoked,0,0,0.8230
4829,66067,Male,66.0,0,0,Yes,Private,Rural,67.92,31.1,formerly smoked,0,0,0.5773
611,30753,Male,42.0,0,0,Yes,Govt_job,Urban,93.79,27.2,never smoked,0,0,0.7849
3082,66270,Female,57.0,0,0,Yes,Private,Rural,69.40,24.0,Unknown,0,0,0.6257


In [51]:
xgboost.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 6,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': -1,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 42,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 18.999999999999996,
 'subsample': 1,
 'tree_method': 'auto',
 'validate_parameters': 1,
 'verbosity': 0}

In [56]:
best_model.estimators

[('lr',
  LogisticRegression(C=2.214, class_weight='balanced', dual=False,
                     fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                     max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('catboost', <catboost.core.CatBoostClassifier at 0x7f76c8bbb3d0>),
 ('lightgbm',
  LGBMClassifier(bagging_fraction=0.8, bagging_freq=7, boosting_type='gbdt',
                 class_weight='balanced', colsample_bytree=1.0,
                 feature_fraction=0.8, importance_type='split', learning_rate=0.4,
                 max_depth=-1, min_child_samples=16, min_child_weight=0.001,
                 min_split_gain=0.7, n_estimators=100, n_jobs=-1, num_leaves=90,
                 objective=None, random_state=42, reg_alpha=1, reg_lambda=3,
                 silent='warn', subsample=1.0, subsample_for_bin=200000,
                 subsample_freq=