In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df=pd.read_excel('heart_failure_dataset.xlsx')

In [3]:
categorical_data = df[['anaemia','diabetes','high_blood_pressure','sex','smoking']]
categorical_data.head()

Unnamed: 0,anaemia,diabetes,high_blood_pressure,sex,smoking
0,0,0,1,1,0
1,0,0,0,1,0
2,0,0,0,1,1
3,1,0,0,1,0
4,1,1,0,0,0


In [4]:
y = df['DEATH_EVENT']
X = df.drop(['time','DEATH_EVENT','anaemia','diabetes','high_blood_pressure','sex','smoking'], axis = 1)
cols = X.columns
index = X.index

In [5]:
scaler = StandardScaler()

In [6]:
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = cols, index = index)
X_scaled.head()

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium
0,1.192945,0.000166,-1.53056,0.01681648,0.490057,-1.504036
1,-0.491279,7.51464,-0.007077,7.53566e-09,-0.284552,-0.141976
2,0.350833,-0.449939,-1.53056,-1.038073,-0.0909,-1.731046
3,-0.912335,-0.486071,-1.53056,-0.5464741,0.490057,0.085034
4,0.350833,-0.435486,-1.53056,0.6517986,1.264666,-4.682176


In [7]:
X = pd.concat([X_scaled, categorical_data], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.70, 
                                                    random_state = 1234)

In [9]:
X_train.head()

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,anaemia,diabetes,high_blood_pressure,sex,smoking
90,0.771889,-0.532527,0.585389,-0.14705,-0.575031,-0.141976,0,1,0,1,1
98,-0.070223,-0.439615,-1.10737,0.559624,-0.187726,0.085034,1,1,1,0,0
121,0.435044,-0.530462,-0.007077,-1.038073,-0.381379,-0.141976,1,1,1,0,0
0,1.192945,0.000166,-1.53056,0.016816,0.490057,-1.504036,0,0,1,1,0
105,0.940312,-0.262051,-0.68418,3.662843,0.296405,0.312044,1,0,1,0,1


## Grid Search

In [10]:
params_grid_search = {'max_depth': [1,3,5],
                      'criterion' :['gini', 'entropy'],
                      'max_features': ['sqrt', 'log2', 'auto'],
                      'n_estimators': [100, 300, 500],
                      'min_samples_split': [0.01, 0.05, 0.1]}

In [11]:
model_grid_search = RandomForestClassifier(random_state=1234)

cv_model_grid_search = GridSearchCV(model_grid_search,
                                    params_grid_search,
                                    cv = 10,
                                    n_jobs = -1, 
                                    verbose = 2) 

In [12]:
cv_model_grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=1234),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 3, 5],
                         'max_features': ['sqrt', 'log2', 'auto'],
                         'min_samples_split': [0.01, 0.05, 0.1],
                         'n_estimators': [100, 300, 500]},
             verbose=2)

In [13]:
print("Best parameters: " + str(cv_model_grid_search.best_params_))

Best parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_split': 0.05, 'n_estimators': 100}


In [14]:
tuned_model_grid_search = RandomForestClassifier(max_depth = 3,
                                                 criterion = 'gini',
                                                 max_features = 'sqrt', 
                                                 min_samples_split = 0.05,
                                                 n_estimators = 100,
                                                 random_state = 1234)

tuned_model_grid_search.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, max_features='sqrt', min_samples_split=0.05,
                       random_state=1234)

In [15]:
y_prob_grid_search = tuned_model_grid_search.predict_proba(X_test)[:, 1]
roc_grid_search = roc_auc_score(y_test, y_prob_grid_search)
roc_grid_search

0.8063633600162123

## Random Search

In [16]:
params_random_search = {'max_depth': [1,2,3,4,5],
                        'criterion' :['gini', 'entropy'],
                        'max_features': ['sqrt', 'log2', 'auto'],
                        'n_estimators': [100, 200, 300, 400, 500],
                        'min_samples_split': [0.01, 0.03, 0.05, 0.07, 0.1]}

In [17]:
model_random_search = RandomForestClassifier(random_state = 1234)
cv_model_random_search = RandomizedSearchCV(model_random_search,
                                            params_random_search,
                                            n_iter = 50,
                                            cv = 10,
                                            n_jobs = -1,
                                            verbose = 2,
                                            random_state = 1234)

In [18]:
cv_model_random_search.fit(X_train,y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(random_state=1234),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [1, 2, 3, 4, 5],
                                        'max_features': ['sqrt', 'log2',
                                                         'auto'],
                                        'min_samples_split': [0.01, 0.03, 0.05,
                                                              0.07, 0.1],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   random_state=1234, verbose=2)

In [19]:
print("Best parameters: " + str(cv_model_random_search.best_params_))

Best parameters: {'n_estimators': 100, 'min_samples_split': 0.05, 'max_features': 'log2', 'max_depth': 3, 'criterion': 'entropy'}


In [20]:
tuned_model_random_search = RandomForestClassifier(max_depth = 3,
                                                   criterion = 'entropy',
                                                   max_features = 'log2',
                                                   min_samples_split = 0.05,
                                                   n_estimators = 100,
                                                   random_state = 1234)

tuned_model_random_search.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=3, max_features='log2',
                       min_samples_split=0.05, random_state=1234)

In [21]:
y_prob_random_search = tuned_model_random_search.predict_proba(X_test)[:, 1]
roc_random_search = roc_auc_score(y_test, y_prob_random_search)
roc_random_search

0.7999797345222414

## Bayesian Hyperparameter Optimization with Hyperopt

In [22]:
params_hyperopt = {'max_depth' : hp.quniform('max_depth', 1, 5, 1),
                   'criterion': hp.choice('criterion', ['gini', 'entropy']),
                   'max_features': hp.choice('max_features', ['sqrt', 'log2', 'auto']),
                   'n_estimators': hp.quniform('n_estimators', 100, 500, 100),
                   'min_samples_split' : hp.quniform('min_samples_split', 0.01, 0.1, 0.01)}

In [23]:
def objective(params_hyperopt):
    
    model_hyperopt = RandomForestClassifier(max_depth = params_hyperopt['max_depth'],
                                            criterion = params_hyperopt['criterion'],
                                            max_features = params_hyperopt['max_features'],
                                            n_estimators = int(params_hyperopt['n_estimators']),
                                            min_samples_split = params_hyperopt['min_samples_split'],
                                            random_state = 1234)

    cv_model_random_search = cross_val_score(model_hyperopt,
                                             X = X_train,
                                             y = y_train,
                                             scoring = 'roc_auc',
                                             cv = 10).mean()

    return {'loss': -cv_model_random_search, 'status': STATUS_OK }

In [24]:
trials = Trials()
best_hyperopt = fmin(fn = objective,
                     space = params_hyperopt,
                     algo = tpe.suggest,
                     max_evals = 50,
                     trials = trials,
                     rstate = np.random.RandomState(1234))

100%|██████████| 50/50 [02:03<00:00,  2.48s/trial, best loss: -0.8305555555555555]


In [25]:
print("Best parameters: " + str(best_hyperopt))

Best parameters: {'criterion': 1, 'max_depth': 2.0, 'max_features': 2, 'min_samples_split': 0.01, 'n_estimators': 100.0}


In [26]:
tuned_model_hyperopt = RandomForestClassifier(max_depth = 2,
                                              criterion = 'entropy',
                                              max_features = 'auto',
                                              min_samples_split = 0.01,
                                              n_estimators = 100,
                                              random_state = 1234)

tuned_model_hyperopt.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=2, min_samples_split=0.01,
                       random_state=1234)

In [27]:
y_prob_hyperopt = tuned_model_hyperopt.predict_proba(X_test)[:, 1]
roc_hyperopt = roc_auc_score(y_test, y_prob_hyperopt)
print(roc_hyperopt)

0.7976492045799979
