In [8]:
# imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 40)
import numpy as np
import hyperopt
from hyperopt import hp, tpe, STATUS_OK, Trials
from sklearn.metrics import roc_auc_score,auc,roc_curve
import xgboost as xgb
from sklearn.model_selection import KFold,StratifiedKFold
from collections import Counter
import pickle

In [9]:
train=pd.read_csv('prepped_train.csv')

In [10]:
def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}

class_weights = get_class_weights(train.stroke.values)
print(class_weights)

{0.0: 1.0, 1.0: 54.43}


In [11]:
X=train.drop(['stroke','id'],axis=1)
Y=train['stroke']

In [17]:
# Hyperparameter tuning
param_dict={}
space = {
        'max_depth': hp.choice('max_depth', np.arange(1, 10, dtype=int)),
        'min_child_weight': hp.choice('min_child_weight', np.arange(1, 10, dtype=int)),
        'subsample': hp.uniform('subsample', 0.5, 1),
        'n_estimators': hp.choice('n_estimators', np.arange(100, 600, 10, dtype=int)),
        'eta': hp.uniform('eta', 0.01, 0.1),
        'gamma': hp.uniform('gamma', 0, 10),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
    }


def objective(space):
    params = {
            'min_child_weight': space['min_child_weight'],
            'eta': space['eta'],
            'colsample_bytree': space['colsample_bytree'],
            'max_depth': space['max_depth'],
            'subsample': space['subsample'],
            'gamma': space['gamma'],
            'n_estimators': space['n_estimators'],
            'silent': 1,
            'verbose_eval': True,
    'objective':'binary:logistic',
    'nthread':8}
    print(params, file=open("output_xg.txt", "a"))
    print(params)
    skf = StratifiedKFold(n_splits=5)
    count=0
    for train_index, test_index in skf.split(X[0:X.shape[0]], Y):
        X_train, X_test = pd.DataFrame(X.values[train_index],columns=X.columns), pd.DataFrame(X.values[test_index],columns=X.columns)
        y_train, y_test = Y[train_index], Y[test_index]    
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        model = xgb.train(params,xgtrain,space['n_estimators'])
        fp_rate, tp_rate, thresholds = roc_curve(y_test, model.predict(xgb.DMatrix(X_test)))
        count=count+auc(fp_rate, tp_rate)
    print('auc='+str(count/5))    
    print((count/5), file=open("output_xg.txt", "a"))
    param_dict[count/5]=params
    return count/5
try:
    
    trials = Trials()
    best = hyperopt.fmin(fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=1000,
        trials=trials
        )
except:
    pass

{'min_child_weight': 1, 'learning_rate': 0.03783628629713371, 'colsample_bytree': 0.9826164591821482, 'max_depth': 4, 'subsample': 0.8344193040933546, 'gamma': 6.166887642129774, 'n_estimators': 440, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8606853018353006
{'min_child_weight': 4, 'learning_rate': 0.059069485786736375, 'colsample_bytree': 0.9096119325555434, 'max_depth': 9, 'subsample': 0.5786309418194371, 'gamma': 5.911501431274515, 'n_estimators': 300, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8520374304713687
{'min_child_weight': 2, 'learning_rate': 0.04344582897871535, 'colsample_bytree': 0.9309461485785986, 'max_depth': 9, 'subsample': 0.9978489488883706, 'gamma': 9.851507125454738, 'n_estimators': 220, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8622101314519274
{'min_child_weight': 4, 'learning_rate': 0.05813871366236947, 'colsample_bytree': 0.941

In [None]:
# {'min_child_weight': 4, 'eta': 0.08561247330656825, 'colsample_bytree': 0.6531780240267306, 'max_depth': 3, 'subsample': 0.9154275812011686, 'gamma': 3.636512715240638, 'n_estimators': 110, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
# auc=0.8625443411982147