In [7]:
# imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 40)
import numpy as np
import hyperopt
from hyperopt import hp, tpe, STATUS_OK, Trials
from sklearn.metrics import roc_auc_score,auc,roc_curve
import xgboost as xgb
from sklearn.model_selection import KFold,StratifiedKFold
from collections import Counter
import pickle

In [8]:
train=pd.read_csv('prepped_train.csv')

In [9]:
def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}

class_weights = get_class_weights(train.Approved.values)
print(class_weights)

{0.0: 1.0, 1.0: 67.35}


In [10]:
X=train.drop(['Approved','ID'],axis=1)
Y=train['Approved']

In [12]:
# Hyperparameter tuning
param_dict={}
space = {
        'max_depth': hp.choice('max_depth', np.arange(1, 10, dtype=int)),
        'min_child_weight': hp.choice('min_child_weight', np.arange(1, 10, dtype=int)),
        'subsample': hp.uniform('subsample', 0.5, 1),
        'n_estimators': hp.choice('n_estimators', np.arange(100, 600, 10, dtype=int)),
        'eta': hp.uniform('eta', 0.01, 0.1),
        'gamma': hp.uniform('gamma', 0, 10),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'scale_pos_weight':hp.choice('scale_pos_weight', np.arange(10, 100,10, dtype=int))
    }


def objective(space):
    params = {
            'min_child_weight': space['min_child_weight'],
            'eta': space['eta'],
            'colsample_bytree': space['colsample_bytree'],
            'max_depth': space['max_depth'],
            'subsample': space['subsample'],
            'gamma': space['gamma'],
            'n_estimators': space['n_estimators'],
            'silent': 1,
            'verbose_eval': True,
    'objective':'binary:logistic',
    'nthread':8}
    print(params, file=open("output_xg.txt", "a"))
    print(params)
    skf = StratifiedKFold(n_splits=5)
    count=0
    for train_index, test_index in skf.split(X[0:X.shape[0]], Y):
        X_train, X_test = pd.DataFrame(X.values[train_index],columns=X.columns), pd.DataFrame(X.values[test_index],columns=X.columns)
        y_train, y_test = Y[train_index], Y[test_index]    
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        model = xgb.train(params,xgtrain,space['n_estimators'])
#         print(model.predict(xgb.DMatrix(X_test)))
        fp_rate, tp_rate, thresholds = roc_curve(y_test, model.predict(xgb.DMatrix(X_test)))
#         print(auc(fp_rate, tp_rate))
        count=count+auc(fp_rate, tp_rate)
    print('auc='+str(count/5))    
    print((count/5), file=open("output_xg.txt", "a"))
    param_dict[count/5]=params
    return count/5
try:
    
    trials = Trials()
    best = hyperopt.fmin(fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=1000,
        trials=trials
        )
except:
    pass

{'min_child_weight': 1, 'eta': 0.08912368539544886, 'colsample_bytree': 0.869421826512258, 'max_depth': 8, 'subsample': 0.8508558256698064, 'gamma': 1.9608643582925034, 'n_estimators': 540, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8321716035647263
{'min_child_weight': 9, 'eta': 0.05474873206174422, 'colsample_bytree': 0.7935898211862362, 'max_depth': 2, 'subsample': 0.7306648453610738, 'gamma': 3.2856515838487987, 'n_estimators': 430, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8479262623252121
{'min_child_weight': 7, 'eta': 0.06943664376715199, 'colsample_bytree': 0.6286453113885396, 'max_depth': 2, 'subsample': 0.8116624251147926, 'gamma': 6.461442340662041, 'n_estimators': 130, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8366179948400646
{'min_child_weight': 3, 'eta': 0.04422509193909629, 'colsample_bytree': 0.8394797941106702, 'max_depth': 8, 'subsampl

auc=0.7950998134530455
{'min_child_weight': 4, 'eta': 0.09104584576519123, 'colsample_bytree': 0.8765166827250157, 'max_depth': 1, 'subsample': 0.9197392397810105, 'gamma': 9.05220979331414, 'n_estimators': 100, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8242471482935004
{'min_child_weight': 6, 'eta': 0.03187526006663395, 'colsample_bytree': 0.6647438899906168, 'max_depth': 1, 'subsample': 0.7082420475681006, 'gamma': 7.649328041521769, 'n_estimators': 220, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8190897158520908
{'min_child_weight': 5, 'eta': 0.053216884476430776, 'colsample_bytree': 0.5914625667480629, 'max_depth': 2, 'subsample': 0.9978705839482935, 'gamma': 9.224768770863527, 'n_estimators': 290, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8359291071203208
{'min_child_weight': 3, 'eta': 0.08383236783452308, 'colsample_bytree': 0.6345962912799586, 'ma

auc=0.8426940990551636
{'min_child_weight': 3, 'eta': 0.09991455058215293, 'colsample_bytree': 0.9796407741059913, 'max_depth': 1, 'subsample': 0.5592684652804416, 'gamma': 9.55028179710727, 'n_estimators': 540, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8372921575197321
{'min_child_weight': 9, 'eta': 0.04301546542438467, 'colsample_bytree': 0.6401013033364213, 'max_depth': 5, 'subsample': 0.645033115902169, 'gamma': 4.388979356516864, 'n_estimators': 130, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8411389196983441
{'min_child_weight': 6, 'eta': 0.035518795870938924, 'colsample_bytree': 0.567994273027044, 'max_depth': 2, 'subsample': 0.7025390368879917, 'gamma': 8.298207730179815, 'n_estimators': 560, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8414178330981148
{'min_child_weight': 4, 'eta': 0.06603096235944729, 'colsample_bytree': 0.7940003651473289, 'max_

auc=0.7976235653245061
{'min_child_weight': 3, 'eta': 0.010172818039825861, 'colsample_bytree': 0.9048512616687113, 'max_depth': 6, 'subsample': 0.9229243390517078, 'gamma': 9.63172508330024, 'n_estimators': 590, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8415575744366206
{'min_child_weight': 3, 'eta': 0.07857835379271903, 'colsample_bytree': 0.9399725761360915, 'max_depth': 2, 'subsample': 0.828086449903077, 'gamma': 8.574204497494774, 'n_estimators': 450, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8435330314663446
{'min_child_weight': 7, 'eta': 0.046554850525037335, 'colsample_bytree': 0.9798109834297691, 'max_depth': 4, 'subsample': 0.8040224307570898, 'gamma': 5.9729935797322575, 'n_estimators': 290, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8505251097537672
{'min_child_weight': 6, 'eta': 0.02210300238828567, 'colsample_bytree': 0.8033563723200102, 'm

auc=0.8428970509443496
{'min_child_weight': 3, 'eta': 0.09396769664765986, 'colsample_bytree': 0.881617081409428, 'max_depth': 9, 'subsample': 0.8705808117202379, 'gamma': 8.426103556925215, 'n_estimators': 330, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8509380776202784
{'min_child_weight': 1, 'eta': 0.022354237288670704, 'colsample_bytree': 0.822431522481959, 'max_depth': 7, 'subsample': 0.8974879599887348, 'gamma': 8.147685952518575, 'n_estimators': 500, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8505588933360219
{'min_child_weight': 7, 'eta': 0.012979568729899171, 'colsample_bytree': 0.8986768617476633, 'max_depth': 5, 'subsample': 0.9732633499985697, 'gamma': 4.536213649565186, 'n_estimators': 130, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8196739108937219
{'min_child_weight': 6, 'eta': 0.031573823794430564, 'colsample_bytree': 0.9285782754948407, 'm

auc=0.827941140365494
{'min_child_weight': 5, 'eta': 0.02102755265484254, 'colsample_bytree': 0.9837498153507623, 'max_depth': 7, 'subsample': 0.8691977573440466, 'gamma': 3.9393364774940673, 'n_estimators': 290, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8489483059777715
{'min_child_weight': 4, 'eta': 0.04325956582198819, 'colsample_bytree': 0.9719311635102594, 'max_depth': 8, 'subsample': 0.9833382532640782, 'gamma': 6.005321513317476, 'n_estimators': 170, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8499397387332255
{'min_child_weight': 3, 'eta': 0.015816708367066612, 'colsample_bytree': 0.9579097032837143, 'max_depth': 1, 'subsample': 0.6404569080962089, 'gamma': 7.63103382578805, 'n_estimators': 370, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.814721265063071
{'min_child_weight': 1, 'eta': 0.012775285312002114, 'colsample_bytree': 0.9394946056156681, 'ma

auc=0.8491485532442798
{'min_child_weight': 3, 'eta': 0.011579074199534772, 'colsample_bytree': 0.8528389029806249, 'max_depth': 2, 'subsample': 0.9028321185270357, 'gamma': 7.446160795335561, 'n_estimators': 160, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8138797630168666
{'min_child_weight': 2, 'eta': 0.035868824761215515, 'colsample_bytree': 0.7646123096937119, 'max_depth': 1, 'subsample': 0.8561619485870553, 'gamma': 9.284987204133015, 'n_estimators': 540, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8347866045219783
{'min_child_weight': 1, 'eta': 0.015634297114535418, 'colsample_bytree': 0.9589333125457787, 'max_depth': 9, 'subsample': 0.9841072506727733, 'gamma': 9.036166570082885, 'n_estimators': 460, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8478722865574415
{'min_child_weight': 7, 'eta': 0.06804155154485032, 'colsample_bytree': 0.9168508213623402, 

auc=0.7807998855144292
{'min_child_weight': 1, 'eta': 0.04535289535119497, 'colsample_bytree': 0.6574212751401697, 'max_depth': 9, 'subsample': 0.9998129122186308, 'gamma': 8.740731013213924, 'n_estimators': 250, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8487023237091668
{'min_child_weight': 7, 'eta': 0.04152360080302765, 'colsample_bytree': 0.7479807522973553, 'max_depth': 1, 'subsample': 0.9464908617470222, 'gamma': 7.190791022753, 'n_estimators': 470, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8357258016572882
{'min_child_weight': 3, 'eta': 0.021074753614862912, 'colsample_bytree': 0.8319106101141597, 'max_depth': 5, 'subsample': 0.8759621524666451, 'gamma': 8.456675089144305, 'n_estimators': 590, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8476946402773284
{'min_child_weight': 6, 'eta': 0.029602336325343738, 'colsample_bytree': 0.9747403825101404, 'max

auc=0.781564292848221
{'min_child_weight': 6, 'eta': 0.08043560101012953, 'colsample_bytree': 0.8543209229110194, 'max_depth': 5, 'subsample': 0.8833124589598271, 'gamma': 1.019313360349805, 'n_estimators': 500, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8404401582534341
{'min_child_weight': 3, 'eta': 0.011218317997206847, 'colsample_bytree': 0.5508047230414996, 'max_depth': 7, 'subsample': 0.9283559720202222, 'gamma': 9.690112407796084, 'n_estimators': 110, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.7987723031936467
{'min_child_weight': 5, 'eta': 0.013545958184296462, 'colsample_bytree': 0.8959130529029535, 'max_depth': 1, 'subsample': 0.8551844178234758, 'gamma': 3.637874182848089, 'n_estimators': 530, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8190103234615623
{'min_child_weight': 1, 'eta': 0.010024130245084084, 'colsample_bytree': 0.9773187425717114, '

auc=0.8481423430545494
{'min_child_weight': 1, 'eta': 0.010025858139987413, 'colsample_bytree': 0.9116371782197259, 'max_depth': 1, 'subsample': 0.9089928380023746, 'gamma': 4.9762692466511895, 'n_estimators': 250, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.7928468875779252
{'min_child_weight': 5, 'eta': 0.0691388469505433, 'colsample_bytree': 0.97113000604045, 'max_depth': 1, 'subsample': 0.8638642278834687, 'gamma': 2.215992443772299, 'n_estimators': 110, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
auc=0.8198251359090433
{'min_child_weight': 4, 'eta': 0.06086206403419492, 'colsample_bytree': 0.8729464306521184, 'max_depth': 8, 'subsample': 0.9678003806134201, 'gamma': 9.28936266799306, 'n_estimators': 320, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}


In [6]:
# Best Single Model
# xg_params={'min_child_weight': 3, 'eta': 0.030786571985543867, 'colsample_bytree': 0.9982435533479505, 'max_depth': 7, 'subsample': 0.8386043415246714, 'gamma': 5.200936573465457, 'n_estimators': 440, 'silent': 1, 'verbose_eval': True, 'objective': 'binary:logistic', 'nthread': 8}
#auc=0.8533679286761162

In [16]:
# Train on full data
xgtrain = xgb.DMatrix(X, label=Y)
model = xgb.train(xg_params,xgtrain,xg_params['n_estimators'])

In [22]:
# Get test predictions
test=pd.read_csv('prepped_test.csv')
X_test=test.drop(['ID','Approved'],axis=1)
xgtest=xgb.DMatrix(X_test,feature_names=X_test.columns)
predictions=model.predict(xgtest)