In [1]:
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import time
from sklearn.model_selection import KFold
import warnings
from sklearn.model_selection import cross_val_score
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

### Loading Data


In [2]:
start_time = time.time()
df = pd.read_csv('./data/cc_sample.txt', sep=';',index_col=False, decimal=',') 
print(-start_time + time.time()," sec")

2.7515923976898193  sec


### Functions with xgboost

In [2]:
#Get Dataframe as input, return DataFrame with filled nulls
def fill_null(x,s):
    for i in x.columns:
        x[i] = x[i].fillna(s)
    return x

#Check type of variable and return numeric or cat
def check_type(x):
    from pandas.api.types import is_string_dtype
    from pandas.api.types import is_numeric_dtype   
    #Удаляем пустые значения
    x = x[x.notnull()]
    #Если число различных значений меньше 4, то тип-категориальный
    if x.nunique()<=4: return 'cat'
    elif is_numeric_dtype(x): return 'numeric'
    else: return 'cat'
    
#Input: DataFrame
#Output: DataFrame with one-hot variables
def cat_to_one_hot(x):
    for col in x.columns:
        if check_type(x[col])=='cat':
            tmp = pd.get_dummies(x[col],prefix=x[col].name,drop_first=True)
            for i in tmp.columns:
                x[i] = tmp[i]
            del x[col]
    return x

def gini(model,x,y):
    gini =  2*roc_auc_score(y,model.predict_proba(x)[:,1])-1
   # print('GINI = ',2*roc_auc_score(y,model.predict_proba(x)[:,1])-1)   
    return gini

#Searching for optimal hyperparams using gridsearch
def gridcv_xgboost(params_grid,x,y):
    xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
                               gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
                               min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
                               objective='binary:logistic', reg_alpha=30, reg_lambda=1,
                               scale_pos_weight=1, seed=42, silent=True, subsample=1,tree_method='gpu_hist')
    kfold = KFold(n_splits=3, shuffle=True)
    clf = GridSearchCV(xgb_model,params_grid, 
                            verbose=2, 
                            scoring='roc_auc',
                            cv=kfold,
                            n_jobs=-1)
    clf.fit(x,y)
    print(-start_time + time.time()," sec")
    warnings.simplefilter('ignore', FutureWarning)
    res = pd.DataFrame(clf.cv_results_)
    res["dev_gini"] = 2*res["mean_train_score"]-1
    res["val_gini"] = 2*res["mean_test_score"]-1
    return res[["params","dev_gini","val_gini"]]

#Build single xgoost model using params
def xgb_build(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    gbm_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
                               gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
                               min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
                               objective='binary:logistic', reg_alpha=30, reg_lambda=1,
                               scale_pos_weight=1, seed=42, silent=True, subsample=1)
    gbm_model.fit(x_train,y_train)
    predictions = gbm_model.predict(x_valid)
    score = roc_auc_score(y_valid, predictions)
    # TODO: Add the importance for the selected features
    print("\tScore {0}\n\n".format(score))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    gini = 2*score-1
    return {'gini': gini, 'status': STATUS_OK}

#Find optimal params using hyperopt
def optimize(
             #trials, 
             random_state=1):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
        'n_estimators': hp.quniform('n_estimators', 50,100, 1),
        'learning_rate': hp.quniform('learning_rate', 0.08,0.12, 0.01),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(3, 4, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(xgb_build, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=250)
    return best


### Loading data and applying xgboost

In [109]:
#XGBoost with GridSearch and CV
start_time = time.time()
df = pd.read_csv('./data/cc_sample.txt', sep=';',index_col=False, decimal=',') 
print(-start_time + time.time()," sec")

df = fill_null(df,-1)
df = cat_to_one_hot(df)
start_time = time.time()
x=df.copy()
x= x.drop(['CONTRACT_SRC_CODE','SCORE_FINAL','BAD_12_FLAG90_1'], axis=1)
y = df["BAD_12_FLAG90_1"][df['BAD_12_FLAG90_1'].notnull()] 

params_grid = {'n_estimators': [500],
              'max_depth':[3],
              'learning_rate':[0.1],
              'reg_alpha':[30]}

#gridcv_xgboost(params_grid,x_sample,y)
x_train,x_test,y_train,y_test = train_test_split(x,y)
gbm_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
                               gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=2,
                               min_child_weight=1, missing=None, n_estimators=2, nthread=-1,
                               objective='binary:logistic', reg_alpha=30, reg_lambda=1,
                               scale_pos_weight=1, seed=42, silent=True, subsample=1)
gbm_model.fit(x,y)

2.7091481685638428  sec


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=2,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=30, reg_lambda=1, scale_pos_weight=1, seed=42,
       silent=True, subsample=1)

### Save model to file and view model

In [110]:
import pickle 
import os
import matplotlib
import matplotlib.pyplot as plt
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
# save model to file
#pickle.dump(gbm_model, open("./models/pima.pickle.dat", "wb"))

# load model from file
#loaded_model = pickle.load(open("./models/pima.pickle.dat", "rb"))

# dump it to a text file
gbm_model.get_booster().dump_model('./models/xgb_model.txt', with_stats=True)
# read the contents of the file
with open('./models/xgb_model.txt', 'r') as f:
    txt_model = f.read()
print(txt_model)



booster[0]:
0:[LBT_ACCT_TOT_BAL_PREV_RUB_AMT<30.5649986] yes=1,no=2,missing=1,gain=1094.875,cover=157760.25
	1:[MIN_APP_DAYS<0.5] yes=3,no=4,missing=3,gain=161.640625,cover=14666.75
		3:leaf=-0.145004302,cover=8156
		4:leaf=-0.174092993,cover=6510.75
	2:[CNT_TR_CASH_1M<16.5] yes=5,no=6,missing=5,gain=38.21875,cover=143093.5
		5:leaf=-0.18950969,cover=140195
		6:leaf=-0.162200376,cover=2898.5
booster[1]:
0:[LBT_ACCT_TOT_BAL_PREV_RUB_AMT<64.0350037] yes=1,no=2,missing=1,gain=925.15625,cover=156398.203
	1:[CNT_AGR_WO_ARREAR_TO_CNT<0.516649961] yes=3,no=4,missing=3,gain=193.21875,cover=19860.5254
		3:leaf=-0.136471063,cover=11818.1885
		4:leaf=-0.162643105,cover=8042.33691
	2:[RATE_TR_ALL_L3_6M<0.711987019] yes=5,no=6,missing=5,gain=16.40625,cover=136537.688
		5:leaf=-0.173366696,cover=122632.648
		6:leaf=-0.161552235,cover=13905.0361



In [135]:
gbm_model.get_booster().get_dump()[0]

'0:[LBT_ACCT_TOT_BAL_PREV_RUB_AMT<30.5649986] yes=1,no=2,missing=1\n\t1:[MIN_APP_DAYS<0.5] yes=3,no=4,missing=3\n\t\t3:leaf=-0.145004302\n\t\t4:leaf=-0.174092993\n\t2:[CNT_TR_CASH_1M<16.5] yes=5,no=6,missing=5\n\t\t5:leaf=-0.18950969\n\t\t6:leaf=-0.162200376\n'

In [116]:
b0 = df[(df['LBT_ACCT_TOT_BAL_PREV_RUB_AMT']>=30.5649986)&(df['CNT_TR_CASH_1M']>=16.5)]
b1 = df[(df['LBT_ACCT_TOT_BAL_PREV_RUB_AMT']>=64.0350037)&(df['RATE_TR_ALL_L3_6M']>=0.711987019)]

In [114]:
a0=-0.162200376
a1=-0.161552235
1/(1+np.exp(-a0-a1))

0.4197614810197186

In [118]:
b1b0 = pd.merge(b0,b1,left_index=True,right_index=True)

In [120]:
df['y_pred'] = gbm_model.predict_proba(x)[:,1]

In [121]:
df_merge = pd.merge(df,b1b0,left_index=True,right_index=True)

In [122]:
df_merge['y_pred']

88        0.419762
313       0.419762
356       0.419762
649       0.419762
1087      0.419762
1190      0.419762
1253      0.419762
1314      0.419762
1475      0.419762
1583      0.419762
1607      0.419762
1785      0.419762
1862      0.419762
2176      0.419762
2278      0.419762
2481      0.419762
2767      0.419762
3392      0.419762
3480      0.419762
3512      0.419762
3617      0.419762
3635      0.419762
3654      0.419762
3736      0.419762
4102      0.419762
4252      0.419762
4827      0.419762
4998      0.419762
5227      0.419762
5305      0.419762
            ...   
591582    0.419762
594650    0.419762
594683    0.419762
596003    0.419762
596479    0.419762
597516    0.419762
600013    0.419762
600123    0.419762
600731    0.419762
600736    0.419762
603400    0.419762
604557    0.419762
605261    0.419762
605581    0.419762
607465    0.419762
608818    0.419762
609087    0.419762
610967    0.419762
611425    0.419762
616083    0.419762
617879    0.419762
617973    0.

In [119]:
b1b0['BAD_12_FLAG90_1_y'].mean()

0.17921830314585319

### Applying Random forest

In [None]:
#Random forest classifier
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

kfold = KFold(n_splits=3, shuffle=True)
clf = GridSearchCV(rf_model,{'max_depth':[10],
                            'n_estimators':[100]},                             
                            scoring='roc_auc',
                            cv=kfold,
                            n_jobs=-1)
clf.fit(x_sample,y)

print(-start_time + time.time()," sec")
warnings.simplefilter('ignore', FutureWarning)
res = pd.DataFrame(clf.cv_results_)
res["dev_gini"] = 2*res["mean_train_score"]-1
res["val_gini"] = 2*res["mean_test_score"]-1
res[["params","dev_gini","val_gini"]]

### Applying LGBM