In [20]:
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import time
from sklearn.model_selection import KFold
import warnings
from sklearn.model_selection import cross_val_score
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [21]:
start_time = time.time()
df = pd.read_csv('C:\\users\\booba\\cc_sample.txt', sep=';',index_col=False, decimal=',') 
print(-start_time + time.time()," sec")

5.405526876449585  sec


In [23]:
#Get Dataframe as input, return DataFrame with filled nulls
def fill_null(x,s):
    for i in x.columns:
        x[i] = x[i].fillna(s)
    return x

#Check type of variable and return numeric or cat
def check_type(x):
    from pandas.api.types import is_string_dtype
    from pandas.api.types import is_numeric_dtype   
    #Удаляем пустые значения
    x = x[x.notnull()]
    #Если число различных значений меньше 4, то тип-категориальный
    if x.nunique()<=4: return 'cat'
    elif is_numeric_dtype(x): return 'numeric'
    else: return 'cat'
    
#Input: DataFrame
#Output: DataFrame with one-hot variables
def cat_to_one_hot(x):
    for col in x.columns:
        if check_type(x[col])=='cat':
            tmp = pd.get_dummies(x[col],prefix=x[col].name,drop_first=True)
            for i in tmp.columns:
                x[i] = tmp[i]
            del x[col]
    return x

def gini(model,x,y):
    gini =  2*roc_auc_score(y,model.predict_proba(x)[:,1])-1
   # print('GINI = ',2*roc_auc_score(y,model.predict_proba(x)[:,1])-1)   
    return gini

#Searching for optimal hyperparams using gridsearch
def gridcv_xgboost(params_grid,x,y):
    xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
                               gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
                               min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
                               objective='binary:logistic', reg_alpha=30, reg_lambda=1,
                               scale_pos_weight=1, seed=42, silent=True, subsample=1,tree_method='gpu_hist')
    kfold = KFold(n_splits=3, shuffle=True)
    clf = GridSearchCV(xgb_model,params_grid, 
                            verbose=2, 
                            scoring='roc_auc',
                            cv=kfold,
                            n_jobs=-1)
    clf.fit(x,y)
    print(-start_time + time.time()," sec")
    warnings.simplefilter('ignore', FutureWarning)
    res = pd.DataFrame(clf.cv_results_)
    res["dev_gini"] = 2*res["mean_train_score"]-1
    res["val_gini"] = 2*res["mean_test_score"]-1
    return res[["params","dev_gini","val_gini"]]

#Build single xgoost model using params
def xgb_build(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    gbm_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
                               gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
                               min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
                               objective='binary:logistic', reg_alpha=30, reg_lambda=1,
                               scale_pos_weight=1, seed=42, silent=True, subsample=1)
    gbm_model.fit(x_train,y_train)
    predictions = gbm_model.predict(x_valid)
    score = roc_auc_score(y_valid, predictions)
    # TODO: Add the importance for the selected features
    print("\tScore {0}\n\n".format(score))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    gini = 2*score-1
    return {'gini': gini, 'status': STATUS_OK}

#Find optimal params using hyperopt
def optimize(
             #trials, 
             random_state=1):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
        'n_estimators': hp.quniform('n_estimators', 50,100, 1),
        'learning_rate': hp.quniform('learning_rate', 0.08,0.12, 0.01),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(3, 4, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(xgb_build, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=250)
    return best


In [18]:
#XGBoost with GridSearch and CV
df = fill_null(df,-1)
df = cat_to_one_hot(df)
start_time = time.time()
x_sample=df.copy()
x_sample = x_sample.drop(['CONTRACT_SRC_CODE','SCORE_FINAL','BAD_12_FLAG90_1'], axis=1)
y = df["BAD_12_FLAG90_1"][df['BAD_12_FLAG90_1'].notnull()] 

#Find optimal parameters with hyperopt. Before aplly need to initiate x, y
x_train, x_valid, y_train, y_valid = train_test_split(x_sample,y)
optimize()


Training with params: 
{'booster': 'gbtree', 'colsample_bytree': 0.9, 'eta': 0.375, 'eval_metric': 'auc', 'gamma': 0.55, 'learning_rate': 0.09, 'max_depth': 3, 'min_child_weight': 3.0, 'n_estimators': 73.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 1, 'silent': 1, 'subsample': 0.9500000000000001, 'tree_method': 'exact'}


KeyboardInterrupt: 

In [8]:
#Random forest classifier
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

kfold = KFold(n_splits=3, shuffle=True)
clf = GridSearchCV(rf_model,{'max_depth':[10],
                            'n_estimators':[100]},                             
                            scoring='roc_auc',
                            cv=kfold,
                            n_jobs=-1)
clf.fit(x_sample,y)

print(-start_time + time.time()," sec")
warnings.simplefilter('ignore', FutureWarning)
res = pd.DataFrame(clf.cv_results_)
res["dev_gini"] = 2*res["mean_train_score"]-1
res["val_gini"] = 2*res["mean_test_score"]-1
res[["params","dev_gini","val_gini"]]

TypeError: __init__() got an unexpected keyword argument 'min_impurity_decrease'