In [1]:
'''
import modules
'''
import pandas as pd
import numpy as np
import lightgbm as lgb
from datetime import datetime
import warnings
import gc
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
'''
import data
'''
df_raw = pd.read_csv('./Data_Processed_All_Contractors_Characteristics - Data.csv')
df_2016 = df_raw.loc[df_raw['registed 2016'] == 1]
df_2016_xy = df_2016.iloc[:,:56].drop(['Dealer ID','No. of Employees',\
                                       'registed 2016','JS 17.Column3',\
                                       'JS 18.Column3','JS 19.Column3',\
                                       'NON USER 17.Column3','NON USER 18.Column3',\
                                       'Velocity 17.Column3','centurty 18.Column3',\
                                       'Gibson 18.Column3'],axis=1)

In [3]:
df_2016_xy.columns

Index(['Converted State-AL', 'Converted State-AR', 'Converted State-CA',
       'Converted State-FL', 'Converted State-GA', 'Converted State-IL',
       'Converted State-IN', 'Converted State-KY', 'Converted State-LA',
       'Converted State-MI', 'Converted State-MO', 'Converted State-NC',
       'Converted State-NJ', 'Converted State-OH', 'Converted State-Others',
       'Converted State-PA', 'Converted State-SC', 'Converted State-TN',
       'Converted State-TX', 'employeebucket-NA', 'employeebucket-4~8',
       'employeebucket-<4', 'employeebucket->8', 'Sales between 0-99,999',
       'Sales between 100,000-499,999', 'Other Sales', 'Sales N/A',
       'Currently offers Consumer Financing?_No',
       'Currently offers Consumer Financing?_Yes',
       'Currently offers Consumer Financing?_N/A', 'Over 10', 'Below 10',
       'No year info', 'Hitting Potential 16.Column3', 'JS 16.Column3',
       'NON USER 16.Column3', 'brand_1', 'brand_2', 'brand_3', 'brand_4',
       'brand_5', 'bra

In [4]:
'''
clarify target and x variables (features and categorical features)
'''
y = df_2016_xy['applied 2016']
features = [c for c in df_2016_xy.columns if c not in ['applied 2016']]
x = df_2016_xy.drop(['applied 2016'],axis=1)
categorical_feats = [c for c in df_2016_xy.columns if c != 'applied 2016']

In [5]:
'''
split training data and testing data
'''
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [6]:
'''
set one function with different parameters to do parameters optimization
'''
def lgb_cv(num_leaves,
           min_data_in_leaf,
           max_depth,
           feature_fraction,
           bagging_fraction,
           lambda_l1,
          threshold):
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(x_train.shape[0])
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(x_train.iloc[trn_idx][features],
                               label=y_train.iloc[trn_idx],
                               categorical_feature=categorical_feats
                              )
        val_data = lgb.Dataset(x_train.iloc[val_idx][features],
                               label=y_train.iloc[val_idx],
                               categorical_feature=categorical_feats
                              )
        param = {
            'num_leaves': int(num_leaves),
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'binary',
            'max_depth': int(max_depth),
            'learning_rate': 0.05,
            "boosting": "gbdt",
            "feature_fraction": feature_fraction,
            "bagging_freq": 1,
            "bagging_fraction": bagging_fraction ,
            "bagging_seed": 11,
            "lambda_l1": lambda_l1,
            "verbosity": -1
        }
    
        clf = lgb.train(param,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 200)
        
        oof[val_idx] = clf.predict(x_train.iloc[val_idx][features],
                                   num_iteration=clf.best_iteration)
        del clf, trn_idx, val_idx
        gc.collect()
    
    for i in range(x_train.shape[0]):
        if oof[i] >= threshold:
            oof[i] = 1
        else:
            oof[i] = 0

    return accuracy_score(oof, y_train.values)

In [8]:
'''
use bayesian optimization to find the optimal parameters in ranges
'''
LGB_BO = BayesianOptimization(lgb_cv, {
    'num_leaves': (5, 130),
    'min_data_in_leaf': (10, 150),
    'max_depth': (4, 10),
    'feature_fraction': (1,1),
    'bagging_fraction': (1,1),
    'lambda_l1': (0, 6),
    'threshold':(0.4,0.6),
    })

In [9]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=2, n_iter=20, acq='ei', xi=0.0)

|   iter    |  target   | baggin... | featur... | lambda_l1 | max_depth | min_da... | num_le... | threshold |
-------------------------------------------------------------------------------------------------------------
fold n°0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.634733	valid_1's binary_logloss: 0.669459
fold n°1
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[125]	training's binary_logloss: 0.630043	valid_1's binary_logloss: 0.658083
fold n°2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[114]	training's binary_logloss: 0.617798	valid_1's binary_logloss: 0.671702
fold n°3
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[26]	training's binary_logloss: 0.646248	valid_1's binary_logloss: 0.688062
fold n°4
Training until validation scores don'

Early stopping, best iteration is:
[198]	training's binary_logloss: 0.652272	valid_1's binary_logloss: 0.663717
fold n°2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[296]	training's binary_logloss: 0.644204	valid_1's binary_logloss: 0.674507
fold n°3
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[5]	training's binary_logloss: 0.686343	valid_1's binary_logloss: 0.695179
fold n°4
Training until validation scores don't improve for 200 rounds.
[500]	training's binary_logloss: 0.646859	valid_1's binary_logloss: 0.666385
Early stopping, best iteration is:
[304]	training's binary_logloss: 0.647037	valid_1's binary_logloss: 0.666124
| [0m 8       [0m | [0m 0.5526  [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 4.377   [0m | [0m 4.086   [0m | [0m 148.9   [0m | [0m 5.752   [0m | [0m 0.5593  [0m |
fold n°0
Training until validation scores don't improve for 200 rounds.
Early s

Early stopping, best iteration is:
[23]	training's binary_logloss: 0.642806	valid_1's binary_logloss: 0.682555
fold n°4
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[80]	training's binary_logloss: 0.620742	valid_1's binary_logloss: 0.662005
| [0m 15      [0m | [0m 0.5747  [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 0.04817 [0m | [0m 4.055   [0m | [0m 41.56   [0m | [0m 97.9    [0m | [0m 0.4701  [0m |
fold n°0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[31]	training's binary_logloss: 0.660452	valid_1's binary_logloss: 0.677294
fold n°1
Training until validation scores don't improve for 200 rounds.
[500]	training's binary_logloss: 0.609881	valid_1's binary_logloss: 0.650609
Early stopping, best iteration is:
[548]	training's binary_logloss: 0.607177	valid_1's binary_logloss: 0.649472
fold n°2
Training until validation scores don't improve for 200 rounds.
Early st

Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[131]	training's binary_logloss: 0.633757	valid_1's binary_logloss: 0.661204
| [0m 22      [0m | [0m 0.5739  [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 5.935   [0m | [0m 4.144   [0m | [0m 31.75   [0m | [0m 78.24   [0m | [0m 0.42    [0m |


In [11]:
'''
set parameters
'''
param = {
            'num_leaves': 81,
            'min_data_in_leaf': 38, 
            'objective':'binary',
            'max_depth': 4,
            'learning_rate': 0.05,
            "boosting": "gbdt",
            "feature_fraction": 1,
            "bagging_freq": 1,
            "bagging_fraction": 1,
            "bagging_seed": 11,
            "lambda_l1": 0.09078,
            "verbosity": -1
        }

In [None]:
'''
run boosted tree
'''
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(x_train))
predictions = np.zeros(len(x_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(x_train.iloc[trn_idx][features],
                           label=y_train.iloc[trn_idx],
                           categorical_feature=categorical_feats
                          )
    val_data = lgb.Dataset(x_train.iloc[val_idx][features],
                           label=y_train.iloc[val_idx],
                           categorical_feature=categorical_feats
                          )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds = 200)
    
    oof[val_idx] = clf.predict(x_train.iloc[val_idx][features], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(x_test[features], num_iteration=clf.best_iteration) / folds.n_splits

for i in range(x_train.shape[0]):
        if oof[i] >= 0.5:
            oof[i] = 1
        else:
            oof[i] = 0
for i in range(len(predictions)):
        if predictions[i] >= 0.5:
            predictions[i] = 1
        else:
            predictions[i] = 0
print("CV score: {:<8.5f}".format(accuracy_score(oof, y_train.values)))
print("Test Accuracy:{:<8.5f}".format(accuracy_score(predictions, y_test.values)))

fold n°0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[45]	training's binary_logloss: 0.630448	valid_1's binary_logloss: 0.655923
fold n°1




Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[93]	training's binary_logloss: 0.619629	valid_1's binary_logloss: 0.63895
fold n°2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[43]	training's binary_logloss: 0.6291	valid_1's binary_logloss: 0.667348
fold n°3
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[24]	training's binary_logloss: 0.640324	valid_1's binary_logloss: 0.682191


In [95]:
predictions

array([1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
       1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1.,
       1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1.,
       0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1.,
       1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 1.

In [96]:
y_test.values

array([1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0], dtype=int64)

In [97]:
confusion_matrix(y_test.values, predictions, labels=None, sample_weight=None)

array([[77, 62],
       [62, 94]], dtype=int64)