In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [53]:
data_raw = pd.read_csv('./data/data_preprocessed.csv')
#test = pd.read_csv('./data/test_pre.csv')

In [76]:
cols = ['ChannelID',
        'Cluster',
        #'DeliveryType',
        'prepay',
        'count_edit',
        'interval_time',
        'order_weekday',
        'weekday',
        #'interval_low',
        'interval_high',
        #'interval_avg',
        #'morning',
        #'day',
        #'evening',
        #'night',
        #'is_order_weekend',
        #'is_weekend',
        'CancelFlag',
]

In [77]:
data = data_raw[cols]

In [78]:
data.head()

Unnamed: 0,ChannelID,Cluster,prepay,count_edit,interval_time,order_weekday,weekday,interval_high,CancelFlag
0,2,23,0,0,2,1,2,16,0
1,2,23,0,0,2,4,6,14,0
2,2,23,0,0,2,4,4,21,0
3,2,23,0,0,2,4,4,24,0
4,2,23,0,0,8,0,2,18,1


In [80]:
#categorical_cols = ['ChannelID', 'Cluster', 'order_weekday', 'weekday', 'interval_high']
label_cols = [0, 1, 5, 6, 7]

#'['DeliveryType', 'prepay', 'morning','day', 'evening', 'night', 'is_order_weekend', 'is_weekend']
boolean_cols = [2, 3, 9, 10, 11, 12, 13, 14]

cat_cols = label_cols #+ boolean_cols

In [81]:
for col in ['ChannelID', 'Cluster', 'order_weekday', 'weekday', 'interval_high']:
    data[col] = data[col].astype('category')

In [82]:
X = data.drop('CancelFlag', axis=1)
y = data['CancelFlag']

In [83]:
X.columns = range(len(X.columns))
y.columns = [0]

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

dataset = lgb.Dataset(X, label=y, categorical_feature=cat_cols, free_raw_data=False)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols, free_raw_data=False)
train_data.save_binary('./data/train.bin')
test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols, free_raw_data=False, reference=train_data)
test_data.save_binary('./data/test.bin')

<lightgbm.basic.Dataset at 0x7fe9fb7278d0>

In [15]:
#train_data = lgb.Dataset('./data/train.bin')
#test_data = lgb.Dataset('./data/test.bin')

In [87]:
param = {
    'num_leaves': 40,
    'objective': 'binary',
    'max_depth': -1,
    #'scale_pos_weight': '1.5',
    #'unbalance': 'true',
    #'min_sum_hessian_in_leaf': 1e-3,
    #'pos_bagging_fraction': 0.8,
    #'neg_bagging_fraction': 0.8,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    #'max_bin': 120,
    #'min_gain_to_split': 
    #'min_data_in_leaf': 50,
    'learning_rate': 0.1,
    #'boosting': 'dart',
    #'lambda_l2': 1.0,
    #'cat_l2': 1.0,
    #'cat_smooth': 50,
    #'top_k': 100,
    'tree_learner': 'data',
    #'max_cat_group': 50
    #'boosting': 'dart'
}
param['metric'] = 'auc'

In [68]:
num_round = 100
lgb.cv(param, dataset, num_round, nfold=5, early_stopping_rounds=5)['auc-mean'][-1]

0.6554403297193773

In [88]:
num_round = 100
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=5)

[1]	valid_0's auc: 0.633436
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.635575
[3]	valid_0's auc: 0.64081
[4]	valid_0's auc: 0.641749
[5]	valid_0's auc: 0.643292
[6]	valid_0's auc: 0.643977
[7]	valid_0's auc: 0.646095
[8]	valid_0's auc: 0.646546
[9]	valid_0's auc: 0.6469
[10]	valid_0's auc: 0.646872
[11]	valid_0's auc: 0.646982
[12]	valid_0's auc: 0.647121
[13]	valid_0's auc: 0.647691
[14]	valid_0's auc: 0.647661
[15]	valid_0's auc: 0.648198
[16]	valid_0's auc: 0.648344
[17]	valid_0's auc: 0.649158
[18]	valid_0's auc: 0.649477
[19]	valid_0's auc: 0.64969
[20]	valid_0's auc: 0.649763
[21]	valid_0's auc: 0.649842
[22]	valid_0's auc: 0.650389
[23]	valid_0's auc: 0.650528
[24]	valid_0's auc: 0.650918
[25]	valid_0's auc: 0.650964
[26]	valid_0's auc: 0.651008
[27]	valid_0's auc: 0.651302
[28]	valid_0's auc: 0.651336
[29]	valid_0's auc: 0.651429
[30]	valid_0's auc: 0.652055
[31]	valid_0's auc: 0.651959
[32]	valid_0's auc: 0.651938
[33]	valid_0's auc: 0.651

In [14]:
bst.save_model('model.txt', num_iteration=bst.best_iteration)

<lightgbm.basic.Booster at 0x7f31c534da90>

In [55]:
y_pred = bst.predict(X_test)

In [56]:
y_pred

array([0.02759844, 0.14145102, 0.1226732 , ..., 0.07579301, 0.03964714,
       0.05939585])

In [57]:
roc_auc_score(y_test, np.array(list(map(lambda x: 0 if x<0.1 else x, y_pred))))

0.6653257274922726

In [75]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

forest = ExtraTreesClassifier(n_estimators=250)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [74]:
X_cols = X.columns

for f in range(X.shape[1]):
    print(f"{f+1}. {X_cols[indices[f]]} {importances[indices[f]]}")

1. count_edit 0.2712835608434637
2. Cluster 0.22327182161658468
3. ChannelID 0.13622609394384383
4. order_weekday 0.09214468258700419
5. weekday 0.09178368285104366
6. prepay 0.08007389236028722
7. interval_high 0.021189992201283432
8. interval_avg 0.02040818698809016
9. interval_low 0.01923923914072298
10. interval_time 0.015462990283141097
11. is_weekend 0.009969177323018589
12. is_order_weekend 0.00805418198745193
13. day 0.0043437663578553476
14. evening 0.0031040616152836166
15. morning 0.002489267992824822
16. DeliveryType 0.000955401908100635
17. night 0.0


In [None]:
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()