In [36]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [37]:
data = pd.read_csv('./data/data_pre.csv')

In [38]:
test = pd.read_csv('./data/test_pre.csv')

In [39]:
data.head()

Unnamed: 0,Interval,ChannelID,CancelFlag,Cluster,DeliveryType,prepay,count_edit,OrderCnt,MaterialID_-1.0,MaterialID_2002127.0,...,GroupID_41.0,GroupID_42.0,GroupID_46.0,GroupID_55.0,GroupID_59.0,GroupID_61.0,GroupID_63.0,GroupID_66.0,OrderDate_weekday,Date_weekday
0,7,0,0,0,1,0,1,6.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1,2
1,5,0,0,0,1,0,1,28.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4,6
2,14,0,0,0,0,0,1,16.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4
3,20,0,0,0,1,0,1,14.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4,4
4,4,0,1,0,1,0,1,29.0,25.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,2


In [40]:
t = list(data.columns[data.columns.isin(test.columns)])
t.append('CancelFlag')
cols = pd.Index(t)

In [42]:
data = data[cols]

In [43]:
#categorical_cols = ['Interval', 'ChannelID', 'Cluster', 'DeliveryType', 'OrderDate_weekday', 'Date_weekday']
categorical_cols = [0, 1, 2, 3, -1, -2]
#categorical_cols = [1, 2, 3]
#categorical_cols = None

In [44]:
X = data.drop('CancelFlag', axis=1)
y = data['CancelFlag']

In [45]:
X.columns = range(len(X.columns))
y.columns = [0]

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y)

data = lgb.Dataset(X, label=y, categorical_feature=categorical_cols, free_raw_data=False)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols, free_raw_data=False)
train_data.save_binary('./data/train.bin')
test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_cols, free_raw_data=False, reference=train_data)
test_data.save_binary('./data/test.bin')

<lightgbm.basic.Dataset at 0x7f6e32decfd0>

In [15]:
train_data = lgb.Dataset('./data/train.bin')
test_data = lgb.Dataset('./data/test.bin')

In [52]:
param = {
    'num_leaves': 76,
    'objective': 'binary',
    'max_depth': 7,
    'scale_pos_weight': '1.5',
    #'min_sum_hessian_in_leaf': 1e-3,
    #'pos_bagging_fraction': 0.5,
    #'neg_bagging_fraction': 0.5,
    #'bagging_fraction': 0.5,
    #'bagging_freq': 5,
    'max_bin': 50,
    #'min_gain_to_split': 
    'min_data_in_leaf': 50,
    'learning_rate': 0.1,
    #'boosting': 'dart',
    'lambda_l2': 1.0,
    #'cat_l2': 1.0,
    #'cat_smooth': 50,
    #'top_k': 50,
    #'tree_learner': 'data',
    #'max_cat_group': 50
    #'boosting': 'dart'
}
param['metric'] = 'auc'

In [336]:
num_round = 100
lgb.cv(param, data, num_round, nfold=5, early_stopping_rounds=5)['auc-mean'][-1]

0.6870754646139112

In [53]:
num_round = 5000
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=50)

[1]	valid_0's auc: 0.669833
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.674371
[3]	valid_0's auc: 0.677775
[4]	valid_0's auc: 0.678664
[5]	valid_0's auc: 0.680152
[6]	valid_0's auc: 0.682005
[7]	valid_0's auc: 0.682206
[8]	valid_0's auc: 0.682408
[9]	valid_0's auc: 0.682692
[10]	valid_0's auc: 0.683367
[11]	valid_0's auc: 0.683687
[12]	valid_0's auc: 0.684422
[13]	valid_0's auc: 0.68504
[14]	valid_0's auc: 0.685573
[15]	valid_0's auc: 0.686177
[16]	valid_0's auc: 0.687016
[17]	valid_0's auc: 0.687323
[18]	valid_0's auc: 0.687752
[19]	valid_0's auc: 0.688384
[20]	valid_0's auc: 0.688619
[21]	valid_0's auc: 0.688717
[22]	valid_0's auc: 0.68889
[23]	valid_0's auc: 0.689295
[24]	valid_0's auc: 0.689382
[25]	valid_0's auc: 0.689768
[26]	valid_0's auc: 0.690006
[27]	valid_0's auc: 0.69023
[28]	valid_0's auc: 0.690384
[29]	valid_0's auc: 0.690678
[30]	valid_0's auc: 0.691049
[31]	valid_0's auc: 0.691094
[32]	valid_0's auc: 0.691415
[33]	valid_0's auc: 0.6

In [54]:
bst.save_model('model.txt', num_iteration=bst.best_iteration)

<lightgbm.basic.Booster at 0x7f6e32e12090>

In [55]:
y_pred = bst.predict(X_test)

In [56]:
y_pred

array([0.02759844, 0.14145102, 0.1226732 , ..., 0.07579301, 0.03964714,
       0.05939585])

In [57]:
roc_auc_score(y_test, np.array(list(map(lambda x: 0 if x<0.1 else x, y_pred))))

0.6653257274922726