In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./data/data_pre.csv')

In [38]:
test = pd.read_csv('./data/test_pre.csv')

In [4]:
def_cols = ['Interval', 'ChannelID', 'CancelFlag', 'Cluster', 'DeliveryType', 'prepay', 'count_edit', 'OrderCnt', 'OrderDate_weekday', 'Date_weekday']

In [6]:
data.head()

Unnamed: 0,Interval,ChannelID,CancelFlag,Cluster,DeliveryType,prepay,count_edit,OrderCnt,OrderDate_weekday,Date_weekday
0,7,0,0,0,1,0,1,6.0,1,2
1,5,0,0,0,1,0,1,28.0,4,6
2,14,0,0,0,0,0,1,16.0,4,4
3,20,0,0,0,1,0,1,14.0,4,4
4,4,0,1,0,1,0,1,29.0,0,2


In [40]:
t = list(data.columns[data.columns.isin(test.columns)])
t.append('CancelFlag')
cols = pd.Index(t)

In [5]:
data = data[def_cols]

In [10]:
#categorical_cols = ['Interval', 'ChannelID', 'Cluster', 'DeliveryType', 'OrderDate_weekday', 'Date_weekday']
categorical_cols = [0, 1, 2, 3, -1, -2]
#categorical_cols = [1, 2, 3]
#categorical_cols = None

In [7]:
X = data.drop('CancelFlag', axis=1)
y = data['CancelFlag']

In [8]:
X.columns = range(len(X.columns))
y.columns = [0]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y)

data = lgb.Dataset(X, label=y, categorical_feature=categorical_cols, free_raw_data=False)
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols, free_raw_data=False)
train_data.save_binary('./data/train.bin')
test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_cols, free_raw_data=False, reference=train_data)
test_data.save_binary('./data/test.bin')

<lightgbm.basic.Dataset at 0x7f31f049c790>

In [15]:
train_data = lgb.Dataset('./data/train.bin')
test_data = lgb.Dataset('./data/test.bin')

In [30]:
param = {
    'num_leaves': 22,
    'objective': 'binary',
    'max_depth': 7,
    #'scale_pos_weight': '1.5',
    'unbalance': 'true',
    #'min_sum_hessian_in_leaf': 1e-3,
    #'pos_bagging_fraction': 0.5,
    #'neg_bagging_fraction': 0.5,
    #'bagging_fraction': 0.5,
    #'bagging_freq': 5,
    #'max_bin': 50,
    #'min_gain_to_split': 
    #'min_data_in_leaf': 50,
    'learning_rate': 0.1,
    #'boosting': 'dart',
    #'lambda_l2': 1.0,
    #'cat_l2': 1.0,
    #'cat_smooth': 50,
    #'top_k': 50,
    #'tree_learner': 'data',
    #'max_cat_group': 50
    #'boosting': 'dart'
}
param['metric'] = 'auc'

In [31]:
num_round = 100
lgb.cv(param, data, num_round, nfold=5, early_stopping_rounds=5)['auc-mean'][-1]

0.6878976882873913

In [28]:
num_round = 5000
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=50)

[1]	valid_0's auc: 0.664703
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.670058
[3]	valid_0's auc: 0.671531
[4]	valid_0's auc: 0.672279
[5]	valid_0's auc: 0.673173
[6]	valid_0's auc: 0.673662
[7]	valid_0's auc: 0.674463
[8]	valid_0's auc: 0.674532
[9]	valid_0's auc: 0.674614
[10]	valid_0's auc: 0.67479
[11]	valid_0's auc: 0.675099
[12]	valid_0's auc: 0.675304
[13]	valid_0's auc: 0.675431
[14]	valid_0's auc: 0.675311
[15]	valid_0's auc: 0.675409
[16]	valid_0's auc: 0.675602
[17]	valid_0's auc: 0.675881
[18]	valid_0's auc: 0.675814
[19]	valid_0's auc: 0.675954
[20]	valid_0's auc: 0.676077
[21]	valid_0's auc: 0.676175
[22]	valid_0's auc: 0.676248
[23]	valid_0's auc: 0.676327
[24]	valid_0's auc: 0.676404
[25]	valid_0's auc: 0.676532
[26]	valid_0's auc: 0.676437
[27]	valid_0's auc: 0.676563
[28]	valid_0's auc: 0.676734
[29]	valid_0's auc: 0.676769
[30]	valid_0's auc: 0.676862
[31]	valid_0's auc: 0.676897
[32]	valid_0's auc: 0.676966
[33]	valid_0's auc: 0

[279]	valid_0's auc: 0.689645
[280]	valid_0's auc: 0.689635
[281]	valid_0's auc: 0.689635
[282]	valid_0's auc: 0.68966
[283]	valid_0's auc: 0.689634
[284]	valid_0's auc: 0.689597
[285]	valid_0's auc: 0.68957
[286]	valid_0's auc: 0.68959
[287]	valid_0's auc: 0.689537
[288]	valid_0's auc: 0.689575
[289]	valid_0's auc: 0.689572
[290]	valid_0's auc: 0.689553
[291]	valid_0's auc: 0.689537
[292]	valid_0's auc: 0.689537
[293]	valid_0's auc: 0.689547
[294]	valid_0's auc: 0.68955
[295]	valid_0's auc: 0.689551
[296]	valid_0's auc: 0.689572
[297]	valid_0's auc: 0.689566
[298]	valid_0's auc: 0.689556
[299]	valid_0's auc: 0.689566
[300]	valid_0's auc: 0.689554
[301]	valid_0's auc: 0.689545
[302]	valid_0's auc: 0.689501
[303]	valid_0's auc: 0.689489
[304]	valid_0's auc: 0.689487
[305]	valid_0's auc: 0.68946
[306]	valid_0's auc: 0.689403
[307]	valid_0's auc: 0.689388
[308]	valid_0's auc: 0.689423
[309]	valid_0's auc: 0.689439
[310]	valid_0's auc: 0.689446
[311]	valid_0's auc: 0.689447
[312]	valid_0's

In [14]:
bst.save_model('model.txt', num_iteration=bst.best_iteration)

<lightgbm.basic.Booster at 0x7f31c534da90>

In [55]:
y_pred = bst.predict(X_test)

In [56]:
y_pred

array([0.02759844, 0.14145102, 0.1226732 , ..., 0.07579301, 0.03964714,
       0.05939585])

In [57]:
roc_auc_score(y_test, np.array(list(map(lambda x: 0 if x<0.1 else x, y_pred))))

0.6653257274922726