In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy import stats
from sklearn import metrics
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import warnings
import time
import gc

In [2]:
train_df = pd.read_csv('train_reprocess_knn.csv',encoding='big5')
test_df = pd.read_csv('test_reprocess_knn.csv',encoding='big5')

In [60]:
train_df.shape

(100000, 132)

In [61]:
#目標變量的分布
predictors = train_df.drop(['Y1'],axis=1).columns.values.tolist()
train_df['Y1'].value_counts()

0.0    98000
1.0     2000
Name: Y1, dtype: int64

### RandomForest 篩選特徵

### 尋找最佳參數

In [30]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['Y1','CUS_ID'],axis=1), train_df['Y1'], test_size=0.2, random_state=1)

In [31]:
#分層採樣，確保訓練集與測試集中各類樣本的比例與原始數據集中相同

# split all data
'''
X,y = train_df.drop(['Y1','CUS_ID'],axis=1), train_df['Y1']
bayesian_train_index, bayesian_val_index = list(StratifiedKFold(n_splits=2, random_state=1).split(X, y))[0]
print('number of 1 in train: %s | number of 1 in val: %s' % (train_df.iloc[bayesian_train_index,:]["Y1"].sum(), 
                                                             train_df.iloc[bayesian_val_index,:]["Y1"].sum()))
'''

# split X_train

bayesian_train_index, bayesian_val_index = list(StratifiedKFold(n_splits=2, random_state=1).split(X_train, y_train))[0]
print('number of 1 in train: %s | number of 1 in val: %s' % (y_train[bayesian_train_index].sum(), 
                                                             y_train[bayesian_val_index].sum()))


# train_df_select
'''
X, y = train_df_select.drop(['Y1'],axis=1), train_df_select['Y1']
bayesian_train_index, bayesian_val_index = list(StratifiedKFold(n_splits=2, random_state=1).split(X, y))[0]
print('number of 1 in train: %s | number of 1 in val: %s' % (y[bayesian_train_index].sum(), 
                                                             y[bayesian_val_index].sum()))
'''
print('train_size: %s | val_size: %s' % (bayesian_train_index.shape, bayesian_val_index.shape))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


number of 1 in train: 687.0 | number of 1 in val: 636.0
train_size: (40000,) | val_size: (40000,)


In [32]:
def LGB_bayesian(max_depth, num_leaves, learning_rate,
                 reg_alpha, reg_lambda, bagging_fraction, bagging_freq, feature_fraction):

    
    parameters = {'boosting_type': 'gbdt',          #設置提升類型
                 'objective': 'binary',             #目標函數
                 'importance_type': 'split', 
                 'learning_rate':learning_rate,
                 'max_depth': int(max_depth),       #if -1 means no limit
                 'n_estimators': 200, 
                 'n_jobs': -1,
                 'num_leaves': int(num_leaves),     #葉子節點數 (we should let it be smaller than 2^(max_depth))
                 'objective': 'binary', 
                 'random_state': 1, 
                 'reg_alpha': reg_alpha, 
                 'reg_lambda': reg_lambda, 
                 'silent': True, 
                 'is_unbalance': True,
                 'metric': ('auc','binary_logloss'),
                 'verbose': 0,
                 'feature_fraction': feature_fraction, #特徵採樣
                 'bagging_fraction': bagging_fraction, #數據採樣
                 'bagging_freq': int(bagging_freq),    #每K輪迭代執行一次bagging
                 'cat_smooth': 1}
    
    
    lgb_train = lgb.Dataset(X_train.iloc[bayesian_train_index], y_train.iloc[bayesian_train_index])
    lgb_valid = lgb.Dataset(X_train.iloc[bayesian_val_index], y_train.iloc[bayesian_val_index])

    
    '''    
    lgb_train = lgb.Dataset(X.iloc[bayesian_train_index], y.iloc[bayesian_train_index])
    lgb_valid = lgb.Dataset(X.iloc[bayesian_val_index], y.iloc[bayesian_val_index])
    
    '''
    clf = gb.cv(param, train_data, num_round, nfold=5)
    clf = lgb.train(parameters, lgb_train, valid_sets = lgb_valid, early_stopping_rounds = 50)
                    ##categorical_feature = indexes_of_categories)
    predictions = clf.predict(X_train.iloc[bayesian_val_index], num_iteration =clf.best_iteration) 
    score = metrics.roc_auc_score(y_train.iloc[bayesian_val_index], predictions) 
    
    '''
    clf = lgb.train(parameters, lgb_train, valid_sets = lgb_valid, early_stopping_rounds = 10)
    predictions = clf.predict(X.iloc[bayesian_val_index], num_iteration =clf.best_iteration) 
    score = metrics.roc_auc_score(y.iloc[bayesian_val_index], predictions) 
    '''
    return score 

In [33]:
#設定參數邊界，使貝葉斯優化在邊界內搜索最佳值
bounds_LGB = {
    'num_leaves': (20, 100), 
    'learning_rate': (0.001, 0.2),
    'feature_fraction': (0.6, 1.0),
    'bagging_fraction': (0.6, 1.0),
    'bagging_freq': (5, 20),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 50),
    'max_depth':(10, 30)
}

In [None]:


print('数据...')
x_train = np.random.random((1000,10))
y_train = np.random.rand(1000)>0.5
x_test = np.random.random((100,10))
y_test = np.random.randn(100)>0.5

# 导入到lightgbm矩阵
lgb_train = lgb.Dataset(x_train, y_train)
lgb_test = lgb.Dataset(x_test, y_test, reference=lgb_train)

# 设置参数
params = {
    'num_leaves': 5,
    'metric': ('auc', 'logloss'),#可以设置多个评价指标
    'verbose': 0
}
# if (evals_result and gbm) not in locbals():
	# global evals_result,gbm 
#如果是局部变量的话，推荐把他们变成全局变量，这样plot的代码位置不受限制


In [34]:
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=1)
#print(LGB_BO.space.keys)

In [35]:
#創建BayesianOptimization对象(LGB_BO)後，在調用maxime前不會運作。
#init_points：How many steps of random exploration you want to perform.(help by diversifying the exploration space.)
#n_iter：How many steps of bayesian optimization you want to perform.(The more steps the more likely to find a good maximum)
init_points = 10
n_iter = 50

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points =init_points, n_iter =n_iter)

|   iter    |  target   | baggin... | baggin... | featur... | learni... | max_depth | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------
[1]	valid_0's auc: 0.718876
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.735676
[3]	valid_0's auc: 0.74847
[4]	valid_0's auc: 0.751631
[5]	valid_0's auc: 0.754928
[6]	valid_0's auc: 0.760006
[7]	valid_0's auc: 0.764613
[8]	valid_0's auc: 0.767149
[9]	valid_0's auc: 0.769583
[10]	valid_0's auc: 0.771606
[11]	valid_0's auc: 0.774624
[12]	valid_0's auc: 0.776079
[13]	valid_0's auc: 0.776434
[14]	valid_0's auc: 0.777929
[15]	valid_0's auc: 0.778626
[16]	valid_0's auc: 0.780529
[17]	valid_0's auc: 0.781388
[18]	valid_0's auc: 0.782765
[19]	valid_0's auc: 0.783642
[20]	valid_0's auc: 0.783937
[21]	valid_0's auc: 0.7849
[22]	valid_0's auc: 0.78593
[23]	valid_0's auc: 0.787318
[24]	valid_0's auc: 0.788052
[25]	valid_

[31]	valid_0's auc: 0.791494
[32]	valid_0's auc: 0.791826
[33]	valid_0's auc: 0.791982
[34]	valid_0's auc: 0.79238
[35]	valid_0's auc: 0.792812
[36]	valid_0's auc: 0.792976
[37]	valid_0's auc: 0.793423
[38]	valid_0's auc: 0.793643
[39]	valid_0's auc: 0.793753
[40]	valid_0's auc: 0.794263
[41]	valid_0's auc: 0.794713
[42]	valid_0's auc: 0.79496
[43]	valid_0's auc: 0.795263
[44]	valid_0's auc: 0.795334
[45]	valid_0's auc: 0.795829
[46]	valid_0's auc: 0.796281
[47]	valid_0's auc: 0.79662
[48]	valid_0's auc: 0.79718
[49]	valid_0's auc: 0.797607
[50]	valid_0's auc: 0.797874
[51]	valid_0's auc: 0.798124
[52]	valid_0's auc: 0.79849
[53]	valid_0's auc: 0.798808
[54]	valid_0's auc: 0.799182
[55]	valid_0's auc: 0.799336
[56]	valid_0's auc: 0.799632
[57]	valid_0's auc: 0.799916
[58]	valid_0's auc: 0.800113
[59]	valid_0's auc: 0.800358
[60]	valid_0's auc: 0.80061
[61]	valid_0's auc: 0.800708
[62]	valid_0's auc: 0.801096
[63]	valid_0's auc: 0.801138
[64]	valid_0's auc: 0.801376
[65]	valid_0's auc: 

[26]	valid_0's auc: 0.797609
[27]	valid_0's auc: 0.798193
[28]	valid_0's auc: 0.79861
[29]	valid_0's auc: 0.799079
[30]	valid_0's auc: 0.800305
[31]	valid_0's auc: 0.801943
[32]	valid_0's auc: 0.802142
[33]	valid_0's auc: 0.80261
[34]	valid_0's auc: 0.802577
[35]	valid_0's auc: 0.802567
[36]	valid_0's auc: 0.803238
[37]	valid_0's auc: 0.803329
[38]	valid_0's auc: 0.803334
[39]	valid_0's auc: 0.802991
[40]	valid_0's auc: 0.803356
[41]	valid_0's auc: 0.803127
[42]	valid_0's auc: 0.802986
[43]	valid_0's auc: 0.803477
[44]	valid_0's auc: 0.802971
[45]	valid_0's auc: 0.803174
[46]	valid_0's auc: 0.803174
[47]	valid_0's auc: 0.803008
[48]	valid_0's auc: 0.802775
[49]	valid_0's auc: 0.802248
[50]	valid_0's auc: 0.802445
[51]	valid_0's auc: 0.803016
[52]	valid_0's auc: 0.80365
[53]	valid_0's auc: 0.803932
[54]	valid_0's auc: 0.803945
[55]	valid_0's auc: 0.804317
[56]	valid_0's auc: 0.804575
[57]	valid_0's auc: 0.804484
[58]	valid_0's auc: 0.804035
[59]	valid_0's auc: 0.803657
[60]	valid_0's au

[102]	valid_0's auc: 0.812092
[103]	valid_0's auc: 0.811998
[104]	valid_0's auc: 0.81193
[105]	valid_0's auc: 0.81211
[106]	valid_0's auc: 0.812462
[107]	valid_0's auc: 0.812304
[108]	valid_0's auc: 0.81228
[109]	valid_0's auc: 0.812338
[110]	valid_0's auc: 0.81229
[111]	valid_0's auc: 0.812193
[112]	valid_0's auc: 0.8122
[113]	valid_0's auc: 0.812149
[114]	valid_0's auc: 0.811973
[115]	valid_0's auc: 0.811831
[116]	valid_0's auc: 0.812108
[117]	valid_0's auc: 0.81195
[118]	valid_0's auc: 0.811845
[119]	valid_0's auc: 0.811961
[120]	valid_0's auc: 0.811937
[121]	valid_0's auc: 0.812087
[122]	valid_0's auc: 0.812041
[123]	valid_0's auc: 0.812064
[124]	valid_0's auc: 0.812426
[125]	valid_0's auc: 0.812513
[126]	valid_0's auc: 0.812747
[127]	valid_0's auc: 0.813031
[128]	valid_0's auc: 0.813061
[129]	valid_0's auc: 0.812984
[130]	valid_0's auc: 0.813214
[131]	valid_0's auc: 0.813242
[132]	valid_0's auc: 0.813115
[133]	valid_0's auc: 0.813135
[134]	valid_0's auc: 0.812843
[135]	valid_0's a

[52]	valid_0's auc: 0.769621
[53]	valid_0's auc: 0.769373
[54]	valid_0's auc: 0.769022
[55]	valid_0's auc: 0.768715
[56]	valid_0's auc: 0.768738
[57]	valid_0's auc: 0.768073
[58]	valid_0's auc: 0.767806
[59]	valid_0's auc: 0.768095
[60]	valid_0's auc: 0.768569
[61]	valid_0's auc: 0.769
[62]	valid_0's auc: 0.769219
[63]	valid_0's auc: 0.768915
[64]	valid_0's auc: 0.76849
[65]	valid_0's auc: 0.767757
[66]	valid_0's auc: 0.767772
[67]	valid_0's auc: 0.767547
[68]	valid_0's auc: 0.766926
[69]	valid_0's auc: 0.765291
[70]	valid_0's auc: 0.765324
[71]	valid_0's auc: 0.766174
[72]	valid_0's auc: 0.766593
[73]	valid_0's auc: 0.766453
[74]	valid_0's auc: 0.765905
[75]	valid_0's auc: 0.767061
[76]	valid_0's auc: 0.767761
[77]	valid_0's auc: 0.767025
[78]	valid_0's auc: 0.767458
[79]	valid_0's auc: 0.766158
[80]	valid_0's auc: 0.766005
[81]	valid_0's auc: 0.767042
[82]	valid_0's auc: 0.766509
[83]	valid_0's auc: 0.766819
[84]	valid_0's auc: 0.766665
Early stopping, best iteration is:
[34]	valid_0

[45]	valid_0's auc: 0.801465
[46]	valid_0's auc: 0.801745
[47]	valid_0's auc: 0.801751
[48]	valid_0's auc: 0.801658
[49]	valid_0's auc: 0.801629
[50]	valid_0's auc: 0.801579
[51]	valid_0's auc: 0.801515
[52]	valid_0's auc: 0.801478
[53]	valid_0's auc: 0.801404
[54]	valid_0's auc: 0.801284
[55]	valid_0's auc: 0.801215
[56]	valid_0's auc: 0.801109
[57]	valid_0's auc: 0.801174
[58]	valid_0's auc: 0.801189
[59]	valid_0's auc: 0.801174
[60]	valid_0's auc: 0.801049
[61]	valid_0's auc: 0.801304
[62]	valid_0's auc: 0.801334
[63]	valid_0's auc: 0.80126
[64]	valid_0's auc: 0.801305
[65]	valid_0's auc: 0.801516
[66]	valid_0's auc: 0.801674
[67]	valid_0's auc: 0.801652
[68]	valid_0's auc: 0.801789
[69]	valid_0's auc: 0.801978
[70]	valid_0's auc: 0.802145
[71]	valid_0's auc: 0.802289
[72]	valid_0's auc: 0.802387
[73]	valid_0's auc: 0.802507
[74]	valid_0's auc: 0.802648
[75]	valid_0's auc: 0.802743
[76]	valid_0's auc: 0.802839
[77]	valid_0's auc: 0.802915
[78]	valid_0's auc: 0.803005
[79]	valid_0's 

[183]	valid_0's auc: 0.815198
[184]	valid_0's auc: 0.815358
[185]	valid_0's auc: 0.815218
[186]	valid_0's auc: 0.815253
[187]	valid_0's auc: 0.815124
[188]	valid_0's auc: 0.814952
[189]	valid_0's auc: 0.814953
[190]	valid_0's auc: 0.81492
[191]	valid_0's auc: 0.814962
[192]	valid_0's auc: 0.815
[193]	valid_0's auc: 0.81518
[194]	valid_0's auc: 0.815249
[195]	valid_0's auc: 0.815262
[196]	valid_0's auc: 0.81503
[197]	valid_0's auc: 0.814981
[198]	valid_0's auc: 0.815
[199]	valid_0's auc: 0.815002
[200]	valid_0's auc: 0.814925
Did not meet early stopping. Best iteration is:
[172]	valid_0's auc: 0.815492
|  20       |  0.8155   |  0.6912   |  5.488    |  0.8908   |  0.06277  |  29.49    |  56.77    |  49.59    |  49.64    |
[1]	valid_0's auc: 0.722954
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.747017
[3]	valid_0's auc: 0.760183
[4]	valid_0's auc: 0.762942
[5]	valid_0's auc: 0.764782
[6]	valid_0's auc: 0.767305
[7]	valid_0's auc: 0.771852
[8]	valid_0

[69]	valid_0's auc: 0.815717
[70]	valid_0's auc: 0.815727
[71]	valid_0's auc: 0.815722
[72]	valid_0's auc: 0.815843
[73]	valid_0's auc: 0.815663
[74]	valid_0's auc: 0.815876
[75]	valid_0's auc: 0.816005
[76]	valid_0's auc: 0.816419
[77]	valid_0's auc: 0.816667
[78]	valid_0's auc: 0.816561
[79]	valid_0's auc: 0.816793
[80]	valid_0's auc: 0.816979
[81]	valid_0's auc: 0.817029
[82]	valid_0's auc: 0.817206
[83]	valid_0's auc: 0.817265
[84]	valid_0's auc: 0.817428
[85]	valid_0's auc: 0.81758
[86]	valid_0's auc: 0.817647
[87]	valid_0's auc: 0.817613
[88]	valid_0's auc: 0.81753
[89]	valid_0's auc: 0.817452
[90]	valid_0's auc: 0.817453
[91]	valid_0's auc: 0.817448
[92]	valid_0's auc: 0.817811
[93]	valid_0's auc: 0.818093
[94]	valid_0's auc: 0.818345
[95]	valid_0's auc: 0.81837
[96]	valid_0's auc: 0.818405
[97]	valid_0's auc: 0.81848
[98]	valid_0's auc: 0.818373
[99]	valid_0's auc: 0.818475
[100]	valid_0's auc: 0.818438
[101]	valid_0's auc: 0.81827
[102]	valid_0's auc: 0.818285
[103]	valid_0's 

|  26       |  0.8217   |  0.962    |  5.423    |  0.6776   |  0.01556  |  18.15    |  32.77    |  0.04272  |  23.75    |
[1]	valid_0's auc: 0.727211
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.750475
[3]	valid_0's auc: 0.755607
[4]	valid_0's auc: 0.755498
[5]	valid_0's auc: 0.760783
[6]	valid_0's auc: 0.764723
[7]	valid_0's auc: 0.768651
[8]	valid_0's auc: 0.770591
[9]	valid_0's auc: 0.771864
[10]	valid_0's auc: 0.773726
[11]	valid_0's auc: 0.776749
[12]	valid_0's auc: 0.779065
[13]	valid_0's auc: 0.781492
[14]	valid_0's auc: 0.783353
[15]	valid_0's auc: 0.78399
[16]	valid_0's auc: 0.785214
[17]	valid_0's auc: 0.786606
[18]	valid_0's auc: 0.78812
[19]	valid_0's auc: 0.788912
[20]	valid_0's auc: 0.78905
[21]	valid_0's auc: 0.789577
[22]	valid_0's auc: 0.790364
[23]	valid_0's auc: 0.791856
[24]	valid_0's auc: 0.791997
[25]	valid_0's auc: 0.792912
[26]	valid_0's auc: 0.793782
[27]	valid_0's auc: 0.79419
[28]	valid_0's auc: 0.794108
[29]	valid_0's au

[157]	valid_0's auc: 0.793445
[158]	valid_0's auc: 0.793656
[159]	valid_0's auc: 0.793812
[160]	valid_0's auc: 0.79398
[161]	valid_0's auc: 0.79414
[162]	valid_0's auc: 0.794261
[163]	valid_0's auc: 0.794439
[164]	valid_0's auc: 0.794614
[165]	valid_0's auc: 0.794778
[166]	valid_0's auc: 0.794949
[167]	valid_0's auc: 0.7951
[168]	valid_0's auc: 0.795256
[169]	valid_0's auc: 0.79524
[170]	valid_0's auc: 0.795222
[171]	valid_0's auc: 0.795207
[172]	valid_0's auc: 0.795192
[173]	valid_0's auc: 0.795171
[174]	valid_0's auc: 0.795134
[175]	valid_0's auc: 0.795141
[176]	valid_0's auc: 0.795236
[177]	valid_0's auc: 0.795352
[178]	valid_0's auc: 0.795331
[179]	valid_0's auc: 0.795401
[180]	valid_0's auc: 0.795379
[181]	valid_0's auc: 0.795541
[182]	valid_0's auc: 0.795643
[183]	valid_0's auc: 0.795749
[184]	valid_0's auc: 0.795868
[185]	valid_0's auc: 0.795961
[186]	valid_0's auc: 0.796048
[187]	valid_0's auc: 0.796134
[188]	valid_0's auc: 0.796195
[189]	valid_0's auc: 0.796258
[190]	valid_0's

[149]	valid_0's auc: 0.793612
[150]	valid_0's auc: 0.79368
[151]	valid_0's auc: 0.793785
[152]	valid_0's auc: 0.793924
[153]	valid_0's auc: 0.794032
[154]	valid_0's auc: 0.79417
[155]	valid_0's auc: 0.794269
[156]	valid_0's auc: 0.794355
[157]	valid_0's auc: 0.794437
[158]	valid_0's auc: 0.794503
[159]	valid_0's auc: 0.794612
[160]	valid_0's auc: 0.794669
[161]	valid_0's auc: 0.79477
[162]	valid_0's auc: 0.794874
[163]	valid_0's auc: 0.794973
[164]	valid_0's auc: 0.795072
[165]	valid_0's auc: 0.795111
[166]	valid_0's auc: 0.795108
[167]	valid_0's auc: 0.795098
[168]	valid_0's auc: 0.795146
[169]	valid_0's auc: 0.795203
[170]	valid_0's auc: 0.795256
[171]	valid_0's auc: 0.79535
[172]	valid_0's auc: 0.795457
[173]	valid_0's auc: 0.795565
[174]	valid_0's auc: 0.795612
[175]	valid_0's auc: 0.795717
[176]	valid_0's auc: 0.79579
[177]	valid_0's auc: 0.795829
[178]	valid_0's auc: 0.7959
[179]	valid_0's auc: 0.795918
[180]	valid_0's auc: 0.795991
[181]	valid_0's auc: 0.796091
[182]	valid_0's a

[4]	valid_0's auc: 0.777619
[5]	valid_0's auc: 0.780883
[6]	valid_0's auc: 0.782083
[7]	valid_0's auc: 0.782727
[8]	valid_0's auc: 0.783528
[9]	valid_0's auc: 0.784064
[10]	valid_0's auc: 0.784838
[11]	valid_0's auc: 0.785785
[12]	valid_0's auc: 0.787701
[13]	valid_0's auc: 0.788367
[14]	valid_0's auc: 0.789755
[15]	valid_0's auc: 0.791382
[16]	valid_0's auc: 0.791156
[17]	valid_0's auc: 0.792346
[18]	valid_0's auc: 0.79287
[19]	valid_0's auc: 0.793412
[20]	valid_0's auc: 0.793297
[21]	valid_0's auc: 0.793801
[22]	valid_0's auc: 0.794194
[23]	valid_0's auc: 0.793923
[24]	valid_0's auc: 0.794204
[25]	valid_0's auc: 0.794282
[26]	valid_0's auc: 0.794005
[27]	valid_0's auc: 0.793616
[28]	valid_0's auc: 0.793762
[29]	valid_0's auc: 0.793642
[30]	valid_0's auc: 0.794399
[31]	valid_0's auc: 0.794512
[32]	valid_0's auc: 0.79462
[33]	valid_0's auc: 0.794594
[34]	valid_0's auc: 0.794812
[35]	valid_0's auc: 0.795062
[36]	valid_0's auc: 0.79576
[37]	valid_0's auc: 0.796008
[38]	valid_0's auc: 0.7

[183]	valid_0's auc: 0.814479
[184]	valid_0's auc: 0.814528
[185]	valid_0's auc: 0.814661
[186]	valid_0's auc: 0.814777
[187]	valid_0's auc: 0.814876
[188]	valid_0's auc: 0.814954
[189]	valid_0's auc: 0.814959
[190]	valid_0's auc: 0.815015
[191]	valid_0's auc: 0.81506
[192]	valid_0's auc: 0.815088
[193]	valid_0's auc: 0.815142
[194]	valid_0's auc: 0.815185
[195]	valid_0's auc: 0.81527
[196]	valid_0's auc: 0.815309
[197]	valid_0's auc: 0.81534
[198]	valid_0's auc: 0.815379
[199]	valid_0's auc: 0.815448
[200]	valid_0's auc: 0.815479
Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.815479
|  40       |  0.8155   |  0.975    |  6.915    |  0.6514   |  0.01006  |  5.524    |  36.26    |  3.622    |  49.71    |
[1]	valid_0's auc: 0.745522
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.774168
[3]	valid_0's auc: 0.783877
[4]	valid_0's auc: 0.787861
[5]	valid_0's auc: 0.793304
[6]	valid_0's auc: 0.797317
[7]	valid_0's auc: 0.798348
[8]	v

[26]	valid_0's auc: 0.786617
[27]	valid_0's auc: 0.786596
[28]	valid_0's auc: 0.787784
[29]	valid_0's auc: 0.788071
[30]	valid_0's auc: 0.789227
[31]	valid_0's auc: 0.790265
[32]	valid_0's auc: 0.791151
[33]	valid_0's auc: 0.791637
[34]	valid_0's auc: 0.792128
[35]	valid_0's auc: 0.792908
[36]	valid_0's auc: 0.793407
[37]	valid_0's auc: 0.79409
[38]	valid_0's auc: 0.79434
[39]	valid_0's auc: 0.794558
[40]	valid_0's auc: 0.795257
[41]	valid_0's auc: 0.795695
[42]	valid_0's auc: 0.795974
[43]	valid_0's auc: 0.796713
[44]	valid_0's auc: 0.797081
[45]	valid_0's auc: 0.797735
[46]	valid_0's auc: 0.798293
[47]	valid_0's auc: 0.798836
[48]	valid_0's auc: 0.799686
[49]	valid_0's auc: 0.799852
[50]	valid_0's auc: 0.800608
[51]	valid_0's auc: 0.8008
[52]	valid_0's auc: 0.801196
[53]	valid_0's auc: 0.801456
[54]	valid_0's auc: 0.801908
[55]	valid_0's auc: 0.8023
[56]	valid_0's auc: 0.802999
[57]	valid_0's auc: 0.803638
[58]	valid_0's auc: 0.803937
[59]	valid_0's auc: 0.804252
[60]	valid_0's auc: 

[166]	valid_0's auc: 0.807715
[167]	valid_0's auc: 0.807899
[168]	valid_0's auc: 0.808116
[169]	valid_0's auc: 0.808252
[170]	valid_0's auc: 0.808348
[171]	valid_0's auc: 0.808288
[172]	valid_0's auc: 0.80834
[173]	valid_0's auc: 0.808494
[174]	valid_0's auc: 0.808617
[175]	valid_0's auc: 0.808611
[176]	valid_0's auc: 0.808682
[177]	valid_0's auc: 0.808731
[178]	valid_0's auc: 0.808697
[179]	valid_0's auc: 0.808674
[180]	valid_0's auc: 0.808539
[181]	valid_0's auc: 0.808519
[182]	valid_0's auc: 0.808739
[183]	valid_0's auc: 0.808672
[184]	valid_0's auc: 0.808582
[185]	valid_0's auc: 0.808716
[186]	valid_0's auc: 0.808566
[187]	valid_0's auc: 0.808657
[188]	valid_0's auc: 0.808571
[189]	valid_0's auc: 0.808482
[190]	valid_0's auc: 0.808347
[191]	valid_0's auc: 0.808524
[192]	valid_0's auc: 0.80857
[193]	valid_0's auc: 0.808466
[194]	valid_0's auc: 0.808369
[195]	valid_0's auc: 0.808389
[196]	valid_0's auc: 0.808387
[197]	valid_0's auc: 0.808326
[198]	valid_0's auc: 0.808114
[199]	valid_

[18]	valid_0's auc: 0.786281
[19]	valid_0's auc: 0.787489
[20]	valid_0's auc: 0.787702
[21]	valid_0's auc: 0.788631
[22]	valid_0's auc: 0.789711
[23]	valid_0's auc: 0.78991
[24]	valid_0's auc: 0.789952
[25]	valid_0's auc: 0.790653
[26]	valid_0's auc: 0.791154
[27]	valid_0's auc: 0.790969
[28]	valid_0's auc: 0.791196
[29]	valid_0's auc: 0.791126
[30]	valid_0's auc: 0.791887
[31]	valid_0's auc: 0.79224
[32]	valid_0's auc: 0.792929
[33]	valid_0's auc: 0.79362
[34]	valid_0's auc: 0.794154
[35]	valid_0's auc: 0.794612
[36]	valid_0's auc: 0.794543
[37]	valid_0's auc: 0.79454
[38]	valid_0's auc: 0.794595
[39]	valid_0's auc: 0.794605
[40]	valid_0's auc: 0.794719
[41]	valid_0's auc: 0.795349
[42]	valid_0's auc: 0.795683
[43]	valid_0's auc: 0.796173
[44]	valid_0's auc: 0.796149
[45]	valid_0's auc: 0.79625
[46]	valid_0's auc: 0.796695
[47]	valid_0's auc: 0.797149
[48]	valid_0's auc: 0.797417
[49]	valid_0's auc: 0.797742
[50]	valid_0's auc: 0.798086
[51]	valid_0's auc: 0.798167
[52]	valid_0's auc:

[23]	valid_0's auc: 0.797142
[24]	valid_0's auc: 0.797968
[25]	valid_0's auc: 0.799223
[26]	valid_0's auc: 0.799717
[27]	valid_0's auc: 0.800248
[28]	valid_0's auc: 0.799995
[29]	valid_0's auc: 0.800214
[30]	valid_0's auc: 0.801319
[31]	valid_0's auc: 0.801506
[32]	valid_0's auc: 0.801966
[33]	valid_0's auc: 0.802478
[34]	valid_0's auc: 0.80301
[35]	valid_0's auc: 0.802863
[36]	valid_0's auc: 0.803038
[37]	valid_0's auc: 0.803045
[38]	valid_0's auc: 0.803216
[39]	valid_0's auc: 0.803284
[40]	valid_0's auc: 0.803061
[41]	valid_0's auc: 0.803729
[42]	valid_0's auc: 0.803447
[43]	valid_0's auc: 0.803102
[44]	valid_0's auc: 0.803308
[45]	valid_0's auc: 0.803657
[46]	valid_0's auc: 0.803653
[47]	valid_0's auc: 0.804048
[48]	valid_0's auc: 0.803942
[49]	valid_0's auc: 0.804231
[50]	valid_0's auc: 0.804581
[51]	valid_0's auc: 0.804665
[52]	valid_0's auc: 0.804562
[53]	valid_0's auc: 0.80424
[54]	valid_0's auc: 0.804507
[55]	valid_0's auc: 0.804628
[56]	valid_0's auc: 0.804497
[57]	valid_0's a

[28]	valid_0's auc: 0.800944
[29]	valid_0's auc: 0.801519
[30]	valid_0's auc: 0.801835
[31]	valid_0's auc: 0.802301
[32]	valid_0's auc: 0.802489
[33]	valid_0's auc: 0.802953
[34]	valid_0's auc: 0.803143
[35]	valid_0's auc: 0.803367
[36]	valid_0's auc: 0.803706
[37]	valid_0's auc: 0.803829
[38]	valid_0's auc: 0.80418
[39]	valid_0's auc: 0.804581
[40]	valid_0's auc: 0.805023
[41]	valid_0's auc: 0.805324
[42]	valid_0's auc: 0.805842
[43]	valid_0's auc: 0.806166
[44]	valid_0's auc: 0.806481
[45]	valid_0's auc: 0.80681
[46]	valid_0's auc: 0.806903
[47]	valid_0's auc: 0.807181
[48]	valid_0's auc: 0.807457
[49]	valid_0's auc: 0.80757
[50]	valid_0's auc: 0.80794
[51]	valid_0's auc: 0.808175
[52]	valid_0's auc: 0.808414
[53]	valid_0's auc: 0.808599
[54]	valid_0's auc: 0.809027
[55]	valid_0's auc: 0.80922
[56]	valid_0's auc: 0.809349
[57]	valid_0's auc: 0.809453
[58]	valid_0's auc: 0.809914
[59]	valid_0's auc: 0.810305
[60]	valid_0's auc: 0.810593
[61]	valid_0's auc: 0.810847
[62]	valid_0's auc:

[31]	valid_0's auc: 0.777795
[32]	valid_0's auc: 0.778425
[33]	valid_0's auc: 0.778606
[34]	valid_0's auc: 0.778576
[35]	valid_0's auc: 0.779005
[36]	valid_0's auc: 0.778873
[37]	valid_0's auc: 0.779091
[38]	valid_0's auc: 0.779393
[39]	valid_0's auc: 0.779822
[40]	valid_0's auc: 0.779876
[41]	valid_0's auc: 0.780238
[42]	valid_0's auc: 0.780563
[43]	valid_0's auc: 0.780742
[44]	valid_0's auc: 0.780913
[45]	valid_0's auc: 0.780868
[46]	valid_0's auc: 0.780964
[47]	valid_0's auc: 0.780963
[48]	valid_0's auc: 0.781201
[49]	valid_0's auc: 0.781294
[50]	valid_0's auc: 0.78139
[51]	valid_0's auc: 0.781482
[52]	valid_0's auc: 0.781581
[53]	valid_0's auc: 0.781748
[54]	valid_0's auc: 0.78196
[55]	valid_0's auc: 0.78216
[56]	valid_0's auc: 0.782283
[57]	valid_0's auc: 0.782434
[58]	valid_0's auc: 0.782645
[59]	valid_0's auc: 0.783351
[60]	valid_0's auc: 0.783941
[61]	valid_0's auc: 0.783961
[62]	valid_0's auc: 0.784332
[63]	valid_0's auc: 0.784349
[64]	valid_0's auc: 0.784386
[65]	valid_0's au

In [36]:
LGB_BO.max['target']

0.8235570039908574

In [37]:
LGB_BO.max['params']

{'bagging_fraction': 0.8679900331594039,
 'bagging_freq': 19.461295671833362,
 'feature_fraction': 0.6521303847699594,
 'learning_rate': 0.04001280271606868,
 'max_depth': 27.331557512738563,
 'num_leaves': 36.11800346099659,
 'reg_alpha': 0.18342287394159662,
 'reg_lambda': 6.304147153507928}

In [39]:
#保存最佳參數
best_parameters = { 'boosting_type': 'gbdt',              
                    'objective': 'binary',               
                    'class_weight': None, 
                    'importance_type': 'split', 
                    #'learning_rate': LGB_BO.max['params']['learning_rate'],
                    'learning_rate':0.1,
                    'max_depth': int(LGB_BO.max['params']['max_depth']),          
                    #'min_split_gain':  LGB_BO.max['params']['min_split_gain'],
                    'n_estimators': 200, 
                    'n_jobs': -1,
                    'num_leaves': int(LGB_BO.max['params']['num_leaves']),
                    #'min_data_in_leaf': int(LGB_BO.max['params']['min_data_in_leaf']),
                    #'min_sum_hessian_in_leaf': LGB_BO.max['params']['min_sum_hessian_in_leaf'],
                    'objective': 'binary', 
                    'random_state': 1, 
                    'reg_alpha':  LGB_BO.max['params']['reg_alpha'], 
                    'reg_lambda': int(LGB_BO.max['params']['reg_lambda']), 
                    'silent': True, 
                    'is_unbalance': True,
                    'metric': 'auc', 
                    'verbose': 0,
                    'feature_fraction': LGB_BO.max['params']['feature_fraction'],
                    'bagging_fraction': LGB_BO.max['params']['bagging_fraction'],  
                    'bagging_freq': int(LGB_BO.max['params']['bagging_freq']), 
                    'cat_smooth': 1}



In [71]:
best_parameters = { 'boosting_type': 'gbdt',              
                    'objective': 'binary',               
                    'class_weight': None, 
                    'importance_type': 'split', 
                    'learning_rate': 0.04,
                    'learning_rate':0.1,
                    'max_depth': 23,          
                    'min_split_gain':  0.9,
                    'n_estimators': 200, 
                    'n_jobs': -1,
                    'num_leaves': 27,
                    #'min_data_in_leaf': int(LGB_BO.max['params']['min_data_in_leaf']),
                    'min_sum_hessian_in_leaf': 0.02,
                    'objective': 'binary', 
                    'random_state': 1, 
                    'reg_alpha':  0.02, 
                    'reg_lambda': 35, 
                    'silent': True, 
                    'is_unbalance': True,
                    'metric': 'auc', 
                    'verbose': 0,
                    'feature_fraction':0.8,
                    'bagging_fraction': 0.7,  
                    'bagging_freq': 10, 
                    'cat_smooth': 1}

In [54]:
best_parameters ={'boosting_type': 'gbdt', 'importance_type': 'split', 'learning_rate': 0.1,
                  'max_depth': 20, 'n_estimators': 200, 'n_jobs': -1,'num_leaves': 35, 
                  'objective': 'binary', 'random_state': 1, 'silent': True, 'metric': 'auc', 
                  'verbose': 0, 'feature_fraction': 0.6, 'bagging_fraction': 0.7, 'bagging_freq': 8, 
                  'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1}

### 訓練LightGBM模型

In [21]:
train_df_2.shape #train_reprocess_new_columns.csv #0.8403033316326531

(100000, 132)

In [32]:
train_df_2 = pd.read_csv('train_reprocess_knn_v2.csv',encoding='big5')
test_df_2 = pd.read_csv('test_reprocess_knn_v2.csv',encoding='big5')

#new_col = feat_labels[indices[range(X_selected.shape[1])]]
#train_df_2 = pd.concat([train_df[new_col],train_df['Y1']],axis=1)
#test_df_2 = test_df[new_col]

In [33]:
train_df_2.shape #train_new_columns_OHE.csv #0.8419634336734694

(100000, 154)

In [57]:
nfold = 10
skf = StratifiedKFold(n_splits = nfold, shuffle=True, random_state=1)

tStart = time.time()#計時開始

val_predict = np.zeros(len(train_df_2))
predictions = np.zeros((len(test_df_2), nfold))

predictors = train_df_2.drop(['Y1','CUS_ID'],axis=1).columns
target = 'Y1'

i = 0
for train_index, valid_index in skf.split(train_df_2, train_df_2[target]):
    print("\nfold {}".format(i+1))
    lgb_train = lgb.Dataset(train_df_2.iloc[train_index][predictors],
                            train_df_2.iloc[train_index][target])
    
    lgb_valid = lgb.Dataset(train_df_2.iloc[valid_index][predictors],
                            train_df_2.iloc[valid_index][target])
    
    evals_result = {} #紀錄訓練結果
    clf = lgb.train(best_parameters, lgb_train, valid_sets = lgb_valid, early_stopping_rounds = 100, evals_result=evals_result)
    
    val_predict[valid_index] = clf.predict(train_df_2.iloc[valid_index][predictors], num_iteration = clf.best_iteration)
    predictions[:,i] += clf.predict(test_df_2[predictors], num_iteration = clf.best_iteration)
    i += 1

    #print('Training Results...') #訓練過程
    #ax = lgb.plot_metric(evals_result, metric='auc')
    #plt.show()

    #print('Features Importance...') #重要特徵
    #ax = lgb.plot_importance(clf, max_num_features=10)
    #plt.show()

    #print('Plot Tree...')  # 畫出決策樹
    #ax = lgb.plot_tree(clf, figsize=(20, 8), show_info=['split_gain'])
    #plt.show()

    #print('導出最後一輪决策樹的圖到本地') #需安装graphviz
    #graph = lgb.create_tree_digraph(clf, name='Tree_MT_fold' +str(i))
    #graph.render(view=True)

tEnd = time.time()#計時結束
print("\n\n AUC in train data: %.4f" % (metrics.roc_auc_score(train_df_2[target], val_predict)))
print ("It cost %f sec" % (tEnd - tStart))#會自動做進位


fold 1


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[1]	valid_0's auc: 0.776239
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.794213
[3]	valid_0's auc: 0.796766
[4]	valid_0's auc: 0.803663
[5]	valid_0's auc: 0.80564
[6]	valid_0's auc: 0.806644
[7]	valid_0's auc: 0.811501
[8]	valid_0's auc: 0.813866
[9]	valid_0's auc: 0.81563
[10]	valid_0's auc: 0.814623
[11]	valid_0's auc: 0.8164
[12]	valid_0's auc: 0.817165
[13]	valid_0's auc: 0.818406
[14]	valid_0's auc: 0.819415
[15]	valid_0's auc: 0.820952
[16]	valid_0's auc: 0.82134
[17]	valid_0's auc: 0.822231
[18]	valid_0's auc: 0.822328
[19]	valid_0's auc: 0.823854
[20]	valid_0's auc: 0.823071
[21]	valid_0's auc: 0.823133
[22]	valid_0's auc: 0.823618
[23]	valid_0's auc: 0.824298
[24]	valid_0's auc: 0.825713
[25]	valid_0's auc: 0.825978
[26]	valid_0's auc: 0.826792
[27]	valid_0's auc: 0.826518
[28]	valid_0's auc: 0.826907
[29]	valid_0's auc: 0.827466
[30]	valid_0's auc: 0.827462
[31]	valid_0's auc: 0.827728
[32]	valid_0's auc: 0.82817
[33]	valid_0's auc: 0.82

[136]	valid_0's auc: 0.848532
[137]	valid_0's auc: 0.848242
[138]	valid_0's auc: 0.84833
[139]	valid_0's auc: 0.848473
[140]	valid_0's auc: 0.848831
[141]	valid_0's auc: 0.848928
[142]	valid_0's auc: 0.849231
[143]	valid_0's auc: 0.849397
[144]	valid_0's auc: 0.849422
[145]	valid_0's auc: 0.849642
[146]	valid_0's auc: 0.849433
[147]	valid_0's auc: 0.849689
[148]	valid_0's auc: 0.849577
[149]	valid_0's auc: 0.849842
[150]	valid_0's auc: 0.849526
[151]	valid_0's auc: 0.849684
[152]	valid_0's auc: 0.84983
[153]	valid_0's auc: 0.849996
[154]	valid_0's auc: 0.850299
[155]	valid_0's auc: 0.850343
[156]	valid_0's auc: 0.850288
[157]	valid_0's auc: 0.850303
[158]	valid_0's auc: 0.850884
[159]	valid_0's auc: 0.850596
[160]	valid_0's auc: 0.850853
[161]	valid_0's auc: 0.850633
[162]	valid_0's auc: 0.850317
[163]	valid_0's auc: 0.850348
[164]	valid_0's auc: 0.850125
[165]	valid_0's auc: 0.849882
[166]	valid_0's auc: 0.850294
[167]	valid_0's auc: 0.850131
[168]	valid_0's auc: 0.849837
[169]	valid_

[13]	valid_0's auc: 0.826107
[14]	valid_0's auc: 0.827953
[15]	valid_0's auc: 0.82887
[16]	valid_0's auc: 0.828817
[17]	valid_0's auc: 0.829859
[18]	valid_0's auc: 0.830428
[19]	valid_0's auc: 0.831323
[20]	valid_0's auc: 0.831807
[21]	valid_0's auc: 0.83236
[22]	valid_0's auc: 0.83323
[23]	valid_0's auc: 0.833617
[24]	valid_0's auc: 0.833466
[25]	valid_0's auc: 0.833743
[26]	valid_0's auc: 0.832864
[27]	valid_0's auc: 0.832715
[28]	valid_0's auc: 0.833038
[29]	valid_0's auc: 0.83312
[30]	valid_0's auc: 0.833585
[31]	valid_0's auc: 0.833813
[32]	valid_0's auc: 0.833836
[33]	valid_0's auc: 0.833701
[34]	valid_0's auc: 0.833783
[35]	valid_0's auc: 0.833678
[36]	valid_0's auc: 0.83395
[37]	valid_0's auc: 0.834128
[38]	valid_0's auc: 0.834469
[39]	valid_0's auc: 0.834503
[40]	valid_0's auc: 0.83413
[41]	valid_0's auc: 0.834401
[42]	valid_0's auc: 0.834476
[43]	valid_0's auc: 0.8342
[44]	valid_0's auc: 0.833491
[45]	valid_0's auc: 0.833814
[46]	valid_0's auc: 0.834123
[47]	valid_0's auc: 0.

[134]	valid_0's auc: 0.828154
[135]	valid_0's auc: 0.828263
[136]	valid_0's auc: 0.828446
[137]	valid_0's auc: 0.82853
[138]	valid_0's auc: 0.82845
[139]	valid_0's auc: 0.828337
[140]	valid_0's auc: 0.828132
[141]	valid_0's auc: 0.827884
[142]	valid_0's auc: 0.827923
[143]	valid_0's auc: 0.827874
[144]	valid_0's auc: 0.827669
[145]	valid_0's auc: 0.827306
[146]	valid_0's auc: 0.827012
[147]	valid_0's auc: 0.827048
[148]	valid_0's auc: 0.826965
[149]	valid_0's auc: 0.82672
[150]	valid_0's auc: 0.826589
[151]	valid_0's auc: 0.826377
[152]	valid_0's auc: 0.826072
[153]	valid_0's auc: 0.826121
[154]	valid_0's auc: 0.826057
[155]	valid_0's auc: 0.826453
[156]	valid_0's auc: 0.826406
[157]	valid_0's auc: 0.826541
[158]	valid_0's auc: 0.826895
[159]	valid_0's auc: 0.826658
[160]	valid_0's auc: 0.826754
[161]	valid_0's auc: 0.826973
[162]	valid_0's auc: 0.827212
[163]	valid_0's auc: 0.82771
[164]	valid_0's auc: 0.828116
[165]	valid_0's auc: 0.827839
[166]	valid_0's auc: 0.827858
[167]	valid_0'

[43]	valid_0's auc: 0.845194
[44]	valid_0's auc: 0.844843
[45]	valid_0's auc: 0.844219
[46]	valid_0's auc: 0.844594
[47]	valid_0's auc: 0.844244
[48]	valid_0's auc: 0.844296
[49]	valid_0's auc: 0.845055
[50]	valid_0's auc: 0.845809
[51]	valid_0's auc: 0.845364
[52]	valid_0's auc: 0.845646
[53]	valid_0's auc: 0.845723
[54]	valid_0's auc: 0.846289
[55]	valid_0's auc: 0.84647
[56]	valid_0's auc: 0.84662
[57]	valid_0's auc: 0.846475
[58]	valid_0's auc: 0.846968
[59]	valid_0's auc: 0.846944
[60]	valid_0's auc: 0.847563
[61]	valid_0's auc: 0.848118
[62]	valid_0's auc: 0.84783
[63]	valid_0's auc: 0.848359
[64]	valid_0's auc: 0.848263
[65]	valid_0's auc: 0.848368
[66]	valid_0's auc: 0.848878
[67]	valid_0's auc: 0.848582
[68]	valid_0's auc: 0.848419
[69]	valid_0's auc: 0.848532
[70]	valid_0's auc: 0.848459
[71]	valid_0's auc: 0.848527
[72]	valid_0's auc: 0.848616
[73]	valid_0's auc: 0.848596
[74]	valid_0's auc: 0.848506
[75]	valid_0's auc: 0.848372
[76]	valid_0's auc: 0.848704
[77]	valid_0's au

[122]	valid_0's auc: 0.853103
[123]	valid_0's auc: 0.85343
[124]	valid_0's auc: 0.853573
[125]	valid_0's auc: 0.853127
[126]	valid_0's auc: 0.852981
[127]	valid_0's auc: 0.85304
[128]	valid_0's auc: 0.85298
[129]	valid_0's auc: 0.85308
[130]	valid_0's auc: 0.853013
[131]	valid_0's auc: 0.853008
[132]	valid_0's auc: 0.85259
[133]	valid_0's auc: 0.852534
[134]	valid_0's auc: 0.852477
[135]	valid_0's auc: 0.852092
[136]	valid_0's auc: 0.851882
[137]	valid_0's auc: 0.85189
[138]	valid_0's auc: 0.851982
[139]	valid_0's auc: 0.851953
[140]	valid_0's auc: 0.851898
[141]	valid_0's auc: 0.851796
[142]	valid_0's auc: 0.851643
[143]	valid_0's auc: 0.851758
[144]	valid_0's auc: 0.852136
[145]	valid_0's auc: 0.852093
[146]	valid_0's auc: 0.852179
[147]	valid_0's auc: 0.85212
[148]	valid_0's auc: 0.852338
[149]	valid_0's auc: 0.852563
[150]	valid_0's auc: 0.852863
[151]	valid_0's auc: 0.853135
[152]	valid_0's auc: 0.853091
[153]	valid_0's auc: 0.853083
[154]	valid_0's auc: 0.853023
[155]	valid_0's a


fold 10
[1]	valid_0's auc: 0.766277
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.80796
[3]	valid_0's auc: 0.80452
[4]	valid_0's auc: 0.809259
[5]	valid_0's auc: 0.815189
[6]	valid_0's auc: 0.82113
[7]	valid_0's auc: 0.823174
[8]	valid_0's auc: 0.823445
[9]	valid_0's auc: 0.825395
[10]	valid_0's auc: 0.826179
[11]	valid_0's auc: 0.826677
[12]	valid_0's auc: 0.828966
[13]	valid_0's auc: 0.829331
[14]	valid_0's auc: 0.830505
[15]	valid_0's auc: 0.832155
[16]	valid_0's auc: 0.832341
[17]	valid_0's auc: 0.832986
[18]	valid_0's auc: 0.83284
[19]	valid_0's auc: 0.833351
[20]	valid_0's auc: 0.833833
[21]	valid_0's auc: 0.83443
[22]	valid_0's auc: 0.835003
[23]	valid_0's auc: 0.835881
[24]	valid_0's auc: 0.836106
[25]	valid_0's auc: 0.836272
[26]	valid_0's auc: 0.837095
[27]	valid_0's auc: 0.838818
[28]	valid_0's auc: 0.839362
[29]	valid_0's auc: 0.839459
[30]	valid_0's auc: 0.840864
[31]	valid_0's auc: 0.841544
[32]	valid_0's auc: 0.841771
[33]	valid_0's

In [58]:
print("\n\n AUC in train data: %.4f | It costs %f sec" % (metrics.roc_auc_score(train_df_2[target], val_predict), (tEnd - tStart)))



 AUC in train data: 0.8411 | It costs 51.524124 sec


In [80]:
metrics.roc_auc_score(train_df_2[target], val_predict) #train_reprocess #0.8422430076530614

0.8422430076530614

### 平均folds預測結果

In [60]:
predictions

array([[0.04297558, 0.0225673 , 0.03223007, ..., 0.03526879, 0.02907358,
        0.04309781],
       [0.00218763, 0.0017327 , 0.00192614, ..., 0.00164087, 0.00139406,
        0.00239386],
       [0.00485049, 0.00253477, 0.00458762, ..., 0.00381704, 0.0033667 ,
        0.00414319],
       ...,
       [0.12882683, 0.12850523, 0.20422815, ..., 0.15807645, 0.1992501 ,
        0.05835648],
       [0.06187036, 0.10217092, 0.09807307, ..., 0.10286012, 0.12936324,
        0.02151471],
       [0.03975468, 0.08504033, 0.062961  , ..., 0.12260089, 0.07900538,
        0.01733423]])

In [41]:
prediction2 = predictions

In [44]:
predictions[:,[1,5,6,7]].shape

(150000, 4)

In [61]:
predictions_transform = sum(predictions[:,[1,6,7]].T) /3

In [63]:
predictions_transform

array([0.03297947, 0.00174498, 0.00348902, ..., 0.13432976, 0.09708225,
       0.09433066])

In [62]:
submit = pd.DataFrame({'CUS_ID': test_df_2['CUS_ID'].astype(np.int64),'Ypred': predictions_transform})#rank_predictions.reshape(-1,)})
submit.to_csv('Lightgbm_reprocess_prob_26_knn.csv',index=False)

In [None]:
#清除函数占用的内存
'''
for x in list(locals().keys()):
    del locals()[x]
'''
gc.collect()

## ----------------------------------------------------------------------------------------------------------------------------------

In [None]:
def roc_curve_plot(df_true, df_predict):
    fpr,tpr,th = metrics.roc_curve(df_true, df_predict)

    plt.figure(figsize=(14,6))
    plt.plot(fpr,tpr,color='r')
    plt.plot([0,1],[0,1],color='b')
    plt.title('Reciever operating Charactaristics')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.show()
    
def roc_thresholds(df_true, df_predict, min_tpr, max_fpr):
    fpr,tpr,th = metrics.roc_curve(df_true, df_predict)
    thresholds_df = pd.DataFrame({'fpr':fpr,'tpr':tpr,'threshold':th}).sort_values(by="tpr" , ascending=False)
    a = thresholds['tpr'] > min_tpr
    b = thresholds['fpr'] < max_fpr
    return thresholds_df[(a & b)]


def conf_matrix(df_true, df_predict, threshold):
    y_pred_turn = pd.Series(df_predict).apply(lambda x: 1 if x > threshold else 0)
    matrix = metrics.confusion_matrix(y_true = df_true, 
                                      y_pred = y_pred_turn)
    score = metrics.roc_auc_score(df_true, y_pred_turn)

    fig, ax = plt.subplots(figsize= (5,5))
    ax.matshow(matrix, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            ax.text(x=j, y=i, s=matrix[i,j], va='center', ha='center')  
    plt.xlabel('Predict')
    plt.ylabel('True')
    plt.show()

    print('\n\nAUC after transformation：',score)

In [None]:
roc_curve_plot(train_df[target], val_predict)

### test records

In [None]:
'''
Lightgbm_reprocess_1.csv  sum_of_1 = 44796
threshold：0.02
AUC after transformation：0.7581462585034013, TBrain_AUC：0.762815

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.01,
 'max_depth': 15, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1,
 'num_leaves': 35, 'objective': 'binary', 'random_state': None, silent': True,
 'subsample_for_bin': 200000, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6,
 'bagging_fraction': 0.8, 'bagging_freq': 2, 'lambda_l1': 0.1, 'lambda_l2': 10, 'cat_smooth': 1}
'''

In [None]:
''' 
Lightgbm_reprocess_2.csv[Lightgbm_reprocess_prob_3.csv]  sum_of_1 = 37818  
threshold：0.021 
AUC in training: 0.8401, AUC after transformation：0.7616156462585034, TBrain_AUC：0.7746192893,  TBrain_AUC[new]：0.8485426599

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1,
 'max_depth': 20, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1,
 'num_leaves': 35, 'objective': 'binary', 'random_state': None, 'silent': True,
 'subsample_for_bin': 200000, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6,
 'bagging_fraction': 0.7, 'bagging_freq': 8, 'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1}
'''

In [None]:
''' BEST
Lightgbm_reprocess_prob_11.csv 
AUC in training:0.840415714285, TBrain_AUC：0.8492443858

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1,
 'max_depth': 20, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1,
 'num_leaves': 37, 'objective': 'binary', 'random_state': None, 'silent': True,'subsample_for_bin': 200000, 
 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6, 'bagging_fraction': 0.7, 'bagging_freq': 8, 
 'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1}
 
Lightgbm_reprocess_prob_13.csv 
AUC in training:0.8405826556122449, TBrain_AUC：XXX
early_stopping_rounds = 10 -> 15

Lightgbm_reprocess_prob_14.csv 
AUC in training:0.8409510178571429, TBrain_AUC：XXX
early_stopping_rounds = 10 -> 15 -> 20

Lightgbm_reprocess_prob_15.csv 
AUC in training:0.841293762755102, TBrain_AUC：0.8494192487
early_stopping_rounds = 10 -> 15 -> 20 -> 30

Lightgbm_reprocess_prob_16.csv 
AUC in training:0.8422430076530614, TBrain_AUC：0.8498711743
early_stopping_rounds = 10 -> 15 -> 20 -> 30
n_estimators = 100 -> 200

Lightgbm_reprocess_prob_24.csv
取Lightgbm_reprocess_prob_16中的 predictions[:,[1,2,3,5,6,7,9]]做平均  TBrain_AUC：0.8503344027

Lightgbm_reprocess_prob_21_knn.csv
AUC in training:0.8412987576530613, TBrain_AUC：0.8498704027

Lightgbm_reprocess_prob_22_knn.csv
AUC in training:0.842063642857143, TBrain_AUC： 0.8484891777
best_parameters ={'boosting_type': 'gbdt', 'class_weight': None, 'importance_type': 'split', 'learning_rate': 0.05,
 'max_depth': 20, 'n_estimators': 200, 'n_jobs': -1,
 'num_leaves': 35, 'objective': 'binary', 'random_state': None, 'silent': True, 
 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5, 
 'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1}

Lightgbm_reprocess_prob_20.csv 
AUC in training:0.8429781198979592, TBrain_AUC：XXX
early_stopping_rounds = 10 -> 15 -> 20 -> 30
n_estimators = 100 -> 200
lambda_l2 = 35 -> 40
 

Lightgbm_reprocess_prob_17.csv 
AUC in training:0.8422174158163266, TBrain_AUC：0.8474860643
early_stopping_rounds = 10 -> 15 -> 20 -> 30 -> 50
n_estimators = 100 -> 200 -> 300
boosting_type = 'goss'

Lightgbm_reprocess_prob_19.csv 
AUC in training:0.8423614770408163, TBrain_AUC：0.8484761421
early_stopping_rounds = 10 -> 15 -> 20 -> 30 -> 35
n_estimators = 100 -> 200
bagging_freq = 8 -> 9
'''

In [None]:
'''
Lightgbm_reprocess_prob_18.csv (using [train_new_columns_OHE.csv] data)
AUC in training:0.8431564438775511, TBrain_AUC：0.8480138071

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1,
 'max_depth': 20, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 300, 'n_jobs': -1,
 'num_leaves': 37, 'objective': 'binary', 'random_state': None, 'silent': True,'subsample_for_bin': 200000, 
 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6, 'bagging_fraction': 0.7, 'bagging_freq': 8, 
 'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1}
 
 '''

In [None]:
'''
Lightgbm_reprocess_prob_12.csv
AUC in training: 0.8410963954 , TBrain_AUC：0.8487294078

{best_parameters ={'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1,
 'max_depth': 20, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1,
 'num_leaves': 38, 'objective': 'binary', 'random_state': 1, 'silent': True, 'subsample': 1.0,
 'subsample_for_bin': 200000, 'subsample_freq': 0, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6,
 'bagging_fraction': 0.7, 'bagging_freq': 8, 'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_3.csv(without doing OHE)[Lightgbm_reprocess_prob_4.csv]  sum_of_1 = 28523
threshold：0.02111, 
AUC in training: 0.8424, AUC after transformation：0.7648979591836734, TBrain_AUC：0.6717529611, TBrain_AUC[new]：0.8470215228

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1,
 'max_depth': 20, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1,
 'num_leaves': 35, 'objective': 'binary', 'random_state': None, 'silent': True,
 'subsample_for_bin': 200000, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6,
 'bagging_fraction': 0.7, 'bagging_freq': 5, 'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_4.csv  sum_of_1 = 76059
threshold = 0.5
AUC in training: 0.8319, AUC after transformation：0.7504285714285714, Tbrain：0.7183654822 

{'boosting_type': 'gbdt', 'objective': 'binary', 'class_weight': 'balanced', 'importance_type': 'split',
 'learning_rate': 0.048845927448988316, 'max_depth': 47, 'min_child_samples': 15, 'min_split_gain': 0.6704675101784022,
 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 16, 'min_data_in_leaf': 5, 'min_sum_hessian_in_leaf': 0.020923509638119637,
 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'is_unbalance': True, 'subsample_for_bin': 200000,
 'subsample_freq': 0, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6616450202010071, 'bagging_fraction': 0.6,
 'bagging_freq': 8, 'lambda_l1': 11.28791771604942, 'lambda_l2': 37.40878001587038, 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_5.csv[Lightgbm_reprocess_prob_2.csv]  sum_of_1 = 38022
threshold = {'fold1': 0.4800000000000001, 'fold2': 0.4800000000000001, 'fold3': 0.44000000000000006, 'fold4': 0.4800000000000001,
             'fold5': 0.4600000000000001, 'fold6': 0.5600000000000002, 'fold7': 0.5200000000000001,  'fold8': 0.5400000000000001,
             'fold9': 0.4800000000000001, 'fold10': 0.42000000000000004}
             
AUC in training: 0.8355, AUC after transformation：XXX , Tbrain：0.8443494958

{'boosting_type': 'gbdt', 'objective': 'binary', 'class_weight': None, 'importance_type': 'split',
 'learning_rate': 0.054526359848345994, 'max_depth': 22, 'min_child_samples': 20, 'min_split_gain': 0.9620310653179346,
 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 24, 'min_data_in_leaf': 19, 'min_sum_hessian_in_leaf': 0.04858829899583882,
 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'is_unbalance': True, 'subsample_for_bin': 2000,
 'subsample_freq': 4, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.7829084970031792, 'bagging_fraction': 0.7,
 'bagging_freq': 8, 'lambda_l1': 49.83708262075042, 'lambda_l2': 49.59441610333089, 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_prob.csv  
AUC in training: 0.8349, AUC after transformation：XXX , Tbrain：0.842534

{'boosting_type': 'gbdt', 'objective': 'binary', 'class_weight': None, 'importance_type': 'split', 
 'learning_rate': 0.047364271859971274, 'max_depth': 10, 'min_split_gain': 0.293237287812807, 
 'n_estimators': 200, 'n_jobs': -1, 'num_leaves': 401, 'min_data_in_leaf': 46, 'min_sum_hessian_in_leaf': 0.03313119848815982,
 'random_state': 1, 'reg_alpha': 48.77369736176732, 'reg_lambda': 41.514456471954624, 'silent': True, 'is_unbalance': True,
 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.7381674547563201, 'bagging_fraction': 0.5018758497121245, 'bagging_freq': 10,
 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_prob_5.csv
LGB_BO.max['target']: 0.8339245026344717, AUC in training: 0.8362, , Tbrain：XXX

{'boosting_type': 'gbdt', 'objective': 'binary', 'class_weight': None, 'importance_type': 'split',
 'learning_rate': 0.036462505977644125, 'max_depth': 34, 'min_split_gain': 0.16535419711693278,
 'n_estimators': 200, 'n_jobs': -1, 'num_leaves': 508, 'min_data_in_leaf': 31, 'min_sum_hessian_in_leaf': 0.0927581071815638,
 'random_state': 1, 'reg_alpha': 37.54060515680778, 'reg_lambda': 36.29989926752258, 'silent': True, 'is_unbalance': True,
 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.7412424112438712, 'bagging_fraction': 0.7710205745863651, 'bagging_freq': 10,
 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_prob_6.csv
LGB_BO.max['target']: 0.838803567546262, AUC in training: 0.8403, , Tbrain：0.8473665651

{'boosting_type': 'dart', 'objective': 'binary', 'class_weight': None, 'importance_type': 'split', 'learning_rate': 0.06611771474105235,
 'max_depth': 40, 'min_split_gain': 0.8946066635038473, 'n_estimators': 200, 'n_jobs': -1, 'num_leaves': 36, 'min_data_in_leaf': 55,
 'min_sum_hessian_in_leaf': 0.008595916715840812, 'random_state': 1, 'reg_alpha': 8.491520978228445, 'reg_lambda': 43.90712517147065,
 'silent': True, 'is_unbalance': True, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.9841307878596988,
 'bagging_fraction': 0.9003722843377684, 'bagging_freq': 10, 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_prob_7.csv  #using [train_reprocess_2.csv] data
LGB_BO.max['target']: 0.8409108462598369, AUC in training: 0.8403, , Tbrain：0.8467020914

{'boosting_type': 'dart', 'objective': 'binary', 'class_weight': None, 'importance_type': 'split', 'learning_rate': 0.06611771474105235,
 'max_depth': 40, 'min_split_gain': 0.8946066635038473, 'n_estimators': 200, 'n_jobs': -1, 'num_leaves': 36,
 'min_data_in_leaf': 55, 'min_sum_hessian_in_leaf': 0.008595916715840812, 'random_state': 1,
 'reg_alpha': 8.491520978228445, 'reg_lambda': 43.90712517147065, 'silent': True, 'is_unbalance': True, 'metric': 'auc',
 'verbose': 0, 'feature_fraction': 0.9841307878596988, 'bagging_fraction': 0.9003722843377684, 'bagging_freq': 10, 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_prob_8.csv  #using [train_reprocess_2.csv] data
LGB_BO.max['target']: 0.8400115824555283, AUC in training: 0.8416, , Tbrain：0.8451220575

{'boosting_type': 'gbdt', 'objective': 'binary', 'class_weight': None, 'importance_type': 'split','learning_rate': 0.03215888771389792,
 'max_depth': 41, 'min_split_gain': 0.16535419711693278, 'n_estimators': 200, 'n_jobs': -1,'num_leaves': 76,
 'min_data_in_leaf': 29, 'min_sum_hessian_in_leaf': 0.0927581071815638, 'random_state': 1,
 'reg_alpha': 37.54060515680778, 'reg_lambda': 36.29989926752258, 'silent': True, 'is_unbalance': True, 'metric': 'auc',
 'verbose': 0, 'feature_fraction': 0.5687373520731187, 'bagging_fraction': 0.9517009576439417, 'bagging_freq': 10, 'cat_smooth': 1}
'''

In [None]:
'''
Lightgbm_reprocess_prob_9.csv  #using [train_new_columns_OHE.csv] data
AUC in training: 0.8411265739795919, , Tbrain：0.8471926768

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1,
 'max_depth': 20, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1,
 'num_leaves': 35, 'objective': 'binary', 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0,
 'subsample_for_bin': 200000, 'subsample_freq': 0, 'metric': 'auc', 'verbose': 0, 'feature_fraction': 0.6,
 'bagging_fraction': 0.7, 'bagging_freq': 8, 'lambda_l1': 0.1, 'lambda_l2': 35, 'cat_smooth': 1, 'learn_rate': 0.02}
'''

In [None]:
'''
Lightgbm_reprocess_prob_10.csv  #using [train_reprocess_nan.csv] data
AUC in training: 0.9505286122448979, , Tbrain：0.7894146058

{'boosting_type': 'gbdt', 'objective': 'binary', 'class_weight': None, 'importance_type': 'split',
 'learning_rate': 0.09746013255697664, 'max_depth': 22, 'min_split_gain': 0.9051459971534626, 'n_estimators': 100,
 'n_jobs': -1, 'num_leaves': 30, 'min_data_in_leaf': 91, 'min_sum_hessian_in_leaf': 0.008595916715840812, 'random_state': 1,
 'reg_alpha': 3.396608391291378, 'reg_lambda': 17.56285006858826, 'silent': True, 'is_unbalance': True,
 'metric': 'auc', 'verbose': 0, 'bagging_fraction': 0.940223370602661, 'bagging_freq': 9, 'cat_smooth': 1}
'''

In [None]:
best_parameters

### 參考網站(貝葉斯全局優化)：https://blog.csdn.net/qq_42283960/article/details/88317003

### 參考網站(特徵工程)：https://www.zhihu.com/question/28641663