### 导包

In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from hyperopt import hp
from toad.metrics import KS

import sys
sys.path.insert(0, r'D:\projects\auto_ml')

from auto_ml import *
from features_filtering import *

### 读取数据

In [None]:
result = pd.read_csv('data.csv')

### 拆分dev与oot

In [None]:
dev = result[result['payout_date'].str.slice(0, 7).isin([
    '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-05', '2022-06'])]
oot = result[~result['payout_date'].str.slice(0, 7).isin([
    '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-05', '2022-06'])]
assert dev.shape[0] + oot.shape[0] == result.shape[0]

dev.set_index(['uuid', 'certno', 'custname', 'phone', 'channel_flag', 'his_overdue_date', 'occur_date', 'payout_date'], inplace=True)
oot.set_index(['uuid', 'certno', 'custname', 'phone', 'channel_flag', 'his_overdue_date', 'occur_date', 'payout_date'], inplace=True)

### 字段类别

In [1]:
# 定义查看变量类型的方法


#### 查看每种类型所包含的变量数量

### 剔除缺失率95、同值率95的变量

In [None]:
variable_quality(dev, y_label='target', check=False, iv_limit=0)

eda_info = pd.read_excel('2.变量EDA.xlsx', sheet_name='变量IV、同值率、缺失率')
eda_info['rm_reason'].value_counts(dropna=False)

In [None]:
dev_filtered1 = dev[eda_info[eda_info['rm_reason'].isnull()]['variable'].astype(str).values.tolist() + ['target']]
oot_filtered1 = oot[dev_filtered1.columns]

# 剔除后所包含的变量总个数
dev_filtered1.shape, oot_filtered1.shape

#### 查看每种类型所包含的变量数量

### 剔除方差为0的变量

In [None]:
dev_filtered2, oot_filtered2 = VarThreshold(dev_filtered1, oot_filtered1, threshold=0)

# 剔除后所包含的变量总个数
dev_filtered2.shape, oot_filtered2.shape

((65160, 1699), (27339, 1699))

#### 查看每种类型所包含的变量数量

### 计算psi值

In [None]:
psi_val, psi_frame = get_psi_info(dev_filtered2, oot_filtered2)

writer = pd.ExcelWriter('变量psi.xlsx')
psi_val.to_excel(writer, sheet_name='psi val')
psi_frame.to_excel(writer, sheet_name='psi frame')
writer.save()
writer.close()
psi_val = pd.read_excel('变量psi.xlsx', sheet_name='psi val').set_index('Unnamed: 0')
del_cols = psi_val[psi_val['psi_all'] > 0.1].index.tolist()  # 0.1或0.2
cols = psi_val[psi_val['psi_all'] <= 0.1].index.tolist()

print(f"有 {len(del_cols)} 个变量不稳定，可能会被剔除，建议后续复查变量，重点检查入模变量的稳定性")

### 拆分train、test、oot

In [None]:
X_dev = dev_filtered2.drop('target', axis=1).fillna(-99999)
y_dev = dev_filtered2['target']

X_oot = oot_filtered2.drop('target', axis=1).fillna(-99999)
y_oot = oot_filtered2['target']

X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, random_state=68, stratify=y_dev, test_size=0.3)

### 设置初始参数空间

In [None]:
res1 = lgb_model_exploration(X_dev, y_dev, X_oot, y_oot, seeds=range(0, 100, 1))
res1.to_excel('1.1 model exploration.xlsx')

In [None]:
# 设置随机种子
seed = 7

In [None]:
# 根据随机种子拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, random_state=seed, stratify=y_dev, test_size=0.3)

# 设置正负样本权重
if y_train.mean() < 0.05:
    scale_pos_weight = round(0.05 * len(y_train) / sum(y_train), 4)
else:
    scale_pos_weight = 1

# 初始参数
init_param = get_init_lgb_param(scale_pos_weight, seed)

# 初始参数下的模型表现
clf, res = get_init_lgb_model(X_train, y_train, X_test, y_test, X_oot, y_oot, scale_pos_weight=scale_pos_weight, seed=seed)
res

### 模型变量初筛

In [None]:
clf.fit(X_dev, y_dev)

# 特征重要性>0的变量
rough_feat = X_dev.columns[clf.feature_importances_ > 0]
print(f"有 {len(rough_feat)} 个变量特征重要性 > 0")
# 模型表现
model_eva(clf, X_train[rough_feat], y_train, X_test[rough_feat], y_test, X_oot[rough_feat], y_oot, X_dev[rough_feat], y_dev, if_dev=True)

In [None]:
X_dev = X_dev[rough_feat]
X_train = X_train[rough_feat]
X_test = X_test[rough_feat]
X_oot = X_oot[rough_feat]

#### 查看每种类型所包含的变量数量

### 决策树的属性
- clf.booster_.current_iteration()  # Get the index of the current iteration
- clf.booster_.feature_importance(importance_type='split')
- clf.booster_.feature_importance(importance_type='gain')
- clf.booster_.feature_name()  # 与feature_importance一一对应
- lgb.plot_importance(clf, importance_type='split', max_num_features=10)
- clf.booster_.get_leaf_output(tree_id=0, leaf_id=0)  # 获取指定树-叶子节点的输出
- clf.booster_.get_split_value_histogram(feature='ds84')  # 获取指定变量的分割点，设置split时，返回元组(分割次数, 分割的区间)
- lgb.plot_split_value_histogram(clf, feature='ds84')  # Feature split value histogram 
- clf.booster_.trees_to_dataframe()  # 可查看各节点的分割特征、分割收益、分割点、缺失值在左/右子节点、输出值value、二阶海塞矩阵值weight、样本数count

In [None]:
lgb.plot_tree(clf, tree_index=0, figsize=(20, 8), dpi=300, show_info=['split_gain', 'internal_count', 'leaf_count', 'leaf_weight', 'data_percentage'], orientation='vertical')

In [None]:
lgb_booster_leaf_attribute(init_param, X_train, y_train)

### 第一次调参

In [None]:
#? 定义参数空间
param_space = {
    'n_estimators': hp.quniform('n_estimators', 100, 200, 10),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.05)),
    'max_depth':hp.quniform('max_depth', 2, 4, 1),
    'num_leaves':hp.quniform('num_leaves', 4, 15, 1),
    # 'min_split_gain': hp.uniform('min_split_gain', 0.01, 12),
    'reg_lambda':hp.uniform('reg_lambda', 1, 15),
    'reg_alpha':hp.uniform('reg_alpha', 1, 15),
    'subsample':hp.quniform('subsample', 0.6, 1.01, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1.01, 0.1),
    'min_child_samples': hp.quniform('min_child_samples', 400, 1800, 100),  # X_train.shape[0] * 0.8 * 0.01, X_train.shape[0] * 0.8 * 0.05, X_train.shape[0] / 8 * 0.25
    # 'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 3)
}
params_best, trials = lgb_bayes_hyperopt_process(init_param, param_space, X_train, y_train, X_test, y_test)

with open('1.1 trials_result_lgb.txt', mode='w', encoding='utf8') as f:
    f.write(str(trials.trials))

bayes_df = lgb_model_bayes_opt(trials.trials, init_param, X_train, y_train, X_test, y_test, X_oot, y_oot)
bayes_df.to_excel('1.2 第1轮贝叶斯调参.xlsx', index=False)

In [None]:
i = 82
fixed_params_bayes = get_bayes_param_from_trail(i, trials, init_param)
fixed_params_bayes

#### 第1轮调参--n_estimators、learning_rate

In [None]:
lgb_tune_df1 = lgb_tune2(fixed_params_bayes, 'n_estimators', 'learning_rate', range(150, 180, 3), [i/1000 for i in range(20, 30, 1)], X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df1.sort_values('ks_oot', ascending=False)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
70,171.0,0.02,0.226111,0.184433,0.185094,0.661859,0.622015,0.614863,0.041678,0.041016
7,150.0,0.027,0.237409,0.187296,0.184996,0.668813,0.625023,0.615866,0.050113,0.052412
37,159.0,0.027,0.241037,0.18922,0.184647,0.671343,0.625668,0.616528,0.051817,0.056389
17,153.0,0.027,0.238082,0.1875,0.184586,0.669624,0.625154,0.615703,0.050582,0.053495
90,177.0,0.02,0.229477,0.185244,0.184536,0.663126,0.622355,0.614596,0.044233,0.044941
...,...,...,...,...,...,...,...,...,...,...
33,159.0,0.023,0.229832,0.188946,0.177413,0.663855,0.623576,0.615293,0.040886,0.052419
13,153.0,0.023,0.228569,0.188086,0.177304,0.662571,0.622903,0.614843,0.040483,0.051265
23,156.0,0.023,0.229177,0.188344,0.177229,0.663171,0.623003,0.614439,0.040833,0.051948
3,150.0,0.023,0.228056,0.186948,0.176848,0.661795,0.622711,0.614953,0.041108,0.051208


In [None]:
fixed_params_bayes['n_estimators'] = 170
fixed_params_bayes['learning_rate'] = 0.02

In [None]:
lgb_booster_leaf_attribute(fixed_params_bayes, X_train, y_train)

Unnamed: 0,count,weight,split_gain
count,1313.0,1313.0,633.0
mean,5905.590251,382.722515,11.664695
std,6612.966959,430.679714,5.106249
min,471.0,25.717631,0.47091
10%,724.0,45.632287,5.367258
20%,1054.8,68.795333,6.844834
30%,1633.4,105.851275,8.610878
40%,2308.0,137.980712,10.17444
50%,3004.0,195.633322,11.2755
60%,4520.0,294.255539,12.34258


#### 第2轮调参--max_depth、num_leaves

In [None]:
lgb_tune_df2 = lgb_tune2(fixed_params_bayes, 'max_depth', 'num_leaves', range(2, 5), range(4, 16), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df2.sort_values('ks_oot', ascending=False)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
18,3.0,10.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
23,3.0,15.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
22,3.0,14.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
21,3.0,13.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
20,3.0,12.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
19,3.0,11.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
17,3.0,9.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
16,3.0,8.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
31,4.0,11.0,0.267002,0.179935,0.179783,0.68621,0.625992,0.616231,0.087067,0.087218
33,4.0,13.0,0.282807,0.181708,0.17953,0.696668,0.628317,0.613311,0.101099,0.103277


In [None]:
fixed_params_bayes['max_depth'] = 3
fixed_params_bayes['num_leaves'] = 8

In [None]:
lgb_booster_leaf_attribute(fixed_params_bayes, X_train, y_train)

Unnamed: 0,count,weight,split_gain
count,1313.0,1313.0,633.0
mean,5905.590251,382.722515,11.664695
std,6612.966959,430.679714,5.106249
min,471.0,25.717631,0.47091
10%,724.0,45.632287,5.367258
20%,1054.8,68.795333,6.844834
30%,1633.4,105.851275,8.610878
40%,2308.0,137.980712,10.17444
50%,3004.0,195.633322,11.2755
60%,4520.0,294.255539,12.34258


#### 第3轮调参--min_child_weight、min_child_samples

In [None]:
X_train.shape[0] * 0.01, X_train.shape[0] * 0.05, X_train.shape[0] / 8 * 0.25

(456.12, 2280.6, 1425.375)

In [None]:
lgb_tune_df2 = lgb_tune1(fixed_params_bayes, 'min_child_samples', range(400, 1000, 20), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df2.sort_values('ks_oot', ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
5,500.0,0.226547,0.18319,0.185602,0.661682,0.621844,0.6148,0.043357,0.040945
2,440.0,0.23144,0.182672,0.181252,0.662675,0.620584,0.612685,0.048768,0.050188
4,480.0,0.227065,0.187391,0.181049,0.662374,0.622216,0.611758,0.039674,0.046016
0,400.0,0.231546,0.185465,0.180444,0.662704,0.62196,0.61372,0.046082,0.051102
3,460.0,0.225233,0.188766,0.180165,0.661647,0.621779,0.612985,0.036467,0.045068
6,520.0,0.228507,0.186787,0.180159,0.661448,0.622881,0.617094,0.04172,0.048348
7,540.0,0.228471,0.189555,0.178786,0.661477,0.622906,0.616146,0.038916,0.049685
1,420.0,0.22901,0.183193,0.17752,0.662379,0.622274,0.611338,0.045817,0.05149
9,580.0,0.229473,0.191974,0.177337,0.662185,0.622647,0.613706,0.037499,0.052136
20,800.0,0.229306,0.183866,0.176791,0.660843,0.621789,0.613317,0.045439,0.052515


In [None]:
fixed_params_bayes['min_child_samples'] = 500

In [None]:
lgb_booster_leaf_attribute(fixed_params_bayes, X_train, y_train)

Unnamed: 0,count,weight,split_gain
count,1313.0,1313.0,633.0
mean,5905.590251,382.722515,11.664695
std,6612.966959,430.679714,5.106249
min,471.0,25.717631,0.47091
10%,724.0,45.632287,5.367258
20%,1054.8,68.795333,6.844834
30%,1633.4,105.851275,8.610878
40%,2308.0,137.980712,10.17444
50%,3004.0,195.633322,11.2755
60%,4520.0,294.255539,12.34258


#### 第4轮调参--reg_lambda、reg_alpha

In [None]:
lgb_tune_df3 = lgb_tune2(fixed_params_bayes, 'reg_lambda', 'reg_alpha', range(0, 10, 1), range(0, 10, 1), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df3.sort_values('ks_oot', ascending=False)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
73,7.0,3.0,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
33,3.0,3.0,0.229761,0.185963,0.186574,0.664696,0.623813,0.618122,0.043797,0.043186
93,9.0,3.0,0.229612,0.185338,0.186103,0.662929,0.622642,0.618275,0.044274,0.043509
53,5.0,3.0,0.230843,0.182822,0.185544,0.66325,0.622258,0.616478,0.048021,0.045299
15,1.0,5.0,0.22845,0.182699,0.184902,0.6631,0.62377,0.620062,0.045751,0.043549
...,...,...,...,...,...,...,...,...,...,...
8,0.0,8.0,0.227273,0.189244,0.169794,0.661898,0.621022,0.611486,0.038029,0.057479
96,9.0,6.0,0.227361,0.184844,0.168945,0.661168,0.621475,0.610587,0.042517,0.058417
1,0.0,1.0,0.232224,0.184842,0.168754,0.665367,0.622148,0.609079,0.047382,0.06347
80,8.0,0.0,0.228418,0.182678,0.167845,0.663716,0.624055,0.611428,0.04574,0.060574


In [None]:
lgb_tune_df3 = lgb_tune2(fixed_params_bayes, 'reg_lambda', 'reg_alpha', np.arange(5, 9, 0.5), np.arange(1, 5, 0.5), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df3.sort_values('ks_oot', ascending=False)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
30,6.5,4.0,0.230019,0.182648,0.186743,0.662661,0.62271,0.618164,0.04737,0.043275
36,7.0,3.0,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
35,7.0,2.5,0.231271,0.182311,0.186069,0.663549,0.622769,0.617784,0.048959,0.045202
4,5.0,3.0,0.230843,0.182822,0.185544,0.66325,0.622258,0.616478,0.048021,0.045299
33,7.0,1.5,0.229393,0.186405,0.185319,0.663975,0.623813,0.617597,0.042988,0.044073
...,...,...,...,...,...,...,...,...,...,...
32,7.0,1.0,0.229318,0.180547,0.175532,0.664369,0.622677,0.613643,0.04877,0.053786
5,5.0,3.5,0.229946,0.184642,0.175113,0.66308,0.622491,0.613763,0.045304,0.054833
54,8.0,4.0,0.226968,0.185549,0.175083,0.663296,0.62258,0.613219,0.041419,0.051885
17,6.0,1.5,0.231528,0.18288,0.174457,0.663923,0.622714,0.614498,0.048648,0.057071


In [None]:
fixed_params_bayes['reg_lambda'] = 7
fixed_params_bayes['reg_alpha'] = 3

#### 第5轮调参--subsample、colsample_bytree

In [None]:
lgb_tune_df4 = lgb_tune2(fixed_params_bayes, 'subsample', 'colsample_bytree', [(i / 20) for i in range(10, 20, 1)], [(i / 20) for i in range(10, 20, 1)], X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df4.sort_values('ks_oot', ascending=False)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
38,0.65,0.9,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
58,0.75,0.9,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
98,0.95,0.9,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
68,0.8,0.9,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
88,0.9,0.9,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
...,...,...,...,...,...,...,...,...,...,...
41,0.7,0.55,0.227308,0.188619,0.151211,0.662071,0.622557,0.599606,0.038689,0.076097
11,0.55,0.55,0.227308,0.188619,0.151211,0.662071,0.622557,0.599606,0.038689,0.076097
1,0.5,0.55,0.227308,0.188619,0.151211,0.662071,0.622557,0.599606,0.038689,0.076097
81,0.9,0.55,0.227308,0.188619,0.151211,0.662071,0.622557,0.599606,0.038689,0.076097


In [None]:
fixed_params_bayes['colsample_bytree'] = 0.9
fixed_params_bayes['subsample'] = 0.8

#### 第6轮调参--scale_pos_weight

In [None]:
lgb_tune_df5 = lgb_tune1(fixed_params_bayes, 'scale_pos_weight', [i / 20 for i in range(20, 60, 1)], X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df5.sort_values('ks_oot',ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
0,1.0,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
30,2.5,0.245361,0.187536,0.186141,0.674687,0.624802,0.619499,0.057825,0.05922
2,1.1,0.234262,0.18263,0.184199,0.664017,0.622132,0.617278,0.051633,0.050063
24,2.2,0.244254,0.192527,0.183756,0.673007,0.625985,0.616762,0.051728,0.060498
38,2.9,0.249796,0.185353,0.183331,0.676704,0.626524,0.619283,0.064443,0.066465
39,2.95,0.250681,0.186232,0.182864,0.676529,0.625309,0.618282,0.064449,0.067817
7,1.35,0.231732,0.185482,0.182581,0.666566,0.623892,0.616354,0.04625,0.049151
15,1.75,0.242723,0.185445,0.182075,0.670222,0.623683,0.616276,0.057279,0.060648
4,1.2,0.230098,0.183328,0.181512,0.665555,0.624079,0.616724,0.046771,0.048586
21,2.05,0.242306,0.185176,0.180606,0.67189,0.62393,0.612912,0.05713,0.0617


In [None]:
fixed_params_bayes['scale_pos_weight'] = 1

#### 第7轮调参--random_state

In [None]:
lgb_tune_df6 = lgb_tune1(fixed_params_bayes, 'random_state', range(1, 100), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df6.sort_values('ks_oot',ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
66,67.0,0.230285,0.182515,0.190362,0.662522,0.622234,0.618705,0.04777,0.039923
90,91.0,0.230134,0.18574,0.189232,0.662317,0.623059,0.619475,0.044394,0.040902
95,96.0,0.226122,0.18131,0.188983,0.662379,0.621457,0.620346,0.044812,0.037139
75,76.0,0.226761,0.185393,0.18869,0.662241,0.622412,0.619326,0.041369,0.038072
69,70.0,0.22855,0.186478,0.187143,0.662982,0.622599,0.618038,0.042072,0.041407
...,...,...,...,...,...,...,...,...,...
8,9.0,0.228706,0.182924,0.174563,0.663132,0.6224,0.61157,0.045782,0.054143
1,2.0,0.227635,0.184078,0.174557,0.663613,0.621517,0.615503,0.043557,0.053078
26,27.0,0.226116,0.180991,0.173338,0.661986,0.622078,0.616373,0.045125,0.052778
22,23.0,0.22842,0.179822,0.172584,0.663069,0.622339,0.613237,0.048599,0.055836


In [None]:
lgb_tune_df6.sort_values('ks_oot',ascending=False).head(20)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
66,67.0,0.230285,0.182515,0.190362,0.662522,0.622234,0.618705,0.04777,0.039923
90,91.0,0.230134,0.18574,0.189232,0.662317,0.623059,0.619475,0.044394,0.040902
95,96.0,0.226122,0.18131,0.188983,0.662379,0.621457,0.620346,0.044812,0.037139
75,76.0,0.226761,0.185393,0.18869,0.662241,0.622412,0.619326,0.041369,0.038072
69,70.0,0.22855,0.186478,0.187143,0.662982,0.622599,0.618038,0.042072,0.041407
67,68.0,0.227849,0.184313,0.186599,0.663497,0.622186,0.618334,0.043536,0.041249
27,28.0,0.232172,0.184071,0.186397,0.662819,0.621752,0.618111,0.0481,0.045775
74,75.0,0.229457,0.178078,0.185939,0.662579,0.622218,0.618297,0.051379,0.043519
20,21.0,0.228129,0.184569,0.185927,0.66284,0.622549,0.618648,0.04356,0.042202
78,79.0,0.229901,0.187961,0.185417,0.6627,0.621889,0.619308,0.04194,0.044484


In [None]:
fixed_params_bayes['random_state'] = 76

### 第二轮调参

In [None]:
model = LGBMClassifier(**fixed_params_bayes)
model

In [None]:
# 数据集切分随机种子
bayes_df = pd.DataFrame()
for i in range(200):
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, random_state=i, stratify=y_dev, test_size=0.3)
    res = model_eva(model, X_train, y_train, X_test, y_test, X_oot, y_oot)
    col = ['id', 'train', 'test', 'oot', 'ks_train', 'ks_test', 'ks_oot', 'auc_train', 'auc_test', 'auc_oot']
    result = [i, y_train.mean(), y_test.mean(), y_oot.mean(), res['ks_train'], res['ks_test'], res['ks_oot'], res['auc_train'], res['auc_test'], res['auc_oot']]
    bayes_df = pd.concat([bayes_df, pd.Series(dict(zip(col, result))).to_frame().T], axis=0, ignore_index=True)
bayes_df['ks_dec1'] = bayes_df['ks_train'] - bayes_df['ks_test']
bayes_df['ks_dec2'] = bayes_df['ks_train'] - bayes_df['ks_oot']

In [None]:
bayes_df.sort_values('ks_oot', ascending=False).head(20)

Unnamed: 0,id,train,test,oot,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
68,68.0,0.069806,0.069777,0.055196,0.226761,0.185393,0.18869,0.662241,0.622412,0.619326,0.041369,0.038072
4,4.0,0.069806,0.069777,0.055196,0.232768,0.185572,0.184712,0.664581,0.621191,0.611675,0.047196,0.048056
42,42.0,0.069806,0.069777,0.055196,0.242272,0.170067,0.183103,0.668164,0.616256,0.619964,0.072205,0.059169
73,73.0,0.069806,0.069777,0.055196,0.243384,0.156558,0.181822,0.670333,0.602388,0.615441,0.086826,0.061562
53,53.0,0.069806,0.069777,0.055196,0.235786,0.159053,0.181791,0.666688,0.611343,0.615254,0.076732,0.053995
34,34.0,0.069806,0.069777,0.055196,0.246899,0.149932,0.180915,0.669304,0.600026,0.612349,0.096968,0.065985
48,48.0,0.069806,0.069777,0.055196,0.230809,0.166118,0.180136,0.664657,0.605065,0.611129,0.064691,0.050673
38,38.0,0.069806,0.069777,0.055196,0.246502,0.162651,0.178615,0.668156,0.615061,0.612064,0.083852,0.067887
44,44.0,0.069806,0.069777,0.055196,0.231478,0.16967,0.17856,0.667304,0.617381,0.61511,0.061808,0.052918
162,162.0,0.069806,0.069777,0.055196,0.244731,0.163371,0.178244,0.671927,0.608592,0.612483,0.08136,0.066487


In [None]:
i = 68
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, random_state=i, stratify=y_dev, test_size=0.3)

In [None]:
lgb_booster_leaf_attribute(fixed_params_bayes, X_train, y_train)

In [None]:
#? 定义参数空间
param_space = {
    'n_estimators': hp.quniform('n_estimators', 160, 180, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.05)),
    # 'max_depth':hp.quniform('max_depth', 2, 4, 1),
    # 'num_leaves':hp.quniform('num_leaves', 4, 15, 1),
    'min_split_gain': hp.uniform('min_split_gain', 0.01, 10),
    'reg_lambda':hp.uniform('reg_lambda', 0, 10),
    'reg_alpha':hp.uniform('reg_alpha', 0, 10),
    # 'subsample':hp.quniform('subsample', 0.6, 1.01, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1.01, 0.1),
    'min_child_samples': hp.quniform('min_child_samples', 400, 800, 10),  # X_train.shape[0] * 0.8 * 0.01, X_train.shape[0] * 0.8 * 0.05, X_train.shape[0] / 8 * 0.25
    # 'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 3)
}
params_best, trials = lgb_bayes_hyperopt_process(init_param, param_space, X_train, y_train, X_test, y_test)

with open('2.1 trials_result_lgb.txt', mode='w', encoding='utf8') as f:
    f.write(str(trials.trials))

bayes_df = lgb_model_bayes_opt(trials.trials, init_param, X_train, y_train, X_test, y_test, X_oot, y_oot)
bayes_df.to_excel('2.2 第2轮贝叶斯调参.xlsx', index=False)

In [None]:
fixed_params_bayes = get_bayes_param_from_trail(122, trials, init_param)
fixed_params_bayes

{'boosting_type': 'gbdt',
 'max_depth': 3,
 'learning_rate': 0.0104,
 'objective': 'binary',
 'scale_pos_weight': 1,
 'min_split_gain': 2.8542,
 'min_child_samples': 470,
 'subsample': 0.8,
 'colsample_bytree': 0.9,
 'reg_lambda': 4.8839,
 'reg_alpha': 2.1956,
 'random_state': 68,
 'force_col_wise': True,
 'verbose': -1,
 'n_estimators': 172}

#### 第1轮调参--n_estimators、learning_rate

In [None]:
lgb_tune_df1 = lgb_tune2(fixed_params_bayes, 'n_estimators', 'learning_rate', range(170, 180, 1), [i/1000 for i in range(10, 25, 1)], X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df1.sort_values('ks_oot', ascending=False)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
77,177.0,0.017,0.223383,0.185338,0.180024,0.658323,0.621789,0.615407,0.038044,0.043359
57,175.0,0.017,0.222966,0.184184,0.179669,0.658,0.621397,0.614845,0.038781,0.043297
67,176.0,0.017,0.222678,0.183983,0.179282,0.65824,0.621655,0.615495,0.038696,0.043397
87,178.0,0.017,0.223874,0.185558,0.179136,0.658543,0.621874,0.615403,0.038316,0.044739
38,173.0,0.018,0.229795,0.182294,0.179032,0.660514,0.622239,0.612432,0.047501,0.050764
...,...,...,...,...,...,...,...,...,...,...
40,174.0,0.01,0.189322,0.167754,0.1637,0.638064,0.611395,0.604448,0.021567,0.025621
0,170.0,0.01,0.188486,0.16793,0.163214,0.637089,0.610915,0.603564,0.020555,0.025271
50,175.0,0.01,0.189889,0.167501,0.163128,0.638327,0.611491,0.60442,0.022388,0.02676
10,171.0,0.01,0.18802,0.168132,0.163049,0.637257,0.611095,0.604246,0.019888,0.024971


In [None]:
lgb_tune_df1.sort_values('ks_oot', ascending=False).head(50)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
77,177.0,0.017,0.223383,0.185338,0.180024,0.658323,0.621789,0.615407,0.038044,0.043359
57,175.0,0.017,0.222966,0.184184,0.179669,0.658,0.621397,0.614845,0.038781,0.043297
67,176.0,0.017,0.222678,0.183983,0.179282,0.65824,0.621655,0.615495,0.038696,0.043397
87,178.0,0.017,0.223874,0.185558,0.179136,0.658543,0.621874,0.615403,0.038316,0.044739
38,173.0,0.018,0.229795,0.182294,0.179032,0.660514,0.622239,0.612432,0.047501,0.050764
37,173.0,0.017,0.222308,0.182406,0.178835,0.657508,0.621158,0.614792,0.039902,0.043472
97,179.0,0.017,0.224682,0.185394,0.178795,0.658865,0.621904,0.615354,0.039288,0.045887
48,174.0,0.018,0.229748,0.182898,0.178264,0.660767,0.62238,0.613188,0.046849,0.051484
28,172.0,0.018,0.229579,0.182844,0.178112,0.660339,0.622165,0.612423,0.046735,0.051468
47,174.0,0.017,0.222543,0.183799,0.178095,0.657673,0.621403,0.614794,0.038745,0.044448


In [None]:
fixed_params_bayes['n_estimators'] = 177
fixed_params_bayes['learning_rate'] = 0.017

#### 第2轮调参--max_depth、num_leaves

In [None]:
# lgb_tune_df2 = lgb_tune2(fixed_params_bayes, 'max_depth', 'num_leaves', range(2, 5), range(4, 16), X_train, y_train, X_test, y_test, X_oot, y_oot)
# lgb_tune_df2.sort_values('ks_oot', ascending=False)

In [None]:
# fixed_params_bayes['max_depth'] = 3
# fixed_params_bayes['num_leaves'] = 8

In [None]:
lgb_booster_leaf_attribute(fixed_params_bayes, X_train, y_train)

Unnamed: 0,count,weight,split_gain
count,1334.0,1334.0,640.0
mean,6051.967016,392.233948,13.440471
std,6872.178043,447.45405,5.504871
min,413.0,22.976123,2.92213
10%,733.3,48.496744,6.570744
20%,1074.2,68.795115,8.427786
30%,1568.0,105.301066,10.22619
40%,2309.4,138.629433,11.76612
50%,3343.0,211.299244,12.90635
60%,4569.4,296.502771,14.28286


#### 第3轮调参--min_child_weight、min_child_samples

In [None]:
X_train.shape[0] * 0.01, X_train.shape[0] * 0.05, X_train.shape[0] / 8 * 0.25

(456.12, 2280.6, 1425.375)

In [None]:
fixed_params_bayes

{'boosting_type': 'gbdt',
 'max_depth': 3,
 'learning_rate': 0.017,
 'objective': 'binary',
 'scale_pos_weight': 1,
 'min_split_gain': 2.8542,
 'min_child_samples': 470,
 'subsample': 0.8,
 'colsample_bytree': 0.9,
 'reg_lambda': 4.8839,
 'reg_alpha': 2.1956,
 'random_state': 68,
 'force_col_wise': True,
 'verbose': -1,
 'n_estimators': 177}

In [None]:
lgb_tune_df2 = lgb_tune1(fixed_params_bayes, 'min_child_samples', range(400, 1000, 20), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df2.sort_values('ks_oot', ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
21,820.0,0.222589,0.187299,0.179345,0.65637,0.621209,0.61391,0.03529,0.043244
5,500.0,0.223066,0.181383,0.177432,0.65799,0.621666,0.614394,0.041683,0.045634
3,460.0,0.223046,0.182646,0.177431,0.658622,0.621428,0.616374,0.0404,0.045615
18,760.0,0.224416,0.18583,0.177296,0.657129,0.621842,0.614577,0.038586,0.04712
16,720.0,0.222036,0.18442,0.177294,0.657405,0.621912,0.614646,0.037616,0.044741
14,680.0,0.223195,0.18059,0.176996,0.657389,0.621668,0.61432,0.042605,0.046198
1,420.0,0.224419,0.179814,0.176148,0.658799,0.620914,0.614695,0.044605,0.048271
4,480.0,0.224313,0.180869,0.175344,0.658105,0.621138,0.616265,0.043445,0.048969
17,740.0,0.227083,0.18283,0.174901,0.657286,0.621684,0.613515,0.044254,0.052183
0,400.0,0.226777,0.182153,0.173689,0.659634,0.621128,0.614163,0.044624,0.053088


In [None]:
lgb_tune_df2 = lgb_tune1(fixed_params_bayes, 'min_child_samples', range(500, 800, 10), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df2.sort_values('ks_oot', ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
3,530.0,0.227001,0.185027,0.179648,0.658723,0.621431,0.612204,0.041974,0.047353
29,790.0,0.225072,0.186896,0.178924,0.657409,0.622045,0.614183,0.038176,0.046148
1,510.0,0.223365,0.183231,0.177817,0.657871,0.621606,0.61325,0.040134,0.045548
0,500.0,0.223066,0.181383,0.177432,0.65799,0.621666,0.614394,0.041683,0.045634
26,760.0,0.224416,0.18583,0.177296,0.657129,0.621842,0.614577,0.038586,0.04712
22,720.0,0.222036,0.18442,0.177294,0.657405,0.621912,0.614646,0.037616,0.044741
18,680.0,0.223195,0.18059,0.176996,0.657389,0.621668,0.61432,0.042605,0.046198
23,730.0,0.223201,0.183179,0.176048,0.657148,0.621397,0.61276,0.040023,0.047154
24,740.0,0.227083,0.18283,0.174901,0.657286,0.621684,0.613515,0.044254,0.052183
27,770.0,0.225062,0.186329,0.174005,0.656957,0.620652,0.611753,0.038733,0.051057


In [None]:
fixed_params_bayes['min_child_samples'] = 790

In [None]:
lgb_booster_leaf_attribute(fixed_params_bayes, X_train, y_train)

Unnamed: 0,count,weight,split_gain
count,1310.0,1310.0,618.0
mean,6162.842748,399.474516,13.413876
std,6889.476898,449.078694,5.717817
min,746.0,42.465444,3.00282
10%,926.9,60.520387,5.837099
20%,1185.0,80.691552,8.009412
30%,1770.0,111.441027,10.31176
40%,2389.0,142.530723,12.05514
50%,3284.0,211.115203,13.098
60%,4518.8,292.92132,14.48944


#### 第4轮调参--reg_lambda、reg_alpha

In [None]:
lgb_tune_df3 = lgb_tune2(fixed_params_bayes, 'reg_lambda', 'reg_alpha', range(0, 10, 1), range(0, 10, 1), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df3.sort_values('ks_oot', ascending=False)

Unnamed: 0,id1,id2,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
30,3.0,0.0,0.222737,0.189885,0.184935,0.657573,0.622292,0.616865,0.032851,0.037801
40,4.0,0.0,0.224445,0.183853,0.183606,0.657523,0.622281,0.615934,0.040592,0.040839
70,7.0,0.0,0.222133,0.183982,0.183043,0.657024,0.622151,0.615328,0.038151,0.03909
50,5.0,0.0,0.223613,0.184842,0.182741,0.656184,0.621308,0.61723,0.038771,0.040873
51,5.0,1.0,0.223043,0.185097,0.181326,0.656617,0.622163,0.613721,0.037945,0.041717
...,...,...,...,...,...,...,...,...,...,...
57,5.0,7.0,0.219642,0.179176,0.152808,0.653379,0.618924,0.600482,0.040466,0.066835
59,5.0,9.0,0.215185,0.173549,0.152318,0.65194,0.617499,0.602041,0.041635,0.062867
99,9.0,9.0,0.217644,0.176392,0.150449,0.650779,0.61714,0.601146,0.041253,0.067195
39,3.0,9.0,0.216206,0.178165,0.150204,0.652148,0.618634,0.600478,0.038041,0.066003


In [None]:
fixed_params_bayes['reg_alpha'] = 0

In [None]:
lgb_tune_df3 = lgb_tune1(fixed_params_bayes, 'reg_lambda', np.arange(2, 7, 0.1), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df3.sort_values('ks_oot', ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
9,2.9,0.223537,0.185082,0.185824,0.657384,0.622773,0.617019,0.038454,0.037713
36,5.6,0.224536,0.186032,0.185755,0.656344,0.622265,0.617873,0.038504,0.03878
45,6.5,0.223255,0.186784,0.1855,0.656544,0.622204,0.617003,0.03647,0.037754
10,3.0,0.222737,0.189885,0.184935,0.657573,0.622292,0.616865,0.032851,0.037801
20,4.0,0.224445,0.183853,0.183606,0.657523,0.622281,0.615934,0.040592,0.040839
8,2.8,0.2257,0.181054,0.183542,0.657173,0.622116,0.615348,0.044646,0.042158
42,6.2,0.220574,0.186068,0.183212,0.656358,0.62222,0.616669,0.034506,0.037362
18,3.8,0.222433,0.186236,0.182836,0.6567,0.621862,0.615716,0.036197,0.039597
30,5.0,0.223613,0.184842,0.182741,0.656184,0.621308,0.61723,0.038771,0.040873
4,2.4,0.225104,0.180574,0.182515,0.656755,0.621651,0.615769,0.04453,0.042589


In [None]:
fixed_params_bayes['reg_lambda'] = 3
fixed_params_bayes['reg_alpha'] = 0

#### 第5轮调参--subsample、colsample_bytree

In [None]:
lgb_tune_df4 = lgb_tune1(fixed_params_bayes, 'colsample_bytree', [(i / 20) for i in range(10, 20, 1)], X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df4.sort_values('ks_oot', ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
8,0.9,0.222737,0.189885,0.184935,0.657573,0.622292,0.616865,0.032851,0.037801
7,0.85,0.224179,0.183087,0.184683,0.656086,0.622769,0.618625,0.041092,0.039496
9,0.95,0.224833,0.181962,0.178561,0.65693,0.621992,0.615835,0.042871,0.046271
2,0.6,0.222584,0.186838,0.165817,0.655981,0.621793,0.603757,0.035745,0.056767
6,0.8,0.224983,0.186093,0.164777,0.657226,0.6209,0.607068,0.03889,0.060205
5,0.75,0.224059,0.18235,0.160245,0.657589,0.621445,0.604013,0.041709,0.063814
3,0.65,0.223721,0.185777,0.159418,0.655962,0.621734,0.60524,0.037944,0.064303
4,0.7,0.219784,0.182751,0.159315,0.655985,0.621499,0.603495,0.037033,0.060469
1,0.55,0.219286,0.179729,0.152899,0.656176,0.62142,0.597448,0.039558,0.066388
0,0.5,0.218786,0.175424,0.142597,0.654622,0.620769,0.592531,0.043362,0.076188


In [None]:
fixed_params_bayes['colsample_bytree'] = 0.9
fixed_params_bayes['subsample'] = 0.8

#### 第6轮调参--min_split_gain

In [None]:
lgb_tune_df4 = lgb_tune1(fixed_params_bayes, 'min_split_gain', range(0, 10), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df4.sort_values('ks_oot', ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
0,0.0,0.225401,0.182586,0.18549,0.657624,0.621394,0.618405,0.042815,0.039911
1,1.0,0.222558,0.183671,0.185064,0.657345,0.621741,0.618577,0.038887,0.037494
2,2.0,0.225126,0.187008,0.179501,0.657816,0.622931,0.616941,0.038118,0.045625
4,4.0,0.221959,0.185031,0.176245,0.656249,0.62124,0.616182,0.036928,0.045714
6,6.0,0.221363,0.178837,0.17425,0.652395,0.621297,0.614416,0.042526,0.047113
3,3.0,0.22292,0.190201,0.172896,0.656337,0.621206,0.611002,0.03272,0.050025
5,5.0,0.219057,0.181823,0.172071,0.654712,0.621103,0.614208,0.037234,0.046986
7,7.0,0.215299,0.181162,0.165606,0.649638,0.620804,0.610107,0.034138,0.049693
8,8.0,0.207145,0.178247,0.155845,0.6467,0.619492,0.604885,0.028899,0.0513
9,9.0,0.207929,0.178436,0.152254,0.644541,0.618666,0.598564,0.029493,0.055675


In [None]:
fixed_params_bayes['min_split_gain'] = 0

#### 第6轮调参--scale_pos_weight

In [None]:
lgb_tune_df5 = lgb_tune1(fixed_params_bayes, 'scale_pos_weight', [i / 20 for i in range(20, 60, 1)], X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df5.sort_values('ks_oot',ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
1,1.05,0.224471,0.186603,0.192752,0.657812,0.621494,0.620082,0.037868,0.03172
34,2.7,0.237555,0.184273,0.191184,0.667056,0.623928,0.616534,0.053283,0.046372
18,1.9,0.234693,0.187681,0.191005,0.663786,0.622671,0.617386,0.047012,0.043688
22,2.1,0.233599,0.183174,0.190926,0.664936,0.623512,0.618471,0.050425,0.042673
15,1.75,0.234294,0.18292,0.189174,0.663288,0.622862,0.618608,0.051374,0.04512
31,2.55,0.23719,0.184715,0.188366,0.667373,0.623884,0.616865,0.052475,0.048824
36,2.8,0.239354,0.184021,0.188099,0.667124,0.624271,0.616931,0.055333,0.051255
3,1.15,0.228168,0.18365,0.187933,0.659012,0.622216,0.615807,0.044518,0.040235
30,2.5,0.238189,0.182829,0.187879,0.666297,0.62294,0.6181,0.05536,0.05031
29,2.45,0.236298,0.184897,0.187549,0.665834,0.622954,0.6169,0.051401,0.048749


In [None]:
fixed_params_bayes['scale_pos_weight'] = 1

#### 第7轮调参--random_state

In [None]:
lgb_tune_df6 = lgb_tune1(fixed_params_bayes, 'random_state', range(1, 100), X_train, y_train, X_test, y_test, X_oot, y_oot)
lgb_tune_df6.sort_values('ks_oot',ascending=False)

Unnamed: 0,id,ks_train,ks_test,ks_oot,auc_train,auc_test,auc_oot,ks_dec1,ks_dec2
90,91.0,0.222456,0.186145,0.190092,0.657831,0.620903,0.618439,0.036311,0.032364
37,38.0,0.223802,0.185287,0.188431,0.657983,0.621992,0.618615,0.038515,0.035371
32,33.0,0.222534,0.182701,0.187837,0.657697,0.620361,0.618366,0.039833,0.034697
92,93.0,0.223227,0.183151,0.187814,0.657741,0.620792,0.617154,0.040076,0.035413
41,42.0,0.224747,0.185359,0.187618,0.657831,0.621987,0.618276,0.039388,0.037129
...,...,...,...,...,...,...,...,...,...
95,96.0,0.224361,0.18646,0.177699,0.657112,0.621346,0.616118,0.037901,0.046662
68,69.0,0.223706,0.18506,0.177462,0.657919,0.621751,0.615257,0.038646,0.046244
45,46.0,0.223654,0.185414,0.177447,0.657267,0.621031,0.615301,0.038239,0.046207
46,47.0,0.224973,0.18177,0.177321,0.658335,0.620983,0.614943,0.043203,0.047652


In [None]:
fixed_params_bayes['random_state'] = 91

In [None]:
fixed_params_bayes

{'boosting_type': 'gbdt',
 'max_depth': 3,
 'learning_rate': 0.017,
 'objective': 'binary',
 'scale_pos_weight': 1,
 'min_split_gain': 0,
 'min_child_samples': 790,
 'subsample': 0.8,
 'colsample_bytree': 0.9,
 'reg_lambda': 3,
 'reg_alpha': 0,
 'random_state': 91,
 'force_col_wise': True,
 'verbose': -1,
 'n_estimators': 177}

### RFE筛选变量

In [None]:
model = LGBMClassifier(**fixed_params_bayes)
model_eva(model, X_train, y_train, X_test, y_test, X_oot, y_oot, X_dev, y_dev, if_dev=True)

{'auc_train': 0.6449038162138588,
 'auc_test': 0.6661840700083733,
 'auc_oot': 0.6167619653097032,
 'ks_train': 0.20762208800199738,
 'ks_test': 0.24051000976656994,
 'ks_oot': 0.1789301357938317,
 'auc_dev': 0.6512914287170306,
 'ks_dev': 0.2153762057967396}

In [None]:
model = LGBMClassifier(**fixed_params_bayes)
model_eva(model, X_train, y_train, X_test, y_test, X_oot, y_oot)

{'auc_train': 0.657831373978879,
 'auc_test': 0.6209031730708443,
 'auc_oot': 0.6184389340816632,
 'ks_train': 0.2224559975800564,
 'ks_test': 0.18614492067403526,
 'ks_oot': 0.19009189154657807}

In [None]:
(model.feature_importances_ > 0).sum()

139

In [None]:
rfe_feat = X_dev.columns[model.feature_importances_ > 0]

In [None]:
model_eva(model, X_train, y_train, X_test, y_test, X_oot, y_oot)

{'auc_train': 0.657831373978879,
 'auc_test': 0.6209031730708443,
 'auc_oot': 0.6184389340816632,
 'ks_train': 0.2224559975800564,
 'ks_test': 0.18614492067403526,
 'ks_oot': 0.19009189154657807}

In [None]:
model = LGBMClassifier(**fixed_params_bayes)
feature_ranking, scores = filtering_feat_RFE(model, X_dev, y_dev, X_train, y_train, X_test, y_test, X_oot, y_oot)
feature_ranking.to_excel('feature_ranking_dev.xlsx', index=False)  # ranking从1开始
scores.to_excel('rfe_scores_dev.xlsx')

#### 选择k个特征

In [None]:
k = 72
final_cols = feature_ranking[feature_ranking['ranking'] <= k]['feature'].values
model_eva(model, X_train[final_cols], y_train, X_test[final_cols], y_test, X_oot[final_cols], y_oot)

In [None]:
model_eva(model, X_train[final_cols], y_train, X_test[final_cols], y_test, X_oot[final_cols], y_oot, X_dev[final_cols], y_dev, if_dev=True)

#### 查看每种类型所包含的变量数量

### 保存模型

In [None]:
model.fit(X_dev[final_cols], y_dev)
model.feature_importances_

array([19, 29, 77, 23, 41, 27, 37, 21, 22, 26, 10, 22, 35, 16, 17,  8, 17,
       22, 21, 29, 28, 26, 18, 17,  7, 18, 18, 12, 13, 15, 16, 16, 13, 16,
       15, 17, 13,  8, 11,  7, 16, 11, 11, 14, 18, 13, 11, 11, 11, 14, 11,
       11, 13, 11, 10,  8,  7, 13,  8,  9, 12, 10, 10,  6,  8,  7, 12,  5,
        4,  6,  5,  5])

In [None]:
feature_df = lgb_shap_importance(model, X_dev[final_cols], X_oot[final_cols])
feature_df

Unnamed: 0,feature_name,dev_importance_shap,oot_importance_shap,importance_split,importance_gain
2,X67,0.099883,0.090816,77,5527.426035
6,ds16,0.039437,0.030132,37,932.805699
4,ds61,0.031415,0.031145,41,993.966118
12,k318,0.028120,0.090113,35,850.662497
1,X93,0.018346,0.018853,29,714.880214
...,...,...,...,...,...
57,311,0.001728,0.004647,13,115.599818
55,2017,0.001498,0.001500,8,111.411691
65,183,0.001449,0.000981,7,135.674501
68,435,0.001103,0.001001,4,92.208698


In [None]:
(feature_df['importance_split'] == 0).sum()

0

In [None]:
feature_df.to_excel('feature_importance_最终.xlsx')

In [None]:
import pickle 

# pickle.dump(model, open('ym202301.pkl', 'wb'))

### 打分

In [None]:
KS(get_score(X_dev[final_cols], model)['score'], y_dev), KS(get_score(X_oot[final_cols], model)['score'], y_oot)

(0.2072856860027717, 0.18642023199556051)

In [None]:
X_all = pd.read_csv('data.csv').set_index(['uuid', 'certno', 'custname', 'phone', 'channel_flag', 'his_overdue_date', 'occur_date', 'payout_date'])[final_cols]

In [None]:
scores = get_score(X_all, model)
scores

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,好样本p,坏样本p,odds,score
uuid,certno,custname,phone,channel_flag,his_overdue_date,occur_date,payout_date,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2204220000001954,430821198209255138,柴龙军,13327248768,升级贷-360,0,2022-04-22,2022-06-10,0.915713,0.084287,0.092045,565.0
2204240000002438,445323199103180912,张荣浩,13729781617,升级贷-360,0,2022-04-24,2022-04-27,0.934079,0.065921,0.070573,580.0
2204240000002870,420325198807111923,邢欢,18716135235,升级贷-360,0,2022-04-24,2022-04-24,0.956261,0.043739,0.045740,605.0
2204240000002885,34122519870702633X,徐洪春,13057958729,升级贷-360,0,2022-04-24,2022-04-24,0.931540,0.068460,0.073491,578.0
2204240000002939,410381198307016574,石自强,15139939738,升级贷-360,0,2022-04-24,2022-05-12,0.936659,0.063341,0.067625,583.0
...,...,...,...,...,...,...,...,...,...,...,...
650598090861707264,140481199105036416,张强,13623551709,升级贷-携程,0,2022-07-28,2022-07-28,0.945315,0.054685,0.057848,592.0
651017425794891776,610322199505103611,麻鹏玉,18292878448,升级贷-携程,0,2022-07-29,2022-07-29,0.922166,0.077834,0.084403,570.0
651461584930603008,440923198912132949,李晓瑜,13422666391,升级贷-携程,0,2022-07-30,2022-07-30,0.933039,0.066961,0.071766,579.0
651702203389313024,371523199005050056,戚迪,18365870387,升级贷-携程,0,2022-07-31,2022-07-31,0.949521,0.050479,0.053162,596.0


In [None]:
scores.to_excel('升级贷X_scores_92499.xlsx')

In [None]:
model