# Section 3.2 LightGBM with Bayesian Optimisation

In [None]:
# 把Google Drive挂载到Colab里
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    pass

Mounted at /content/drive


In [None]:
# 修改当前文件夹位置 假定notebook文件就在项目文件夹根目录
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive/Colab/'):
        return '/content/drive/MyDrive/Colab/4-AMEX/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())

In [None]:
!pip install lightgbm==3.1.1
import lightgbm as lgb
lgb.__version__

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightgbm==3.1.1
  Downloading lightgbm-3.1.1-py2.py3-none-manylinux1_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 4.3 MB/s 
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 2.2.3
    Uninstalling lightgbm-2.2.3:
      Successfully uninstalled lightgbm-2.2.3
Successfully installed lightgbm-3.1.1


'3.1.1'

In [None]:
!pip install hyperopt==0.2.7
import hyperopt
hyperopt.__version__

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hyperopt==0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.3 MB/s 
[?25hCollecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 73.0 MB/s 
Installing collected packages: py4j, hyperopt
  Attempting uninstall: hyperopt
    Found existing installation: hyperopt 0.1.2
    Uninstalling hyperopt-0.1.2:
      Successfully uninstalled hyperopt-0.1.2
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7


'0.2.7'

In [None]:
import time
import numpy as np
import pandas as pd

In [None]:
from numpy.random import RandomState

In [None]:
import lightgbm as lgb
from hyperopt import hp, fmin, tpe
from sklearn.model_selection import StratifiedKFold

In [None]:
from hyperopt import hp, fmin, tpe
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

In [None]:
import hyperopt
hyperopt.__version__

'0.2.7'

其中hp是参数空间创建函数，fmin是参数搜索函数，tpe则是一种基于贝叶斯过程的搜索策略。

## 导入数据

我们使用的数据是压缩之后的最终数据：Part I + Part II + Part III

保证一行对应一个唯一客户。

In [None]:
train_LGBM = pd.read_parquet("../data/2-processed-demo/train_fe.parquet")
test_LGBM = pd.read_parquet("../data/2-processed-demo/test_fe.parquet")

In [None]:
train_LGBM.shape

(458913, 920)

In [None]:
train_LGBM = train_LGBM.fillna(0)
test_LGBM = test_LGBM.fillna(0)

## Metric

In [None]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'AMEX_metric', amex_metric(y_true, y_pred), True

## 贝叶斯优化

### 参数回调函数

对于lgb模型来说，并不是所有的超参数都需要进行搜索。

为了防止多次实例化模型过程中部分超参数被设置成默认参数，我们首先需要创建一个参数回调函数，用于在后续多次实例化模型过程中反复申明这部分参数的固定取值：

In [None]:
def params_append(params):
    """
    动态回调参数函数，params视作字典
    :param params:lgb参数字典
    :return params:修正后的lgb参数字典
    """
    params['feature_pre_filter'] = False
    params['objective'] = "binary"
    params['metric'] = "None" # 将用自定义metric（feval中声明）
    params["is_unbalance"] = True # 不平衡数据
    params["boosting"] = "dart" # gbdt, rf, dart
    params["verbose"] = 1

    return params

### 寻找最优超参数

In [None]:
def param_hyperopt(train):
    """
    模型参数搜索与优化函数
    :param train:训练数据集
    :return params_best:lgb最优参数
    """
    
    # Part 1.划分特征名称，删除ID列和标签列
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")
    
    # Part 2.封装训练数据
    train_data = lgb.Dataset(train[features], train[label])
    
    # Part 3.内部函数，输入模型超参数损失值输出函数
    def hyperopt_objective(params):
        """
        输入超参数，输出对应损失值
        :param params:
        :return:最大自定义metric
        """
        # 创建参数集
        params = params_append(params)
        print(f"检查使用的LGBM参数：{params}")

        t = time.time() # 记录时间
        
        # 借助lgb的cv过程，输出某一组超参数下损失值的最小值
        res = lgb.cv(params, train_data, 
                     num_boost_round = 3000,# 最大迭代次数 5000
                     nfold=5, # 交叉验证的次数（n折交叉验证）
                     stratified=True, # 不平衡数据
                     shuffle=True,
                     metrics="None",
                     # early_stopping_rounds=500, #dart模式下面不需要early stopping
                     show_stdv=False,
                     seed=2022,
                     verbose_eval=500, # 1000
                     feval = lgb_amex_metric,
                     eval_train_metric=True,
                    )
        
        # 追踪记录
        dur = round((time.time() - t) / 60, 2)
        print(f"本次贝叶斯优化evaluation的消耗时间 {dur} mins")
        
        # 打印训练后的字典
        # print(f"交叉验证后的结果字典：{res}")

        
        return -max(res["valid AMEX_metric-mean"]) # 最大化自定义metric，但请注意我们在贝叶斯优化中的目标函数是最小化，所以要在前面加上负号
    

    # Part 4.lgb超参数空间
    params_space = {
        'learning_rate': hp.uniform('learning_rate', 5e-3, 5e-1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
        'num_leaves': hp.choice('num_leaves', np.arange(30, 200, 10, dtype=int)),
        # 'reg_alpha': hp.randint('reg_alpha', 0, 10),
        # 'reg_lambda': hp.uniform('reg_lambda', 0, 10),
        'bagging_freq': hp.randint('bagging_freq', 3, 13),
        'min_child_samples': hp.choice('min_child_samples', list(range(10, 50, 5))),
        'max_depth': hp.choice('max_depth', np.arange(5, 30, 3, dtype=int)),
    }
    
    # Part 5.TPE超参数搜索
    params_best = fmin( # 注意是最小化这个最优参数
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=30, # 50
        rstate=np.random.default_rng(2022)
    )
    
    # 天坑！必须要用space_eval处理！
    best_params=space_eval(params_space, params_best)
    
    # 返回最佳参数
    return params_best

开始进行贝叶斯优化，求取最佳的超参数

In [None]:
best_clf = param_hyperopt(train_LGBM)

检查使用的LGBM参数：{'bagging_fraction': 0.7919796956560885, 'bagging_freq': 7, 'feature_fraction': 0.5728155231585855, 'learning_rate': 0.11142176050178165, 'max_depth': 5, 'min_child_samples': 35, 'num_leaves': 70, 'feature_pre_filter': False, 'objective': 'binary', 'metric': 'None', 'is_unbalance': True, 'boosting': 'dart', 'verbose': -1}
[500]	cv_agg's train AMEX_metric: 0.815719	cv_agg's valid AMEX_metric: 0.789643
[1000]	cv_agg's train AMEX_metric: 0.835931	cv_agg's valid AMEX_metric: 0.791827
[1500]	cv_agg's train AMEX_metric: 0.853065	cv_agg's valid AMEX_metric: 0.791435
[2000]	cv_agg's train AMEX_metric: 0.870885	cv_agg's valid AMEX_metric: 0.790698
[2500]	cv_agg's train AMEX_metric: 0.889634	cv_agg's valid AMEX_metric: 0.790933
[3000]	cv_agg's train AMEX_metric: 0.904314	cv_agg's valid AMEX_metric: 0.790236
本次贝叶斯优化evaluation的消耗时间 186.85 mins
检查使用的LGBM参数：{'bagging_fraction': 0.8450629040529597, 'bagging_freq': 10, 'feature_fraction': 0.8949698698614581, 'learning_rate': 0.385669958384

### 输出最优参数

In [None]:
best_clf

**请注意这里有一个天坑！！！** 我们必须在倒数第二行使用`space_eval(params_space, params_best)`

如果不写，则直接输出`best_clf`是一个字典，这个东西看似是最优参数，其实不全是。如果你在搜索空间里设置的是**枚举类型(hp.choice)**，那么返回的是**索引**而不是真实值！

有的时候会看见`num_leaves`为1，简直是匪夷所思。因为我在参数空间中根本没有设置过这样的数字。其实他是指你在参数空间中声明的可能值的那个数组的下标1对应的数字。

因此我必须要利用`space_eval(参数空间, best_clf)`这个函数将返回的字典结果转化为装有最优参数的结果。

In [None]:
import pickle
with open("best_parameters.pkl", "wb") as tf:
    pickle.dump(best_clf, tf)

这样我们就得到了通过贝叶斯优化得到的最优超参数。

通过这个最优超参数，我们可以在下面重新训练，并且结合交叉验证进行模型预测。

## 训练模型

### 利用交叉验证进行模型预测

<center><img src="https://s2.loli.net/2021/12/08/ALF3cfuSwmB7b8z.png" alt="image-20211208192640281" style="zoom:33%;" />

In [None]:
def train_predict(train, test, params):
    """
    :param train:
    :param test:
    :param params:
    :return:
    """
    print("*"*50)
    print("LGBM 开始正式训练！")
    print("*"*50)

    # Part 1.选择特征
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")

    print(f"将要使用的LGBM的最优参数：{params}")

    # Part 2.申明固定参数与控制迭代参数
    params = params_append(params)
    ESR = 500
    NBR = 2000 # 10000训练模型可以调高
    VBE = 1000

    # Part 3.创建结果存储容器
    # 测试集预测结果存储器，后保存至本地文件
    prediction_test = 0
    # 验证集的模型表现，作为展示用
    cv_score = []
    # 验证集的预测结果存储器，后保存至本地文件
    prediction_train = pd.Series()

    # Part 3.交叉验证
    iteration = 1
    kf = StratifiedKFold(n_splits=5, random_state=2022, shuffle=True)
    for train_part_index, eval_index in kf.split(train[features], train[label]):
        print("*"*30)
        print(f"开始第{iteration}折的交叉验证！")
        print("*"*30)
        iteration += 1

        t = time.time() # 记录时间

        
        # 训练数据封装
        train_part = lgb.Dataset(
            train[features].loc[train_part_index],
            train[label].loc[train_part_index]
        )
        # 测试数据封装
        eval = lgb.Dataset(
            train[features].loc[eval_index],
            train[label].loc[eval_index]
        )
        # 依据验证集训练模型
        bst = lgb.train(
            params,
            train_part,
            num_boost_round=NBR,
            valid_sets=[train_part, eval],
            valid_names=['train', 'valid'],
            early_stopping_rounds=ESR,
            verbose_eval=VBE,   
            fdeval = lgb_amex_metric, # 自定义metric
        )

        # 测试集预测结果并纳入prediction_test容器
        prediction_test += bst.predict(test[features])
        # 验证集预测结果并纳入prediction_train容器
        prediction_train = prediction_train.append(
            pd.Series(bst.predict(train[features].loc[eval_index]),index=eval_index))
        # 验证集预测结果
        eval_pre = bst.predict(train[features].loc[eval_index])

        # 计算验证集上得分metric
        score = amex_metric(train[label].loc[eval_index].values, eval_pre)

        # 纳入cv_score容器
        cv_score.append(score)

        # 追踪记录
        dur = round((time.time() - t) / 60, 2)
        print(f"第{iteration}个iteration训练所消耗的时间 {dur} mins")


    print("*"*50)
    print("LGBM 训练结束！开始保存结果！")
    print("*"*50)
        
    # Part 4.打印/输出结果
    # 打印验证集得分与平均得分
    print(f"验证集得分：{cv_score}, 验证集平均分：{sum(cv_score)/5}")
    
    # 将验证集上预测结果写入本地文件
    pd.Series(prediction_train.sort_index().values).to_csv("train_lightgbm.csv", index=False)
    
    # 将测试集上预测结果写入本地文件
    pd.Series(prediction_test/5).to_csv("test_lightgbm.csv", index=False)
    
    # 测试集平均得分作为模型最终预测结果
    test['target'] = prediction_test/5
    
    # 将测试集预测结果写成竞赛要求格式并保存至本地
    test[["customer_ID", 'target']].to_csv("submission_lightgbm.csv", index=False)

    return

In [None]:
# best_clf = param_hyperopt(train_LGBM)
train_predict(train_LGBM, test_LGBM, best_clf)