# Section 3.2.2 LightGBM Training and Inference

In [1]:
import joblib
import gc

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState

In [3]:
!pip install lightgbm==3.1.1
import lightgbm as lgb
lgb.__version__

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

'3.1.1'

In [4]:
!pip install hyperopt==0.2.7
import hyperopt
hyperopt.__version__

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

'0.2.7'

In [5]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

## Metric

In [6]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [7]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'AMEX_metric', amex_metric(y_true, y_pred), True

## 训练模型

### 最优参数

In [8]:
def params_append(params):
    """
    动态回调参数函数，params视作字典
    :param params:lgb参数字典
    :return params:修正后的lgb参数字典
    """
    params['feature_pre_filter'] = False
    params['objective'] = "binary"
    params['metric'] = "None" # 将用自定义metric（feval中声明）
    params["is_unbalance"] = True # 不平衡数据
    params["boosting"] = "dart" # gbdt, rf, dart
    params["verbose"] = -1
    # params["n_estimators"] = 200
    # params["device"] = "gpu"


    return params

此时best_clf即为lgb模型的最优参数组。手动输入best_clf

In [9]:
best_clf = {'bagging_fraction': 0.9145573834812892, 'bagging_freq': 10, 'feature_fraction': 0.8371096595408576, 'learning_rate': 0.0584746743556553, 'max_depth': 23, 
'min_child_samples': 25, 'num_leaves': 150, 'feature_pre_filter': False, 'objective': 'binary', 'metric': 'None', 'is_unbalance': True, 'boosting': 'gbdt', 'verbose': -1}
best_clf = params_append(best_clf)

### 导入数据

In [10]:
train_LGBM = pd.read_parquet("./input/8-CombinedData/train_lgbm_500.parquet")

In [11]:
# 数据准备过程
label = 'target'
features = train_LGBM.columns.tolist()
features.remove("customer_ID")
features.remove('target')

In [12]:
# 数据封装
lgb_train = lgb.Dataset(train_LGBM[features], train_LGBM[label])

In [13]:
del train_LGBM
gc.collect()

0

In [14]:
# 在全部数据集上训练模型
bst = lgb.train(best_clf, 
                lgb_train,
                num_boost_round= 5000,
                early_stopping_rounds=500,
                verbose_eval=300,
                feval = lgb_amex_metric
                )



In [15]:
del lgb_train
gc.collect()

25

### 保存模型在本地

In [16]:
# Save best model 保存训练模型在本地
joblib.dump(bst, f'LightGBM-training-model.pkl')

['LightGBM-training-model.pkl']

## Inference

接下来，对测试集进行预测，并将结果写入本地文件

### 导入数据

In [17]:
test_LGBM = pd.read_parquet("./input/8-CombinedData/test_lgbm_500.parquet")

In [18]:
test_LGBM['prediction'] = bst.predict(test_LGBM[features])
test_LGBM[['customer_ID', 'prediction']].to_csv("result/submission_LGBM.csv", index=False)

In [None]:
test_LGBM[['customer_ID', 'target']].to_csv("result/submission_LGBM.csv", index=False)