# Section 3.2.2 LightGBM Training and Inference

In [None]:
import joblib
import gc

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState

In [None]:
!pip install lightgbm==3.1.1
import lightgbm as lgb
lgb.__version__

In [None]:
!pip install hyperopt==0.2.7
import hyperopt
hyperopt.__version__

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

## Metric

In [None]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'AMEX_metric', amex_metric(y_true, y_pred), True

## 训练模型

### 最优参数

In [None]:
def params_append(params):
    """
    动态回调参数函数，params视作字典
    :param params:lgb参数字典
    :return params:修正后的lgb参数字典
    """
    params['feature_pre_filter'] = False
    params['objective'] = "binary"
    params['metric'] = "None" # 将用自定义metric（feval中声明）
    params["is_unbalance"] = True # 不平衡数据
    params["boosting"] = "gbdt" # gbdt, rf, dart
    params["verbose"] = -1
    # params["n_estimators"] = 200
    # params["device"] = "gpu"


    return params

此时best_clf即为lgb模型的最优参数组。手动输入best_clf

In [None]:
best_clf = {'bagging_fraction': 0.9145573834812892, 'bagging_freq': 10, 'feature_fraction': 0.8371096595408576, 'learning_rate': 0.0584746743556553, 'max_depth': 23, 
'min_child_samples': 25, 'num_leaves': 150, 'feature_pre_filter': False, 'objective': 'binary', 'metric': 'None', 'is_unbalance': True, 'boosting': 'gbdt', 'verbose': -1}
best_clf = params_append(best_clf)

{'bagging_fraction': 0.9022336069269954,
 'bagging_freq': 2,
 'feature_fraction': 0.9373662317255621,
 'learning_rate': 0.014947332175194025,
 'min_child_samples': 5,
 'num_leaves': 7,
 'reg_alpha': 2,
 'reg_lambda': 3.5907566887206896}

### 导入数据

In [None]:
train_LGBM = pd.read_parquet("../data/8-CombinedData/combined-data-train.parquet")

In [None]:
# 数据准备过程
label = 'target'
features = train_LGBM.columns.tolist()
features.remove("customer_ID")
features.remove('target')

In [None]:

# 数据封装
lgb_train = lgb.Dataset(train_LGBM[features], train_LGBM[label])

In [None]:
# 在全部数据集上训练模型
bst = lgb.train(best_clf, lgb_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65975
[LightGBM] [Info] Number of data points in the train set: 201917, number of used features: 300
[LightGBM] [Info] Start training from score -0.393636


In [None]:
del lgb_train # 还要删什么
gc.collect()

### 保存模型在本地

In [None]:
# Save best model 保存每折训练模型在本地
joblib.dump(model, f'../models/baseline/lgbm_{CFG.boosting_type}_fold{fold}_seed{CFG.seed}.pkl')

## Inference

接下来，对测试集进行预测，并将结果写入本地文件

### 导入数据

In [None]:
test_LGBM = pd.read_parquet("../data/8-CombinedData/combined-data-test.parquet")

In [None]:
test_LGBM['target'] = bst.predict(test_LGBM[features])
test_LGBM[['customer_ID', 'target']].to_csv("result/submission_LGBM.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_LGBM['target'] = bst.predict(test_LGBM[features])


In [None]:
test_LGBM[['customer_ID', 'target']]

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.418856
1,C_ID_130fd0cbdd,-0.752626
2,C_ID_b709037bc5,-0.030933
3,C_ID_d27d835a9f,-0.245119
4,C_ID_2b5e3df5c2,-0.36699
