In [1]:
!pip install hyperopt==0.2.7
import hyperopt
hyperopt.__version__

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


'0.2.7'

In [2]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import time
from hyperopt import hp, fmin, tpe, space_eval
import joblib
import gc

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# 修改当前文件夹位置 假定notebook文件就在项目文件夹根目录
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive/'):
        return '/content/drive/MyDrive/Colab/4-AMEX/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())
print(get_root_dir())

/content/drive/MyDrive/Colab/4-AMEX/AMEX Project/notebooks


Load data

In [5]:
targets = pd.read_csv('../data/1-original-data/train_labels.csv')
print(targets.shape)

(458913, 2)


In [6]:
targets['target'].value_counts()

0    340085
1    118828
Name: target, dtype: int64

In [7]:
# calculate weight scaling
num_pos = targets['target'].value_counts().loc[1]
num_neg = targets['target'].value_counts().loc[0]
pos_weight = num_neg / num_pos
print(pos_weight)

2.861993806173629


In [8]:
train_df = pd.read_parquet("../data/8-CombinedData/FeatureSelection/train_lgbm.parquet", engine='pyarrow') #1000 features
print(train_df.shape)
# train_df = pd.read_parquet("../data/8-CombinedData/combined-data-train.parquet", engine='pyarrow') #2000 features
# print(train_df.shape)

(458913, 1002)


In [9]:
del targets

In [10]:
train_df['target'].value_counts()

0    340085
1    118828
Name: target, dtype: int64

Calculate metrics

In [11]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [12]:
def xgb_amex_metric(y_pred: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    return 'AMEX_metric', amex_metric(y_true, y_pred)

In [13]:
def params_append(params, pos_weight=5):
    """
    动态回调参数函数，params视作字典
    :param params:lgb参数字典
    :return params:修正后的lgb参数字典
    """
    params['objective'] = "binary:logistic"
    params['disable_default_eval_metric'] = 1 # 将用自定义metric（feval中声明）
    params["scale_pos_weight"] = pos_weight # 不平衡数据
    params["booster"] = "gbtree" # gbtree, gblinear, dart
    params["verbosity"] = 1
    # params["gpu_id"] = 0
    # params["tree_method"] = "gpu_hist"


    return params

In [14]:
def param_hyperopt(train, pos_weight):
    """
    模型参数搜索与优化函数
    :param train:训练数据集
    :return params_best:lgb最优参数
    """
    
    # Part 1.划分特征名称，删除ID列和标签列
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")
    
    # Part 2.封装训练数据
    train_data = xgb.DMatrix(train[features], label=train[label])
    
    # Part 3.内部函数，输入模型超参数损失值输出函数
    def hyperopt_objective(params):
        """
        输入超参数，输出对应损失值
        :param params:
        :return:最大自定义metric
        """
        # 创建参数集
        params = params_append(params, pos_weight)
        print(f"XGBoost hyperparameters：{params}")

        t = time.time() # 记录时间
        
        # 借助lgb的cv过程，输出某一组超参数下损失值的最小值
        res = xgb.cv(params, train_data, 
                     num_boost_round = 300,# 最大迭代次数 5000
                     nfold=5, # 交叉验证的次数（n折交叉验证）
                     stratified=True, # 不平衡数据
                     shuffle=True,
                     # early_stopping_rounds=500, #dart模式下面不需要early stopping
                     show_stdv=False,
                     seed=2022,
                     verbose_eval=50, # 1000
                     feval = xgb_amex_metric,
                     # eval_train_metric=True,
                    )
        
        # 追踪记录
        dur = round((time.time() - t) / 60, 2)
        print(f"本次贝叶斯优化evaluation的消耗时间 {dur} mins")
        
        # 打印训练后的字典
        # print(f"交叉验证后的结果：{res}", type(res))

        
        return -max(res["test-AMEX_metric-mean"]) # 最大化自定义metric，但请注意我们在贝叶斯优化中的目标函数是最小化，所以要在前面加上负号
    

    # Part 4.xgb超参数空间
    params_space = {
        'learning_rate': hp.uniform('learning_rate', 5e-3, 5e-2),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'max_depth': hp.choice('max_depth', np.arange(5, 30, 3, dtype=int)),
        'subsample': hp.uniform('subsample', 0, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    }
    
    # Part 5.TPE超参数搜索
    params_best = fmin( # 注意是最小化这个最优参数
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=10, # 50
        rstate=np.random.default_rng(2022)
    )
    
    # 天坑！必须要用space_eval处理！
    best_params=space_eval(params_space, params_best)
    
    # 返回最佳参数
    return params_best

In [15]:
best_params = param_hyperopt(train_df, pos_weight)

XGBoost hyperparameters：{'colsample_bytree': 0.9290029116234898, 'gamma': 4.627192778950635, 'learning_rate': 0.03127817260904797, 'max_depth': 8, 'min_child_weight': 2.0, 'reg_lambda': 0.44594212646423004, 'subsample': 0.6348844043084347, 'objective': 'binary:logistic', 'disable_default_eval_metric': 1, 'scale_pos_weight': 2.861993806173629, 'booster': 'gbtree', 'verbosity': 1}
[0]	train-AMEX_metric:0.777407	test-AMEX_metric:0.769957

[50]	train-AMEX_metric:0.801588	test-AMEX_metric:0.785533

  0%|          | 0/10 [47:16<?, ?trial/s, best loss=?]


KeyboardInterrupt: ignored

In [None]:
import pickle
with open("xgb_best_parameters.pickle", "wb") as tf:
    pickle.dump(best_params, tf)

In [None]:
print(best_params)

In [None]:
def xgb_train(train,params, pos_weight):
    """
    :param train:
    :param test:
    :param params:
    :return:
    """
    print("*"*50)
    print("XGBoost 开始正式训练！")
    print("*"*50)

    # Part 1.选择特征
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")

    print(f"将要使用的XGBoost的最优参数：{params}")

    # Part 2.申明固定参数与控制迭代参数
    params = params_append(params, pos_weight)
    ESR = 500
    NBR = 20 # 10000训练模型可以调高
    VBE = 1000

    # 训练数据封装
    train_ = xgb.DMatrix(train[features],train[label])
        # 依据验证集训练模型
    bst = xgb.train(
        params,
        train,
        num_boost_round=NBR,
        evals=[(train, 'train')],
        early_stopping_rounds=ESR,
        verbose_eval=VBE,   
        feval = xgb_amex_metric, # 自定义metric
    )
    return bst

In [None]:
bst=xgb_train(train_df, best_params, pos_weight)

In [None]:
label = "target"
features = train.columns.tolist()
features.remove("customer_ID")
features.remove("target")

In [None]:
del train, best_params

In [None]:
test_df = pd.read_parquet("../data/8-CombinedData/FeatureSelection/test_lgbm.parquet", engine='pyarrow')
print(test_df.shape)

In [None]:
#do prediction
test_df['target'] = bst.predict(test_df[features])
test_df[['customer_ID', 'target']].to_csv("result/submission_xgboost.csv", index=False)

In [None]:
test_df[['customer_ID', 'target']]