In [None]:
!pip install hyperopt==0.2.7
import hyperopt
hyperopt.__version__

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hyperopt==0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 5.0 MB/s 
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 72.0 MB/s 
Installing collected packages: py4j, hyperopt
  Attempting uninstall: hyperopt
    Found existing installation: hyperopt 0.1.2
    Uninstalling hyperopt-0.1.2:
      Successfully uninstalled hyperopt-0.1.2
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7


'0.2.7'

In [None]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import time
from hyperopt import hp, fmin, tpe, space_eval

In [None]:
# 把Google Drive挂载到Colab里
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    pass

Mounted at /content/drive


In [None]:
# 修改当前文件夹位置 假定notebook文件就在项目文件夹根目录
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive/'):
        return '/content/drive/MyDrive/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())

Load data

In [None]:
train_df = pd.read_parquet("../data/8-CombinedData/FeatureSelection/train_lgbm.parquet", engine='pyarrow')
train_df = train_df.sample(n=10000, random_state=2022)
print(train_df.shape)

(10000, 1002)


In [None]:
# calculate weight scaling
num_pos = train_df['target'].value_counts().loc[1]
num_neg = train_df['target'].value_counts().loc[0]
pos_weight = num_neg / num_pos
print(pos_weight)

2.852080123266564


In [None]:
# test_df = pd.read_parquet("../data/2-processed-demo/test_fe.parquet", engine='pyarrow').iloc[:1000, :]
# print(test_df.shape)

Calculate metrics

In [None]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
def xgb_amex_metric(y_pred: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    return 'AMEX_metric', amex_metric(y_true, y_pred)

In [None]:
def params_append(params, pos_weight):
    """
    动态回调参数函数，params视作字典
    :param params:lgb参数字典
    :return params:修正后的lgb参数字典
    """
    params['objective'] = "binary:logistic"
    params['disable_default_eval_metric'] = 1 # 将用自定义metric（feval中声明）
    params["scale_pos_weight"] = pos_weight # 不平衡数据
    params["booster"] = "gbtree" # gbtree, gblinear, dart
    params["verbosity"] = 1
    # params["gpu_id"] = 0
    # params["tree_method"] = "gpu_hist"


    return params

In [None]:
def param_hyperopt(train, pos_weight):
    """
    模型参数搜索与优化函数
    :param train:训练数据集
    :return params_best:lgb最优参数
    """
    
    # Part 1.划分特征名称，删除ID列和标签列
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")
    
    # Part 2.封装训练数据
    train_data = xgb.DMatrix(train[features], label=train[label])
    
    # Part 3.内部函数，输入模型超参数损失值输出函数
    def hyperopt_objective(params):
        """
        输入超参数，输出对应损失值
        :param params:
        :return:最大自定义metric
        """
        # 创建参数集
        params = params_append(params, pos_weight)
        print(f"XGBoost hyperparameters：{params}")

        t = time.time() # 记录时间
        
        # 借助lgb的cv过程，输出某一组超参数下损失值的最小值
        res = xgb.cv(params, train_data, 
                     num_boost_round = 300,# 最大迭代次数 5000
                     nfold=5, # 交叉验证的次数（n折交叉验证）
                     stratified=True, # 不平衡数据
                     shuffle=True,
                     # early_stopping_rounds=500, #dart模式下面不需要early stopping
                     show_stdv=False,
                     seed=2022,
                     verbose_eval=50, # 1000
                     feval = xgb_amex_metric,
                     # eval_train_metric=True,
                    )
        
        # 追踪记录
        dur = round((time.time() - t) / 60, 2)
        print(f"本次贝叶斯优化evaluation的消耗时间 {dur} mins")
        
        # 打印训练后的字典
        # print(f"交叉验证后的结果：{res}", type(res))

        
        return -max(res["test-AMEX_metric-mean"]) # 最大化自定义metric，但请注意我们在贝叶斯优化中的目标函数是最小化，所以要在前面加上负号
    

    # Part 4.xgb超参数空间
    params_space = {
        'learning_rate': hp.uniform('learning_rate', 5e-3, 5e-2),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'max_depth': hp.choice('max_depth', np.arange(5, 30, 3, dtype=int)),
        'subsample': hp.uniform('subsample', 0, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    }
    
    # Part 5.TPE超参数搜索
    params_best = fmin( # 注意是最小化这个最优参数
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=10, # 50
        rstate=np.random.default_rng(2022)
    )
    
    # 天坑！必须要用space_eval处理！
    best_params=space_eval(params_space, params_best)
    
    # 返回最佳参数
    return params_best

In [None]:
best_params = param_hyperopt(train_df, pos_weight)

XGBoost hyperparameters：{'colsample_bytree': 0.9290029116234898, 'gamma': 4.627192778950635, 'learning_rate': 0.03127817260904797, 'max_depth': 8, 'min_child_weight': 2.0, 'reg_lambda': 0.44594212646423004, 'subsample': 0.6348844043084347, 'objective': 'binary:logistic', 'disable_default_eval_metric': 1, 'scale_pos_weight': 2.852080123266564, 'booster': 'gbtree', 'verbosity': 1}
[0]	train-AMEX_metric:0.773967	test-AMEX_metric:0.720308

[50]	train-AMEX_metric:0.920603	test-AMEX_metric:0.765059

[100]	train-AMEX_metric:0.970284	test-AMEX_metric:0.768389

[150]	train-AMEX_metric:0.991202	test-AMEX_metric:0.772483

[200]	train-AMEX_metric:0.998258	test-AMEX_metric:0.770833

[250]	train-AMEX_metric:0.999686	test-AMEX_metric:0.769987

[299]	train-AMEX_metric:0.999947	test-AMEX_metric:0.768815

本次贝叶斯优化evaluation的消耗时间 7.56 mins
XGBoost hyperparameters：{'colsample_bytree': 0.8064002562284813, 'gamma': 6.3416045911551615, 'learning_rate': 0.03605566136476637, 'max_depth': 29, 'min_child_weight':

In [None]:
import pickle
with open("xgb_best_parameters.pickle", "wb") as tf:
    pickle.dump(best_params, tf)

In [None]:
print(best_params)

{'colsample_bytree': 0.8064002562284813, 'gamma': 6.3416045911551615, 'learning_rate': 0.03605566136476637, 'max_depth': 8, 'min_child_weight': 8.0, 'reg_lambda': 0.7472997965324492, 'subsample': 0.5714266399284903}


In [None]:
# load data with 500 records
train_df = pd.read_parquet("../data/8-CombinedData/FeatureSelection/train_lgbm_500.parquet", engine='pyarrow')
# train_df = train_df.sample(n=10000, random_state=2022)
print(train_df.shape)

(458913, 502)


In [None]:
def xgb_train_predict(train, params, pos_weight):
    """
    :param train:
    :param test:
    :param params:
    :return:
    """
    print("*"*50)
    print("XGBoost 开始正式训练！")
    print("*"*50)

    # Part 1.选择特征
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")

    print(f"将要使用的XGBoost的最优参数：{params}")

    # Part 2.申明固定参数与控制迭代参数
    params = params_append(params, pos_weight)
    ESR = 500
    NBR = 20 # 10000训练模型可以调高
    VBE = 1000

    # Part 3.创建结果存储容器
    # 测试集预测结果存储器，后保存至本地文件
    # prediction_test = 0
    # 验证集的模型表现，作为展示用
    cv_score = []
    # 验证集的预测结果存储器，后保存至本地文件
    prediction_train = pd.Series()

    # Part 3.交叉验证
    iteration = 1
    kf = StratifiedKFold(n_splits=5, random_state=2022, shuffle=True)
    for train_part_index, eval_index in kf.split(train[features], train[label]):
        print("*"*30)
        print(f"开始第{iteration}折的交叉验证！")
        print("*"*30)
        iteration += 1

        t = time.time() # 记录时间

        
        # 训练数据封装
        train_part = xgb.DMatrix(
            train[features].loc[train_part_index],
            train[label].loc[train_part_index]
        )
        # 测试数据封装
        eval = xgb.DMatrix(
            train[features].loc[eval_index],
            train[label].loc[eval_index]
        )
        # 依据验证集训练模型
        bst = xgb.train(
            params,
            train_part,
            num_boost_round=NBR,
            evals=[(train_part, 'train'), (eval, 'eval')],
            early_stopping_rounds=ESR,
            verbose_eval=VBE,   
            feval = xgb_amex_metric, # 自定义metric
        )

        # 测试集预测结果并纳入prediction_test容器
        # prediction_test += bst.predict(xgb.DMatrix(test[features]))
        
        # 验证集预测结果
        eval_pre = bst.predict(xgb.DMatrix(train[features].loc[eval_index]))
        
        # 验证集预测结果并纳入prediction_train容器
        prediction_train = prediction_train.append(
            pd.Series(eval_pre,index=eval_index))

        # 计算验证集上得分metric
        score = amex_metric(train[label].loc[eval_index].values, eval_pre)

        # 纳入cv_score容器
        cv_score.append(score)

        # 追踪记录
        dur = round((time.time() - t) / 60, 2)
        print(f"第{iteration}个iteration训练所消耗的时间 {dur} mins")


    print("*"*50)
    print("XGBoost 训练结束！开始保存结果！")
    print("*"*50)
        
    # Part 4.打印/输出结果
    # 打印验证集得分与平均得分
    print(f"验证集得分：{cv_score}, 验证集平均分：{sum(cv_score)/5}")
    
    # 将验证集上预测结果写入本地文件
    # print(len(prediction_train.sort_index().values))
    pd.Series(prediction_train.sort_index().values).to_csv("../models/XGBoost/train_xgboost1.csv", index=False)
    
    # save model
    bst.save_model("../models/XGBoost/xgboost1.json") #xgboost format
    with open("xgboost1.pickle", "wb") as f: # pickle file
        pickle.dump(bst, f)
    return

In [None]:
xgb_train_predict(train_df, best_params, pos_weight)

**************************************************
XGBoost 开始正式训练！
**************************************************
将要使用的XGBoost的最优参数：{'colsample_bytree': 0.8064002562284813, 'gamma': 6.3416045911551615, 'learning_rate': 0.03605566136476637, 'max_depth': 8, 'min_child_weight': 8.0, 'reg_lambda': 0.7472997965324492, 'subsample': 0.5714266399284903}




******************************
开始第1折的交叉验证！
******************************
[0]	train-AMEX_metric:0.776732	eval-AMEX_metric:0.765522
Multiple eval metrics have been passed: 'eval-AMEX_metric' will be used for early stopping.

Will train until eval-AMEX_metric hasn't improved in 500 rounds.
[19]	train-AMEX_metric:0.794749	eval-AMEX_metric:0.779567
第2个iteration训练所消耗的时间 3.62 mins
******************************
开始第2折的交叉验证！
******************************
[0]	train-AMEX_metric:0.774768	eval-AMEX_metric:0.770478
Multiple eval metrics have been passed: 'eval-AMEX_metric' will be used for early stopping.

Will train until eval-AMEX_metric hasn't improved in 500 rounds.
[19]	train-AMEX_metric:0.793066	eval-AMEX_metric:0.780327
第3个iteration训练所消耗的时间 3.47 mins
******************************
开始第3折的交叉验证！
******************************
[0]	train-AMEX_metric:0.776361	eval-AMEX_metric:0.774446
Multiple eval metrics have been passed: 'eval-AMEX_metric' will be used for early stopping.

Will train until eva

In [None]:
def xgb_predict(test_path, model_path):
    # load data
    test = pd.read_parquet(test_path, engine='pyarrow')
    print(test.shape)
    # load model
    model = xgb.Booster()
    model.load_model(model_path)
    # extract features
    features = test.columns.tolist()
    features.remove("customer_ID")
    # predict
    print("predicting...")
    dtest = xgb.DMatrix(test[features])
    test_pred = model.predict(dtest)
    test['target'] = test_pred
    # save test result
    print("saving...")
    test[['customer_ID', 'target']].to_csv("../models/XGBoost/submission_xgboost1.csv", index=False)
    return

In [None]:
test_path = "../data/8-CombinedData/FeatureSelection/test_lgbm_500.parquet"
model_path = "../models/XGBoost/xgboost1.json"
xgb_predict(test_path, model_path)

(924621, 501)
predicting...
saving...


In [33]:
# compare with other submission
sub1 = pd.read_csv('../results/1-tuffline-plotly-amex-submission-799.csv')
sub3 = pd.read_csv("../results/2-blend-boosting-study-submission-799.csv")

In [39]:
sub4 = pd.read_csv("../results/9.5-ensemble5-submission.csv")

In [35]:
 customers = sub1['customer_ID'].sample(n=10000, random_state=2022)
 print(customers.shape)

(10000,)


In [36]:
sub1_n = sub1[sub1['customer_ID'].isin(customers)]
sub2_n = sub2[sub2['customer_ID'].isin(customers)]
sub3_n = sub3[sub3['customer_ID'].isin(customers)]

In [40]:
sub4_n = sub4[sub4['customer_ID'].isin(customers)]

In [None]:
import plotly.graph_objects as go

In [41]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=sub1_n['prediction']))
fig.add_trace(go.Histogram(x=sub4_n['prediction']))
fig.add_trace(go.Histogram(x=sub3_n['prediction']))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()