# Section 2.5 Wrapper Method

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN

In [None]:
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.metrics import mean_squared_error

本节将会使用多种集成模型的feature importance来筛选最重要的特征们。

## 导入数据

In [None]:
train = pd.read_parquet("../data/baseline_train_fe1.parquet")
test = pd.read_parquet("../data/baseline_test_fe1.parquet")

In [None]:
train.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933824,0.024194,0.86858,0.960384,0.934745,0.230769,0.83205,0,3,...,13,0,1,13,-1,1,13,6,1,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.89982,0.022119,0.861109,0.929122,0.880519,7.153846,6.743468,0,19,...,13,0,1,13,-1,1,13,6,1,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878454,0.028911,0.79767,0.904482,0.880875,0.0,0.0,0,0,...,13,2,1,13,-1,1,13,6,1,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.598969,0.020107,0.567442,0.623392,0.621776,1.538462,3.017046,0,9,...,13,0,1,13,-1,1,13,3,3,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.891679,0.042325,0.805045,0.940382,0.8719,0.0,0.0,0,0,...,13,0,1,13,1,1,13,6,1,0


In [None]:
train.shape

(458913, 920)

## Wrapper 特征筛选

### Metric

In [None]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'AMEX_metric', amex_metric(y_true, y_pred), True

### LightGBM

In [None]:
def feature_selection_wrapper_lgbm(train, test, feature_num=300):
    """
    lgm特征重要性筛选函数
    
    :param train:训练数据集
    :param test:测试数据集
    :return:特征筛选后的训练集和测试集
    """
    
    # 提取候选特征：删除ID列和标签列
    print("Start wrapper feature selection: LGBM")
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")

    # 配置lgb参数
    params_initial = {
        'objective': "binary",
        "is_unbalance": True, # 不平衡数据（尝试为false的效果）
        'num_leaves': 100,
        'learning_rate': 0.01,
        'boosting': 'gbdt',
        'min_data_in_leaf': 45,
        'bagging_seed': 2022,
        'bagging_fraction': 0.7,
        'bagging_freq': 10,
        'feature_fraction': 0.7,
        'max_depth': -1,
        'metric': "None", # 我将要使用自定义metric，在feval中声明
        'reg_alpha': 0,
        'reg_lambda': 1,
    }
    
    # 控制参数
    ESR = 300     # 提前验证迭代效果或停止
    NBR = 1000  # 迭代次数（正式训练变成10000）
    VBE = 50     # 打印间隔
    
    # 开始交叉验证
    kf = StratifiedKFold(n_splits=5, random_state=2022, shuffle=True) #分层抽样
    fse = pd.Series(0, index=features) # 创建空容器存储重要性水平结果
    
    for train_part_index, eval_index in kf.split(train[features], train[label]):
        # 封装训练数据集
        train_part = lgb.Dataset(
            train[features].loc[train_part_index],
            train[label].loc[train_part_index])
        # 封装验证数据集
        eval = lgb.Dataset(
            train[features].loc[eval_index],
            train[label].loc[eval_index])
        
        # 在训练集上进行训练，并同时进行验证
        bst = lgb.train(
            params_initial, 
            train_part,    
            num_boost_round=NBR, # 迭代次数
            valid_sets=[train_part, eval], # 验证集包括train和val
            valid_names=['train', 'valid'],
            early_stopping_rounds=ESR, 
            verbose_eval=VBE,
            feval = lgb_amex_metric, # 自定义metric
            
        )
        
        # 输出特征重要性计算结果，并进行累加
        fse += pd.Series(bst.feature_importance(), features)
    
    # 选择最重要的前k个特征
    feature_select = ["customer_ID"] + fse.sort_values(ascending=False).index.tolist()[:feature_num]
    print("wrapper feature selection: LGBM Done!")
    
    return train[feature_select + ["target"]], test[feature_select]

In [None]:
train_lgbm, test_lgbm = feature_selection_wrapper_lgbm(train, test, 500)

Start wrapper feature selection: LGBM
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149174
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 910
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
Training until validation scores don't improve for 300 rounds
[50]	train's AMEX_metric: 0.753134	valid's AMEX_metric: 0.741936
[100]	train's AMEX_metric: 0.760245	valid's AMEX_metric: 0.749485
[150]	train's AMEX_metric: 0.76711	valid's AMEX_metric: 0.754621
[200]	train's AMEX_metric: 0.773231	valid's AMEX_metric: 0.759161
[250]	train's AMEX_metric: 0.779006	valid's AMEX_metric: 0.763827
[300]	train's AMEX_metric: 0.784021	valid's AMEX_metric: 0.767223
[350]	train's AMEX_metric: 0.789208	valid's AMEX_metric: 0.770682
[400]	train's AMEX_metric: 0.79382	valid's AM

KeyboardInterrupt: 

可以运行，但时间有点长。可以适当降低训练轮次。对于筛选特征，过多的轮次应该差别不大。

加上了`"is_unbalance": True`好像效果反而没那么好？

反正过拟合有点严重

### Random Forest

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def feature_selection_wrapper_RF(train, test, feature_num=300):
    """
    RF特征重要性筛选函数

    :param train:训练集
    :param test:测试集
    :param best_clf:最优的分类器模型

    :return: 特征筛选后的训练集和测试集
    """

    # 提取候选特征：删除ID列和标签列
    print("Start wrapper feature selection: Random Forest")
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")
    
    # 开始交叉验证
    kf = StratifiedKFold(n_splits=2, random_state=2022, shuffle=True)
    fse = pd.Series(0, index=features) # 创建空容器存储重要性水平结果
    for train_index, test_index in kf.split(train[features], train[label]):

        # 得到训练集和测试集
        X_train, X_test = train[features].iloc[train_index], train[features].iloc[test_index]
        y_train, y_test = train[label].iloc[train_index], train[label].iloc[test_index]
        
        # 处理缺失值：随机森林无法处理缺失值，必须要保证数据集没有缺失值
        X_train = X_train.fillna(0)
        print(X_train.isna().sum().sum(), y_train.isna().sum().sum())
        
        # 处理不平衡数据
        # SMOTE + ENN 注意只针对训练集调整
        smote = SMOTE(random_state = 1, sampling_strategy=0.1)
        enn = EditedNearestNeighbours(n_neighbors=5)
        smote_enn = SMOTEENN(smote=smote, enn=enn)
    
        X_SMOTEENN, y_SMOTEENN = smote_enn.fit_resample(X_train, y_train) # 模型变成smote_enn
    
        # 训练模型
        rfc = RandomForestClassifier(
            random_state=2022,
            min_samples_split=100,
            min_samples_leaf=20,
            max_depth=8,
            max_features='sqrt',
            criterion="gini",  # 注意此处和metric不同
            class_weight={0: 1, 1: 40}  # 处理不平衡问题
        )
        
        rfc.fit(X_SMOTEENN, y_SMOTEENN)
        
        # 对重要性水平进行累加
        fse += pd.Series(rfc.feature_importances_, features)


    # 选择最重要的前k个特征
    feature_select = ["customer_ID"] + fse.sort_values(ascending=False).index.tolist()[:feature_num]
    print("wrapper feature selection: Random Forest Done!")

    return train[feature_select + ["target"]], test[feature_select]

In [None]:
train_rf, test_rf = feature_selection_wrapper_RF(train, test, 500)

In [None]:
train_rf.shape

(458913, 302)

可以运行，通过返回的数据集我们可以看大最终选取了哪些特征。

如果想知道每个特征的重要性水平，那么我们就需要专门返回fse这个数组