# Section 2.5 Wrapper Method

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [2]:
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN

In [3]:
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.metrics import mean_squared_error

本节将会使用多种集成模型的feature importance来筛选最重要的特征们。

## 导入数据 (Import Data)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# 修改当前文件夹位置 假定notebook文件就在项目文件夹根目录
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive'):
        return '/content/drive/MyDrive/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())

In [None]:
print(os.getcwd())

/content/drive/.shortcut-targets-by-id/1-6G5JjzxCCkstqGX9WjENeODam918ybc/4-AMEX/AMEX Project/notebooks


In [15]:
test = pd.read_parquet("../data/8-CombinedData/FeatureSelection/test_lgbm.parquet")
train = pd.read_parquet("../data/8-CombinedData/FeatureSelection/train_lgbm_500.parquet")

In [18]:
tmp = list(train.columns)
tmp

['customer_ID',
 'D_114&0&S_9',
 'D_39_last',
 'D_46_last',
 'P_2_last',
 'B_5_last',
 'B_11_last',
 'B_2_last',
 'P_3_last',
 'B_1_last',
 'R_1_last',
 'LSTM-Embedding98',
 'D_120&1&S_9',
 'B_37_last',
 'B_9_last',
 'LSTM-Embedding74',
 'S_27_avg',
 'D_48_last',
 'B_13_avg',
 'S_3_last',
 'S_5_last',
 'S_7_last',
 'B_4_last',
 'B_14_last',
 'S_23_last',
 'B_30&1&S_9',
 'B_3_last',
 'LSTM-Embedding19',
 'S_9_min',
 'S_7_min',
 'B_14_avg',
 'D_112_last',
 'D_43_last',
 'D_114&0&S_3',
 'LSTM-Embedding3',
 'S_9_last',
 'S_12_min',
 'LSTM-Embedding28',
 'P_3_max',
 'B_13_max',
 'S_24_avg',
 'S_3_min',
 'S_26_max',
 'D_41_last',
 'S_7_avg',
 'D_46_min',
 'S_26_avg',
 'D_114&0&S_7',
 'B_37_avg',
 'B_18_last',
 'S_25_last',
 'D_50_last',
 'B_11_avg',
 'B_9_avg',
 'S_3_max',
 'S_12_max',
 'LSTM-Embedding92',
 'B_28_min',
 'R_3_last',
 'B_14_min',
 'D_44_last',
 'LSTM-Embedding31',
 'LSTM-Embedding37',
 'B_40_last',
 'S_22_avg',
 'D_71_avg',
 'B_4_max',
 'R_5_last',
 'LSTM-Embedding75',
 'D_114

In [19]:
tmp.remove('target')

In [21]:
test = test[tmp]

In [None]:
train.head()

Unnamed: 0,customer_ID,B_38&5&S_9,D_114&1&S_13,D_114&0&D_79,D_64&-1&D_91,D_117&4&R_4,D_64&0&R_4,B_30&0&R_2,D_114&1&D_111,D_114&1&D_74,...,D_64_last_0,D_64_last_2,D_64_last_3,D_68_last_1,D_68_last_2,D_68_last_3,D_68_last_4,D_68_last_5,D_68_last_6,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,-0.219021,2.18961,-0.190002,0.309325,-0.140324,-0.179559,-0.270268,-0.943571,-0.269386,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,-0.219021,-0.024353,-0.190002,0.309325,-0.140324,-0.179559,-0.270268,-0.943571,-0.468583,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,-0.219021,-0.63333,-0.190002,0.309325,-0.140324,-0.179559,-0.270268,-0.461414,-0.468583,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,-0.219021,1.193602,-0.190002,0.309325,-0.140324,-0.179559,-0.270268,-0.943571,-0.368984,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,-0.219021,-0.63333,-0.190002,0.309325,-0.140324,-0.179559,-0.270268,-0.943571,0.328206,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [7]:
train.shape

(458913, 1002)

## Wrapper 特征筛选

### Metric

In [8]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [9]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'AMEX_metric', amex_metric(y_true, y_pred), True

### LightGBM

In [11]:
def feature_selection_wrapper_lgbm(train, feature_num=500):
    """
    lgm特征重要性筛选函数
    
    :param train:训练数据集
    :param test:测试数据集
    :return:特征筛选后的训练集和测试集
    """
    
    # 提取候选特征：删除ID列和标签列
    print("Start wrapper feature selection: LGBM")
    label = "target"
    features = train.columns.tolist()
    print(features)
    features.remove("customer_ID")
    features.remove("target")

    # 配置lgb参数
    params_initial = {
        'objective': "binary",
        "is_unbalance": True, # 不平衡数据（尝试为false的效果）
        'num_leaves': 100,
        'learning_rate': 0.01,
        'boosting': 'gbdt',
        'min_data_in_leaf': 45,
        'bagging_seed': 2022,
        'bagging_fraction': 0.7,
        'bagging_freq': 10,
        'feature_fraction': 0.7,
        'max_depth': -1,
        'metric': "None", # 我将要使用自定义metric，在feval中声明
        'reg_alpha': 0,
        'reg_lambda': 1,
    }
    
    # 控制参数
    ESR = 80     # 提前验证迭代效果或停止
    NBR = 400  # 迭代次数（正式训练变成10000）
    VBE = 50     # 打印间隔
    
    # 开始交叉验证
    kf = StratifiedKFold(n_splits=5, random_state=2022, shuffle=True) #分层抽样
    fse = pd.Series(0, index=features) # 创建空容器存储重要性水平结果
    
    for train_part_index, eval_index in kf.split(train[features], train[label]):
        # 封装训练数据集
        train_part = lgb.Dataset(
            train[features].loc[train_part_index],
            train[label].loc[train_part_index])
        # 封装验证数据集
        eval = lgb.Dataset(
            train[features].loc[eval_index],
            train[label].loc[eval_index])
        
        # 在训练集上进行训练，并同时进行验证
        bst = lgb.train(
            params_initial, 
            train_part,    
            num_boost_round=NBR, # 迭代次数
            valid_sets=[train_part, eval], # 验证集包括train和val
            valid_names=['train', 'valid'],
            early_stopping_rounds=ESR, 
            verbose_eval=VBE,
            feval = lgb_amex_metric, # 自定义metric
            
        )
        
        # 输出特征重要性计算结果，并进行累加
        fse += pd.Series(bst.feature_importance(), features)
    
    # 选择最重要的前k个特征
    feature_select = ["customer_ID"] + fse.sort_values(ascending=False).index.tolist()[:feature_num]
    print("wrapper feature selection: LGBM Done!")
    
    return train[feature_select + ["target"]]

In [12]:
train_lgbm = feature_selection_wrapper_lgbm(train, 500)

Start wrapper feature selection: LGBM
['customer_ID', 'D_114&0&S_9', 'D_39_last', 'D_46_last', 'P_2_last', 'B_11_last', 'B_5_last', 'B_2_last', 'P_3_last', 'R_1_last', 'B_1_last', 'LSTM-Embedding98', 'D_120&1&S_9', 'B_9_last', 'LSTM-Embedding74', 'B_37_last', 'S_27_avg', 'D_48_last', 'B_13_avg', 'S_3_last', 'S_5_last', 'S_23_last', 'S_7_last', 'B_4_last', 'B_14_last', 'B_3_last', 'B_30&1&S_9', 'LSTM-Embedding19', 'D_114&0&S_3', 'S_9_min', 'S_7_min', 'B_14_avg', 'D_43_last', 'S_9_last', 'LSTM-Embedding28', 'D_112_last', 'P_3_max', 'S_24_avg', 'S_12_min', 'LSTM-Embedding3', 'B_13_max', 'D_41_last', 'S_7_avg', 'S_3_min', 'S_26_max', 'D_114&0&S_7', 'D_50_last', 'S_26_avg', 'S_3_max', 'LSTM-Embedding75', 'LSTM-Embedding92', 'B_37_avg', 'D_46_min', 'B_18_last', 'B_11_avg', 'B_28_min', 'R_3_last', 'D_71_avg', 'B_5_avg', 'S_25_last', 'B_9_avg', 'B_40_last', 'B_14_min', 'D_44_last', 'S_12_max', 'B_13_min', 'B_4_max', 'LSTM-Embedding31', 'LSTM-Embedding37', 'LSTM-Embedding60', 'R_4_last', 'S_22_

In [13]:
train_lgbm.to_parquet(path = '../data/8-CombinedData/FeatureSelection/train_lgbm_500.parquet', engine = 'pyarrow')

In [None]:
del train_lgbm

In [None]:
# test_lgbm.to_parquet(path = '../data/7-FeatureSelection /Part2/test_lgbm.parquet', engine = 'pyarrow')

可以运行，但时间有点长。可以适当降低训练轮次。对于筛选特征，过多的轮次应该差别不大。

加上了`"is_unbalance": True`好像效果反而没那么好？

反正过拟合有点严重

### Random Forest

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def feature_selection_wrapper_RF(train, feature_num=1000):
    """
    RF特征重要性筛选函数

    :param train:训练集
    :param test:测试集
    :param best_clf:最优的分类器模型

    :return: 特征筛选后的训练集和测试集
    """

    # 提取候选特征：删除ID列和标签列
    print("Start wrapper feature selection: Random Forest")
    label = "target"
    features = train.columns.tolist()
    features.remove("customer_ID")
    features.remove("target")
    
    # 开始交叉验证
    kf = StratifiedKFold(n_splits=2, random_state=2022, shuffle=True)
    fse = pd.Series(0, index=features) # 创建空容器存储重要性水平结果
    for train_index, test_index in kf.split(train[features], train[label]):

        # 得到训练集和测试集
        X_train, X_test = train[features].iloc[train_index], train[features].iloc[test_index]
        y_train, y_test = train[label].iloc[train_index], train[label].iloc[test_index]
        
        # 处理缺失值：随机森林无法处理缺失值，必须要保证数据集没有缺失值
        X_train = X_train.fillna(0)
        print(X_train.isna().sum().sum(), y_train.isna().sum().sum())
        
        # 处理不平衡数据
        # SMOTE + ENN 注意只针对训练集调整
        smote = SMOTE(random_state = 1, sampling_strategy=0.4)
        enn = EditedNearestNeighbours(n_neighbors=5, sampling_strategy = 'majority')
        smote_enn = SMOTEENN(smote=smote, enn=enn)
    
        X_SMOTEENN, y_SMOTEENN = smote_enn.fit_resample(X_train, y_train) # 模型变成smote_enn
    
        # 训练模型
        rfc = RandomForestClassifier(
            random_state=2022,
            min_samples_split=100,
            min_samples_leaf=20,
            max_depth=8,
            max_features='sqrt',
            criterion="gini",  # 注意此处和metric不同
            class_weight={0: 1, 1: 40}  # 处理不平衡问题
        )
        
        rfc.fit(X_SMOTEENN, y_SMOTEENN)
        
        # 对重要性水平进行累加
        fse += pd.Series(rfc.feature_importances_, features)


    # 选择最重要的前k个特征
    feature_select = ["customer_ID"] + fse.sort_values(ascending=False).index.tolist()[:feature_num]
    print("wrapper feature selection: Random Forest Done!")

    return train[feature_select + ["target"]]

In [None]:
train_rf = feature_selection_wrapper_RF(tmp, test, 1000)

NameError: ignored

In [None]:
list(set(train.columns).difference(set(train_rf)))

In [None]:
train_rf.to_parquet(path = '../data/8-CombinedData/FeatureSelection/train_rf.parquet', engine = 'pyarrow')

In [23]:
test.to_parquet(path = '../data/8-CombinedData/FeatureSelection/test_lgbm_500.parquet', engine = 'pyarrow')

In [24]:
test.shape

(924621, 501)

In [25]:
train.shape

(458913, 502)

In [None]:
# train_rf.shape

可以运行，通过返回的数据集我们可以看大最终选取了哪些特征。

如果想知道每个特征的重要性水平，那么我们就需要专门返回fse这个数组