知识补充：
* 学习模型融合的几种方式：
1. 模型结果层面的融合技术：回归任务中加权融合；分类任务中的voting
2. 从样本集的角度考虑把多个弱模型集成起来：Boosting; Bagging （一般用于集成学习）；Boosting和Bagging的区别
3. 构建多层模型：stacking；blending

* 学习使用heamy模块进行模型在线融合

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2
#from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
warnings.filterwarnings('ignore')

In [4]:
#reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
#训练数据/测试数据准备

data_train = pd.read_csv('data_train_for_model-09-27.csv')
data_train = reduce_mem_usage(data_train)

data_test = pd.read_csv('data_test_for_model-09-27.csv')
data_test = reduce_mem_usage(data_test)

Memory usage of dataframe is 723200128.00 MB
Memory usage after optimization is: 128800128.00 MB
Decreased by 82.2%
Memory usage of dataframe is 179200128.00 MB
Memory usage after optimization is: 32000128.00 MB
Decreased by 82.1%


In [11]:
features = [f for f in data_train.columns if f not in ['id','isDefault']]

X_train = data_train[features]
X_test = data_test[features]

y_train = data_train['isDefault']

In [9]:
#建立模型

def xgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = xgb.DMatrix(X_train_split , label=y_train_split)
    valid_matrix = xgb.DMatrix(X_val , label=y_val)
    test_matrix = xgb.DMatrix(X_test)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'exact',
        'seed': 2020,
        'n_jobs': -1,
        "silent": True,
        
        'gamma': 1,
        'min_child_weight': 1.5,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.04,
        
        'scale_pos_weight': 1,
    }
    watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
    
    model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
    """计算在验证集上的得分"""
    val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后xgboost单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
    
    return test_pred
    

In [10]:
def lgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)
    
    # 调参后的最优参数
    params = {'objective' : 'binary',
         'metric' : 'auc',
         'max_depth' : 5,
         'num_leaves' : 31,
          
         'learning_rate' : 0.01,    #0.005
          
         'feature_fraction' : 0.8,
         'bagging_fraction' : 0.8,
         'bagging_freq': 2, 
          
         'min_child_samples' : 23,     
         'min_child_weight': 0.001,    
         
         'reg_alpha': 0.5,      
         'reg_lambda' : 0.3,     
         'min_split_gain' : 0.0,
         'n_jobs' : -1
         }
    
    model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
    """计算在验证集上的得分"""
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    return test_pred

基于模型层面的模型

In [15]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)

使用stacking方法进行模型融合

In [16]:
from heamy.pipeline import ModelsPipeline

pipeline = ModelsPipeline(model_xgb, model_lgb)
pipeline

<heamy.pipeline.ModelsPipeline at 0x23da5a698c8>

构建第一层新特征，其中k默认是5， 表示5折交叉验证，full_test=True，对全部训练集进行训练得到基学习器，然后用基学习器对测试集预测得到新特征。

In [19]:
stack_ds = pipeline.stack(k=5, seed=2020, full_test=True)

[0]	train-auc:0.696657	eval-auc:0.696166
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.733379	eval-auc:0.728716
[400]	train-auc:0.74129	eval-auc:0.7324
[600]	train-auc:0.747017	eval-auc:0.73413
[800]	train-auc:0.751733	eval-auc:0.735151
[1000]	train-auc:0.75597	eval-auc:0.735989
[1200]	train-auc:0.759709	eval-auc:0.736376
[1400]	train-auc:0.763337	eval-auc:0.736583
[1600]	train-auc:0.766822	eval-auc:0.736894
[1800]	train-auc:0.770133	eval-auc:0.736917
[2000]	train-auc:0.773363	eval-auc:0.736932
[2200]	train-auc:0.776598	eval-auc:0.736961
Stopping. Best iteration:
[2106]	train-auc:0.775138	eval-auc:0.737024

调参后xgboost单模型在验证集上的AUC：0.7370245110462428
[0]	train-auc:0.696861	eval-auc:0.698827
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.733417	eval-auc:0.732033
[40

In [21]:
#第二层使用逻辑回归进行stack
from sklearn.linear_model import LogisticRegression as LR
LR(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LR, parameters={'solver': 'lbfgs'})

In [22]:
#测试集的预测结果
test_pred = stacker.predict()
test_pred

array([0.08887541, 0.27427777, 0.70083222, ..., 0.11649688, 0.25233517,
       0.07192994])

In [23]:
test_pred.shape

(200000,)

In [24]:
data_test['isDefault'] = test_pred
data_test[['id','isDefault']].to_csv('test_sub-09-27-stack.csv', index=False)

提交到线上，分数反而降了，emmmm。

使用blending方法进行模型融合

In [25]:
# 构建第一层新特征，将训练集切分成8:2，其中80%用于训练基学习器，20%用于构建新特征
blend_ds = pipeline.blend(proportion=0.2,seed=111)

In [26]:
# 第二层使用逻辑回归进行blend
blender = Classifier(dataset=blend_ds, estimator=LR, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred_1 = blender.predict()
test_pred_1

array([0.09120743, 0.28107113, 0.67114587, ..., 0.11950613, 0.26956718,
       0.068617  ])

In [27]:
data_test['isDefault'] = test_pred
data_test[['id','isDefault']].to_csv('test_sub-09-27-blending.csv', index=False)