# Section 3.6.1 MLP Bagging: Training 

In [None]:
!nvidia-smi

Mon Aug 15 19:03:31 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 300W |      2MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# 把Google Drive挂载到Colab里
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    pass

Mounted at /content/drive


In [None]:
# 修改当前文件夹位置 假定notebook文件就在项目文件夹根目录
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive/Colab/'):
        return '/content/drive/MyDrive/Colab/4-AMEX/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())

In [None]:
import numpy as np
import pandas as pd

In [None]:
import gc
import os
import glob
import copy
import pdb

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR

In [None]:
from sklearn.model_selection import StratifiedKFold

## 导入数据

**请注意：** 我们这里使用的数据集都是压缩后的数据集（也即一行对应一个顾客）

In [None]:
train = pd.read_parquet("../data/2-processed-demo/train_fe.parquet")

In [None]:
gc.collect() # 及时清理

15

In [None]:
train.head(2)

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933824,0.024194,0.86858,0.960384,0.934745,0.230769,0.83205,0,3,...,13,0,1,13,-1,1,13,6,1,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.89982,0.022119,0.861109,0.929122,0.880519,7.153846,6.743468,0,19,...,13,0,1,13,-1,1,13,6,1,0


In [None]:
feature_dim = train.shape[1] - 2

在训练数据集中，当前一共有920列，但是其中两列是`customer_ID`和`target`。因此我们一共拥有的特征数是`920-2`，存储在`feature_dim`中。注意`feature_dim`的数量很重要，这影响着我们后面的神经网络架构。

## Metric

In [None]:
def amex_metric(y_true: np.array, y_pred: np.array) -> float:
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

## Multilayer Perceptron 多层感知机

多层感知机其实就是简单的多层神经网络。因为这种神经网络很小，我们可以作为bagging算法的基分类器。

我们下面开始设计我们的基分类器。

### 基分类器架构

In [None]:
class DenseModel(nn.Module):
    def __init__(self, in_feats, repeat=1):
        super(DenseModel, self).__init__()
        self.l1 = nn.Linear(in_feats, 500, bias=True) # 输出400维的向量
        self.l2 = nn.Linear(in_feats+500, 200, bias=True)
        self.l3 = nn.Linear(in_feats+700, 1, bias=True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.6)
        self.bn1 = nn.BatchNorm1d(in_feats)
        self.bn2 = nn.BatchNorm1d(200)

    def forward(self, x):
        x = self.bn1(x)

        x1 = self.l1(x)
        x1 = self.dropout(x1)
        x1 = self.relu(x1)

        x_c1 = torch.cat([x, x1], 1)
        x2 = self.l2(x_c1)
        x2 = self.dropout(x2)
        x2 = self.relu(x2)

        x_c2 = torch.cat([x, x1, x2], 1)
        x3 = self.l3(x_c2)
        
        return x3

### 将原始数据转化为张量

一般在深度学习中，我们都要用Dataset, Dataloader对数据进行加载。改变数据的形式，让深度学习模型能够读取数据。

这里由于我们的数据结构非常简单，直接用`torch.from_numpy`将其转化为张量即可

In [None]:
X = torch.from_numpy(train.iloc[:, 1:-1].fillna(0).values.astype(np.float16))
Y = torch.from_numpy(train.iloc[:, -1].values)

## 训练与验证

### 训练与验证函数

In [None]:
def train_and_val(model, train_loader, val_loader, loss_fn, epoch, path):
    best_metric = 0 # 记录最佳metric分数
    
    for _ in range(epoch):
        
        # 开始训练模型
        model.train()
        train_pred = []
        train_label = []
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad() # 梯度清零
            pred = model(batch_x.float()) # 预测值
            loss = loss_fn(pred[:, 0], batch_y.float()) # 计算损失值
            loss.backward()  # 反向传播
            optimizer.step() # 更新梯度

            pred = torch.sigmoid(pred)
            pred = pred.data.cpu().numpy().reshape(-1)
            train_pred.append(pred)
            train_label.append(batch_y.data.cpu().numpy())
            
            
        # 开始评估模型（计算metric分数）
        val_pred = []
        val_label = []
        model.eval()
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                pred = torch.sigmoid(model(batch_x.float()))
                pred = pred.data.cpu().numpy().reshape(-1)
                val_pred.append(pred)
                val_label.append(batch_y.data.cpu().numpy())
        
        # pdb.set_trace()
        train_metric = amex_metric(np.hstack(train_label), np.hstack(train_pred))
        val_metric = amex_metric(np.hstack(val_label), np.hstack(val_pred))
        
        # 更新最高metric分数记录
        if best_metric < val_metric:
            best_metric = val_metric
            torch.save(model.state_dict(), path) # 保存模型
            print(f"当前最高的validation分数：{best_metric}")
            print(f"这个模型保存在：{path}")

### 训练参数

In [None]:
epoch = 10
bagging = 5 # 每一折中会产生5个基分类器

### 损失函数

In [None]:
loss_fn = torch.nn.BCEWithLogitsLoss() # 损失函数：二分类交叉熵损失函数

### 训练过程

在这里我们要进行分层抽样K折交叉验证。

由于我们这里使用的是Bagging袋装法，因此在每一折里面，都会有多个基分类器（每个基分类器都是相对独立的）。

假设我们要进行6折交叉验证，我规定每一折中产生5个基分类器进行bagging，那么最终一共会产生30个基分类器。我们把这些训练好的基分类器存储在本地，之后在inference的notebook中一起使用。

In [None]:
skf = StratifiedKFold(n_splits=6)
for fold_idx, (tr_idx, val_idx) in enumerate(skf.split(X, Y)):
    print("*"*30)
    print(f"当前是第{fold_idx}折正进行训练")
    print("*"*30)
    print(f"训练集的index: {tr_idx}")
    print(f"测试集的index: {val_idx}")
    
    # 封装dataset
    fold_tr_dataset = torch.utils.data.TensorDataset(X[tr_idx], Y[tr_idx])
    fold_val_dataset = torch.utils.data.TensorDataset(X[val_idx], Y[val_idx])
    
    # 数据加载：验证数据集
    val_loader = torch.utils.data.DataLoader(
        fold_val_dataset,
        batch_size=1024,
        shuffle=False # 不要乱序，因为验证集
    )
    
    #############################
    # 在这里尝试加入SMOTE和ENN #
    #############################
    # 请注意SMOTE和ENN一定不可以影响validation set的样本分布！
    

    # 装袋法
    for bag_idx in range(bagging):
        # 从当前的训练样本中抽取90%的数据，作为这个bag的训练数据（每个基分类器的数据尽量不同）
        bagging_idx = np.random.choice(
            range(len(fold_tr_dataset)),  
            int(len(fold_tr_dataset)*0.9)
        )
        fold_tr_bagging_dataset = torch.utils.data.Subset(fold_tr_dataset, bagging_idx)
        
        # 数据加载：训练数据集（这个bag的训练数据）
        train_loader = torch.utils.data.DataLoader(
            fold_tr_bagging_dataset, 
            batch_size=1024*4, 
            shuffle=True # 需要乱序，因为训练集
        )

        model = DenseModel(feature_dim) # 模型初始化（这个bag的第bag_index个基分类器）
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # 优化器（是否可以移到循环外面和损失函数放在一起）

        # 训练与验证
        train_and_val(model, train_loader, val_loader, loss_fn, epoch, 
                      "../models/MLP Bagging/" + f'dense_{fold_idx}_{bag_idx}.pt' # 模型存放路径
                     )

        del fold_tr_bagging_dataset, train_loader, model, optimizer

******************************
当前是第0折正进行训练
******************************
训练集的index: [ 75929  75934  75941 ... 458910 458911 458912]
测试集的index: [    0     1     2 ... 76669 76671 76672]
当前最高的validation分数：0.7757768100279117
这个模型保存在：../models/MLP Bagging/dense_0_0.pt
当前最高的validation分数：0.7801554238779469
这个模型保存在：../models/MLP Bagging/dense_0_0.pt
当前最高的validation分数：0.7816355451180471
这个模型保存在：../models/MLP Bagging/dense_0_0.pt
当前最高的validation分数：0.782644721571593
这个模型保存在：../models/MLP Bagging/dense_0_0.pt
当前最高的validation分数：0.7835116165378098
这个模型保存在：../models/MLP Bagging/dense_0_0.pt
当前最高的validation分数：0.7835623752549576
这个模型保存在：../models/MLP Bagging/dense_0_0.pt
当前最高的validation分数：0.7747949250155286
这个模型保存在：../models/MLP Bagging/dense_0_1.pt
当前最高的validation分数：0.7790239024204417
这个模型保存在：../models/MLP Bagging/dense_0_1.pt
当前最高的validation分数：0.7812827356075449
这个模型保存在：../models/MLP Bagging/dense_0_1.pt
当前最高的validation分数：0.7824218830761709
这个模型保存在：../models/MLP Bagging/dense_0_1.pt
当前最高的validation

In [None]:
del X, Y;
gc.collect()

22