# Section 3.6.2 MLP Bagging: Inference

In [2]:
!nvidia-smi

Tue Aug 16 01:39:31 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    43W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# 把Google Drive挂载到Colab里
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    pass

Mounted at /content/drive


In [4]:
# 修改当前文件夹位置 假定notebook文件就在项目文件夹根目录
import os
def get_root_dir():
    if os.path.exists('/content/drive/MyDrive/Colab/'):
        return '/content/drive/MyDrive/Colab/4-AMEX/AMEX Project/notebooks' #在Colab里
    else:
        return './' #在本地

#调用系统命令，相当于cd，但是直接!cd是不行的
os.chdir(get_root_dir())

In [5]:
import numpy as np
import pandas as pd

In [6]:
import os
import copy
import glob, gc

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## 导入数据

In [8]:
test = pd.read_parquet("../data/2-processed-demo/test_fe.parquet")

In [9]:
test.head(3)

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.601387,0.02019,0.56893,0.631315,0.56893,2.222222,3.527668,0,8,...,1,9,3,2,9,-1,1,9,6,2
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.862166,0.031436,0.794469,0.913501,0.841177,5.076923,6.034091,0,17,...,1,13,0,1,13,-1,1,13,6,1
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.748955,0.061456,0.673112,0.835114,0.697522,6.0,9.0,0,23,...,1,13,3,2,13,1,1,13,4,2


In [10]:
test.shape

(924621, 919)

**强调：** 即使是压缩后的数据，我们的样本数量也是惊人地多。测试集中的数据量有92万多条。

## 多层感知机模型

将我们训练过程中的多层感知机模型复制过来。这是我们基分类器模型。

In [11]:
class DenseModel(nn.Module):
    def __init__(self, in_feats, repeat=1):
        super(DenseModel, self).__init__()
        self.l1 = nn.Linear(in_feats, 500, bias=True) # 输出400维的向量
        self.l2 = nn.Linear(in_feats+500, 200, bias=True)
        self.l3 = nn.Linear(in_feats+700, 1, bias=True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.6)
        self.bn1 = nn.BatchNorm1d(in_feats)
        self.bn2 = nn.BatchNorm1d(200)

    def forward(self, x):
        x = self.bn1(x)

        x1 = self.l1(x)
        x1 = self.dropout(x1)
        x1 = self.relu(x1)

        x_c1 = torch.cat([x, x1], 1)
        x2 = self.l2(x_c1)
        x2 = self.dropout(x2)
        x2 = self.relu(x2)

        x_c2 = torch.cat([x, x1, x2], 1)
        x3 = self.l3(x_c2)
        
        return x3

## Inference

利用我们放在`models/MLP Bagging`训练好的30个基分类器，对测试集数据进行预测

### 获取训练好的模型

In [16]:
paths = glob.glob('../models/MLP Bagging/*.pt') # 获取全部模型

### 推理过程

In [17]:
tta_test_pred = []
for path in paths[:]: # 遍历全部模型，分别进行预测
    print(f"当前使用的模型：{path}")
    
    model = DenseModel(test.shape[1] - 1) # 减1因为排除customer_ID这列
    model.load_state_dict(torch.load(path)) # 加载训练好的模型
    model.eval() # 模型进入评估状态
    
    
    # 测试集样本数量太多，我们分批次进行预测
    CTS = len(test)//40960
    if len(test)%40960!=0: CTS += 1
        
    test_pred = []
    for j in range(CTS): # 按批次进行遍历（每一批中有40960个样本）
        a = j*40960
        b = (j+1)*40960
        b = min(b, len(test))
        
        # 将这批数据转化为张量
        batch_data = test.iloc[a:b, 1:].fillna(0).values.astype(np.float32)
        batch_data = torch.from_numpy(batch_data)
        
        # 利用模型对这批数据进行预测
        pred = model(batch_data)
        pred = torch.sigmoid(pred).data.cpu().numpy()
        test_pred.append(pred)
        
    del model
    tta_test_pred.append(np.vstack(test_pred).reshape(-1)) # 测试集中全部样本的预测值

当前使用的模型：../models/MLP Bagging/dense_0_0.pt
当前使用的模型：../models/MLP Bagging/dense_0_1.pt
当前使用的模型：../models/MLP Bagging/dense_0_2.pt
当前使用的模型：../models/MLP Bagging/dense_0_3.pt
当前使用的模型：../models/MLP Bagging/dense_0_4.pt
当前使用的模型：../models/MLP Bagging/dense_1_0.pt
当前使用的模型：../models/MLP Bagging/dense_1_1.pt
当前使用的模型：../models/MLP Bagging/dense_1_2.pt
当前使用的模型：../models/MLP Bagging/dense_1_3.pt
当前使用的模型：../models/MLP Bagging/dense_1_4.pt
当前使用的模型：../models/MLP Bagging/dense_2_0.pt
当前使用的模型：../models/MLP Bagging/dense_2_1.pt
当前使用的模型：../models/MLP Bagging/dense_2_2.pt
当前使用的模型：../models/MLP Bagging/dense_2_3.pt
当前使用的模型：../models/MLP Bagging/dense_2_4.pt
当前使用的模型：../models/MLP Bagging/dense_3_0.pt
当前使用的模型：../models/MLP Bagging/dense_3_1.pt
当前使用的模型：../models/MLP Bagging/dense_3_2.pt
当前使用的模型：../models/MLP Bagging/dense_3_3.pt
当前使用的模型：../models/MLP Bagging/dense_3_4.pt
当前使用的模型：../models/MLP Bagging/dense_4_0.pt
当前使用的模型：../models/MLP Bagging/dense_4_1.pt
当前使用的模型：../models/MLP Bagging/dense_4_2.pt
当前使用的模型：../

In [18]:
sub = pd.DataFrame({'customer_ID': test['customer_ID'].values,
                    'prediction': np.mean(tta_test_pred, axis=0)}) # 对所有模型的预测结果取mean
sub.to_csv('../results/3.6-MLP-Bagging-submission.csv', index=False)

In [19]:
sub

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.016745
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.000894
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.045843
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.261950
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.855230
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.007820
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.789221
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.357902
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.333148
