In [38]:
# !pip install pandas
# !pip install d2lzh

%matplotlib inline
import d2lzh as d2l 
from mxnet import autograd, gluon, init, nd 
from mxnet.gluon import data as gdata, loss as gloss, nn 
import numpy as np
import pandas as pd

import pickle

# Load X_2020
with open('X_2020.pkl', 'rb') as file:
    X_2020 = pickle.load(file)
    print("success")

# Load y_2020
with open('y_2020.pkl', 'rb') as file:
    y_2020 = pickle.load(file)
    print("success")


success
success


In [83]:
print(X_2020.shape)
print(y_2020.shape)

(584844, 50)
(584844,)


In [84]:
# 导入所需库
import mxnet as mx
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import nn, Trainer
from mxnet.gluon.data import ArrayDataset, DataLoader
import numpy as np
import pandas as pd

# 定义多层感知机模型
class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense1 = nn.Dense(64, activation='relu')
            self.dense2 = nn.Dense(32, activation='relu')
            self.dense3 = nn.Dense(1, activation='sigmoid')  # 输出层使用sigmoid激活函数，输出概率值

    def forward(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3(x)

# 定义RMSE作为损失函数
def rmse_loss(output, label):
    return nd.sqrt(nd.mean((output - label)**2))

# 定义交叉验证函数
def k_fold_cross_validation(k, X, y, num_epochs, learning_rate, weight_decay, batch_size):
    assert k > 1
    fold_size = len(X) // k
    train_losses, val_losses = [], []
    accuracies = []

    for i in range(k):
        # 准备数据
        X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
        train_dataset = ArrayDataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size, shuffle=True)

        val_dataset = ArrayDataset(X_valid, y_valid)
        val_loader = DataLoader(val_dataset, batch_size, shuffle=False)

        # 初始化模型
        net = MLP()
        net.initialize(init.Xavier(), ctx=mx.cpu())

        # 定义损失函数和优化器
        criterion = rmse_loss
        trainer = Trainer(net.collect_params(), 'adam', {'learning_rate': learning_rate, 'wd': weight_decay})

        # 训练模型
        for epoch in range(num_epochs):
            train_loss = 0.
            for data, label in train_loader:
                with autograd.record():
                    output = net(data)
                    loss = criterion(output, label)
                loss.backward()
                trainer.step(batch_size)
                train_loss += loss.mean().asscalar()
            train_loss /= len(train_loader)
            train_losses.append(train_loss)

            # 在验证集上计算损失
            val_loss = 0.
            for data, label in val_loader:
                output = net(data)
                loss = criterion(output, label)
                val_loss += loss.mean().asscalar()
            val_loss /= len(val_loader)
            val_losses.append(val_loss)

            # 计算验证集准确率
            accuracy = calculate_accuracy(net, X_valid, y_valid)
            accuracies.append(accuracy)

            print(f"Fold [{i+1}/{k}], Epoch [{epoch+1}/{num_epochs}], Train RMSE: {train_loss:.4f}, Val RMSE: {val_loss:.4f}, Val Accuracy: {accuracy:.4f}")

    return train_losses, val_losses, accuracies

# 计算准确率
def calculate_accuracy(net, features, labels):
    preds = net(features)
    preds = preds > 0.5
    accuracy = (preds == labels.reshape(preds.shape)).mean().asscalar()
    return accuracy

# 获取K折交叉验证数据
def get_k_fold_data(k, i, X, y):
    fold_size = len(X) // k
    X_train = nd.concat(X[0:i * fold_size], X[(i + 1) * fold_size:], dim=0)
    y_train = nd.concat(y[0:i * fold_size], y[(i + 1) * fold_size:], dim=0)
    return X_train, y_train, X[i * fold_size: (i + 1) * fold_size], y[i * fold_size: (i + 1) * fold_size]


# 调用交叉验证函数进行训练
k = 5
num_epochs = 50
learning_rate = 0.001
weight_decay = 0.001
batch_size = 128

train_losses, val_losses, accuracies = k_fold_cross_validation(k, X_2020, y_2020, num_epochs, learning_rate, weight_decay, batch_size)

# 打印最终结果
print(f"Average Train RMSE: {np.mean(train_losses):.4f}, Average Val RMSE: {np.mean(val_losses):.4f}, Average Val Accuracy: {np.mean(accuracies):.4f}")


AssertionError: Positional arguments must have NDArray type, but got Empty DataFrame
Columns: [BMI, PhysicalHealth, MentalHealth, SleepTime, Smoking_No, Smoking_Yes, AlcoholDrinking_No, AlcoholDrinking_Yes, Stroke_No, Stroke_Yes, DiffWalking_No, DiffWalking_Yes, Sex_Female, Sex_Male, AgeCategory_18-24, AgeCategory_25-29, AgeCategory_30-34, AgeCategory_35-39, AgeCategory_40-44, AgeCategory_45-49, AgeCategory_50-54, AgeCategory_55-59, AgeCategory_60-64, AgeCategory_65-69, AgeCategory_70-74, AgeCategory_75-79, AgeCategory_80 or older, Race_American Indian/Alaskan Native, Race_Asian, Race_Black, Race_Hispanic, Race_Other, Race_White, Diabetic_No, Diabetic_No, borderline diabetes, Diabetic_Yes, Diabetic_Yes (during pregnancy), PhysicalActivity_No, PhysicalActivity_Yes, GenHealth_Excellent, GenHealth_Fair, GenHealth_Good, GenHealth_Poor, GenHealth_Very good, Asthma_No, Asthma_Yes, KidneyDisease_No, KidneyDisease_Yes, SkinCancer_No, SkinCancer_Yes]
Index: []

[0 rows x 50 columns]

In [None]:
import matplotlib.pyplot as plt

# 假设 train_losses 和 val_losses 已经在你的代码中记录好了
# 训练过程中的 epoch 数量
epochs = range(1, num_epochs + 1)

# 创建绘图
plt.figure(figsize=(10, 6))

# 绘制训练集 RMSE 曲线（使用蓝色实线）
plt.plot(epochs, train_losses, 'b-o', label='Train RMSE')

# 绘制验证集 RMSE 曲线（使用橙色实线）
plt.plot(epochs, val_losses, 'r-o', label='Val RMSE')

# 设置图表标题和标签
plt.xlabel('Epoch')
plt.ylabel('RMSE')
plt.title('Training and Validation RMSE')

# 添加图例
plt.legend()

# 添加网格线
plt.grid(True)

# 自动调整布局
plt.tight_layout()

# 显示图表
plt.show()


In [78]:

loss = gloss.L2Loss()

def get_net():
    net = nn.Sequential()
    
    # 第一层
    net.add(nn.Dense(14, activation='relu'))  
    # 第二层 
    net.add(nn.Dense(4, activation='relu'))  
    # 输出层
    net.add(nn.Dense(1))  # 输出层, 1个神经元
    net.initialize()
    return net

import mxnet.ndarray as nd

def binary_cross_entropy(preds, labels):
    # 将预测值进行限制，避免出现 log(0)
    preds = nd.clip(preds, 1e-10, 1 - 1e-10)
    return -nd.mean(labels * nd.log(preds) + (1 - labels) * nd.log(1 - preds))

def log_bce(net, features, labels):
   
    preds = net(features)  
    
    # 计算损失
    bce_loss = binary_cross_entropy(preds, labels)
    return bce_loss.asscalar()  

def calculate_accuracy(net, features, labels):
    # 将预测值二值化，使用0.5作为阈值
    preds = net(features)
    predicted_labels = (preds > 0.5).astype(labels.dtype)
    # 计算准确率
    return nd.mean(predicted_labels == labels).asscalar()


In [79]:
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls, accura_ls= [], [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(
        train_features, train_labels), batch_size, shuffle=True)
    # print("train_iter",train_iter)
    # 这里使用了Adam优化算法
    trainer = gluon.Trainer(net.collect_params(), 'adam', {
        'learning_rate': learning_rate, 'wd': weight_decay})
    # print("trainer:",trainer)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_bce(net, train_features, train_labels))
        accura_ls.append(calculate_accuracy(net, train_features, train_labels))
        # print("train_ls in loop:",train_ls)
        if test_labels is not None:
            test_ls.append(log_bce(net, test_features, test_labels))
        # print("train_ls1:",train_ls)
    return train_ls, test_ls,accura_ls


In [80]:
import mxnet.ndarray as nd
import numpy as np

def get_k_fold_data(k, i, X, y):
    assert k > 1
    X = nd.array(X)
    y = nd.array(y)

    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx], y[idx]
        
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            print("X_train type",type(X_train))
            print("X_part type",type(X_part))
            X_train = nd.concat([X_train, X_part], dim=0)
            y_train = nd.concat([y_train, y_part], dim=0)
    
    return X_train, y_train, X_valid, y_valid


In [81]:
import mxnet.ndarray as nd
import numpy as np
import pandas as pd

def get_k_fold_data(k, i, X, y):
    assert k > 1
    X = nd.array(X.values)  # 将DataFrame转换为ndarray
    y = nd.array(y.values)  # 将DataFrame转换为ndarray

    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx], y[idx]
        
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = nd.concat([X_train, X_part], dim=0)
            y_train = nd.concat([y_train, y_part], dim=0)
    
    return X_train, y_train, X_valid, y_valid


In [82]:

k, num_epochs, lr, weight_decay, batch_size = 5, 220, 0.00005, 0, 32
train_l, valid_l, accuracy_l = k_fold(k, X_2020, y_2020, num_epochs, lr,
                          weight_decay, batch_size)
print('%d-fold validation: avg train loss %f, avg valid loss %f, accuracy %f'
      % (k, train_l, valid_l, accuracy_l))
      

AssertionError: Positional arguments must have NDArray type, but got [
[[-0.7686169  -0.4240698  -0.49003857 ...  0.          1.
   0.        ]
 [-1.1493541  -0.4240698  -0.49003857 ...  0.          1.
   0.        ]
 [-0.5058768  -0.4240698  -0.49003857 ...  0.          0.
   1.        ]
 ...
 [ 0.47113892 -0.4240698   3.2810688  ...  0.          1.
   0.        ]
 [ 1.3081315  -0.4240698  -0.49003857 ...  0.          0.
   1.        ]
 [-0.00714252  0.3305677  -0.49003857 ...  0.          1.
   0.        ]]
<NDArray 116968x50 @cpu(0)>, 
[[-0.40203938 -0.4240698  -0.49003857 ...  0.          1.
   0.        ]
 [-0.03074194 -0.4240698  -0.49003857 ...  0.          1.
   0.        ]
 [ 0.7716381  -0.4240698  -0.49003857 ...  0.          1.
   0.        ]
 ...
 [ 1.1382157  -0.4240698  -0.49003857 ...  0.          1.
   0.        ]
 [ 0.12501419 -0.4240698   0.5155901  ...  0.          1.
   0.        ]
 [ 1.1193361   2.0913885   1.3955151  ...  0.          1.
   0.        ]]
<NDArray 116968x50 @cpu(0)>]

# MLP finding
感觉准确性不是很高，也有可能是因为网络设计或者手动调参数的问题，这个太容易梯度消失了，lr大了容易梯度消失，小了容易loss变大，试一下Random forest + GridSearchCV


In [None]:
import pandas as pd
import numpy as np
from mxnet import nd, autograd, gluon

def get_k_fold_data(k, i, X, y):
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part = nd.array(X.iloc[idx].values)  # Convert DataFrame to NDArray
        y_part = nd.array(y.iloc[idx].values)  # Convert DataFrame to NDArray
        
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = nd.concat(X_train, X_part, dim=0)  # Concatenate NDArray
            y_train = nd.concat(y_train, y_part, dim=0)  # Concatenate NDArray
            
    return X_train, y_train, X_valid, y_valid

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls, accura_ls = [], [], []
    train_iter = gluon.data.DataLoader(gluon.data.ArrayDataset(train_features, train_labels), 
                                       batch_size, shuffle=True)
    
    trainer = gluon.Trainer(net.collect_params(), 'adam', 
                            {'learning_rate': learning_rate, 'wd': weight_decay})
    
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = gloss.L2Loss()(net(X), y)
            l.backward()
            trainer.step(batch_size)
    
        train_ls.append(log_rmse(net, train_features, train_labels))
        accura_ls.append(calculate_accuracy(net, train_features, train_labels))
        
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels)) # type: ignore
    
    return train_ls, test_ls, accura_ls

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum, accura_l_sum = 0, 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls, accura_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
        
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        accura_l_sum += accura_ls[-1]
        
        if i == 0:
            d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                         range(1, num_epochs + 1), valid_ls,
                         ['train', 'valid'])
        
        print('fold %d, train loss %f, valid loss %f, accuracy %f'
              % (i, train_ls[-1], valid_ls[-1], accura_ls[-1]))
    
    return train_l_sum / k, valid_l_sum / k, accura_l_sum / k

# 之后调用 k_fold 函数
k, num_epochs, lr, weight_decay, batch_size = 5, 220, 0.05, 0, 160
X_2020 = X_2020.astype(np.float32)
y_2020 = y_2020.astype(np.float32)
train_l, valid_l, accuracy_l = k_fold(k, X_2020, y_2020, num_epochs, lr, 
                                      weight_decay, batch_size)
print('%d-fold validation: avg train loss %f, avg valid loss %f, accuracy %f'
      % (k, train_l, valid_l, accuracy_l))

MXNetError: Traceback (most recent call last):
  File "../src/c_api/c_api_ndarray.cc", line 59
MXNetError: Check failed: inp->shape().Size() < (int64_t{1} << 31) - 1 (218904208384 vs. 2147483647) : [SetNDInputsOutputs] Size of tensor you are trying to allocate is larger than 2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1