In [47]:
import pandas as pd
import numpy as np
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim


In [76]:
data = pd.read_csv('../data/train.csv')

In [34]:
data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [22]:
result = data.isna().sum()
result[result > 0]


Series([], dtype: int64)

In [77]:
#过滤无用列
data = data.drop(['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [78]:
numeric_cols = data.select_dtypes(include=np.number).columns.tolist()
non_numeric_cols = data.select_dtypes(exclude=np.number).columns.tolist()

fill_values = {}
for col in numeric_cols:
    if data[col].isnull().any(): # 仅对有空值的数字列计算均值
        fill_values[col] = data[col].mean()
data.fillna(fill_values, inplace=True)

for col in non_numeric_cols:
    if data[col].isnull().any(): # 仅对有空值的非数字列进行操作
        data[col].fillna(data[col].mode()[0], inplace=True)
    data = pd.get_dummies(data, columns=[col], prefix=[col])

bool_cols = data.select_dtypes(include='bool').columns.tolist()

for col in bool_cols:
    data[col] = data[col].astype(int)



In [79]:
x = data.drop(['SalePrice'], axis=1)
y = data['SalePrice']

X_numpy = x.values
y_numpy = y.values

X_tensor = torch.tensor(X_numpy, dtype=torch.float32)
y_tensor = torch.tensor(y_numpy, dtype=torch.float32)

epsilon = 1e-8 # 一个非常小的正数
X_std = X_std + epsilon # 这样可以确保分母不为0

X_mean, X_std = X_tensor.mean(dim=0), X_tensor.std(dim=0)
y_mean, y_std = y_tensor.mean(), y_tensor.std()
X_tensor = (X_tensor - X_mean) / X_std
y_tensor = (y_tensor - y_mean) / y_std  

In [80]:
print("X_std最小值:", X_std.min())  # 如果有0会导致除零错误
print("y_std:", y_std)             # 应为非零


X_std最小值: tensor(0.0262)
y_std: tensor(79442.5000)


In [81]:
batch_size = 32 # 常用批次大小
output_dim = 1


class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super(MLPRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # 输入层到第一个隐藏层
        self.relu1 = nn.ReLU()                # 激活函数
        self.dropout1 = nn.Dropout(0.2)       # Dropout 层，用于防止过拟合

        self.fc2 = nn.Linear(128, 64)         # 第一个隐藏层到第二个隐藏层
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(64, output_dim)  # 第二个隐藏层到输出层

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        return x

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# 创建 Dataset 实例
train_dataset = CustomDataset(X_tensor, y_tensor)

# 创建 DataLoader 实例
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [82]:
model = MLPRegressor(X_tensor.shape[1])

criterion = nn.MSELoss() # 均方误差，回归任务常用
optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam 优化器，学习率 0.001
num_epochs = 100 # 训练轮次

In [83]:
for epoch in range(num_epochs):
    model.train() # 设置模型为训练模式
    running_loss = 0.0
    for features, labels in train_loader:
        # 前向传播
        outputs = model(features)
        loss = criterion(outputs, labels.unsqueeze(1)) 

        # 反向传播和优化
        optimizer.zero_grad() # 清空梯度
        loss.backward()       # 计算梯度
        optimizer.step()      # 更新权重

        running_loss += loss.item() * features.size(0) # 累加损失

    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

Epoch [1/100], Loss: 0.4299
Epoch [2/100], Loss: 0.1613
Epoch [3/100], Loss: 0.1211
Epoch [4/100], Loss: 0.1063
Epoch [5/100], Loss: 0.0843
Epoch [6/100], Loss: 0.0830
Epoch [7/100], Loss: 0.0740
Epoch [8/100], Loss: 0.0844
Epoch [9/100], Loss: 0.0730
Epoch [10/100], Loss: 0.0671
Epoch [11/100], Loss: 0.0761
Epoch [12/100], Loss: 0.0517
Epoch [13/100], Loss: 0.0591
Epoch [14/100], Loss: 0.0546
Epoch [15/100], Loss: 0.0527
Epoch [16/100], Loss: 0.0502
Epoch [17/100], Loss: 0.0502
Epoch [18/100], Loss: 0.0468
Epoch [19/100], Loss: 0.0497
Epoch [20/100], Loss: 0.0488
Epoch [21/100], Loss: 0.0510
Epoch [22/100], Loss: 0.0503
Epoch [23/100], Loss: 0.0515
Epoch [24/100], Loss: 0.0529
Epoch [25/100], Loss: 0.0466
Epoch [26/100], Loss: 0.0396
Epoch [27/100], Loss: 0.0438
Epoch [28/100], Loss: 0.0447
Epoch [29/100], Loss: 0.0447
Epoch [30/100], Loss: 0.0402
Epoch [31/100], Loss: 0.0411
Epoch [32/100], Loss: 0.0379
Epoch [33/100], Loss: 0.0435
Epoch [34/100], Loss: 0.0409
Epoch [35/100], Loss: 0

In [84]:
test_data = pd.read_csv('../data/test.csv')
test_data = test_data.drop(['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
numeric_cols = test_data.select_dtypes(include=np.number).columns.tolist()
non_numeric_cols = test_data.select_dtypes(exclude=np.number).columns.tolist()

fill_values = {}
for col in numeric_cols:
    if test_data[col].isnull().any(): # 仅对有空值的数字列计算均值
        fill_values[col] = test_data[col].mean()
test_data.fillna(fill_values, inplace=True)

for col in non_numeric_cols:
    if test_data[col].isnull().any(): # 仅对有空值的非数字列进行操作
        test_data[col].fillna(test_data[col].mode()[0], inplace=True)
    test_data = pd.get_dummies(test_data, columns=[col], prefix=[col])

bool_cols = test_data.select_dtypes(include='bool').columns.tolist()

for col in bool_cols:
    test_data[col] = test_data[col].astype(int)


In [85]:
# 获取训练时的特征列名
train_columns = data.drop(['SalePrice'], axis=1).columns  

# 对测试集进行特征对齐
missing_cols = set(train_columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0  # 缺失的特征列补0

# 确保列顺序一致
test_data = test_data[train_columns]

  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0
  test_data[col] = 0  # 缺失的特征列补0


In [86]:
x = test_data

X_numpy = x.values
X_tensor = torch.tensor(X_numpy, dtype=torch.float32)
X_mean, X_std = X_tensor.mean(dim=0), X_tensor.std(dim=0)
epsilon = 1e-8 # 一个非常小的正数
X_std = X_std + epsilon # 这样可以确保分母不为0
X_tensor = (X_tensor - X_mean) / X_std

class TestDataset(Dataset):
    def __init__(self, features_tensor):
        self.features = features_tensor
        
    def __len__(self):
        return len(self.features)
        
    def __getitem__(self, idx):
        return self.features[idx]

test_dataset = TestDataset(X_tensor)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,  # 与训练时相同
    shuffle=False   # 测试集不需要打乱
)


In [87]:
model.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        print(batch)
        outputs = model(batch)
        # 反标准化预测结果
        preds = outputs.squeeze() * y_std + y_mean  
        all_preds.extend(preds.numpy())
        print(outputs)

# 转换为Kaggle提交格式
submission = pd.DataFrame({
    'Id': pd.read_csv('../data/test.csv')['Id'],
    'SalePrice': np.clip(all_preds, 0, None)  # 确保价格非负
})
submission.to_csv('submission.csv', index=False)

all_preds


tensor([[-0.8744,  0.5554,  0.3638,  ..., -0.1347,  0.4601, -0.2993],
        [-0.8744,  0.6040,  0.8976,  ..., -0.1347,  0.4601, -0.2993],
        [ 0.0613,  0.2636,  0.8094,  ..., -0.1347,  0.4601, -0.2993],
        ...,
        [-0.8744,  0.7499,  0.1253,  ..., -0.1347,  0.4601, -0.2993],
        [ 0.0613,  0.0690,  0.4239,  ..., -0.1347,  0.4601, -0.2993],
        [-0.6405,  0.0690, -0.0039,  ..., -0.1347,  0.4601, -0.2993]])
tensor([[-0.6547],
        [-0.2723],
        [ 0.0457],
        [ 0.3129],
        [ 0.0133],
        [ 0.0107],
        [-0.0102],
        [-0.0580],
        [ 0.0172],
        [-0.7665],
        [-0.0699],
        [-1.0192],
        [-1.0642],
        [-0.4077],
        [-0.7731],
        [ 2.7742],
        [ 1.0227],
        [ 1.0693],
        [ 1.2248],
        [ 3.8767],
        [ 2.0643],
        [ 0.2951],
        [ 0.1217],
        [-0.1802],
        [ 0.2054],
        [ 0.3934],
        [ 2.0753],
        [ 0.4187],
        [ 0.1179],
        [ 1.098

[128906.66,
 159291.17,
 184548.98,
 205775.83,
 181977.53,
 181775.02,
 180108.84,
 176311.42,
 182287.58,
 120026.414,
 175366.03,
 99953.72,
 96379.32,
 148531.3,
 119502.34,
 401309.94,
 262167.8,
 265865.12,
 278225.88,
 488897.38,
 344911.94,
 204366.1,
 190587.53,
 166608.97,
 197235.66,
 212176.34,
 345789.56,
 214184.62,
 190289.11,
 268211.38,
 194596.55,
 88953.18,
 194824.06,
 259592.14,
 267415.75,
 233564.53,
 177232.58,
 172833.52,
 167551.25,
 166131.72,
 180301.95,
 159371.0,
 321825.5,
 243923.66,
 243516.3,
 196081.81,
 254032.31,
 196526.6,
 169073.5,
 151249.39,
 153197.03,
 171228.75,
 160087.06,
 142124.08,
 183442.6,
 158267.25,
 159054.44,
 132054.17,
 216420.62,
 132512.95,
 141235.23,
 171585.72,
 121802.33,
 120286.55,
 118949.34,
 102914.6,
 113439.016,
 133719.16,
 153890.94,
 178615.34,
 126183.11,
 92184.01,
 144620.17,
 128365.28,
 153310.67,
 99982.266,
 44611.92,
 162362.1,
 212694.08,
 67267.836,
 139187.77,
 145367.36,
 200432.02,
 85103.37,
 118466