In [222]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split  # 用于切分数据
from sklearn.preprocessing import StandardScaler     # 用于数据标准化
import torch.optim as optim
from sklearn.impute import SimpleImputer   # 引入缺失值填充器

print("torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

torch version: 2.5.1
CUDA available: True


In [223]:
# 1. 读数据
train = pd.read_csv("data/train.csv")

# 2) 快速查看形状与列
print("train shape:", train.shape)       # 预期 (1460, 81)
print("columns count:", len(train.columns))

# 3) 目标列与简单数值检查
assert "SalePrice" in train.columns, "找不到 SalePrice 列，请确认路径与文件。"
print("SalePrice describe:")
print(train["SalePrice"].describe())

train shape: (1460, 81)
columns count: 81
SalePrice describe:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [224]:
y = np.log1p(train['SalePrice'].values)
print(y.shape)

(1460,)


数值特征挑选

In [225]:
# 选出数值型特征
num_cols = train.select_dtypes(include='number').columns

# 去掉目标列和 Id
num_cols = num_cols.drop(["SalePrice"], errors="ignore")
num_cols = [c for c in num_cols if c.lower() != "id"]

# 用这些数值特征建一个 DataFrame
X_df = train[num_cols].copy()

print("数值特征数量:", len(num_cols))
print("前 5 个数值特征:", num_cols[:5])
print("X_df shape:", X_df.shape)

数值特征数量: 36
前 5 个数值特征: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond']
X_df shape: (1460, 36)


切分数据

In [226]:

X_tr_df, X_val_df, y_tr, y_val = train_test_split(
    X_df, y, test_size=0.2, random_state=42
)
# 暂时保留DataFrame
print("训练集形状:", X_tr_df.shape)
print("验证集形状:", X_val_df.shape)
print("训练集目标 shape:", y_tr.shape)
print("验证集目标 shape:", y_val.shape)

训练集形状: (1168, 36)
验证集形状: (292, 36)
训练集目标 shape: (1168,)
验证集目标 shape: (292,)


缺失值填充

In [227]:
# 1) 用训练集拟合：每一列学到一个中位数
imputer = SimpleImputer(strategy="median")
# 创建一个按列工作的填充器：把每一列中的NaN用该列的中位数替换
# 中位数比均值更抗异常值
# imputer.statistics_ 里会存放每列学到的“代表值”

X_tr_imp = imputer.fit_transform(X_tr_df)   # 拟合+变换训练集
# fit：只在训练集上计算每一列的“代表值”（这里是中位数）并存起来（避免数据泄露）。
# transform：立刻用这些中位数把训练集里的 NaN 替换掉。
# 返回的是 NumPy ndarray（不再保留列名），形状与原来一样

X_val_imp = imputer.transform(X_val_df)     # 用同样的中位数变换验证集（避免数据泄露）

print("填充后形状（训练/验证）:", X_tr_imp.shape, X_val_imp.shape)
# 确认行数、列数没变

# 2) 简单检查：是否还存在 NaN
n_tr_nan = np.isnan(X_tr_imp).sum()
n_val_nan = np.isnan(X_val_imp).sum()
# 用 np.isnan 统计剩余 NaN 的总数。

print("训练集 NaN 数:", n_tr_nan)
print("验证集 NaN 数:", n_val_nan)

填充后形状（训练/验证）: (1168, 36) (292, 36)
训练集 NaN 数: 0
验证集 NaN 数: 0


标准化

In [228]:
# 1) 用训练集拟合 scaler（学到均值和标准差）
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr_imp)
X_val_scaled = scaler.transform(X_val_imp)

# 2) 打印检查
print("训练集均值（前 5 个特征）:", X_tr_scaled.mean(axis=0)[:5])
print("训练集标准差（前 5 个特征）:", X_tr_scaled.std(axis=0)[:5])

训练集均值（前 5 个特征）: [ 6.99592591e-17 -2.00752657e-16  2.28128019e-17 -5.17090176e-17
 -2.28128019e-16]
训练集标准差（前 5 个特征）: [1. 1. 1. 1. 1.]


将数据转为张量

In [229]:
# 1) 转为张量（float32）
X_tr = torch.tensor(X_tr_scaled, dtype=torch.float32)
X_val = torch.tensor(X_val_scaled, dtype=torch.float32)

# 注意：y 是前面切分得到的 “log 房价” 数组
y_tr_t = torch.tensor(y_tr, dtype=torch.float32).reshape(-1, 1)
y_val_t = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)

# 2) 打印形状核对
print("X_tr:", X_tr.shape, "  y_tr:", y_tr_t.shape)
print("X_val:", X_val.shape, "  y_val:", y_val_t.shape)

X_tr: torch.Size([1168, 36])   y_tr: torch.Size([1168, 1])
X_val: torch.Size([292, 36])   y_val: torch.Size([292, 1])


定义网络

In [230]:
input_dim = X_tr.shape[1]   # 36
model = nn.Sequential(
    nn.Linear(input_dim, 128),
    nn.Sigmoid(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 1)
)

print(model)


Sequential(
  (0): Linear(in_features=36, out_features=128, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=1, bias=True)
)


选损失 & 优化器，并先训练 100 轮试水

In [231]:
# 1) 损失函数与优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 2) 训练循环（100 轮）
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()
    y_pred = model(X_tr)            # 预测 log(SalePrice)
    loss = criterion(y_pred, y_tr_t)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}]  Train MSE(log): {loss.item():.4f}")

# 3) 验证：log-RMSE
model.eval()
with torch.no_grad():
    val_pred_log = model(X_val)
    val_mse_log = criterion(val_pred_log, y_val_t).item()
    val_rmse_log = val_mse_log ** 0.5

print(f"Validation RMSE (log space): {val_rmse_log:.4f}")

Epoch [20/1000]  Train MSE(log): 79.1418
Epoch [40/1000]  Train MSE(log): 18.8444
Epoch [60/1000]  Train MSE(log): 0.0493
Epoch [80/1000]  Train MSE(log): 0.2807
Epoch [100/1000]  Train MSE(log): 0.0348
Epoch [120/1000]  Train MSE(log): 0.0186
Epoch [140/1000]  Train MSE(log): 0.0174
Epoch [160/1000]  Train MSE(log): 0.0163
Epoch [180/1000]  Train MSE(log): 0.0158
Epoch [200/1000]  Train MSE(log): 0.0154
Epoch [220/1000]  Train MSE(log): 0.0152
Epoch [240/1000]  Train MSE(log): 0.0149
Epoch [260/1000]  Train MSE(log): 0.0147
Epoch [280/1000]  Train MSE(log): 0.0145
Epoch [300/1000]  Train MSE(log): 0.0143
Epoch [320/1000]  Train MSE(log): 0.0141
Epoch [340/1000]  Train MSE(log): 0.0139
Epoch [360/1000]  Train MSE(log): 0.0137
Epoch [380/1000]  Train MSE(log): 0.0136
Epoch [400/1000]  Train MSE(log): 0.0134
Epoch [420/1000]  Train MSE(log): 0.0133
Epoch [440/1000]  Train MSE(log): 0.0131
Epoch [460/1000]  Train MSE(log): 0.0130
Epoch [480/1000]  Train MSE(log): 0.0129
Epoch [500/1000]  

用全量训练集重拟合并训练最终模型

In [232]:
# 重新取全量特征与目标
y_all = np.log1p(train['SalePrice'].values)
num_cols = train.select_dtypes(include='number').columns.drop(['SalePrice'], errors='ignore')
num_cols = [c for c in num_cols if c.lower() != 'id']
X_all_df = train[num_cols].copy()

# 1) 重拟合缺失值填充与标准化（只在训练数据上）
imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()
X_all_imp = imputer.fit_transform(X_all_df)
X_all_scaled = scaler.fit_transform(X_all_imp)

# 2) 训练最终模型（复用你已有的 model 定义）
X_all = torch.tensor(X_all_scaled, dtype=torch.float32)
y_all_t = torch.tensor(y_all, dtype=torch.float32).reshape(-1, 1)

model = nn.Sequential(
    nn.Linear(X_all.shape[1], 128),
    nn.Sigmoid(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 1)
)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(400):  # 先训练少一点轮次，跑通流程
    model.train()
    pred = model(X_all)
    loss = criterion(pred, y_all_t)
    optimizer.zero_grad(); loss.backward(); optimizer.step()


生成提交文件submission.csv


In [233]:
# 1) 读取 test.csv 并用同样的列、同样的预处理
test = pd.read_csv("data/test.csv")
X_test_df = test[num_cols].copy()                  # 确保列顺序与训练一致
X_test_imp = imputer.transform(X_test_df)
X_test_scaled = scaler.transform(X_test_imp)
X_test = torch.tensor(X_test_scaled, dtype=torch.float32)

# 2) 预测（log 空间）并反变换回价格
model.eval()
with torch.no_grad():
    test_pred_log = model(X_test).numpy().reshape(-1)
test_pred_price = np.expm1(test_pred_log)          # 关键：log1p 的逆变换

# 3) 写提交文件（两列：Id, SalePrice；无索引）
submission = pd.DataFrame({
    "Id": test["Id"], 
    "SalePrice": test_pred_price
})
submission.to_csv("submission.csv", index=False)
print(submission.head())


     Id      SalePrice
0  1461  120867.835938
1  1462  168977.875000
2  1463  177859.125000
3  1464  197292.109375
4  1465  175688.781250
