In [127]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split  # 用于切分数据
from sklearn.preprocessing import StandardScaler     # 用于数据标准化
import torch.optim as optim

print("torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


torch version: 2.5.1
CUDA available: True


极简读数据

In [128]:

# 1. 读数据
train = pd.read_csv("data/train.csv")

# 2. 只挑几个数值型特征（先简单）
features = ["GrLivArea", "OverallQual", "GarageCars"]  # 居住面积、房屋质量、车库容量
X = train[features].values
y = train["SalePrice"].values  
# .values取出表格中的纯数据，变成一个NumPy数组
print(X)
print(y)

# 3. 转换为 PyTorch 张量
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)  # 调整成 (N,1) 形状
                                                             # 也可以用.reshape(-1,1) 或 .unsqueeze(1)

print("X_tensor shape:", X_tensor.shape)
print("y_tensor shape:", y_tensor.shape)

[[1710    7    2]
 [1262    6    2]
 [1786    7    2]
 ...
 [2340    7    1]
 [1078    5    1]
 [1256    5    1]]
[208500 181500 223500 ... 266500 142125 147500]
X_tensor shape: torch.Size([1460, 3])
y_tensor shape: torch.Size([1460, 1])


建立一个简单模型

In [129]:
# 输入特征数（我们之前选了3个：GrLivArea, OverallQual, GarageCars）
input_dim = X_tensor.shape[1]
output_dim = 1  # 预测 SalePrice 是单个值

# 定义线性回归模型
model = nn.Linear(input_dim, output_dim)

print(model)

Linear(in_features=3, out_features=1, bias=True)


切分训练集和验证集，标准化特征

In [None]:
# 1) 取出原始 X / y（沿用你前面挑的三个特征）
features = ["GrLivArea", "OverallQual", "GarageCars"]
X = train[features].values
y = train["SalePrice"].values

# 对数变换（log1p 避免 log(0)）
y_log = np.log1p(y)  # log1p就是log(1+x)
print("原始 SalePrice 前 5 个:", y[:5])
print("取对数后的前 5 个:", y_log[:5])

# 2) 切分训练/验证集（例如 80%/20%）
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# random_state=42 固定随机种子，保证每次运行切分结果一致（可复现）


# 3) 只用训练集拟合 StandardScaler，然后分别变换训练集与验证集
scaler = StandardScaler()  # 创建一个标准化器对象。默认 with_mean=True, with_std=True，按列（特征维）做标准化
X_tr = scaler.fit_transform(X_tr)
# fit：只在训练集 X_tr 上计算每一列的 mean_ 与 scale_（等于标准差，sqrt(var_)）
# 并存到对象里（属性：scaler.mean_, scaler.scale_）。
# transform：用刚学到的 mean_ 和 scale_ 立刻把 X_tr 变换为标准化后的数据。
# 合起来的 fit_transform：一步做完，返回的是已标准化的训练集（还是 NumPy 数组）

X_val = scaler.transform(X_val)
# 用训练集的均值与标准差把验证集做同样的线性变换，模拟真实部署场景。

# 4) 转成张量
X_tr = torch.tensor(X_tr, dtype=torch.float32)
y_tr = torch.tensor(y_tr, dtype=torch.float32).reshape(-1, 1)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)

print("X_tr:", X_tr.shape, "y_tr:", y_tr.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)

原始 SalePrice 前 5 个: [208500 181500 223500 140000 250000]
取对数后的前 5 个: [12.24769912 12.10901644 12.31717117 11.84940484 12.4292202 ]
X_tr: torch.Size([1168, 3]) y_tr: torch.Size([1168, 1])
X_val: torch.Size([292, 3]) y_val: torch.Size([292, 1])


定义损失函数与优化器

In [131]:
# 损失函数：均方误差
loss_func = nn.MSELoss()

# 优化器：Adam
optimizer = optim.Adam(model.parameters(), lr=0.01)

print(loss_func)
print(optimizer)

MSELoss()
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)


训练循环

In [132]:
num_epochs = 100
for epoch in range(num_epochs):
    # 前向传播
    y_pred = model(X_tr)
    loss = loss_func(y_pred, y_tr)

    # 反向传播 + 参数更新
    optimizer.zero_grad()   # 梯度清零（否则会累积）
    loss.backward()         # 反向传播
    optimizer.step()        # 更新参数

    # 每 10 轮打印一次训练损失
    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [10/100], Loss: 38885494784.0000
Epoch [20/100], Loss: 38885425152.0000
Epoch [30/100], Loss: 38885359616.0000
Epoch [40/100], Loss: 38885289984.0000
Epoch [50/100], Loss: 38885216256.0000
Epoch [60/100], Loss: 38885150720.0000
Epoch [70/100], Loss: 38885081088.0000
Epoch [80/100], Loss: 38885015552.0000
Epoch [90/100], Loss: 38884941824.0000
Epoch [100/100], Loss: 38884876288.0000


误差太大，重新定义模型和优化器

In [133]:
# 1) 定义一个最小 MLP：输入 -> 隐藏层(32) -> ReLU -> 输出(1)
input_dim = X_tr.shape[1]  # 目前是 3
model = nn.Sequential(
    nn.Linear(input_dim, 32),
    nn.ReLU(),
    nn.Linear(32, 1)
)

# 2) 损失函数 & 优化器（学习率调小一些，更稳）
loss_func = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.15)

# 3) 训练循环（打印训练集 loss）
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    y_pred = model(X_tr)
    loss = loss_func(y_pred, y_tr)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}]  Train MSE: {loss.item():.2f}")

# 4) 简单验证：计算验证集的 RMSE（均方根误差）
model.eval()
with torch.no_grad():
    y_val_pred = model(X_val)
    val_mse = loss_func(y_val_pred, y_val).item()
    val_rmse = val_mse ** 0.5
print(f"Validation RMSE: {val_rmse:.2f}")

Epoch [20/100]  Train MSE: 38748897280.00
Epoch [40/100]  Train MSE: 38096265216.00
Epoch [60/100]  Train MSE: 36687028224.00
Epoch [80/100]  Train MSE: 34448416768.00
Epoch [100/100]  Train MSE: 31444934656.00
Validation RMSE: 178298.41
