In [None]:
import pandas as pd
import numpy as np
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim


In [None]:
data = pd.read_csv('../data/train.csv')

In [None]:
data.head()

In [None]:
result = data.isna().sum()
result[result > 0]


In [None]:
#过滤无用列
data = data.drop(['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
data.head()

In [None]:
numeric_cols = data.select_dtypes(include=np.number).columns.tolist()
non_numeric_cols = data.select_dtypes(exclude=np.number).columns.tolist()

fill_values = {}
for col in numeric_cols:
    if data[col].isnull().any(): # 仅对有空值的数字列计算均值
        fill_values[col] = data[col].mean()
data.fillna(fill_values, inplace=True)

for col in non_numeric_cols:
    if data[col].isnull().any(): # 仅对有空值的非数字列进行操作
        data[col].fillna(data[col].mode()[0], inplace=True)
    data = pd.get_dummies(data, columns=[col], prefix=[col])

bool_cols = data.select_dtypes(include='bool').columns.tolist()

for col in bool_cols:
    data[col] = data[col].astype(int)



In [None]:
x = data.drop(['SalePrice'], axis=1)
y = data['SalePrice']

X_numpy = x.values
y_numpy = y.values

X_tensor = torch.tensor(X_numpy, dtype=torch.float32)
y_tensor = torch.tensor(y_numpy, dtype=torch.float32)

epsilon = 1e-8 # 一个非常小的正数
X_std = X_std + epsilon # 这样可以确保分母不为0

X_mean, X_std = X_tensor.mean(dim=0), X_tensor.std(dim=0)
y_mean, y_std = y_tensor.mean(), y_tensor.std()
X_tensor = (X_tensor - X_mean) / X_std
y_tensor = (y_tensor - y_mean) / y_std  

In [None]:
print("X_std最小值:", X_std.min())  # 如果有0会导致除零错误
print("y_std:", y_std)             # 应为非零


In [None]:
batch_size = 32 # 常用批次大小
output_dim = 1


class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super(MLPRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # 输入层到第一个隐藏层
        self.relu1 = nn.ReLU()                # 激活函数
        self.dropout1 = nn.Dropout(0.2)       # Dropout 层，用于防止过拟合

        self.fc2 = nn.Linear(128, 64)         # 第一个隐藏层到第二个隐藏层
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(64, output_dim)  # 第二个隐藏层到输出层

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        return x

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# 创建 Dataset 实例
train_dataset = CustomDataset(X_tensor, y_tensor)

# 创建 DataLoader 实例
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
model = MLPRegressor(X_tensor.shape[1])

criterion = nn.MSELoss() # 均方误差，回归任务常用
optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam 优化器，学习率 0.001
num_epochs = 100 # 训练轮次

In [None]:
for epoch in range(num_epochs):
    model.train() # 设置模型为训练模式
    running_loss = 0.0
    for features, labels in train_loader:
        # 前向传播
        outputs = model(features)
        loss = criterion(outputs, labels.unsqueeze(1)) 

        # 反向传播和优化
        optimizer.zero_grad() # 清空梯度
        loss.backward()       # 计算梯度
        optimizer.step()      # 更新权重

        running_loss += loss.item() * features.size(0) # 累加损失

    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

In [None]:
test_data = pd.read_csv('../data/test.csv')
test_data = test_data.drop(['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
numeric_cols = test_data.select_dtypes(include=np.number).columns.tolist()
non_numeric_cols = test_data.select_dtypes(exclude=np.number).columns.tolist()

fill_values = {}
for col in numeric_cols:
    if test_data[col].isnull().any(): # 仅对有空值的数字列计算均值
        fill_values[col] = test_data[col].mean()
test_data.fillna(fill_values, inplace=True)

for col in non_numeric_cols:
    if test_data[col].isnull().any(): # 仅对有空值的非数字列进行操作
        test_data[col].fillna(test_data[col].mode()[0], inplace=True)
    test_data = pd.get_dummies(test_data, columns=[col], prefix=[col])

bool_cols = test_data.select_dtypes(include='bool').columns.tolist()

for col in bool_cols:
    test_data[col] = test_data[col].astype(int)


In [None]:
# 获取训练时的特征列名
train_columns = data.drop(['SalePrice'], axis=1).columns  

# 对测试集进行特征对齐
missing_cols = set(train_columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0  # 缺失的特征列补0

# 确保列顺序一致
test_data = test_data[train_columns]

In [None]:
x = test_data

X_numpy = x.values
X_tensor = torch.tensor(X_numpy, dtype=torch.float32)
X_mean, X_std = X_tensor.mean(dim=0), X_tensor.std(dim=0)
epsilon = 1e-8 # 一个非常小的正数
X_std = X_std + epsilon # 这样可以确保分母不为0
X_tensor = (X_tensor - X_mean) / X_std

class TestDataset(Dataset):
    def __init__(self, features_tensor):
        self.features = features_tensor
        
    def __len__(self):
        return len(self.features)
        
    def __getitem__(self, idx):
        return self.features[idx]

test_dataset = TestDataset(X_tensor)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,  # 与训练时相同
    shuffle=False   # 测试集不需要打乱
)


In [None]:
model.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        print(batch)
        outputs = model(batch)
        # 反标准化预测结果
        preds = outputs.squeeze() * y_std + y_mean  
        all_preds.extend(preds.numpy())
        print(outputs)

# 转换为Kaggle提交格式
submission = pd.DataFrame({
    'Id': pd.read_csv('../data/test.csv')['Id'],
    'SalePrice': np.clip(all_preds, 0, None)  # 确保价格非负
})
submission.to_csv('submission.csv', index=False)

all_preds
