In [6]:
import torch
import pandas as pd
import torch.nn as nn
from torch.nn import functional as F
from d2l import torch as d2l
from tqdm import tqdm
import numpy as np
from torch.utils import data

NUM_SAVE = 50
net_list = "in->256->64"

class MLP(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layer1 = nn.Linear(in_features,256)
        self.layer2 = nn.Linear(256,64)
        self.out = nn.Linear(64,1)
        
    def forward(self, X):
        X = F.relu(self.layer1(X))
        X = F.relu(self.layer2(X))
        return self.out(X)

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
print("train_data and test_data shape",train_data.shape,test_data.shape)

# 去掉冗余数据
redundant_cols = ['Address', 'Summary', 'City', 'State']
for c in redundant_cols:
    del test_data[c], train_data[c]
    
# 数据预处理
large_vel_cols = ['Lot', 'Total interior livable area', 'Tax assessed value', 'Annual tax amount', 'Listed Price', 'Last Sold Price']
for c in large_vel_cols:
    train_data[c] = np.log(train_data[c]+1)
    if c!='Sold Price':
        test_data[c] = np.log(test_data[c]+1)

# 把train和test去除id后放一起，train也要去掉label
all_features = pd.concat((train_data.iloc[:,2:],test_data.iloc[:,1:]))

# 时间数据赋日期格式
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")

train_data and test_data shape (47439, 41) (31626, 40)


In [8]:
for in_object in all_features.dtypes[all_features.dtypes=='object'].index:
    print(in_object.ljust(20),len(all_features[in_object].unique()))

Type                 174
Heating              2659
Cooling              910
Parking              9912
Bedrooms             278
Region               1259
Elementary School    3568
Middle School        809
High School          922
Flooring             1739
Heating features     1762
Cooling features     595
Appliances included  11289
Laundry features     3030
Parking features     9694


In [14]:
# Step 1: Use bfill() to backward fill missing values, then fill any remaining missing values with 0
all_features = all_features.bfill(axis=0).fillna(0)

# Identifying numeric features with dtype 'float64'
numeric_features = all_features.dtypes[all_features.dtypes == 'float64'].index

# Step 2: Standardize the numeric features
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std())

In [15]:
features = list(numeric_features)
features.extend(['Type','Bedrooms'])   # 加上类别数相对较少的Type, ,'Cooling features'
all_features = all_features[features]

In [16]:
print('before one hot code',all_features.shape)
all_features = pd.get_dummies(all_features,dummy_na=True)
all_features.shape
print('after one hot code',all_features.shape)

before one hot code (79065, 19)
after one hot code (79065, 470)


In [17]:
all_features = all_features.astype(np.float32)

In [18]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)  # 使用 float32 类型
print('train feature shape:', train_features.shape)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)  # 使用 float32 类型
print('test feature shape:', test_features.shape)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1), dtype=torch.float32)  # 使用 float32 类型
print('train label shape:', train_labels.shape)

train feature shape: torch.Size([47439, 470])
test feature shape: torch.Size([31626, 470])
train label shape: torch.Size([47439, 1])


In [22]:
criterion = nn.MSELoss()
in_features = train_features.shape[1]
net = MLP(in_features).to(device)

def load_array(data_arrays, batch_size, is_train=True):  #@save
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def log_rmse(net, features, labels):
    # 确保输入特征和标签也在正确的设备上
    features, labels = features.to(device), labels.to(device)
    with torch.no_grad():  # 在计算验证集误差时，不需要计算梯度
        # 为了在取对数时进一步稳定该值，将小于1的值设置为1
        clipped_preds = torch.clamp(net(features), 1, float('inf'))
        rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                                torch.log(labels)))
    return rmse.item()

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
    for epoch in tqdm(range(num_epochs)):
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = net(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
        record_loss = log_rmse(net.to('cuda:0'), train_features, train_labels)
        train_ls.append(record_loss)
        if (epoch%NUM_SAVE==0 and epoch!=0) or (epoch==num_epochs-1):
            torch.save(net.state_dict(),'checkpoint_'+str(epoch))
            print('save checkpoints on:', epoch, 'rmse loss value is:', record_loss)
        del X, y
    return train_ls, test_ls

num_epochs, lr, weight_decay, batch_size = 500, 0.005, 0.05, 256
print("network:",net)


network: MLP(
  (layer1): Linear(in_features=470, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


In [23]:
train_ls, valid_ls = train(net, train_features,train_labels,None,None, num_epochs, lr, weight_decay, batch_size)


 10%|█         | 51/500 [00:32<04:47,  1.56it/s]

save checkpoints on: 50 rmse loss value is: 0.418620765209198


 20%|██        | 101/500 [01:05<04:15,  1.56it/s]

save checkpoints on: 100 rmse loss value is: 0.33292049169540405


 30%|███       | 151/500 [01:37<03:43,  1.56it/s]

save checkpoints on: 150 rmse loss value is: 0.3121003806591034


 40%|████      | 201/500 [02:09<03:02,  1.63it/s]

save checkpoints on: 200 rmse loss value is: 0.30113697052001953


 50%|█████     | 251/500 [02:40<02:39,  1.56it/s]

save checkpoints on: 250 rmse loss value is: 0.2916157841682434


 60%|██████    | 301/500 [03:12<02:07,  1.56it/s]

save checkpoints on: 300 rmse loss value is: 0.2532415986061096


 70%|███████   | 351/500 [03:45<01:36,  1.55it/s]

save checkpoints on: 350 rmse loss value is: 0.23507462441921234


 80%|████████  | 401/500 [04:17<01:04,  1.54it/s]

save checkpoints on: 400 rmse loss value is: 0.23701271414756775


 90%|█████████ | 451/500 [04:50<00:31,  1.54it/s]

save checkpoints on: 450 rmse loss value is: 0.2359466254711151


100%|██████████| 500/500 [05:21<00:00,  1.55it/s]

save checkpoints on: 499 rmse loss value is: 0.23833008110523224





In [26]:
# 确保模型在GPU上
net.to('cuda:0')

# 确保测试特征也在同一个设备上，这里是'cuda:0'
test_features = test_features.to('cuda:0')

# 进行预测
preds = net(test_features)

# 将预测结果从GPU移动到CPU，并转换为NumPy数组
preds = preds.detach().cpu().numpy()


In [27]:
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)