In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
from torch.nn import functional as F
from tqdm import tqdm
from torch.utils import data
import wandb

NUM_SAVE=50
net_list="in->256->64"

class MLP(nn.Module):
    def __init__(self,in_features):
        super().__init__()
        self.layer1=nn.Linear(in_features,256)
        self.layer2=nn.Linear(256,64)
        self.out=nn.Linear(64,1)
    def forward(self,X):
        X=F.relu(self.layer1(X))
        X=F.relu(self.layer2(X))
        return self.out(X)

In [None]:
device=torch.device("cuda:0" if torch.cuda.is_available else "cpu")
test_data=pd.read_csv('./house_price/test.csv')
train_data=pd.read_csv('./house_price/train.csv')
print("train_data and test_data shape",train_data.shape,test_data.shape)
# train_data and test_data shape (1460, 81) (1459, 80)
type(train_data)
# pandas.core.frame.DataFrame

# 看一下前4个特征和最后两个特征，以及相应标签
print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
# 去掉第一个id标签以及训练集的label，去掉id是因为它不携带任何用于预测的信息，然后把训练集和测试集合并
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
print(all_features.shape)


## 数据预处理

首先将所有确实的值替换为相应特征的平均值；
为了将所有特征放在一个共同的尺度上，我们通过将特征重新缩放到零均值和单位方差来标准化数据；
标准化数据的两个原因：
1. 方便优化
2. 因为不知道那些特征是相关的，所以不想让惩罚分配给一个特征的系数比分配给其他任何特征的系数更大。



In [None]:
# 若无法获得测试数据，则可根据训练数据计算均值和标准差
numeric_features=all_features.dtypes[all_features.dtypes !='object'].index  # 代码理解：https://www.cnblogs.com/hahaah/p/15067930.html
print(numeric_features)
all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))
# 在标准化数据之后，所有均值消失，因此我们可以将缺失值设置为0
all_features[numeric_features]=all_features[numeric_features].fillna(0)


### 接下来处理离散值
用独热编码来替换它们，例如，“MSZoning”包含值“RL”和“Rm”。 我们将创建两个新的指示器特征“MSZoning_RL”和“MSZoning_RM”，其值为0或1。 根据独热编码，如果“MSZoning”的原始值为“RL”， 则：“MSZoning_RL”为1，“MSZoning_RM”为0。 pandas软件包会自动为我们实现这一点。

In [None]:
# "Dummy_Na=True"将"na"(缺失值)视为有效的特征值，并为其创建指示符特征
all_features=pd.get_dummies(all_features,dummy_na=True)
all_features.shape # 特征总数零从79增加到了331个，


In [None]:
# 最后，通过values属性，我们可以从pandas格式中提取NumPy格式，并将其转换为张量表示用于训练
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

## 训练


In [33]:
criterion=nn.MSELoss()
in_features=train_features.shape[1]
net=MLP(in_features).to(device)

def load_array(data_arrays,batch_size,is_train=True): #@save
    """Construct a Pytorch data iterator."""
    dataset=data.TensorDataset(*data_arrays)  # 将特征和lables组合在一起
    return data.DataLoader(dataset,batch_size,shuffle=is_train)


# 用价格预测的对数来衡量差异
def log_mse(net,features,labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds=torch.clamp(net(features),1,float('inf'))
    rmse=torch.sqrt(criterion(torch.log(clipped_preds),torch.log(labels)))
    return rmse.item()

def train(net, train_features,train_labels,test_features,test_labels, num_epochs,learning_rate, weight_decay, batch_size):
    wandb.watch(net)
    train_ls, test_ls=[], []
    train_iter=load_array((train_features,train_labels),batch_size)
    # 这里使用的是Adam优化算法
    optimizer=torch.optim.Adam(net.parameters(),lr=learning_rate,weight_decay=weight_decay)
    for epoch in tqdm(range(num_epochs)): # tqdm进度条
        for X,y in train_iter:
            X,y=X.to(device),y.to(device)
            optimizer.zero_grad()
            outputs=net(X)
            loss=criterion(outputs,y)
            loss.backward()
            optimizer.step()
            
        record_loss=log_mse(net.to('cpu'),train_features,train_labels)
        wandb.log({'loss':record_loss,'epoch':epoch})
        train_ls.append(record_loss)
        
        if (epoch%NUM_SAVE==0 and epoch!=0) or(epoch==num_epochs-1):
            torch.save(net.state_dict(),'checkpoint_'+str(epoch))
            print('save checkpoints on:',epoch,'rmse loss value is:',record_loss)
        del X,y
        net.to(device)
    wandb.finish()
    return train_ls,test_ls

        
k, num_epochs, lr, weight_decay, batch_size = 5, 2000, 0.005, 0.05, 256
wandb.init(project="kaggle_predict",
           config={ "learning_rate": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "total_run": num_epochs,
                    "network": net_list}
          )
print("network:",net)   
    

network: MLP(
  (layer1): Linear(in_features=331, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


In [34]:
train_ls, valid_ls = train(net, train_features,train_labels,None,None, num_epochs, lr, weight_decay, batch_size)

# 使用现有训练好的net
net.to('cpu')
# 将网络应用于测试集。
preds = net(test_features).detach().numpy() # 用detach()切断反向传播

# 将其重新格式化以导出到Kaggle
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)

  3%|██▏                                                                             | 54/2000 [00:02<01:23, 23.27it/s]

save checkpoints on: 50 rmse loss value is: 0.14847400784492493


  5%|████                                                                           | 102/2000 [00:04<01:28, 21.43it/s]

save checkpoints on: 100 rmse loss value is: 0.1325608193874359


  8%|██████                                                                         | 155/2000 [00:07<01:22, 22.46it/s]

save checkpoints on: 150 rmse loss value is: 0.12705157697200775


 10%|████████                                                                       | 203/2000 [00:09<01:26, 20.88it/s]

save checkpoints on: 200 rmse loss value is: 0.12509186565876007


 13%|█████████▉                                                                     | 253/2000 [00:12<01:44, 16.78it/s]

save checkpoints on: 250 rmse loss value is: 0.12375984340906143


 15%|████████████                                                                   | 304/2000 [00:14<01:19, 21.37it/s]

save checkpoints on: 300 rmse loss value is: 0.12193087488412857


 18%|██████████████                                                                 | 355/2000 [00:17<01:17, 21.14it/s]

save checkpoints on: 350 rmse loss value is: 0.11951719224452972


 20%|███████████████▉                                                               | 404/2000 [00:19<01:39, 16.11it/s]

save checkpoints on: 400 rmse loss value is: 0.11718633025884628


 23%|█████████████████▉                                                             | 453/2000 [00:22<01:21, 19.04it/s]

save checkpoints on: 450 rmse loss value is: 0.1138957291841507


 25%|███████████████████▊                                                           | 503/2000 [00:25<01:29, 16.66it/s]

save checkpoints on: 500 rmse loss value is: 0.11092941462993622


 28%|█████████████████████▊                                                         | 553/2000 [00:28<01:21, 17.83it/s]

save checkpoints on: 550 rmse loss value is: 0.1081109419465065


 30%|███████████████████████▊                                                       | 602/2000 [00:31<01:35, 14.67it/s]

save checkpoints on: 600 rmse loss value is: 0.10472119599580765


 33%|█████████████████████████▊                                                     | 654/2000 [00:34<01:25, 15.67it/s]

save checkpoints on: 650 rmse loss value is: 0.10182551294565201


 35%|███████████████████████████▋                                                   | 702/2000 [00:37<01:37, 13.31it/s]

save checkpoints on: 700 rmse loss value is: 0.0989571362733841


 38%|█████████████████████████████▊                                                 | 754/2000 [00:41<01:20, 15.53it/s]

save checkpoints on: 750 rmse loss value is: 0.09605111181735992


 40%|███████████████████████████████▊                                               | 805/2000 [00:44<00:57, 20.76it/s]

save checkpoints on: 800 rmse loss value is: 0.09410884231328964


 43%|█████████████████████████████████▋                                             | 854/2000 [00:47<01:09, 16.38it/s]

save checkpoints on: 850 rmse loss value is: 0.09128489345312119


 45%|███████████████████████████████████▋                                           | 904/2000 [00:50<01:02, 17.45it/s]

save checkpoints on: 900 rmse loss value is: 0.08982072025537491


 48%|█████████████████████████████████████▋                                         | 954/2000 [00:52<00:59, 17.65it/s]

save checkpoints on: 950 rmse loss value is: 0.08777248114347458


 50%|███████████████████████████████████████                                       | 1002/2000 [00:56<01:09, 14.29it/s]

save checkpoints on: 1000 rmse loss value is: 0.083417609333992


 53%|█████████████████████████████████████████                                     | 1053/2000 [00:58<00:45, 20.80it/s]

save checkpoints on: 1050 rmse loss value is: 0.07953808456659317


 55%|██████████████████████████████████████████▉                                   | 1102/2000 [01:01<01:01, 14.60it/s]

save checkpoints on: 1100 rmse loss value is: 0.07709156721830368


 58%|████████████████████████████████████████████▉                                 | 1152/2000 [01:05<00:57, 14.82it/s]

save checkpoints on: 1150 rmse loss value is: 0.07500649243593216


 60%|██████████████████████████████████████████████▉                               | 1204/2000 [01:08<00:50, 15.85it/s]

save checkpoints on: 1200 rmse loss value is: 0.0733107328414917


 63%|████████████████████████████████████████████████▉                             | 1254/2000 [01:12<00:41, 17.92it/s]

save checkpoints on: 1250 rmse loss value is: 0.07180160284042358


 65%|██████████████████████████████████████████████████▊                           | 1303/2000 [01:14<00:45, 15.43it/s]

save checkpoints on: 1300 rmse loss value is: 0.07072243094444275


 68%|████████████████████████████████████████████████████▊                         | 1354/2000 [01:18<00:34, 18.87it/s]

save checkpoints on: 1350 rmse loss value is: 0.06869059801101685


 70%|██████████████████████████████████████████████████████▋                       | 1403/2000 [01:20<00:31, 19.11it/s]

save checkpoints on: 1400 rmse loss value is: 0.06737542897462845


 73%|████████████████████████████████████████████████████████▋                     | 1455/2000 [01:23<00:29, 18.38it/s]

save checkpoints on: 1450 rmse loss value is: 0.06537870317697525


 75%|██████████████████████████████████████████████████████████▋                   | 1504/2000 [01:26<00:28, 17.55it/s]

save checkpoints on: 1500 rmse loss value is: 0.06398992240428925


 78%|████████████████████████████████████████████████████████████▌                 | 1554/2000 [01:30<00:24, 18.38it/s]

save checkpoints on: 1550 rmse loss value is: 0.06211885064840317


 80%|██████████████████████████████████████████████████████████████▌               | 1604/2000 [01:32<00:26, 15.20it/s]

save checkpoints on: 1600 rmse loss value is: 0.061033450067043304


 83%|████████████████████████████████████████████████████████████████▌             | 1654/2000 [01:36<00:22, 15.40it/s]

save checkpoints on: 1650 rmse loss value is: 0.05992797017097473


 85%|██████████████████████████████████████████████████████████████████▍           | 1704/2000 [01:39<00:18, 16.01it/s]

save checkpoints on: 1700 rmse loss value is: 0.056599777191877365


 88%|████████████████████████████████████████████████████████████████████▍         | 1754/2000 [01:43<00:17, 13.85it/s]

save checkpoints on: 1750 rmse loss value is: 0.052669208496809006


 90%|██████████████████████████████████████████████████████████████████████▎       | 1803/2000 [01:47<00:20,  9.80it/s]

save checkpoints on: 1800 rmse loss value is: 0.04874201491475105


 93%|████████████████████████████████████████████████████████████████████████▎     | 1853/2000 [01:51<00:10, 14.50it/s]

save checkpoints on: 1850 rmse loss value is: 0.046065233647823334


 95%|██████████████████████████████████████████████████████████████████████████▏   | 1902/2000 [01:55<00:07, 13.50it/s]

save checkpoints on: 1900 rmse loss value is: 0.04385744780302048


 98%|████████████████████████████████████████████████████████████████████████████▏ | 1954/2000 [01:58<00:02, 16.92it/s]

save checkpoints on: 1950 rmse loss value is: 0.04175592213869095


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:01<00:00, 16.41it/s]


save checkpoints on: 1999 rmse loss value is: 0.039992429316043854


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,1999.0
loss,0.03999


KeyError: 'SalePrice'

In [36]:
# 将其重新格式化以导出到Kaggle
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)