In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt
%matplotlib inline
%matplotlib widget

## 1 读取数据

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
train_data.dtypes[train_data.dtypes == 'object'], test_data.dtypes[test_data.dtypes == 'object']

(Type    object
 City    object
 dtype: object,
 Type    object
 City    object
 dtype: object)

In [3]:
train_data.shape, test_data.shape

((47434, 21), (31623, 20))

In [4]:
print(train_data.iloc[0:4, [0, 1, 2, 3, 6, -2, -1]])
print(test_data.iloc[0:4, [0, 1, 2, 3, 6, -2, -1]])

   Id  Sold Price          Type  Build year  Full bathrooms  Last Sold Price  \
0   0     3825000  SingleFamily        55.0             NaN              NaN   
1   1      505000  SingleFamily        98.0             2.0         328000.0   
2   2      140000  SingleFamily        66.0             1.0              NaN   
3   3     1775000  SingleFamily        77.0             3.0        1500000.0   

          City  
0    Los Altos  
1  Los Angeles  
2   Strawberry  
3  Culver City  
      Id          Type  Build year      Lot  Total interior livable area  \
0  47439  SingleFamily         4.0    940.0                       1677.0   
1  47440  SingleFamily       100.0  10018.8                       1729.0   
2  47441  SingleFamily         4.0    940.0                       1677.0   
3  47442  SingleFamily         4.0    940.0                       1609.0   

   Last Sold Price         City  
0         819000.0   Dodgertown  
1          15000.0  San Leandro  
2              NaN  Los Angeles

## 2 数据处理

#### 将id去掉，并且将train的sold price放入train_y中

In [5]:
train_x_with_nan = train_data.iloc[:, 2:]
train_y = train_data.iloc[:, 1]
test_x_with_nan = test_data.iloc[:, 1:]
print(train_x_with_nan.iloc[0:6, [0, 1, 2, -2, -1]])
print(train_y.iloc[0:4])
print(test_x_with_nan.iloc[0:18, [0, 1, 2, -2, -1]])
train_x_with_nan.shape, test_x_with_nan.shape

           Type  Build year     Lot  Last Sold Price         City
0  SingleFamily        55.0     1.0              NaN    Los Altos
1  SingleFamily        98.0   447.0         328000.0  Los Angeles
2  SingleFamily        66.0  9147.0              NaN   Strawberry
3  SingleFamily        77.0     NaN        1500000.0  Culver City
4    VacantLand         NaN     NaN         900000.0      Creston
5  SingleFamily       119.0  3576.0         200000.0     Stockton
0    3825000
1     505000
2     140000
3    1775000
Name: Sold Price, dtype: int64
            Type  Build year       Lot  Last Sold Price           City
0   SingleFamily         4.0     940.0         819000.0     Dodgertown
1   SingleFamily       100.0   10018.8          15000.0    San Leandro
2   SingleFamily         4.0     940.0              NaN    Los Angeles
3   SingleFamily         4.0     940.0         810000.0     Dodgertown
4   SingleFamily         7.0    2613.6        1041000.0        Hayward
5   SingleFamily        62.0 

((47434, 19), (31623, 19))

#### 使用每个特征的平均值来填充nan(not a number)

In [6]:
a = train_x_with_nan.dtypes[train_x_with_nan.dtypes != 'object'].index
b = test_x_with_nan.dtypes[test_x_with_nan.dtypes != 'object'].index
train_x_mean = train_x_with_nan[a].mean()
test_x_mean = test_x_with_nan[b].mean()
train_x_fill = train_x_with_nan[a].fillna(train_x_mean)
test_x_fill = test_x_with_nan[b].fillna(test_x_mean)

#### 使用z-score标准化将数据范围重新划分

In [7]:
train_x_std = train_x_fill[a].std()
test_x_std = test_x_fill[b].std()
train_x_z_score = (train_x_fill - train_x_mean) / train_x_std
test_x_z_score = (test_x_fill - test_x_mean) / test_x_std
train_x_conbin = pd.concat([train_x_with_nan.iloc[:, [0]], train_x_z_score, train_x_with_nan.iloc[:, [-1]]], axis = 1)
test_x_conbin = pd.concat([test_x_with_nan.iloc[:, [0]], test_x_z_score, test_x_with_nan.iloc[:, [-1]]], axis = 1)
train_x_conbin.shape, test_x_conbin.shape, train_x_conbin.iloc[0:6, [0, 1, 2, -2, -1]], test_x_conbin.iloc[0:6, [0, 1, 2, -2, -1]]

((47434, 19),
 (31623, 19),
            Type  Build year       Lot  Last Sold Price         City
 0  SingleFamily   -0.056214 -0.078544         0.000000    Los Altos
 1  SingleFamily    0.825179 -0.077088        -0.515180  Los Angeles
 2  SingleFamily    0.169259 -0.048694         0.000000   Strawberry
 3  SingleFamily    0.394731  0.000000         0.742862  Culver City
 4    VacantLand    0.000000  0.000000         0.098813      Creston
 5  SingleFamily    1.255626 -0.066876        -0.652577     Stockton,
            Type  Build year       Lot  Last Sold Price          City
 0  SingleFamily   -0.321192 -0.023875         0.559376    Dodgertown
 1  SingleFamily    0.385274 -0.023490        -0.999101   San Leandro
 2  SingleFamily   -0.321192 -0.023875         0.000000   Los Angeles
 3  SingleFamily   -0.321192 -0.023875         0.541930    Dodgertown
 4  SingleFamily   -0.299115 -0.023804         0.989701       Hayward
 5  SingleFamily    0.105631 -0.023660        -0.037652  Garden Grov

#### 将训练集与测试集合到一块，因为训练集与测试集one-hot后的特征数量不一样

In [8]:
all_feature = pd.concat([train_x_conbin, test_x_conbin])
all_feature_one_hot = pd.get_dummies(all_feature, dummy_na=True)
all_feature_one_hot = all_feature_one_hot.astype(float)

In [9]:
all_feature_one_hot.shape, train_x_conbin.shape[0]

((79057, 1349), 47434)

#### 因为type和city的特征数量不一样，所以将这两个特征去掉。只用剩下来的17个特征。

In [10]:
train_x_one_hot = pd.get_dummies(train_x_conbin, dummy_na=True)
test_x_one_hot = pd.get_dummies(test_x_conbin, dummy_na=True)
train_x_one_hot.shape, test_x_one_hot.shape

((47434, 1124), (31623, 1031))

#### 将数据变为tensor格式

In [11]:
train_x_1 = torch.tensor(train_x_z_score.values, dtype = torch.float32)
train_y_1 = torch.tensor(train_y, dtype = torch.float32)
train_y_1 = train_y_1.reshape(-1, 1)
test_x_1 = torch.tensor(test_x_z_score.values, dtype = torch.float32)
train_x_1, test_x_1, train_x_1.shape, train_y_1.shape, test_x_1.shape

(tensor([[-0.0562, -0.0785,  0.0000,  ...,  0.1570,  1.0957,  0.0000],
         [ 0.8252, -0.0771, -0.0051,  ..., -0.2338, -0.3019, -0.5152],
         [ 0.1693, -0.0487, -0.0049,  ..., -0.5911, -0.4331,  0.0000],
         ...,
         [ 0.9687, -0.0565, -0.0051,  ..., -0.1746,  0.3716,  1.4406],
         [-1.0401, -0.0591, -0.0049,  ..., -0.1489, -0.3114, -0.3306],
         [ 0.3742, -0.0759, -0.0053,  ..., -0.5527, -0.2170,  0.0000]]),
 tensor([[-0.3212, -0.0239, -0.0050,  ...,  0.0000, -0.0637,  0.5594],
         [ 0.3853, -0.0235, -0.0068,  ...,  0.0549, -0.3408, -0.9991],
         [-0.3212, -0.0239, -0.0050,  ...,  0.0000, -0.0291,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.3728,  0.0000],
         [-0.0563,  0.0000, -0.0059,  ..., -0.2204, -0.3711, -0.1171],
         [-0.1225,  0.0000, -0.0068,  ...,  0.0380, -0.2724,  0.0040]]),
 torch.Size([47434, 17]),
 torch.Size([47434, 1]),
 torch.Size([31623, 17]))

In [12]:
n  = train_x_conbin.shape[0]
train_x_2 = torch.tensor(all_feature_one_hot[:n].values, dtype = torch.float32)
train_y_2 = torch.tensor(train_y, dtype = torch.float32)
train_y_2 = train_y_2.reshape(-1, 1)
test_x_2 = torch.tensor(all_feature_one_hot[n:].values, dtype = torch.float32)
train_x_2, test_x_2, train_x_2.shape, train_y_2.shape, test_x_2.shape

(tensor([[-0.0562, -0.0785,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.8252, -0.0771, -0.0051,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.1693, -0.0487, -0.0049,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.9687, -0.0565, -0.0051,  ...,  0.0000,  0.0000,  0.0000],
         [-1.0401, -0.0591, -0.0049,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.3742, -0.0759, -0.0053,  ...,  0.0000,  0.0000,  0.0000]]),
 tensor([[-0.3212, -0.0239, -0.0050,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.3853, -0.0235, -0.0068,  ...,  0.0000,  0.0000,  0.0000],
         [-0.3212, -0.0239, -0.0050,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0563,  0.0000, -0.0059,  ...,  0.0000,  0.0000,  0.0000],
         [-0.1225,  0.0000, -0.0068,  ...,  0.0000,  0.0000,  0.0000]]),
 torch.Size([47434, 1349]),
 torch.Size([47434, 1]),
 torch.Size([31623, 1349]))

## 3 训练模型

### 对数据分为k份

In [13]:
'''
    k:共分几折
    i:第i折
    X:为输入样本
    y:为输入标签
'''
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

### 3.1 使用线性回归，即单个神经元

In [14]:
loss = nn.MSELoss(reduction='mean')
def get_net():
    net = nn.Sequential(nn.Linear(train_x_1.shape[1], 32),
                        nn.ReLU(),
                        nn.Linear(32, 16),
                        nn.ReLU(),
                        nn.Linear(16, 1),
                        nn.ReLU())
    return net

#### 对成本进行相对误差

In [15]:
def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), min = 1)
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    return rmse.item()

#### 训练函数

In [16]:
def train(net, train_features, train_labels, valid_features, valid_labels,
          num_epochs, learning_rate, weight_decay):
    train_loss_his, valid_loss_his = [], []
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        l = loss(net(train_features), train_labels)
        l.backward()
        optimizer.step()
        train_loss_his.append(log_rmse(net, train_features, train_labels))
        if valid_labels is not None:
            valid_loss_his.append(log_rmse(net, valid_features, valid_labels))
    #print(1, torch.clamp(net(train_features), min = 1))
    #print(2, torch.log(torch.clamp(net(train_features), min = 1)))
    #print(3, torch.log(train_labels))
    #a = torch.log(torch.clamp(net(train_features), min = 1)) - torch.log(train_labels)
    #print(sum(a), sum(a) / len(train_labels), torch.sqrt(sum(a) / len(train_labels)))
    #print(4, loss(torch.log(torch.clamp(net(train_features), min = 1)), torch.log(train_labels)))
    return train_loss_his, valid_loss_his

In [17]:
k, num_epochs, learning_rate, weight_decay = 5, 100, 0.8, 0.9
train_loss_sum, valid_loss_sum = 0., 0.
for i in range(k):
    X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, train_x_1, train_y_1)
    net = get_net()
    train_loss, valid_loss = train(net, X_train, y_train, X_valid, y_valid, num_epochs, learning_rate, weight_decay)
    
    train_loss_sum = train_loss[-1] + train_loss_sum
    valid_loss_sum = valid_loss[-1] + valid_loss_sum
    
    print(f'折{i + 1}，训练log rmse:{float(train_loss[-1]):f}, '
              f'验证log rmse:{float(valid_loss[-1]):f}')
print(f'{k}-折验证: 平均训练log rmse: {float(train_loss_sum / k):f}, '
      f'平均验证log rmse: {float(valid_loss_sum / k):f}')

折1，训练log rmse:inf, 验证log rmse:0.489675
折2，训练log rmse:inf, 验证log rmse:0.496444
折3，训练log rmse:inf, 验证log rmse:0.490551
折4，训练log rmse:inf, 验证log rmse:0.349677
折5，训练log rmse:0.476244, 验证log rmse:inf
5-折验证: 平均训练log rmse: inf, 平均验证log rmse: inf


## 4 找到合适的超参数后将所有的训练集训练模型

In [18]:
net = get_net()
train_loss = train(net, train_x_1, train_y_1, None, None, num_epochs * 5, learning_rate, weight_decay)

In [55]:
preds = net(test_x_1).detach().numpy()
sub_txt = pd.read_csv("data/sample_submission.csv")
sub_txt.dtypes[sub_txt.dtypes != 'object']

Id            int64
Sold Price    int64
dtype: object

In [57]:
sub_txt["Sold Price"] = pd.Series(preds.reshape(1, -1)[0])

In [60]:
sub_txt.to_csv('submission.csv', index=False)