In [17]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset

In [18]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

In [19]:
train_data.shape

(1460, 81)

In [20]:
test_data.shape

(1459, 80)

In [21]:
train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,WD,Normal,208500
1,2,20,RL,80.0,WD,Normal,181500
2,3,60,RL,68.0,WD,Normal,223500
3,4,70,RL,60.0,WD,Abnorml,140000


In [22]:
all_features = pd.concat([train_data.iloc[:, 1:-1], test_data.iloc[:, 1:-1]])

In [23]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 标准化后，每个特征值的均值变为0，所以可以直接用0来替换缺失值
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [24]:
# dummy_na=True
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

(2919, 331)

In [25]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice, dtype=torch.float).view(-1, 1)

In [26]:
loss = torch.nn.MSELoss()


def get_net(feature_num):
    net = torch.nn.Linear(feature_num, 1)
    for param in net.parameters():
        torch.nn.init.normal_(param, mean=0, std=0.01)
    return net

In [27]:
def log_rmse(net, features, labels):
    with torch.no_grad():
        # 将小于1的值设为1，使得取对数时数值更稳定
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        rmse = torch.sqrt(loss(clipped_preds.log(), labels.log()))
    return rmse.item()

In [28]:
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay,
          batch_size):
    train_loss, test_loss = [], []
    dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    # 这里使用了adam优化器
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
    net = net.float()
    for epoch in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_loss.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_loss.append(log_rmse(net, test_features, test_labels))
    return train_loss, test_loss

In [ ]:
def get_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        
    