Kaggle实战房价预测

下载DataSet

In [None]:
import hashlib
import tarfile
import zipfile
import requests
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
import os

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'


# 下载一个DATA_HUB中的文件，返回本地文件名
def download(name, cache_dir=os.path.join('.', '01_data/02_DataSet_Kaggle_House')):
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}."
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname


# 从给定函数提取所有信息
def extractall(data):
    features = []
    for row in data:
        features.extend(row)  # 假设每一行的数据都是表示特征的列表或向量
    return features


# 下载并解压zip/tar文件
def download_extract(name, folder=None):
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp, extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir


# 下载函数
def download_all():
    for name in DATA_HUB:
        download(name)


DATA_HUB['kaggle_house_train'] = (DATA_URL + 'kaggle_house_pred_train.csv', '585e9cc9370b9160e7921475fbcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (DATA_URL + 'kaggle_house_pred_test.csv', 'fal9780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
print(train_data.shape)  # 1460个样本，80个te特征，1个标号label
print(test_data.shape)  # 测试样本没有标号label

房价预测

In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
import os

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

train_data = pd.read_csv('01_Data/02_DataSet_Kaggle_House/kaggle_house_pred_train.csv')
test_data = pd.read_csv('01_Data/02_DataSet_Kaggle_House/kaggle_house_pred_test.csv')

# 将训练数据和测试数据中除了第一列和最后一列之外的所有特征进行合并，并存储在名为all_features的变量中
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

# 从all_features中筛选出数值类型的特征，并将它们的索引存储在名为 numeric_features的变量中
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index  # 当值的类型不是object的话，就是一个数值
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))  # 对数值数据变为总体为均值为 0，方差为 1的分布的数据
all_features[numeric_features] = all_features[numeric_features].fillna(0)  # 将缺失值用 0填充

all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

n_train = train_data.shape[0]  # 样本个数
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
# train_data的SalePrice列是label值
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

loss = nn.MSELoss()

in_features = train_features.shape[1]


def get_net():
    net = nn.Sequential(nn.Linear(in_features, 1))
    return net


# 计算对数均方根误差
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))  # 把模型输出的值限制在1和inf之间，inf代表无穷大（infinity的缩写）
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))  # 预测做log，label做log，然后丢到MSE损失函数里
    return rmse.item()


# 训练函数（借助Adam优化器）
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []  # 初始化训练损失列表和验证损失列表
    train_iter = d2l.load_array((train_features, train_labels), batch_size)  # 将训练数据分化为小批量，并使用DataLoader将其包装为可迭代对象
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate,weight_decay=weight_decay)  # 使用Adam优化器来更新模型参数
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()  # 优化器梯度归零
            l = loss(net(X), y)
            l.backward()  # 反向传播，计算梯度
            optimizer.step()  # 使用优化器更新模型参数
        train_ls.append(log_rmse(net, train_features, train_labels))  # 计算并保存当前轮次中训练集和验证集的损失
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))  # 计算并保存验证集的损失
    return train_ls, test_ls  # 返回训练集和验证集的损失列表


# k折交叉验证
def get_k_fold_data(k, i, X, y):  # 给定 k折，给定第几折，返回相应的训练集、测试集
    assert k > 1
    fold_size = X.shape[0] // k  # 每一折的大小为样本数除以k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid


# 求训练和验证误差的平均值
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]

        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'fold{i + 1},train log rmse {float(train_ls[-1]):f},'f'valid log rmse {float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k


k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}-折验证：平均训练log rmse：{float(train_l):f},'f'平均验证log rmse：{float(valid_l):f}')

d2l.plt.show()

In [None]:
def train_and_pred(train_features, test_feature, train_labels, test_data, num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'train log rmse {float(train_ls[-1]):f}')
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.cvs', index=False)

train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)

d2l.plt.show()