In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## 模型训练

### 在 Training Set 上训练和验证模型

#### 使用线性回归模型

书中使用 sklearn pipeline 进行数据预处理和模型训练流程，目前不熟悉这种做法，暂时放着

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import rbf_kernel

num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
               "total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer


def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

In [20]:
housing = pd.read_csv(Path("datasets/housing/housing.csv"))

train_data = housing.drop("median_house_value", axis=1)
housing_labels = housing["median_house_value"].copy()

In [21]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(train_data, housing_labels)

  super()._check_params_vs_input(X, default_n_init=10)


In [22]:
housing_predictions = lin_reg.predict(train_data)
housing_predictions[:5].round(-2)  # -2 = rounded to the nearest hundred

array([366700., 375400., 358200., 315700., 290800.])

In [23]:
housing_labels.iloc[:5].values

array([452600., 358500., 352100., 341300., 342200.])

In [24]:
# 计算误差
error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1
print(", ".join([f"{100 * ratio:.1f}%" for ratio in error_ratios]))

-19.0%, 4.7%, 1.7%, -7.5%, -15.0%


In [25]:
from sklearn.metrics import mean_squared_error

lin_rmse = mean_squared_error(housing_labels, housing_predictions, squared=False)
lin_rmse

69002.48054626475

#### 使用决策树模型

In [27]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(train_data, housing_labels)

  super()._check_params_vs_input(X, default_n_init=10)


In [28]:
housing_predictions = tree_reg.predict(train_data)
tree_rmse = mean_squared_error(housing_labels, housing_predictions, squared=False)
tree_rmse

0.0

### 进行交叉验证

In [29]:
from sklearn.model_selection import cross_val_score

tree_rmses = -cross_val_score(tree_reg, train_data, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
pd.Series(tree_rmses).describe()

count       10.000000
mean     65395.761748
std       2301.122928
min      62887.800954
25%      63769.440461
50%      64952.325580
75%      66505.718859
max      70706.100094
dtype: float64

In [32]:
# 对比线性回归模型的误差
lin_rmses = -cross_val_score(lin_reg, train_data, housing_labels,
                              scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_rmses).describe()

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


count       10.000000
mean     69932.203309
std       2908.601111
min      66241.288187
25%      68807.033461
50%      69211.663585
75%      69925.431885
max      76006.114620
dtype: float64

#### 使用随机森林模型

In [33]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, train_data, housing_labels,
                                scoring="neg_root_mean_squared_error", cv=10)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [34]:
pd.Series(forest_rmses).describe()

count       10.000000
mean     46476.473467
std       1714.738385
min      43415.831537
25%      45754.558449
50%      46800.850121
75%      47583.003485
max      49194.996926
dtype: float64

In [35]:
forest_reg.fit(train_data, housing_labels)
housing_predictions = forest_reg.predict(housing)
forest_rmse = mean_squared_error(housing_labels, housing_predictions, squared=False)
forest_rmse

  super()._check_params_vs_input(X, default_n_init=10)


17395.401711755636