In [1]:
# 从 sklearn.datasets 导入波士顿房价数据读取器
from sklearn.datasets import load_boston

# 从读取房价数据存储在变量 boston 中
boston = load_boston()
# 输出数据描述
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
from sklearn.model_selection import train_test_split

X = boston.data
y = boston.target

# 随机采样 25% 的数据构建测试样本，其余作为训练样本
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=33, test_size=0.25)

In [4]:
# 分析回归目标值的差异
print("The max target value is", np.max(boston.target))
print("The min target value is", np.min(boston.target))
print("The average target value is", np.mean(boston.target))

The max target value is 50.0
The min target value is 5.0
The average target value is 22.532806324110677


In [12]:
from sklearn.preprocessing import StandardScaler

ss_X = StandardScaler()
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)

In [17]:
X_train.shape

(379, 13)

In [16]:
X_train.mean(axis=0)

array([-2.65032792e-16, -1.25376109e-16,  3.96926701e-15, -1.62578833e-16,
       -4.73942931e-15, -3.03714837e-15,  2.98793532e-16,  9.65936599e-16,
        2.69500048e-17, -1.09557628e-16,  2.60477655e-14,  9.12733682e-15,
       -1.57598941e-16])

In [18]:
X_train.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

### 为什么对 y 也要标准化？

In [14]:
# 分别初始化对特征和目标值的标准化器

ss_y = StandardScaler()

# 分别对训练和测试数据的特征以及目标值进行标准化处理
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

In [19]:
from sklearn.linear_model import LinearRegression

# 使用默认配置初始化线性回归器 LinearRegression
lr = LinearRegression()
# 使用训练数据进行参数估计
lr.fit(X_train, y_train)
# 对测试数据进行回归预测
lr_y_predict = lr.predict(X_test)



In [21]:
# 使用 LinearRegression 模型自带的评估模块，并输出评估结果
print('The value of default measurement of LinearRegression is', lr.score(X_test, y_test))

The value of default measurement of LinearRegression is 0.6757955014529482


In [22]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

r2_score(y_test, lr_y_predict)

0.6757955014529482

In [23]:
mean_squared_error(
    ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))

25.139236520353446

In [24]:
mean_absolute_error(
    ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))

3.532532543705397

## 使用梯度下降法做线性回归

In [25]:
from sklearn.linear_model import SGDRegressor

sgdr = SGDRegressor()
sgdr.fit(X_train, y_train)
sgdr_y_predict = sgdr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [26]:
sgdr.score(X_test, y_test)

0.6595037965297591

In [27]:
r2_score(y_test, sgdr_y_predict)

0.6595037965297591

In [28]:
mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predict))

26.402516410729223

In [29]:
mean_absolute_error(
    ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predict))

3.504461786213403

## 用 SVM 做线性回归

In [30]:
from sklearn.svm import SVR

# 使用线性核函数配置的支持向量机进行回归训练，并且对测试样本进行预测
linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train, y_train)
linear_svr_y_predict = linear_svr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [31]:
def print_pred_result(y_true, y_pred):
    print('r2_score', r2_score(y_true, y_pred))
    mse = mean_squared_error(
        ss_y.inverse_transform(y_true), ss_y.inverse_transform(y_pred))
    mae = mean_absolute_error(
        ss_y.inverse_transform(y_true), ss_y.inverse_transform(y_pred))
    print('mse', mse)
    print('mae', mae)

In [33]:
print_pred_result(y_test, linear_svr_y_predict)

r2_score 0.650659546421538
mse 27.088311013556027
mae 3.4328013877599624


In [34]:
# 使用多项式核函数配置的支持向量机进行回归训练，并且对测试样本进行预测
poly_svr = SVR(kernel='poly')
poly_svr.fit(X_train, y_train)
poly_svr_y_predict = poly_svr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [35]:
print_pred_result(y_test, poly_svr_y_predict)

r2_score 0.40365065102550846
mse 46.24170053103929
mae 3.73840737104651


越来越差了。

In [36]:
# 使用径向基核函数配置的支持向量机进行回归训练，并且对测试样本进行预测
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(X_train, y_train)
rbf_svr_y_predict = rbf_svr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [38]:
print_pred_result(y_test, rbf_svr_y_predict)

r2_score 0.7559887416340944
mse 18.920948861538733
mae 2.6067819999501114


这个效果最好。

In [39]:
from sklearn.neighbors import KNeighborsRegressor

# 预测的方式为平均回归：weights='uniform'
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train)
uni_knr_y_predict = uni_knr.predict(X_test)

In [40]:
print_pred_result(y_test, uni_knr_y_predict)

r2_score 0.6907212176346006
mse 23.981877165354337
mae 2.9650393700787396


In [41]:
# 预测的方式为根据距离加权回归：weights='distance'
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict = dis_knr.predict(X_test)

In [42]:
print_pred_result(y_test, dis_knr_y_predict)

r2_score 0.7201094821421603
mse 21.703073090490353
mae 2.801125502210876


In [43]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_y_predict = dtr.predict(X_test)

In [44]:
print_pred_result(y_test, dtr_y_predict)

r2_score 0.7030528130691328
mse 23.025669291338588
mae 3.0976377952755905


目前来看随机森林的表现效果最好。

In [46]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_y_predict = rfr.predict(X_test)
print_pred_result(y_test, rfr_y_predict)

r2_score 0.8216110687931304
mse 13.832508661417318
mae 2.3853543307086618


  after removing the cwd from sys.path.


In [47]:
etr = ExtraTreesRegressor()
etr.fit(X_train, y_train)
etr_y_predict = etr.predict(X_test)
print_pred_result(y_test, etr_y_predict)

r2_score 0.7321741517306886
mse 20.76756299212598
mae 2.5688976377952755


  


梯度提升树又进了一步。

In [48]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_predict = gbr.predict(X_test)
print_pred_result(y_test, gbr_y_predict)

r2_score 0.8341708717185488
mse 12.858605283129389
mae 2.3005895744992477


  y = column_or_1d(y, warn=True)


In [53]:
feature_importances = pd.DataFrame({
    'feature_names': boston.feature_names,
    'feature_importances_': etr.feature_importances_,
})
feature_importances

Unnamed: 0,feature_names,feature_importances_
0,CRIM,0.045423
1,ZN,0.004376
2,INDUS,0.033611
3,CHAS,0.017287
4,NOX,0.029299
5,RM,0.335898
6,AGE,0.020582
7,DIS,0.018369
8,RAD,0.018559
9,TAX,0.033606


In [55]:
feature_importances.sort_values(by='feature_importances_', ascending=True)

Unnamed: 0,feature_names,feature_importances_
1,ZN,0.004376
3,CHAS,0.017287
7,DIS,0.018369
8,RAD,0.018559
11,B,0.019335
6,AGE,0.020582
4,NOX,0.029299
9,TAX,0.033606
2,INDUS,0.033611
0,CRIM,0.045423


In [58]:
sorted(zip(etr.feature_importances_, boston.feature_names),key=lambda d:d[0])

[(0.00437598442416888, 'ZN'),
 (0.017287269094109586, 'CHAS'),
 (0.018368785908987027, 'DIS'),
 (0.018558717093917477, 'RAD'),
 (0.019335111796520553, 'B'),
 (0.02058175471158189, 'AGE'),
 (0.02929948392098649, 'NOX'),
 (0.03360623529886188, 'TAX'),
 (0.03361144517566782, 'INDUS'),
 (0.04542341527149256, 'CRIM'),
 (0.07265201677880531, 'PTRATIO'),
 (0.33589761946593977, 'RM'),
 (0.35100216105896076, 'LSTAT')]

XGBRegressor 又提升了一些

In [60]:
from xgboost import XGBRegressor

xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)
xgbr_y_predict = xgbr.predict(X_test)
print_pred_result(y_test, xgbr_y_predict)

r2_score 0.8418855436202317
mse 12.260400481400309
mae 2.305236428178201
