### 数据导入

本代码使用sklearn中的波士顿房价数据

In [3]:
from sklearn.datasets import load_boston
boston = load_boston()

In [4]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
boston.data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [6]:
boston.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

### 切分数据集

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [8]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.2, random_state=2)

### 简单线性回归模型

In [8]:
model = LinearRegression()

In [9]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
model.score(X_test, y_test)

0.77872098747725604

### 加归一化

In [11]:
model2 = LinearRegression(normalize=True)

In [12]:
model2.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [14]:
model2.score(X_test, y_test)

0.77872098747725804

### 加多项式

简单线性回归模型容易导致欠拟合，可以增加多项式来让线性回归模型更好地拟合数据

degree：多项式特征的个数，默认为2

include_bias：默认为True，包含一个偏置项，或者说截距项

In [10]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2,include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [11]:
model3 = LinearRegression(normalize=True)

In [12]:
model3.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [13]:
model3.score(X_test_poly, y_test)

0.8963635527280811

In [14]:
model3.predict(X_test_poly)

array([22.66995447, 11.98571418, 33.46579216, 31.3901785 , 11.48148761,
       17.06840817, 26.49513931, 26.36747636, 20.38229474, 22.02103995,
       33.42005707, 23.13661476, 18.17279295,  9.4173739 , 12.61895257,
       23.19023472, 19.35284987, 11.03840549,  8.49110001, 13.85800193,
       24.74475012, 19.56912543, 34.71724827, 19.546126  , 17.1864305 ,
       12.09211863, 46.34079361, 33.66209227, 31.02889203, 17.7532168 ,
       22.38903585, 22.26079883, 31.59756823, 27.44452096, 10.55664571,
       14.54123646, 12.4535949 , 14.96719414, 25.79893419, 20.91686069,
       26.07370271, 13.46191179, 31.70967582,  8.40072893, 22.68961252,
       19.59982745, 33.95428374, 15.63574644, 30.94478601, 12.15465243,
       32.47736221, 30.32275141,  3.83572609, 35.64858249, 26.43519448,
       17.77370269, 20.4748173 , 17.64287237, 15.26460571, 23.743317  ,
       17.48966323, 20.14098479, 18.09807856, 33.04764671, 36.30322793,
       24.24972876, 46.78320007, 27.99981426, 15.05427539, 22.35

In [15]:
y_test

array([20.2, 15.3, 37.3, 32.5,  8.8, 14.4, 22. , 26.6, 15. , 21.5, 29.4,
       24.8, 22. , 16.1, 13.9, 21.6, 21.7, 12.8,  7.2, 12.6, 20.7, 19.3,
       36.5, 17.7, 16.7, 20.2, 50. , 34.6, 35.4, 19.4, 20.8, 21.1, 31.1,
       23.5,  8.3, 15.6, 11.3, 21.7, 23.2, 20.8, 22. , 13.6, 28.7, 10.5,
       23. , 13.8, 36.4, 18.4, 30.1, 17.9, 29.9, 30.7,  5. , 35.4, 27.9,
       18.4, 18.5, 17.5, 15.6, 22.4, 20.3, 20.6, 19.8, 28.2, 35.1, 27.5,
       48.5, 27.5, 11.5, 22. , 13.2,  7.4, 20.6, 20.1, 25.1, 22. , 19.1,
       24.3, 19.4, 23.9, 34.9, 19.4, 21.4, 26.6, 37.6, 36. , 21.4, 23.6,
       24.8, 19.9, 20.9, 18.2, 10.9, 44. , 43.5,  8.3, 46.7, 32.9, 21.7,
       14.3, 29.1, 23.8])

### Pipeline

pipeline 实现了对全部步骤的流式化封装和管理，可以很方便地使参数集在新数据集上被重复使用。

pipeline 可以用于下面几处：
#模块化 Feature Transform，只需写很少的代码就能将新的 Feature 更新到训练集中
#自动化 Grid Search，只要预先设定好使用的 Model 和参数的候选，就能自动搜索并记录最佳的 Model
#自动化 Ensemble Generation，每隔一段时间将现有最好的 K 个 Model 拿来做 Ensemble

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
norm = Normalizer()
poly = PolynomialFeatures(2, include_bias=False)
lr = LinearRegression()
pipeline = Pipeline([('norm', norm),('poly',poly),('lr', lr)])
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('norm', Normalizer(copy=True, norm='l2')), ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [17]:
pipeline.predict(X_test)

array([21.09375, 10.90625, 33.375  , 30.6875 , 11.25   , 11.375  ,
       23.53125, 24.3125 , 11.71875, 22.09375, 32.3125 , 23.28125,
       19.03125, 23.15625, 13.375  , 23.09375, 14.21875, 11.90625,
        7.78125, 14.34375, 24.59375, 20.15625, 33.03125, 19.28125,
       17.75   , 22.40625, 45.40625, 33.34375, 28.28125, 24.40625,
       23.1875 , 21.21875, 35.21875, 29.78125,  9.28125, 13.84375,
       14.125  , 17.03125, 25.5625 , 22.75   , 23.84375, 13.71875,
       30.78125,  6.96875, 23.59375, 19.78125, 32.40625, 17.5625 ,
       30.75   , 10.375  , 29.84375, 28.71875,  0.9375 , 38.3125 ,
       26.0625 , 17.625  , 20.25   , 18.21875, 14.9375 , 23.09375,
       16.28125, 22.46875, 18.59375, 31.3125 , 39.25   , 25.09375,
       47.     , 27.34375, 16.53125, 24.96875, 16.     ,  5.78125,
       15.09375, 17.25   , 26.90625, 19.53125, 19.34375, 22.15625,
       23.09375, 24.40625, 34.71875, 19.03125, 17.84375, 31.78125,
       41.71875, 33.03125, 21.9375 , 25.4375 , 27.375  , 18.28

In [18]:
pipeline.score(X_test, y_test)

0.8902595155740898