In [1]:
from sklearn.datasets import load_boston
boston = load_boston()

In [3]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
boston.data

array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00, ...,
          1.53000000e+01,   3.96900000e+02,   4.98000000e+00],
       [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00, ...,
          1.78000000e+01,   3.96900000e+02,   9.14000000e+00],
       [  2.72900000e-02,   0.00000000e+00,   7.07000000e+00, ...,
          1.78000000e+01,   3.92830000e+02,   4.03000000e+00],
       ..., 
       [  6.07600000e-02,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.96900000e+02,   5.64000000e+00],
       [  1.09590000e-01,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.93450000e+02,   6.48000000e+00],
       [  4.74100000e-02,   0.00000000e+00,   1.19300000e+01, ...,
          2.10000000e+01,   3.96900000e+02,   7.88000000e+00]])

In [5]:
boston.target

array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,
        18.9,  15. ,  18.9,  21.7,  20.4,  18.2,  19.9,  23.1,  17.5,
        20.2,  18.2,  13.6,  19.6,  15.2,  14.5,  15.6,  13.9,  16.6,
        14.8,  18.4,  21. ,  12.7,  14.5,  13.2,  13.1,  13.5,  18.9,
        20. ,  21. ,  24.7,  30.8,  34.9,  26.6,  25.3,  24.7,  21.2,
        19.3,  20. ,  16.6,  14.4,  19.4,  19.7,  20.5,  25. ,  23.4,
        18.9,  35.4,  24.7,  31.6,  23.3,  19.6,  18.7,  16. ,  22.2,
        25. ,  33. ,  23.5,  19.4,  22. ,  17.4,  20.9,  24.2,  21.7,
        22.8,  23.4,  24.1,  21.4,  20. ,  20.8,  21.2,  20.3,  28. ,
        23.9,  24.8,  22.9,  23.9,  26.6,  22.5,  22.2,  23.6,  28.7,
        22.6,  22. ,  22.9,  25. ,  20.6,  28.4,  21.4,  38.7,  43.8,
        33.2,  27.5,  26.5,  18.6,  19.3,  20.1,  19.5,  19.5,  20.4,
        19.8,  19.4,  21.7,  22.8,  18.8,  18.7,  18.5,  18.3,  21.2,
        19.2,  20.4,  19.3,  22. ,  20.3,  20.5,  17.3,  18.8,  21.4,
        15.7,  16.2,

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [7]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.2, random_state=2)

In [8]:
model = LinearRegression()

In [9]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
model.score(X_test, y_test)

0.77872098747725604

### 加归一化

In [11]:
model2 = LinearRegression(normalize=True)

In [12]:
model2.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [14]:
model2.score(X_test, y_test)

0.77872098747725804

### 加多项式

In [15]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2,include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [16]:
model3 = LinearRegression(normalize=True)

In [17]:
model3.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [18]:
model3.score(X_test_poly, y_test)

0.895848854203947

In [19]:
model3.predict(X_test_poly)

array([ 22.69315395,  12.05316222,  33.45746911,  31.40322437,
        11.45943167,  17.1115066 ,  26.50886073,  26.32952869,
        20.97102916,  22.0126233 ,  33.41178319,  23.15153042,
        18.18980991,   9.44730055,  12.6040278 ,  23.18526185,
        19.38844774,  11.0372229 ,   8.27540467,  13.80799823,
        24.72435334,  19.56993779,  34.74440305,  19.61728349,
        17.11276174,  12.08260264,  46.37072396,  33.66053914,
        31.00765701,  17.77027627,  22.38140407,  22.24908691,
        31.59548745,  27.43100605,  10.52151736,  14.52958168,
        12.46671464,  14.98178436,  25.7838815 ,  20.88071718,
        26.07338594,  13.46826838,  31.70888865,   8.38304952,
        22.67834795,  19.61727441,  33.9722124 ,  15.68124546,
        30.95396322,  12.14518427,  32.48006639,  30.35667411,
         3.73102674,  35.63463125,  26.42368095,  17.75861171,
        20.47507635,  17.64098518,  15.260486  ,  23.72419513,
        17.45607845,  20.10773957,  18.09023325,  33.06

In [20]:
y_test

array([ 20.2,  15.3,  37.3,  32.5,   8.8,  14.4,  22. ,  26.6,  15. ,
        21.5,  29.4,  24.8,  22. ,  16.1,  13.9,  21.6,  21.7,  12.8,
         7.2,  12.6,  20.7,  19.3,  36.5,  17.7,  16.7,  20.2,  50. ,
        34.6,  35.4,  19.4,  20.8,  21.1,  31.1,  23.5,   8.3,  15.6,
        11.3,  21.7,  23.2,  20.8,  22. ,  13.6,  28.7,  10.5,  23. ,
        13.8,  36.4,  18.4,  30.1,  17.9,  29.9,  30.7,   5. ,  35.4,
        27.9,  18.4,  18.5,  17.5,  15.6,  22.4,  20.3,  20.6,  19.8,
        28.2,  35.1,  27.5,  48.5,  27.5,  11.5,  22. ,  13.2,   7.4,
        20.6,  20.1,  25.1,  22. ,  19.1,  24.3,  19.4,  23.9,  34.9,
        19.4,  21.4,  26.6,  37.6,  36. ,  21.4,  23.6,  24.8,  19.9,
        20.9,  18.2,  10.9,  44. ,  43.5,   8.3,  46.7,  32.9,  21.7,
        14.3,  29.1,  23.8])

### Pipeline

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
norm = Normalizer()
poly = PolynomialFeatures(2, include_bias=False)
lr = LinearRegression()
pipeline = Pipeline([('norm', norm),('poly',poly),('lr', lr)])
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('norm', Normalizer(copy=True, norm='l2')), ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [27]:
pipeline.predict(X_test)

array([ 21.15625,  10.875  ,  33.59375,  30.65625,  11.1875 ,  11.28125,
        23.53125,  24.28125,  11.84375,  21.96875,  32.40625,  23.28125,
        18.96875,  23.34375,  13.5    ,  23.03125,  14.1875 ,  11.9375 ,
         7.71875,  14.46875,  24.5625 ,  20.03125,  33.03125,  19.28125,
        17.84375,  22.46875,  45.34375,  33.34375,  28.34375,  24.4375 ,
        23.1875 ,  21.21875,  35.34375,  29.75   ,   9.21875,  13.78125,
        14.09375,  17.0625 ,  25.59375,  22.65625,  23.84375,  13.71875,
        30.78125,   6.9375 ,  23.65625,  19.8125 ,  32.40625,  17.5625 ,
        30.78125,   9.9375 ,  29.84375,  28.71875,   1.     ,  38.46875,
        26.09375,  17.53125,  20.15625,  18.1875 ,  14.84375,  23.     ,
        16.28125,  22.53125,  18.59375,  31.40625,  39.5    ,  25.     ,
        47.15625,  27.375  ,  16.53125,  25.03125,  15.96875,   5.78125,
        15.15625,  17.21875,  26.84375,  19.59375,  19.28125,  22.21875,
        22.96875,  24.46875,  34.84375,  18.90625, 

In [28]:
pipeline.score(X_test, y_test)

0.88901890070242562