# 09 scikit-learn中的回归问题

使用 kNN 有关的分类或回归算法时一定要作数据的预处理<br>
使用线性回归有关的算法时最好作预处理, 防止训练得到的某些参数过大或过小

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

#### scikit-learn中的线性回归

In [4]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
print(reg.coef_)
print(reg.intercept_)
reg.score(X_test, y_test)

[ -1.18919477e-01   3.63991462e-02  -3.56494193e-02   5.66737830e-02
  -1.16195486e+01   3.42022185e+00  -2.31470282e-02  -1.19509560e+00
   2.59339091e-01  -1.40112724e-02  -8.36521175e-01   7.92283639e-03
  -3.81966137e-01]
34.1614354962


0.81298026026584935

#### kNN Regressor

In [7]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(X_train)
X_train_standard = std.transform(X_train)
X_test_standard = std.transform(X_test)

In [8]:
from sklearn.neighbors import KNeighborsRegressor

reg = KNeighborsRegressor()
reg.fit(X_train_standard, y_train)
score = reg.score(X_test_standard, y_test)
score

0.84664511530389497

#### grid search

In [10]:
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

In [11]:
from sklearn.model_selection import GridSearchCV
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid)

In [12]:
%%time
grid_search.fit(X_train_standard, y_train)

CPU times: user 3.8 s, sys: 4.16 ms, total: 3.81 s
Wall time: 3.81 s


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
best_knn = grid_search.best_estimator_
print(grid_search.best_params_)
print(grid_search.best_score_)
print(best_knn.score(X_test_standard, y_test))

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.79917999891
0.880996650994
