In [45]:
import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR,LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore')

In [46]:
data = pd.read_csv('./housing.csv')

data['data_income'] = pd.cut(data['median_income'],
                            bins = [0,1.5, 3, 4.5, 6, np.inf],
                            labels=[1,2,3,4,5])

s = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=1)
for train, test in s.split(data,data['data_income']):
    train_set , test_set = data.loc[train], data.loc[test]
    
for i in (train_set,test_set):
    i.drop('data_income',axis=1, inplace = True)
    
train_x = train_set.drop('median_house_value',axis=1)
y_train = train_set['median_house_value'].copy()  
housing_labels = y_train.copy()

x_test= test_set.drop('median_house_value',axis=1)
test_x = x_test.copy()
test_y = test_set["median_house_value"].copy()

In [47]:
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(train_x.columns).index(col)    
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

In [48]:
def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

In [49]:
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values

In [50]:
train_num_x = train_x.drop('ocean_proximity', axis=1)
num_attribs = list(train_num_x)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', OldDataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', OldDataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(train_x)

In [51]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
x_prepared = full_pipeline.transform(test_x)
score = lin_reg.score(x_prepared, test_y)
print("线性回归：", score)

线性回归： 0.6616450065920737


In [52]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)
score = tree_reg.score(x_prepared,test_y)
print("决策树：",score)

决策树： 0.632637545638831


In [53]:
svr = SVR(kernel="rbf",C=100000)
svr.fit(housing_prepared, housing_labels)
score = svr.score(x_prepared,test_y)
print("SVM:",score)

SVM: 0.7661342560803278


In [54]:
rf = RandomForestRegressor(criterion="mse",max_depth=50)
rf.fit(housing_prepared, housing_labels)
score = rf.score(x_prepared,test_y)
print("随机森林：",score)

随机森林： 0.8006842751975323


In [55]:
boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())
boost.fit(housing_prepared, housing_labels)
score = boost.score(x_prepared,test_y)
print("boost:",score)

boost: 0.8248789722895387


## 网格搜索

In [56]:
from sklearn.model_selection import GridSearchCV

In [57]:
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
print("best_params", grid_search.best_params_)
# cvres = grid_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

best_params {'max_features': 8, 'n_estimators': 30}
