In [32]:
import os

In [33]:
import tarfile

In [34]:
import urllib

In [35]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [36]:
import pandas as pd

In [37]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [38]:
import pandas as pd

In [39]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [40]:
housing= load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [41]:
import sys
assert sys.version_info >= (3, 5)

In [42]:
import sklearn
assert sklearn.__version__ >= "0.20"


In [43]:
import numpy as np
import os

In [44]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [45]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [46]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [64]:
import numpy as np

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [65]:
train_set, test_set = split_train_test(housing, 0.2)
len(train_set)

16512

In [66]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [67]:
test_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,278000.0,NEAR OCEAN


In [69]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [70]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [71]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

In [72]:
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
4629,-118.3,34.07,18.0,3759.0,,3296.0,1462.0,2.2708,<1H OCEAN,2
6068,-117.86,34.01,16.0,4632.0,,3038.0,727.0,5.1762,<1H OCEAN,4
17923,-121.97,37.35,30.0,1955.0,,999.0,386.0,4.6328,<1H OCEAN,4
13656,-117.3,34.05,6.0,2155.0,,1039.0,391.0,1.6675,INLAND,2
19252,-122.79,38.48,7.0,6837.0,,3468.0,1405.0,3.1662,<1H OCEAN,3


In [73]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [74]:
housing_num = housing.drop("ocean_proximity", axis=1)
# alternatively: housing_num = housing.select_dtypes(include=[np.number])

In [75]:
imputer.fit(housing_num)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [76]:
X = imputer.transform(housing_num)

In [77]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing.index)

In [78]:
housing_tr.loc[sample_incomplete_rows.index.values]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
4629,-118.3,34.07,18.0,3759.0,433.0,3296.0,1462.0,2.2708,2.0
6068,-117.86,34.01,16.0,4632.0,433.0,3038.0,727.0,5.1762,4.0
17923,-121.97,37.35,30.0,1955.0,433.0,999.0,386.0,4.6328,4.0
13656,-117.3,34.05,6.0,2155.0,433.0,1039.0,391.0,1.6675,2.0
19252,-122.79,38.48,7.0,6837.0,433.0,3468.0,1405.0,3.1662,3.0


In [79]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)

In [80]:
housing_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,2.0
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,5.0
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,2.0
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,2.0
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,3.0


In [81]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
17606,<1H OCEAN
18632,<1H OCEAN
14650,NEAR OCEAN
3230,INLAND
3555,<1H OCEAN
19480,INLAND
8879,<1H OCEAN
13685,INLAND
4937,<1H OCEAN
4861,<1H OCEAN


In [82]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [83]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [84]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [87]:
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names] # get the column indices

In [88]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat,rooms_per_household,population_per_household
17606,-121.89,37.29,38,1568,351,710,339,2.7042,<1H OCEAN,2,4.62537,2.0944
18632,-121.93,37.05,14,679,108,306,113,6.4214,<1H OCEAN,5,6.00885,2.70796
14650,-117.2,32.77,31,1952,471,936,462,2.8621,NEAR OCEAN,2,4.22511,2.02597
3230,-119.61,36.31,25,1847,371,1460,353,1.8839,INLAND,2,5.23229,4.13598
3555,-118.59,34.23,17,6592,1525,4459,1463,3.0347,<1H OCEAN,3,4.50581,3.04785


In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [90]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [91]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [92]:

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

old_num_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

old_cat_pipeline = Pipeline([
        ('selector', OldDataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

In [93]:
from sklearn.pipeline import FeatureUnion

old_full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", old_num_pipeline),
        ("cat_pipeline", old_cat_pipeline),
    ])

In [94]:
old_housing_prepared = old_full_pipeline.fit_transform(housing)
old_housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [98]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [102]:
# 1. Try a Support Vector Machine regressor (sklearn.svm.SVR) with various hyperparameters,such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

from sklearn.model_selection import GridSearchCV

param_grid = [
        {'kernel': ['linear'], 'C': [5., 10., 50., 100., 1000., 5000., 10000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 5.0, 10., 50., 500., 1000.0],
         'gamma': [0.01, 0.05, 0.1, 0.5, 1.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 37 candidates, totalling 185 fits
[CV] C=5.0, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................. C=5.0, kernel=linear, total=  12.6s
[CV] C=5.0, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.5s remaining:    0.0s


[CV] ............................. C=5.0, kernel=linear, total=  12.3s
[CV] C=5.0, kernel=linear ............................................
[CV] ............................. C=5.0, kernel=linear, total=  11.1s
[CV] C=5.0, kernel=linear ............................................
[CV] ............................. C=5.0, kernel=linear, total=  10.7s
[CV] C=5.0, kernel=linear ............................................
[CV] ............................. C=5.0, kernel=linear, total=   9.4s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............................ C=10.0, kernel=linear, total=   9.0s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............................ C=10.0, kernel=linear, total=   9.5s
[CV] C=10.0, kernel=linear ...........................................
[CV] ............................ C=10.0, kernel=linear, total=   9.2s
[CV] C=10.0, kernel=linear ...........................................
[CV] .

[CV] ..................... C=1.0, gamma=1.0, kernel=rbf, total=  15.9s
[CV] C=5.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=5.0, gamma=0.01, kernel=rbf, total=  17.5s
[CV] C=5.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=5.0, gamma=0.01, kernel=rbf, total=  17.8s
[CV] C=5.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=5.0, gamma=0.01, kernel=rbf, total=  15.9s
[CV] C=5.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=5.0, gamma=0.01, kernel=rbf, total=  16.5s
[CV] C=5.0, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=5.0, gamma=0.01, kernel=rbf, total=  15.3s
[CV] C=5.0, gamma=0.05, kernel=rbf ...................................
[CV] .................... C=5.0, gamma=0.05, kernel=rbf, total=  16.6s
[CV] C=5.0, gamma=0.05, kernel=rbf ...................................
[CV] .

[CV] ................... C=50.0, gamma=0.05, kernel=rbf, total=  16.4s
[CV] C=50.0, gamma=0.05, kernel=rbf ..................................
[CV] ................... C=50.0, gamma=0.05, kernel=rbf, total=  16.3s
[CV] C=50.0, gamma=0.05, kernel=rbf ..................................
[CV] ................... C=50.0, gamma=0.05, kernel=rbf, total=  17.7s
[CV] C=50.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=50.0, gamma=0.1, kernel=rbf, total=  16.1s
[CV] C=50.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=50.0, gamma=0.1, kernel=rbf, total=  16.6s
[CV] C=50.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=50.0, gamma=0.1, kernel=rbf, total=  15.5s
[CV] C=50.0, gamma=0.1, kernel=rbf ...................................
[CV] .................... C=50.0, gamma=0.1, kernel=rbf, total=  16.0s
[CV] C=50.0, gamma=0.1, kernel=rbf ...................................
[CV] .

[CV] .................. C=1000.0, gamma=0.5, kernel=rbf, total=  17.8s
[CV] C=1000.0, gamma=0.5, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.5, kernel=rbf, total=  17.4s
[CV] C=1000.0, gamma=0.5, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.5, kernel=rbf, total=  17.6s
[CV] C=1000.0, gamma=0.5, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.5, kernel=rbf, total=  17.5s
[CV] C=1000.0, gamma=0.5, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=0.5, kernel=rbf, total=  16.6s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total=  16.7s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .................. C=1000.0, gamma=1.0, kernel=rbf, total=  16.7s
[CV] C=1000.0, gamma=1.0, kernel=rbf .................................
[CV] .

[Parallel(n_jobs=1)]: Done 185 out of 185 | elapsed: 49.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [5.0, 10.0, 50.0, 100.0, 1000.0, 5000.0,
                                10000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 5.0, 10.0, 50.0, 500.0, 1000.0],
                          'gamma': [0.01, 0.05, 0.1, 0.5, 1.0],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=2)

In [103]:
grid_search.best_params_

{'C': 1000.0, 'kernel': 'linear'}

In [104]:
# 2. Try replacing GridSearchCV with RandomizedSearchCV.

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(10, 100000),
        'gamma': expon(scale=1.0),
    }

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=314.891164795686, gamma=3.010121430917521, kernel=linear ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=314.891164795686, gamma=3.010121430917521, kernel=linear, total=   9.8s
[CV] C=314.891164795686, gamma=3.010121430917521, kernel=linear ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.7s remaining:    0.0s


[CV]  C=314.891164795686, gamma=3.010121430917521, kernel=linear, total=   8.5s
[CV] C=314.891164795686, gamma=3.010121430917521, kernel=linear ......
[CV]  C=314.891164795686, gamma=3.010121430917521, kernel=linear, total=   8.7s
[CV] C=314.891164795686, gamma=3.010121430917521, kernel=linear ......
[CV]  C=314.891164795686, gamma=3.010121430917521, kernel=linear, total=   8.1s
[CV] C=314.891164795686, gamma=3.010121430917521, kernel=linear ......
[CV]  C=314.891164795686, gamma=3.010121430917521, kernel=linear, total=   8.1s
[CV] C=13145.103232150108, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=13145.103232150108, gamma=0.9084469696321253, kernel=rbf, total=  13.1s
[CV] C=13145.103232150108, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=13145.103232150108, gamma=0.9084469696321253, kernel=rbf, total=  13.1s
[CV] C=13145.103232150108, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=13145.103232150108, gamma=0.9084469696321253, kernel=rbf, total=  12.7s
[CV] C=13145.1

[CV]  C=85688.69785189001, gamma=0.628789100540856, kernel=linear, total=  34.9s
[CV] C=85688.69785189001, gamma=0.628789100540856, kernel=linear .....
[CV]  C=85688.69785189001, gamma=0.628789100540856, kernel=linear, total=  41.6s
[CV] C=85688.69785189001, gamma=0.628789100540856, kernel=linear .....
[CV]  C=85688.69785189001, gamma=0.628789100540856, kernel=linear, total=  39.7s
[CV] C=85688.69785189001, gamma=0.628789100540856, kernel=linear .....
[CV]  C=85688.69785189001, gamma=0.628789100540856, kernel=linear, total=  39.5s
[CV] C=85688.69785189001, gamma=0.628789100540856, kernel=linear .....
[CV]  C=85688.69785189001, gamma=0.628789100540856, kernel=linear, total=  30.1s
[CV] C=2692.6469100861773, gamma=0.18696125197741642, kernel=linear ..
[CV]  C=2692.6469100861773, gamma=0.18696125197741642, kernel=linear, total=   9.3s
[CV] C=2692.6469100861773, gamma=0.18696125197741642, kernel=linear ..
[CV]  C=2692.6469100861773, gamma=0.18696125197741642, kernel=linear, total=  10.1s
[

[CV]  C=380.71583792493914, gamma=2.6126336514161914, kernel=linear, total=  10.8s
[CV] C=48696.409415208975, gamma=0.09265545895311562, kernel=linear ..
[CV]  C=48696.409415208975, gamma=0.09265545895311562, kernel=linear, total=  23.0s
[CV] C=48696.409415208975, gamma=0.09265545895311562, kernel=linear ..
[CV]  C=48696.409415208975, gamma=0.09265545895311562, kernel=linear, total=  23.9s
[CV] C=48696.409415208975, gamma=0.09265545895311562, kernel=linear ..
[CV]  C=48696.409415208975, gamma=0.09265545895311562, kernel=linear, total=  24.8s
[CV] C=48696.409415208975, gamma=0.09265545895311562, kernel=linear ..
[CV]  C=48696.409415208975, gamma=0.09265545895311562, kernel=linear, total=  24.3s
[CV] C=48696.409415208975, gamma=0.09265545895311562, kernel=linear ..
[CV]  C=48696.409415208975, gamma=0.09265545895311562, kernel=linear, total=  20.2s
[CV] C=1211.5379992469582, gamma=3.248614270240346, kernel=linear ....
[CV]  C=1211.5379992469582, gamma=3.248614270240346, kernel=linear, tot

[CV]  C=28340.904295147724, gamma=0.9763011917123741, kernel=rbf, total=  15.7s
[CV] C=28340.904295147724, gamma=0.9763011917123741, kernel=rbf ......
[CV]  C=28340.904295147724, gamma=0.9763011917123741, kernel=rbf, total=  15.6s
[CV] C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf, total=  11.9s
[CV] C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf, total=  11.8s
[CV] C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf, total=  11.8s
[CV] C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf, total=  11.5s
[CV] C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf .......
[CV]  C=24.07911195464457, gamma=0.4633351167983427, kernel=rbf, total=  11.9s
[CV] C=199.86340778

[CV]  C=64.90003020716534, gamma=2.8621383676481322, kernel=linear, total=   7.3s
[CV] C=64.90003020716534, gamma=2.8621383676481322, kernel=linear ....
[CV]  C=64.90003020716534, gamma=2.8621383676481322, kernel=linear, total=   7.7s
[CV] C=64.90003020716534, gamma=2.8621383676481322, kernel=linear ....
[CV]  C=64.90003020716534, gamma=2.8621383676481322, kernel=linear, total=   7.7s
[CV] C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf .....
[CV]  C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf, total=  11.7s
[CV] C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf .....
[CV]  C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf, total=  11.4s
[CV] C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf .....
[CV]  C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf, total=  12.7s
[CV] C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf .....
[CV]  C=144.21346497969486, gamma=0.17580835850006285, kernel=rbf, total=  11.9s
[CV]

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 127.3min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, shrinking=True,
                                 tol=0.001, verbose=False),
                   iid='warn', n_iter=50, n_jobs=None,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000291784DB9C8>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000291784DBF48>,
                                        'kernel': ['linear', 'rbf']},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=2)

In [105]:
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

55926.074354996425

In [106]:
rnd_search.best_params_

{'C': 78527.55494724249, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}

In [107]:
# We find the better hyperparameter in randomazied search than Gridsearch, althouth it takes more time.


In [117]:
# 3. Try adding a transformer in the preparation pipeline to select only the most important attributes.

from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [129]:
k=4

In [121]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [124]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],
  

In [126]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=6, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [127]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.79326113e-02, 6.18280724e-02, 4.33395023e-02, 1.81017027e-02,
       1.83291556e-02, 1.93269892e-02, 1.78369580e-02, 2.41360490e-01,
       1.61976585e-01, 5.35982558e-02, 1.06273526e-01, 6.14045141e-02,
       1.22353255e-02, 1.08821239e-01, 2.76143239e-05, 2.59938294e-03,
       5.00807682e-03])

In [130]:
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

array([ 7,  8, 10, 13], dtype=int64)

In [143]:
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

In [144]:
housing_prepared_top_k_features[0:3]

array([[-0.61493744, -0.95445595, -0.08649871,  0.        ],
       [ 1.33645936,  1.89030518, -0.03353391,  0.        ],
       [-0.5320456 , -0.95445595, -0.09240499,  0.        ]])

In [145]:
# 4. Try creating a single pipeline that does the full data preparation plus the final prediction.

prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search.best_params_))
])

In [146]:
prepare_select_and_predict_pipeline.fit(housing, housing_labels)

Pipeline(memory=None,
         steps=[('preparation',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                              

In [147]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

Predictions:	 [195726.35121208 337823.88103247 205620.88698963  60536.94595708]
Labels:		 [286600.0, 340600.0, 196900.0, 46300.0]


In [148]:
# 5. Automatically explore some preparation options using GridSearchCV.

param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing, housing_labels)

Fitting 5 folds for each of 51 candidates, totalling 255 fits
[CV] feature_selection__k=1, preparation__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  feature_selection__k=1, preparation__num__imputer__strategy=mean, total=   8.2s
[CV] feature_selection__k=1, preparation__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s remaining:    0.0s


[CV]  feature_selection__k=1, preparation__num__imputer__strategy=mean, total=   8.5s
[CV] feature_selection__k=1, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=1, preparation__num__imputer__strategy=mean, total=   8.5s
[CV] feature_selection__k=1, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=1, preparation__num__imputer__strategy=mean, total=   9.5s
[CV] feature_selection__k=1, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=1, preparation__num__imputer__strategy=mean, total=  10.0s
[CV] feature_selection__k=1, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num__imputer__strategy=median, total=   8.9s
[CV] feature_selection__k=1, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num__imputer__strategy=median, total=   9.1s
[CV] feature_selection__k=1, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=1, preparation__num__

[CV]  feature_selection__k=4, preparation__num__imputer__strategy=median, total=  10.8s
[CV] feature_selection__k=4, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=4, preparation__num__imputer__strategy=median, total=  11.2s
[CV] feature_selection__k=4, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=4, preparation__num__imputer__strategy=median, total=  11.0s
[CV] feature_selection__k=4, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num__imputer__strategy=most_frequent, total=   9.8s
[CV] feature_selection__k=4, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num__imputer__strategy=most_frequent, total=  12.5s
[CV] feature_selection__k=4, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=4, preparation__num__imputer__strategy=most_frequent, total=  11.3s
[CV] feature_selection__k=4, preparation__num__imputer__strategy=

[CV]  feature_selection__k=7, preparation__num__imputer__strategy=most_frequent, total=  14.2s
[CV] feature_selection__k=7, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=7, preparation__num__imputer__strategy=most_frequent, total=  12.7s
[CV] feature_selection__k=7, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=7, preparation__num__imputer__strategy=most_frequent, total=  12.4s
[CV] feature_selection__k=8, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num__imputer__strategy=mean, total=  12.2s
[CV] feature_selection__k=8, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num__imputer__strategy=mean, total=  11.0s
[CV] feature_selection__k=8, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=8, preparation__num__imputer__strategy=mean, total=  11.7s
[CV] feature_selection__k=8, preparation__num__imputer__strategy=mean 
[CV]  feature

[CV]  feature_selection__k=11, preparation__num__imputer__strategy=mean, total=  19.0s
[CV] feature_selection__k=11, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=11, preparation__num__imputer__strategy=mean, total=  18.1s
[CV] feature_selection__k=11, preparation__num__imputer__strategy=mean 
[CV]  feature_selection__k=11, preparation__num__imputer__strategy=mean, total=  17.0s
[CV] feature_selection__k=11, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num__imputer__strategy=median, total=  16.8s
[CV] feature_selection__k=11, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num__imputer__strategy=median, total=  19.9s
[CV] feature_selection__k=11, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=11, preparation__num__imputer__strategy=median, total=  19.9s
[CV] feature_selection__k=11, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=11, p

[CV]  feature_selection__k=14, preparation__num__imputer__strategy=median, total=  23.9s
[CV] feature_selection__k=14, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=14, preparation__num__imputer__strategy=median, total=  21.8s
[CV] feature_selection__k=14, preparation__num__imputer__strategy=median 
[CV]  feature_selection__k=14, preparation__num__imputer__strategy=median, total=  20.9s
[CV] feature_selection__k=14, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num__imputer__strategy=most_frequent, total=  21.3s
[CV] feature_selection__k=14, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num__imputer__strategy=most_frequent, total=  20.0s
[CV] feature_selection__k=14, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=14, preparation__num__imputer__strategy=most_frequent, total=  25.1s
[CV] feature_selection__k=14, preparation__num__impute

[CV]  feature_selection__k=17, preparation__num__imputer__strategy=most_frequent, total=  21.4s
[CV] feature_selection__k=17, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=17, preparation__num__imputer__strategy=most_frequent, total=  20.4s
[CV] feature_selection__k=17, preparation__num__imputer__strategy=most_frequent 
[CV]  feature_selection__k=17, preparation__num__imputer__strategy=most_frequent, total=  22.7s


[Parallel(n_jobs=1)]: Done 255 out of 255 | elapsed: 68.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preparation',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                     

In [149]:
grid_search_prep.best_params_

{'feature_selection__k': 11,
 'preparation__num__imputer__strategy': 'most_frequent'}

In [None]:
# The best imputer strategy is most_frequent and apparently almost all features are useful (11 out of 16)