# Exercises: Chapter 2

# 0. Load and process data

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import RobustScaler, OneHotEncoder

In [25]:
def load_housing_data():
    housing_path = 'datasets/housing/housing.csv'
    return pd.read_csv(housing_path)

In [17]:
housing = load_housing_data()
income_cat = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
y = housing["median_income"].values
X = housing.drop('median_income', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=income_cat)

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 9 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16354 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_house_value    16512 non-null float64
ocean_proximity       16512 non-null object
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [22]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=self.dtype)

In [None]:
# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [28]:
pipeline_num = Pipeline([
    ('selector', TypeSelector('float')),
    ('attr_adder', CombinedAttributesAdder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
pipeline_cat = Pipeline([
    ('selector', TypeSelector('object')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])
pipeline_X = FeatureUnion([('numeric', pipeline_num), ('categorical', pipeline_cat)])

In [29]:
X_train_processed = pipeline_X.fit_transform(X_train)
X_test_processed = pipeline_X.transform(X_test)

## Exercise 1

Try a Support Vector Machine regressor ( sklearn.svm.SVR ) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [36]:
param_grid = [{'kernel':['linear'], 'C': [0.1, 1.0, 10., 50.]},
              {'kernel': ['rbf'], 'C': [0.1, 1.0, 10., 50.], 'gamma': [0.01, 0.1, 1.0, 10]}]

In [None]:
from multiprocessing import cpu_count
n_jobs = cpu_count() - 1

In [None]:
sv_reg = SVR()
grid_search = GridSearchCV(sv_reg, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=n_jobs, verbose=2)
grid_search.fit(X_train_processed, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................. C=0.1, kernel=linear, total=   7.0s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV] ............................. C=0.1, kernel=linear, total=   6.9s
[CV] C=0.1, kernel=linear ............................................
[CV] ............................. C=0.1, kernel=linear, total=   6.9s
[CV] C=0.1, kernel=linear ............................................
[CV] ............................. C=0.1, kernel=linear, total=   6.7s
[CV] C=0.1, kernel=linear ............................................
[CV] ............................. C=0.1, kernel=linear, total=   6.6s
[CV] C=1.0, kernel=linear ............................................
[CV] ............................. C=1.0, kernel=linear, total=  12.3s
[CV] C=1.0, kernel=linear ............................................
[CV] ............................. C=1.0, kernel=linear, total=  12.1s
[CV] C=1.0, kernel=linear ............................................
[CV] ............................. C=1.0, kernel=linear, total=  13.6s
[CV] C=1.0, kernel=linear ............................................
[CV] .