# Exercises of chapter 2 


## 1. Try a regressor of SVM (Support Vector Machine) with a grid search to find the best hyperparameters.

In [5]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from pandas import DataFrame

In [3]:
ROOT: Path = Path.cwd()
DATA: Path = ROOT.parent.parent / 'datasets' / 'housing'

In [6]:
df: DataFrame = pd.read_csv(DATA / 'housing.csv')

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

X: DataFrame = df.drop('median_house_value', axis=1)
y: DataFrame = df['median_house_value']

In [10]:
from sklearn.preprocessing import OneHotEncoder

scaler = OneHotEncoder()
X = scaler.fit_transform(X)

In [11]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
    {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
    {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
     'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}
]

svr = SVR()

grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)

grid_search.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ..............................C=10.0, kernel=linear; total time=  37.6s
[CV] END ..............................C=10.0, kernel=linear; total time=  37.7s
[CV] END ..............................C=10.0, kernel=linear; total time=  37.5s
[CV] END ..............................C=10.0, kernel=linear; total time=  38.0s
[CV] END ..............................C=10.0, kernel=linear; total time=  39.8s
[CV] END ..............................C=30.0, kernel=linear; total time=  39.7s
[CV] END ..............................C=30.0, kernel=linear; total time=  40.7s
[CV] END ..............................C=30.0, kernel=linear; total time=  40.9s
[CV] END ..............................C=30.0, kernel=linear; total time=  47.1s
[CV] END ..............................C=30.0, kernel=linear; total time=  47.3s
[CV] END .............................C=100.0, kernel=linear; total time=  46.7s
[CV] END .............................C=100.0, 

## 2. Try to replace `GridSearchCV` with `RandomizedSearchCV`.

- I've done this during the **Chapter 2**, which can be found [here](https://github.com/pcmoraesmenezes/Inteligencia-Artificial/blob/main/Livros/Hands-On%20Machine%20Learning%20with%20Scikit-Learn%2C%20Keras%2C%20and%20TensorFlow/codes/house_pricing/housing_price.ipynb).

## 3. Try to add a transformer in the preparation pipeline to select only the most important attributes.

In [11]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [12]:
housing_num = df.drop('ocean_proximity', axis=1)


In [13]:
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(df.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

from sklearn.preprocessing import FunctionTransformer

def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = FunctionTransformer(add_extra_features, validate=False,
                                 kw_args={"add_bedrooms_per_room": False})

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])

In [15]:
from sklearn.compose import ColumnTransformer


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])


In [16]:
housing_prepared = full_pipeline.fit_transform(df)

housing_labels = df['median_house_value'].copy()

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [19]:
k = 5

In [21]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [22]:
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

array([ 7,  8, 10, 11, 13])

In [24]:
attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]

In [26]:
sorted(zip(feature_importances, attributes), reverse=True)[:k]


[(np.float64(0.00831913958240636), 'median_income'),
 (np.float64(0.0029322859021517576), 'median_house_value'),
 (np.float64(0.0017771995435055569), 'total_rooms'),
 (np.float64(0.0010406180697137978), 'housing_median_age')]

In [27]:
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [29]:
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(df)


In [30]:
housing_prepared_top_k_features[0:3]


array([[ 2.34476576,  2.12963148, -0.04959654, -1.02998783,  0.        ],
       [ 2.33223796,  1.31415614, -0.09251223, -0.8888972 ,  0.        ],
       [ 1.7826994 ,  1.25869341, -0.02584253, -1.29168566,  0.        ]])

In [31]:
housing_prepared[0:3, top_k_feature_indices]


array([[ 2.34476576,  2.12963148, -0.04959654, -1.02998783,  0.        ],
       [ 2.33223796,  1.31415614, -0.09251223, -0.8888972 ,  0.        ],
       [ 1.7826994 ,  1.25869341, -0.02584253, -1.29168566,  0.        ]])