In [90]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [91]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [92]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [93]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Separando Independent e Dependent Variables

In [94]:
y = housing['median_house_value']
X = housing.drop('median_house_value',axis=1)

In [95]:
display(X.head())
display(y.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: median_house_value, dtype: float64

### Separando variaveis numéricas e categóricas

In [96]:
numeric_cols = list(X.columns[X.dtypes != object])
cat_cols = list(X.columns[X.dtypes == object])

In [97]:
X.loc[:5, numeric_cols]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368


#### Checando NULLS

In [98]:
X.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [99]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.4+ MB


#### Apesar de só existir um campo NULL é uma boa estratégia se preparar para futuros casos. Vamos adotar mediana para valores númericos e moda para categóricos

In [100]:
imputer = SimpleImputer(strategy='most_frequent') 
imputer.fit_transform(X[cat_cols])

array([['NEAR BAY'],
       ['NEAR BAY'],
       ['NEAR BAY'],
       ...,
       ['INLAND'],
       ['INLAND'],
       ['INLAND']], dtype=object)

In [101]:
imputer = SimpleImputer(strategy='median') 
imputer.fit_transform(X[numeric_cols])

array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]])

### Transformando variáveis categóricas em dummy

In [102]:
one = OneHotEncoder(sparse=False,drop='first')
one.fit_transform(housing[['ocean_proximity']])

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [103]:
one.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [104]:
from sklearn.base import BaseEstimator, TransformerMixin 

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 

class CombinedAttributesAdder( BaseEstimator, TransformerMixin): 
    
    def __init__( self, add_bedrooms_per_room = True): # no *args or ** kargs 
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y = None): 
        return self # nothing else to do 
        
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        rooms_per_house = X[:,rooms_ix]/X[:,households_ix]
        pop_per_house = X[:,population_ix]/X[:,households_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_house, pop_per_house, bedrooms_per_room]
        
        else:
            return np.c_[X, rooms_per_house, pop_per_house]
        


In [105]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = True)

In [106]:
transf = attr_adder.transform(X)

In [107]:
pd.DataFrame(transf,columns = list(X.columns)+['rooms_per_house','pop_per_house', 'bedrooms_per_room'])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_house,pop_per_house,bedrooms_per_room
0,-122.23,37.88,41,880,129,322,126,8.3252,NEAR BAY,6.98413,2.55556,0.146591
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,NEAR BAY,6.23814,2.10984,0.155797
2,-122.24,37.85,52,1467,190,496,177,7.2574,NEAR BAY,8.28814,2.80226,0.129516
3,-122.25,37.85,52,1274,235,558,219,5.6431,NEAR BAY,5.81735,2.54795,0.184458
4,-122.25,37.85,52,1627,280,565,259,3.8462,NEAR BAY,6.28185,2.18147,0.172096
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374,845,330,1.5603,INLAND,5.04545,2.56061,0.224625
20636,-121.21,39.49,18,697,150,356,114,2.5568,INLAND,6.11404,3.12281,0.215208
20637,-121.22,39.43,17,2254,485,1007,433,1.7,INLAND,5.20554,2.32564,0.215173
20638,-121.32,39.43,18,1860,409,741,349,1.8672,INLAND,5.32951,2.12321,0.219892


## Pipelines

In [108]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [109]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                        ('atribs_adder', CombinedAttributesAdder(True)),
                        ('scaler',StandardScaler())]
                        )

In [110]:
housing_num_data = num_pipeline.fit_transform(X[numeric_cols])


In [238]:
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(sparse=False#,drop='first'
                                                ))
               ])

In [239]:
housing_cols_data = cat_pipeline.fit_transform(X[cat_cols]) 

In [240]:
from sklearn.compose import ColumnTransformer

In [241]:
full_pipeline = ColumnTransformer([('num_pipeline', num_pipeline, numeric_cols),
                                    ('cat_pipeline', cat_pipeline, list(cat_cols))])

In [242]:
housing_transformed = full_pipeline.fit_transform(X)
pd.DataFrame(housing_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,0.628559,-0.049597,-1.029988,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.045890,1.357143,0.861439,1.669961,2.332238,0.327041,-0.092512,-0.888897,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.155620,-0.025843,-1.291686,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,0.156966,-0.050329,-0.449613,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,0.344711,-0.085616,-0.639087,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.388283,-0.512592,-0.443449,-1.216128,-0.155023,-0.049110,0.165994,0.0,1.0,0.0,0.0,0.0
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.922403,-0.944405,-1.008420,-0.691593,0.276881,0.005021,0.021671,0.0,1.0,0.0,0.0,0.0
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.123608,-0.369537,-0.174042,-1.142593,-0.090318,-0.071735,0.021134,0.0,1.0,0.0,0.0,0.0
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.304827,-0.604429,-0.393753,-1.054583,-0.040211,-0.091225,0.093467,0.0,1.0,0.0,0.0,0.0


In [243]:
full_pipeline.named_transformers_

{'num_pipeline': Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('atribs_adder', CombinedAttributesAdder()),
                 ('scaler', StandardScaler())]),
 'cat_pipeline': Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                 ('onehot', OneHotEncoder(sparse=False))])}

#### É possível navegar por todos os transformers usados, no exemplo abaixo quero saber os nomes das categorias do onehot

In [244]:
one_hot_cats = full_pipeline.transformers[1][1].steps[1][1].categories_
one_hot_cats

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [245]:
colunas = numeric_cols + ['rooms_per_house','pop_per_house', 'bedrooms_per_room'] + list(one_hot_cats[0])#[1:]
colunas

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_house',
 'pop_per_house',
 'bedrooms_per_room',
 '<1H OCEAN',
 'INLAND',
 'ISLAND',
 'NEAR BAY',
 'NEAR OCEAN']

In [246]:
housing_transformed = pd.DataFrame(data=housing_transformed, columns=colunas)
housing_transformed

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_house,pop_per_house,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,0.628559,-0.049597,-1.029988,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.045890,1.357143,0.861439,1.669961,2.332238,0.327041,-0.092512,-0.888897,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.155620,-0.025843,-1.291686,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,0.156966,-0.050329,-0.449613,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,0.344711,-0.085616,-0.639087,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.388283,-0.512592,-0.443449,-1.216128,-0.155023,-0.049110,0.165994,0.0,1.0,0.0,0.0,0.0
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.922403,-0.944405,-1.008420,-0.691593,0.276881,0.005021,0.021671,0.0,1.0,0.0,0.0,0.0
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.123608,-0.369537,-0.174042,-1.142593,-0.090318,-0.071735,0.021134,0.0,1.0,0.0,0.0,0.0
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.304827,-0.604429,-0.393753,-1.054583,-0.040211,-0.091225,0.093467,0.0,1.0,0.0,0.0,0.0


#### Testando alguns modelos

In [247]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score, cross_validate

In [248]:
lin_reg = LinearRegression()

In [249]:
-cross_val_score(lin_reg, housing_transformed, y, scoring='neg_root_mean_squared_error')

array([73391.42036892, 74809.28332317, 75429.91837496, 76604.35506436,
       66196.72436926])

In [250]:
cross = cross_validate(lin_reg, housing_transformed, y,return_train_score=True, scoring='neg_root_mean_squared_error')

In [251]:
pd.DataFrame(cross).mean()[2:]

test_score    -73286.340300
train_score   -67823.668781
dtype: float64

In [252]:
lasso = Lasso(alpha=10)
cross = cross_validate(lasso, housing_transformed, y,return_train_score=True, scoring='neg_root_mean_squared_error')
pd.DataFrame(cross).mean()[2:].map(lambda x: np.sqrt(-x))

test_score     270.682376
train_score    260.433709
dtype: float64

In [253]:
from sklearn.ensemble import RandomForestRegressor

In [254]:
model = RandomForestRegressor(max_features=8, n_estimators=30)
cross = cross_validate(model, housing_transformed, y,return_train_score=True, scoring='neg_root_mean_squared_error')
pd.DataFrame(cross).mean()[2:].map(lambda x: np.sqrt(-x))

test_score     263.057197
train_score    136.443998
dtype: float64

In [255]:
model = RandomForestRegressor()
cross = cross_validate(model, housing_transformed, y,return_train_score=True, scoring='neg_root_mean_squared_error')
pd.DataFrame(cross).mean()[2:].map(lambda x: np.sqrt(-x))

test_score     262.605314
train_score    134.146206
dtype: float64

In [256]:
param_grid = [{'max_features':[2,4,6,8,10], 'n_estimators':[2,4,8,16,32]}
            ]

In [257]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time

In [258]:
start = time.time()
grid = GridSearchCV(model,param_grid,scoring='neg_mean_squared_error',n_jobs=-1)
grid.fit(housing_transformed,y)
finish = time.time() - start
print(finish)

20.108659982681274


In [259]:
grid.best_params_

{'max_features': 4, 'n_estimators': 32}

In [260]:
np.sqrt(-grid.best_score_)

67359.18341885989

In [261]:
from scipy.stats import randint
param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }
grid = RandomizedSearchCV(model,param_distribs, n_jobs=-1, n_iter= 10, cv=5,scoring='neg_mean_squared_error', random_state=42)

In [262]:
start = time.time()
grid.fit(housing_transformed,y)
finish = time.time() - start
print(finish)

39.386104106903076


In [263]:
cvres = grid.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

67371.24761520077 {'max_features': 7, 'n_estimators': 180}
68742.4251329618 {'max_features': 5, 'n_estimators': 15}
68448.59120158403 {'max_features': 3, 'n_estimators': 72}
69740.53067174082 {'max_features': 5, 'n_estimators': 21}
68384.0753758452 {'max_features': 7, 'n_estimators': 122}
67756.91030034398 {'max_features': 3, 'n_estimators': 75}
68285.22590235058 {'max_features': 3, 'n_estimators': 88}
67309.19200137773 {'max_features': 5, 'n_estimators': 100}
67917.75963312679 {'max_features': 3, 'n_estimators': 150}
85378.7663464683 {'max_features': 5, 'n_estimators': 2}


In [264]:
grid2 = RandomizedSearchCV(model,param_distribs, n_jobs=-1, n_iter= 20, cv=5,scoring='neg_mean_squared_error', random_state=42)
start = time.time()
grid2.fit(housing_transformed,y)
finish = time.time() - start
print(finish)

90.7731204032898


In [265]:
cvres2 = grid2.cv_results_
for mean_score, params in zip(cvres2["mean_test_score"], cvres2["params"]):
    print(np.sqrt(-mean_score), params)

67966.79332683596 {'max_features': 7, 'n_estimators': 180}
69297.13881664663 {'max_features': 5, 'n_estimators': 15}
68199.46015032052 {'max_features': 3, 'n_estimators': 72}
68855.45698453875 {'max_features': 5, 'n_estimators': 21}
67567.36787903108 {'max_features': 7, 'n_estimators': 122}
68215.00818651945 {'max_features': 3, 'n_estimators': 75}
67858.05036793921 {'max_features': 3, 'n_estimators': 88}
67661.4579534278 {'max_features': 5, 'n_estimators': 100}
67128.48168536005 {'max_features': 3, 'n_estimators': 150}
82769.8323545056 {'max_features': 5, 'n_estimators': 2}
67339.91691928438 {'max_features': 4, 'n_estimators': 158}
67682.65950779714 {'max_features': 6, 'n_estimators': 130}
69454.95918914062 {'max_features': 4, 'n_estimators': 21}
70902.56346315535 {'max_features': 1, 'n_estimators': 58}
67602.67912193522 {'max_features': 6, 'n_estimators': 89}
71043.86866458226 {'max_features': 1, 'n_estimators': 59}
67434.93186994798 {'max_features': 7, 'n_estimators': 170}
67389.8736

In [266]:
grid2.best_params_

{'max_features': 3, 'n_estimators': 150}

In [267]:
np.sqrt(-grid2.best_score_)

67128.48168536005

#### Parece que o randomized encontrou os melhores parametros

In [268]:
importances = pd.DataFrame(grid2.best_estimator_.feature_importances_, index=colunas)
importances.sor

AttributeError: 'DataFrame' object has no attribute 'sor'

In [None]:
importances.sort_values(by=0, ascending=False)