In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# from get_data import *

In [2]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path="./"):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
fetch_housing_data()
housing = pd.read_csv("./housing.csv")

## Data Enrichment

In [5]:
def full_pipeline(housing):
    # data enhancing
    housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
    housing["bedrooms_to_rooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
    housing["population_per_household"] = housing["population"] / housing["households"]
    housing["income_category"] = pd.cut(housing["median_income"],
                                        bins=[0., 1.5, 3., 4.5, 6., np.inf],
                                        labels=[1,2,3,4,5])

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_category"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]

    train_data, train_labels = create_data_and_labels(strat_train_set)
    test_data, test_labels = create_data_and_labels(strat_test_set)
        
    return train_data, train_labels, test_data, test_labels

def create_data_and_labels(data_set):
    housing_train = data_set.drop("median_house_value", axis=1)
    housing_labels = data_set["median_house_value"].copy()

    imputer = SimpleImputer(strategy="median")
    housing_num = housing_train.drop("ocean_proximity", axis=1)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
     ])

    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    housing_prepared = full_pipeline.fit_transform(housing_train)
    return housing_prepared, housing_labels

In [6]:
train_data, train_labels, test_data, test_labels = full_pipeline(housing)

# Models

## Parametres and models

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold

In [11]:
CV = 10
N_REPEATS = 3

RIDGE_ALPHAS = (list(np.linspace(0.1, 1.0, num=5)) + [10] + list(np.linspace(2.0, 100.0, num=5)))
RIDGE_ALPHAS.sort()

MAX_ITER = 10000

L1_RATIO = np.arange(0.1, 1.01, 0.02)

## RidgeCV

In [12]:
ridge_regressor = Ridge()
ridge_cv = RepeatedKFold(n_splits=CV, n_repeats=N_REPEATS, random_state=42)
# ridgecv_regressor.fit(housing_prepared, housing_labels)
ridge_grid = dict()
ridge_grid['alpha'] = RIDGE_ALPHAS
ridge_search = GridSearchCV(ridge_regressor, ridge_grid, scoring='neg_root_mean_squared_error', cv=ridge_cv, n_jobs=-1)
ridge_results = ridge_search.fit(train_data, train_labels)

In [13]:
ridge_model = ridge_results.best_estimator_

## LassoCV

In [14]:
lasso_regressor = Lasso(max_iter=MAX_ITER)
lasso_cv = RepeatedKFold(n_splits=CV, n_repeats=N_REPEATS, random_state=42)
# ridgecv_regressor.fit(housing_prepared, housing_labels)
lasso_grid = dict()
lasso_grid['alpha'] = RIDGE_ALPHAS
lasso_search = GridSearchCV(lasso_regressor, lasso_grid, scoring='neg_root_mean_squared_error', cv=lasso_cv, n_jobs=-1, verbose=3)
lasso_results = lasso_search.fit(train_data, train_labels)

Fitting 30 folds for each of 11 candidates, totalling 330 fits


In [15]:
lasso_model = lasso_results.best_estimator_

## ElasticNetCV

In [None]:
en_regressor = ElasticNet(max_iter=MAX_ITER)
en_cv = RepeatedKFold(n_splits=CV, n_repeats=N_REPEATS, random_state=42)
# ridgecv_regressor.fit(housing_prepared, housing_labels)
en_grid = dict()
en_grid['alpha'] = RIDGE_ALPHAS
en_grid['l1_ratio'] = L1_RATIO
en_search = GridSearchCV(en_regressor, en_grid, scoring='neg_root_mean_squared_error', cv=en_cv, n_jobs=-1, verbose=3)
en_results = en_search.fit(train_data, train_labels)

In [17]:
enet_model = en_results.best_estimator_

---
# Testing on test set

In [18]:
ridge_train_var = ridge_model.predict(test_data)
print("RMSE:", np.sqrt(((ridge_train_var - test_labels)**2).mean()))

RMSE: 66798.37872583515


### Lasso

In [19]:
lasso_train_var = lasso_model.predict(test_data)
print("RMSE:", np.sqrt(((lasso_train_var - test_labels)**2).mean()))

RMSE: 66781.0087126069


### ElasticNet

In [20]:
enet_train_var = enet_model.predict(test_data)
print("RMSE:", np.sqrt(((enet_train_var - test_labels)**2).mean()))

RMSE: 66807.11030167514
