In [10]:
import pandas as pd

import numpy as np

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline 

In [12]:
housing = pd.read_csv('datasets/housing/housing.csv')

__Creating New Features__

In [13]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [30]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)


__Creating a Train-Test Split__

In [31]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [52]:
## separate labels and data

housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

__Cleaning Data__

In [53]:
## note that there are some missing values

In [54]:
from sklearn.impute import SimpleImputer

In [55]:
imputer = SimpleImputer(strategy= 'median')

In [56]:
housing_num  = housing.loc[:,housing.dtypes != object]

In [63]:
imputer.fit(housing_num)
temp_num_data = imputer.transform(housing_num)

housing_tr = pd.DataFrame(temp_num_data, columns=housing_num.columns)

__Handling Categorical Variables__

In [71]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat) 
housing_cat_1hot

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

__Custom Transformers__

In [74]:
from sklearn.base import BaseEstimator, TransformerMixin

# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

__Scaling Data__

Using pipelines

In [75]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
housing_num_tr = num_pipeline.fit_transform(housing_num)


In [77]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

In [78]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

__Modeling__




In [85]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [92]:
np.set_printoptions(precision = 2)

# let's try the full preprocessing pipeline on a few training instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

print("Labels:", list(some_labels))

Predictions: [180167.5  290865.76 246213.67 145502.72 162680.87]
Labels: [103000.0, 382100.0, 172600.0, 93400.0, 96500.0]


In [93]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

67578.39766252581

A typical prediction error is around $67,578.

Note that most of the median house prices were between 120K and 265 K so this prediction is not satisfactory. But as the baseline it is not very bad.

In [94]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

48778.691517267216

__Comparing different models and selecting the best model__

In [97]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(random_state=42)

In [98]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

Good isn't it! Can we relly on this?

__Comparing Models__

__Decision Trees__

In [115]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)



def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)



Scores: [65649.26 69577.45 68327.26 ... 68529.64 66207.99 70058.29]
Mean: 68698.46566854705
Standard deviation: 2428.357705019381


__Linear model__

In [114]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [64961.75 70909.1  67135.46 ... 68513.71 72715.32 68940.29]
Mean: 67815.99873221978
Standard deviation: 2458.535123036681


__Random Forests__


In [110]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(n_estimators=10, random_state=42)

In [113]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse



116.65908456639868

In [116]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [50669.68 54736.76 52673.92 ... 52971.9  52651.74 52345.72]
Mean: 52457.49864226979
Standard deviation: 2059.503440199788


__Fine Tuning the Parameters__

In [117]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [119]:


cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)


64645.30112167564 {'max_features': 2, 'n_estimators': 3}
56571.03378876097 {'max_features': 2, 'n_estimators': 10}
53961.26240413359 {'max_features': 2, 'n_estimators': 30}
61407.243628813136 {'max_features': 4, 'n_estimators': 3}
54152.35878744126 {'max_features': 4, 'n_estimators': 10}
52029.67227186067 {'max_features': 4, 'n_estimators': 30}
60666.28232528508 {'max_features': 6, 'n_estimators': 3}
53633.81990330956 {'max_features': 6, 'n_estimators': 10}
52020.20786273818 {'max_features': 6, 'n_estimators': 30}
60736.117400961964 {'max_features': 8, 'n_estimators': 3}
53916.23627970669 {'max_features': 8, 'n_estimators': 10}
51900.83550768545 {'max_features': 8, 'n_estimators': 30}
63955.19909415244 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
55913.75351805432 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60992.699256848486 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
53743.088723763634 {'bootstrap': False, 'max_features': 3, 'n_estimators'

__Final Model__

In [122]:
final_model = grid_search.best_estimator_

X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [123]:
final_rmse

51363.09361430886

__Residual Plots__