In [1]:
import os
import urllib
import tarfile

download_root = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
housing_path = "datasets/housing"
housing_url = download_root + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=housing_url, housing_path=housing_path):
    if not os.path.exists(housing_path):
        os.makedirs(housing_path)
    tgz_path = housing_path + "/housing.tgz"
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

import pandas as pd

def load_housing_data(housing_path=housing_path):
    csv_path = (housing_path + "/housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

In [2]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

def add_features(housing):
    housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
    housing["bedrooms_per_household"] = housing["total_bedrooms"] / housing["households"]
    housing["population_per_household"] = housing["population"] / housing["households"]
    housing["rooms_per_population"] = housing["total_rooms"] / housing["population"]
    housing["bedrooms_per_population"] = housing["total_bedrooms"] /  housing["population"]
    imputer = SimpleImputer(strategy="median")
    housing_values = imputer.fit_transform(housing)
    scaler = StandardScaler()
    housing_values = scaler.fit_transform(housing_values)
    return housing_values, list(housing)

from sklearn.preprocessing import OneHotEncoder

labels = housing["median_house_value"].copy()
housing_x = housing.drop("median_house_value", axis=1)

encoder = OneHotEncoder()
ocean = encoder.fit_transform(housing_x["ocean_proximity"].values.reshape(-1, 1)).toarray()

housing_add, housing_labels = add_features(housing_x.drop("ocean_proximity", axis=1))
housing_add = np.concatenate([housing_add, ocean], axis=1)
housing_labels = housing_labels + list(encoder.categories_[0])
housing_df = pd.DataFrame(housing_add, columns=housing_labels)
housing_labels = list(housing_df)

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(housing_df.values, labels.values, random_state=42)

lr = LinearRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.6583335977694541

In [5]:
sorted( zip(lr.coef_, housing_labels), reverse=True)

[(112653.24509406579, 'ISLAND'),
 (75387.6384727927, 'median_income'),
 (28587.313789385087, 'bedrooms_per_population'),
 (27752.618945581547, 'total_bedrooms'),
 (13690.303298378862, 'housing_median_age'),
 (7956.829778418706, 'households'),
 (6676.458763430659, 'rooms_per_population'),
 (110.4286767786485, 'population_per_household'),
 (-8135.917235171384, 'rooms_per_household'),
 (-9840.690156764816, 'total_rooms'),
 (-16742.2432337044, '<1H OCEAN'),
 (-16818.122480701433, 'NEAR OCEAN'),
 (-17552.841833781833, 'bedrooms_per_household'),
 (-20349.31172493932, 'population'),
 (-25585.987698252546, 'NEAR BAY'),
 (-53506.89168140737, 'INLAND'),
 (-58120.87208603855, 'longitude'),
 (-59528.08372465711, 'latitude')]

In [6]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor() 
dtr.fit(x_train, y_train)
y_pred = dtr.predict(x_test)

from sklearn.metrics import mean_squared_error
dtr_mse = mean_squared_error(y_pred, y_test)
dtr_rmse = np.sqrt(dtr_mse)
dtr_rmse

70960.50962611982

In [119]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    {"n_estimators" : [3, 10, 30], "max_features": [2, 4, 6, 8]}
]
rfr = RandomForestRegressor()
grid_search = GridSearchCV(rfr, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]}],
             scoring='neg_mean_squared_error')

In [156]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30)

In [155]:
feature_importances = grid_search.best_estimator_.feature_importances_
sorted(zip(feature_importances, housing_labels), reverse=True)

[(0.328701377454053, 'median_income'),
 (0.15094264868804713, 'INLAND'),
 (0.10086877606696641, 'rooms_per_population'),
 (0.06928727329327233, 'latitude'),
 (0.0665494282545806, 'longitude'),
 (0.05563218248746427, 'population_per_household'),
 (0.05075177423326383, 'rooms_per_household'),
 (0.04438184434088377, 'bedrooms_per_population'),
 (0.03603390004977899, 'housing_median_age'),
 (0.018958285948472996, 'bedrooms_per_household'),
 (0.015436284619434207, '<1H OCEAN'),
 (0.014424621944959078, 'total_rooms'),
 (0.013994682185648616, 'total_bedrooms'),
 (0.013581846172000766, 'households'),
 (0.01343532155826218, 'population'),
 (0.0049092774474614465, 'NEAR OCEAN'),
 (0.0019788905829323275, 'NEAR BAY'),
 (0.0001315846725182168, 'ISLAND')]

In [157]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

best_mse = mean_squared_error(y_pred, y_test)
best_rmse = np.sqrt(best_mse)
best_rmse

50040.06321211937

In [None]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

# svm = SVC()
# param_grid = [
#     {"kernel" : "linear", "C" : [0.01, 0.1, 1.0, 10.0, 100.0]},
#     {"kernel" : "rbf", "C" : [0.01, 0.1, 1.0, 10.0, 100.0], "gamma" : [0.01, 0.1, 1.0, 10.0, 100.0]},
# ]

# grid_search = GridSearchCV(svm, param_grid, cv=5, scoring="neg_mean_squared_error")
# grid_search.fit(x_train, y_train)