In [36]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing();

X = housing.data
y = housing.target


The data is already cleaned and ready for analysis, so let's split it into a train and test set.

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)


In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ("scaler", StandardScaler())
])

X_train = pipeline.fit_transform(X_train)


In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

lin_pred = lin_reg.predict(X_test)

lin_rmse = mean_squared_error(y_test, lin_pred, squared=False)
print(lin_rmse)
print(r2_score(y_test, lin_pred))


73.9074970979364
-4176.427296584963


In this dataset, the targets represent hundreds of thousands of dollars, so this model gives us errors close to $73,900, which is pretty bad. Let's see if we can do better with other regression models.

In [68]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(random_state=17)

ridge_reg.fit(X_train, y_train)
print(ridge_reg.get_params())

ridge_pred = ridge_reg.predict(X_test)

ridge_rmse = mean_squared_error(y_test, ridge_pred, squared=False)
print(ridge_rmse)
print(r2_score(y_test, ridge_pred))


{'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 17, 'solver': 'auto', 'tol': 0.0001}
73.88327977044385
-4173.690103355649


In [69]:
from sklearn.model_selection import GridSearchCV

params = {
    "alpha": [0, 100],
    "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
}

ridge_grid_search = GridSearchCV(ridge_reg, params, cv=10)
ridge_grid_search.fit(X_train, y_train)

ridge_grid_search_pred = ridge_grid_search.best_estimator_.predict(X_test)
print(ridge_grid_search.best_params_)

ridge_opt_rmse = mean_squared_error(y_test, ridge_pred, squared=False)
print(ridge_opt_rmse)
print(r2_score(y_test, ridge_grid_search_pred))


{'alpha': 0, 'solver': 'svd'}
73.88327977044385
-4176.4272965849


Using grid search doesn't seem to lead to any improvement in RMSE and R2 score of the model. 

Let's try using Random Forest Regression.

In [75]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(random_state=17)

rf_reg.fit(X_train, y_train)

model_params = rf_reg.get_params()
print(model_params)

rf_pred = rf_reg.predict(X_test)

rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)
print(rf_rmse)
print(r2_score(y_test, rf_pred))


{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 17, 'verbose': 0, 'warm_start': False}
1.729568972940536
-1.2877453527086837


Random Forest Regression seems to be doing a lot better, even without tuning the hyperparameters. Using grid search to tune the params is going to be pretty slow so we'll try randomized search.

In [76]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

params = {
    "n_estimators": np.arange(100, 400),
    "criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
}

rf_rndm_search = RandomizedSearchCV(
    rf_reg, params, n_iter=100, cv=3, random_state=17)
rf_rndm_search.fit(X_train[:1000], y_train[:1000])

rf_rndm_search.best_params_


{'n_estimators': 261, 'criterion': 'poisson'}

In [78]:
rf_grid_search_pred = rf_rndm_search.best_estimator_.predict(X_test)

rf_grid_search_rmse = mean_squared_error(
    y_test, rf_grid_search_pred, squared=False)
print(rf_grid_search_rmse)
print(r2_score(y_test, rf_grid_search_pred))


1.9362738737556968
-1.8672494711209158


Weird, but with the best params from randomized search, the model seems to be doing worse. TODO investigate this. 

In [79]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(
    criterion="poisson", n_estimators=261, random_state=17)

rf_reg.fit(X_train, y_train)

model_params = rf_reg.get_params()
print(model_params)

rf_pred = rf_reg.predict(X_test)

rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)
print(rf_rmse)
print(r2_score(y_test, rf_pred))


{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'poisson', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 261, 'n_jobs': None, 'oob_score': False, 'random_state': 17, 'verbose': 0, 'warm_start': False}
1.6193545149778867
-1.005468215170659


Initializing the model with the best params from randomized search seems to lead to better results, possibly because randomized search was fitted on only 1000 instances (it's too slow to fit on all). A RMSE of 1.619 means that the model gives us an error of about $1,600, which is pretty decent.