Training and Evaluating on the Training Set

In [69]:
import numpy as np
import pandas as pd
from Housing_preprocessor import HousingPreprocessor , DataFrameSelector , CategoricalEncoder

In [70]:
housing_prepared = pd.read_csv('datasets/housing/housing_prepared.csv')
housing_labels = pd.read_csv('datasets/housing/housing_labels.csv')
housing  = pd.read_csv('datasets/housing/housing.csv')
start_test_set = pd.read_csv('datasets/housing/Housing_test_set.csv')

In [3]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared , housing_labels)

In [4]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels , housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mse

2398505953.6355186

In [5]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared , housing_labels)

In [13]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels , housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg , housing_prepared , housing_labels , scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores  = np.sqrt(-scores)

In [15]:
def display_scores(scores):
 print("Scores:", scores)
 print("Mean:", scores.mean())
 print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [55467.96950575 49306.75017929 47687.27237389 46989.74540635
 50516.12941571 49756.55171536 50104.17263943 49492.95906913
 49109.14803119 53566.82529601]
Mean: 50199.752363211875
Standard deviation: 2418.6789072309803


In [16]:
lin_scores = cross_val_score(lin_reg , housing_prepared , housing_labels , scoring='neg_mean_squared_error' , cv = 10)
lin_rsme_scores = np.sqrt(-lin_scores)
display_scores(lin_rsme_scores)

Scores: [51211.94608604 46337.64760494 48043.17958949 49199.05716922
 47373.70951612 51766.42308772 50016.06782463 49248.75960872
 47472.88487006 50179.60179508]
Mean: 49084.92771520256
Standard deviation: 1672.709217705957


In [103]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared , housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

12983.563333192704

In [19]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [37581.44244113 34733.15835549 32589.27904782 35961.99835695
 33795.77718147 34954.38226578 36653.065404   34669.65949693
 33928.63536875 38722.02334529]
Mean: 35358.94212636232
Standard deviation: 1774.23039641408


In [20]:
import joblib
joblib.dump(lin_reg , 'pickled_models/linear_regression.pkl')
joblib.dump(tree_reg , 'pickled_models/tree_regression.pkl')
joblib.dump(forest_reg , 'pickled_models/forest_regression.pkl')

['pickled_models/forest_regression.pkl']

In [104]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30, 40, 50], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg , param_grid , cv=5 , scoring='neg_mean_squared_error' ,return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

In [46]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 50}

In [47]:
grid_search.best_estimator_

In [48]:
cvres = grid_search.cv_results_
for mean_socre , params in zip(cvres['mean_test_score'] , cvres['params']):
    print(np.sqrt(-mean_socre) , params)

63211.263443375625 {'max_features': 2, 'n_estimators': 3}
55560.25525575558 {'max_features': 2, 'n_estimators': 10}
52386.601362879905 {'max_features': 2, 'n_estimators': 30}
52346.89788313447 {'max_features': 2, 'n_estimators': 40}
51945.03336820599 {'max_features': 2, 'n_estimators': 50}
59722.23599037403 {'max_features': 4, 'n_estimators': 3}
52270.37191427766 {'max_features': 4, 'n_estimators': 10}
50222.070476955894 {'max_features': 4, 'n_estimators': 30}
49938.36258909078 {'max_features': 4, 'n_estimators': 40}
49589.010324958974 {'max_features': 4, 'n_estimators': 50}
58750.38768696532 {'max_features': 6, 'n_estimators': 3}
52015.6897654888 {'max_features': 6, 'n_estimators': 10}
49942.69558263393 {'max_features': 6, 'n_estimators': 30}
49463.729230588535 {'max_features': 6, 'n_estimators': 40}
49416.995173403375 {'max_features': 6, 'n_estimators': 50}
58572.206237982704 {'max_features': 8, 'n_estimators': 3}
52072.95780348153 {'max_features': 8, 'n_estimators': 10}
50093.006785

In [49]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.20037954e-02, 6.51051721e-02, 4.27783441e-02, 1.84660437e-02,
       1.63030144e-02, 1.70276963e-02, 1.65053717e-02, 3.50845474e-01,
       6.74414699e-02, 1.06576762e-01, 6.46402616e-02, 1.59523361e-02,
       1.36030253e-01, 2.02337942e-04, 4.20881538e-03, 5.91285215e-03])

In [93]:
housing_prep = HousingPreprocessor()
preprocessor = housing_prep.fit_transform(housing)
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = housing_prep.full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = cat_encoder.get_feature_names_out(housing_prep.cat_attribs)
attributes = housing_prep.num_attribs + extra_attribs + list(cat_one_hot_attribs)
sorted(zip(feature_importances, attributes), reverse=True)

[(0.35084547435974833, 'median_income'),
 (0.13603025278695122, 'ocean_proximity_INLAND'),
 (0.10657676203841022, 'pop_per_hhold'),
 (0.07200379542738336, 'longitude'),
 (0.06744146994419593, 'rooms_per_hhold'),
 (0.0651051721390262, 'latitude'),
 (0.0646402615899256, 'bedrooms_per_room'),
 (0.042778344090041276, 'housing_median_age'),
 (0.018466043670316762, 'total_rooms'),
 (0.017027696313435557, 'population'),
 (0.01650537166376295, 'households'),
 (0.016303014385167763, 'total_bedrooms'),
 (0.015952336116464766, 'ocean_proximity_<1H OCEAN'),
 (0.00591285215422616, 'ocean_proximity_NEAR OCEAN'),
 (0.0042088153789590415, 'ocean_proximity_NEAR BAY'),
 (0.00020233794198492978, 'ocean_proximity_ISLAND')]

In [149]:
final_model = grid_search.best_estimator_
X_test = start_test_set.drop("median_house_value", axis=1)
y_test = start_test_set["median_house_value"].copy()
housing_prep = HousingPreprocessor()
X_test_prepared = housing_prep.fit_transform(X_test)
X_test_prepared.to_csv('x_test')
final_predictions = final_model.predict(X_test_prepared)[:,1]
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

51335.2287212027

In [145]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,loc=squared_errors.mean(), scale=stats.sem(squared_errors)))

array([49457.44495159, 53146.70784326])

In [53]:
HousingPreprocessor().full_pipeline