### Select and Train a Model

#### 1. Load Preprocessed Data

In [1]:
import pickle
train_pickle_path = 'datasets/train.pickle'

with open(train_pickle_path, 'rb') as f:
    data = pickle.load(f)

In [9]:
housing_labels = data[0]
housing_prepared = data[1]

In [12]:
housing_labels

17606    286600.0
18632    340600.0
14650    196900.0
3230      46300.0
3555     254500.0
           ...   
6563     240200.0
12053    113000.0
13908     97800.0
11159    225900.0
15775    500001.0
Name: median_house_value, Length: 16512, dtype: float64

In [13]:
len(housing_prepared)

16512

#### 2. Train a Linear Regression Model

In [14]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [16]:
# measure with RMSE on training set
from sklearn.metrics import mean_squared_error
import numpy as np 

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse   # $68,628 --> too large compared to the range of median_housing_values 
# --> underfitting --> need more powerful models /more features /less constraints

68628.19819848923

#### 3. Train a Decision Tree Regressor (can find some non-linear relationships, more in Chap6)

In [17]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor()

In [18]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse  # 0.0 --> overfitting

0.0

Better Evaluation Using __Cross-Validation__ --> avoid touching test set before final model is trained

In [28]:
from sklearn.model_selection import cross_val_score
tree_reg = DecisionTreeRegressor()
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, 
                         scoring="neg_root_mean_squared_error", cv=10)   # 10 folds: 10 scores; greater score is better
tree_rmse_scores = np.sqrt(-scores)  # lower MSE is better, so put negative before score

In [29]:
print("Scores: ", scores)
print("Mean: ", scores.mean())  # estimate the performance of the model
print("Standard deviation: ", scores.std())  # how precise the evaluation is

Scores:  [-69405.79076044 -66814.30033838 -70816.52122417 -67936.18742258
 -71134.75937062 -74888.14137685 -71049.49797833 -71053.00704558
 -76604.89415278 -68758.55189923]
Mean:  -70846.16515689557
Standard deviation:  2842.9832296622803


#### 4. Train a Random Forest Regressor (train many decision trees, more in Chap7)

Building a model on top of many other models is called __Ensemble Learning__. From sklearn.ensemble we import the RandomForestRegressor and the codes are essentially the same as for the other models.

In [33]:
# save the models (and relevant numbers for later comparison)
import joblib

joblib.dump(tree_reg, "models/tree_reg.pkl")
# tree_reg_loaded = joblib.load("models/tree_reg.pkl")   # for later use

['models/tree_reg.pkl']