In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

from data_utils import train_test_split, build_pipeline

In [2]:
(df_train_X, train_y), (df_test_X, test_y) = train_test_split()

In [3]:
pipeline = build_pipeline(df_train_X)
pipeline.fit(df_train_X)

train_X = pipeline.transform(df_train_X)
test_X = pipeline.transform(df_test_X)
print(f"train_X.shape = {train_X.shape}")
print(f"test_X.shape = {test_X.shape}")

num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']
train_X.shape = (16512, 16)
test_X.shape = (4128, 16)


In [4]:
lr_model = LinearRegression()
lr_model.fit(train_X, train_y)

LinearRegression()

In [5]:
df_sample_X = df_train_X.iloc[:5]
df_sample_y = train_y.iloc[:5]
sample_X = pipeline.transform(df_sample_X)

lr_model.predict(sample_X)

array([210644.60459286, 317768.80697211, 210956.43331178,  59218.98886849,
       189747.55849879])

In [6]:
def evaluate(model, X, y):
    y_hat = model.predict(X)
    mse = mean_squared_error(y, y_hat)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_hat)
    print(f"rmse = {rmse}, mae = {mae}")

In [7]:
evaluate(lr_model, train_X, train_y)

rmse = 68628.19819848922, mae = 49439.89599001897


In [8]:
evaluate(lr_model, test_X, test_y)

rmse = 66911.98070857547, mae = 49228.6156649542


In [9]:
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=42)

In [10]:
evaluate(tree_model, train_X, train_y)

rmse = 0.0, mae = 0.0


In [11]:
evaluate(tree_model, test_X, test_y)

rmse = 70388.94215505105, mae = 45762.66351744186
