In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
DATA_PATH = './data/housing.csv'

In [3]:
df_data = pd.read_csv(DATA_PATH)

In [4]:
from splitter import split_train_test
from pipeline import build_preprocess_pipeline

(x_train, y_train), (x_test, y_test) = split_train_test(df_data)

columns = list(x_train)
preprocess_pipeline = build_preprocess_pipeline(columns)

x_train_processed = preprocess_pipeline.fit_transform(x_train)
x_test_processed = preprocess_pipeline.transform(x_test)
print(x_train_processed.shape)

(16512, 16)


## Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(x_train_processed, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
x_test_sample = x_train.iloc[:5]
y_test_sample = y_train.iloc[:5]

x_test_sample_processed = preprocess_pipeline.transform(x_test_sample)
y_hat_sample = lr_model.predict(x_test_sample_processed)
print(y_hat_sample)
print(y_test_sample.values)

[210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
[286600. 340600. 196900.  46300. 254500.]


In [7]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

def evaluate(model, x_test, y_test):
    y_hat = model.predict(x_test)
    mse = mean_squared_error(y_test, y_hat)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_hat)
    return mse, rmse, mae
    
def evaluate_model(model, testing=False):
    train_mse, train_rmse, train_mae = evaluate(model, x_train_processed, y_train)
    print("Training: MSE = {}, RMSE = {}, MAE = {}".format(train_mse, train_rmse, train_mae))
    if testing:
        test_mse, test_rmse, test_mae = evaluate(model, x_test_processed, y_test)
        print("Testing:  MSE = {}, RMSE = {}, MAE = {}".format(test_mse, test_rmse, test_mae))
    
def cross_validate_model(model):
    scores = cross_val_score(model, x_train_processed, y_train, scoring='neg_mean_squared_error', cv=10)
    rmse_scores = np.sqrt(-scores)
    print("RMSE scores = {}\nMean = {}\nSTD = {}".format(rmse_scores, rmse_scores.mean(), rmse_scores.std()))

In [8]:
evaluate_model(lr_model)

Training: MSE = 4709829587.971121, RMSE = 68628.19819848923, MAE = 49439.89599001897


In [9]:
cross_validate_model(lr_model)

RMSE scores = [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
Mean = 69052.46136345083
STD = 2731.674001798349


## Decision Tree Regression

In [10]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(x_train_processed, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [11]:
evaluate_model(tree_model)

Training: MSE = 0.0, RMSE = 0.0, MAE = 0.0


In [12]:
cross_validate_model(tree_model)

RMSE scores = [70194.33680785 66855.16363941 72432.58244769 70758.73896782
 71115.88230639 75585.14172901 70262.86139133 70273.6325285
 75366.87952553 71231.65726027]
Mean = 71407.68766037929
STD = 2439.4345041191004


## Random Forest Regression 

In [13]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(n_estimators=10, random_state=42)
forest_model.fit(x_train_processed, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [14]:
evaluate_model(forest_model)

Training: MSE = 481070269.5059823, RMSE = 21933.31414779769, MAE = 13520.692369186045


In [15]:
cross_validate_model(forest_model)

RMSE scores = [51646.44545909 48940.60114882 53050.86323649 54408.98730149
 50922.14870785 56482.50703987 51864.52025526 49760.85037653
 55434.21627933 53326.10093303]
Mean = 52583.72407377466
STD = 2298.353351147122


## Support Vector Regression

In [16]:
from sklearn.svm import SVR

svr_model = SVR(kernel='linear')
svr_model.fit(x_train_processed, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
evaluate_model(svr_model)

Training: MSE = 12342017004.586128, RMSE = 111094.6308539982, MAE = 81841.46999231384


In [18]:
cross_validate_model(svr_model)

RMSE scores = [105342.09141998 112489.24624123 110092.35042753 113403.22892482
 110638.90119657 115675.8320024  110703.56887243 114476.89008206
 113756.17971227 111520.1120808 ]
Mean = 111809.84009600841
STD = 2762.393664321567
