In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

from data_utils import train_test_split, build_pipeline

In [2]:
SEED = 42

In [3]:
(df_train_X, train_y), (df_test_X, test_y) = train_test_split(seed=SEED)

In [4]:
pipeline = build_pipeline(df_train_X)
pipeline.fit(df_train_X)

train_X = pipeline.transform(df_train_X)
test_X = pipeline.transform(df_test_X)
print(f"train_X.shape = {train_X.shape}")
print(f"test_X.shape = {test_X.shape}")

train_X.shape = (16512, 16)
test_X.shape = (4128, 16)


In [5]:
lr_model = LinearRegression()
lr_model.fit(train_X, train_y)

LinearRegression()

In [6]:
df_sample_X = df_train_X.iloc[:5]
df_sample_y = train_y.iloc[:5]
sample_X = pipeline.transform(df_sample_X)

In [7]:
lr_model.predict(sample_X)

array([210644.60459286, 317768.80697211, 210956.43331178,  59218.98886849,
       189747.55849879])

In [8]:
def evaluate(model, X, y):
    y_hat = model.predict(X)
    mse = mean_squared_error(y, y_hat)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_hat)
    return rmse, mae
    
def evaluate_model(model):
    train_rmse, train_mae = evaluate(model, train_X, train_y)
    print(f"train: rmse = {train_rmse}, mae = {train_mae}")
    
def cross_validate_model(model):
    scores = cross_val_score(model, train_X, train_y, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    print(f"cv rmse mean  = {rmse_scores.mean()}")
    print(f"cv rmse std = {rmse_scores.std()}")

In [9]:
evaluate_model(lr_model)

train: rmse = 68628.19819848922, mae = 49439.89599001897


In [10]:
cross_validate_model(lr_model)

cv rmse mean  = 69052.46136345083
cv rmse std = 2731.6740017983484


In [11]:
tree_model = DecisionTreeRegressor(random_state=SEED)
tree_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=42)

In [12]:
evaluate_model(tree_model)

train: rmse = 0.0, mae = 0.0


In [13]:
cross_validate_model(tree_model)

cv rmse mean  = 71407.68766037929
cv rmse std = 2439.4345041191004


In [14]:
forest_model = RandomForestRegressor(n_estimators=100, random_state=SEED)
forest_model.fit(train_X, train_y)

RandomForestRegressor(random_state=42)

In [15]:
evaluate_model(forest_model)

train: rmse = 18603.515021376355, mae = 12062.631660004847


In [16]:
cross_validate_model(forest_model)

cv rmse mean  = 50182.303100336096
cv rmse std = 2097.0810550985693


In [17]:
svm_model = SVR(kernel='linear')
svm_model.fit(train_X, train_y)

SVR(kernel='linear')

In [18]:
evaluate_model(svm_model)

train: rmse = 111094.6308539982, mae = 81841.46999231384


In [19]:
cross_validate_model(svm_model)

cv rmse mean  = 111809.84009600841
cv rmse std = 2762.393664321567
