In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

Load the houseprices data from Thinkful's database. Reimplement your model from the previous checkpoint.

In [27]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from houseprices',con=engine)
engine.dispose()

In [28]:
## Fill continuous variable null values with zero
for column in ['masvnrarea', 'lotfrontage', 'garagecars']:
    df[column] = df[column].fillna(0)
    
indexes = df[df.garageyrblt == 'None'].index
df.loc[indexes, ['garageyrblt']] = 1980
df['garageyrblt'] = pd.to_numeric(df['garageyrblt'])

## Fill all null values with 'none'
df = df.fillna('None')

In [29]:
df['ttl_sq_ft'] = df.totalbsmtsf + df.firstflrsf + df.secondflrsf
df['sqfXqual'] = df.ttl_sq_ft * df.overallqual
df = pd.concat([df, pd.get_dummies(df.mszoning, drop_first=True)], axis=1)

df2 = df[['yearbuilt', 'yearremodadd', 'bsmtfinsf1', 'fireplaces', 'garagecars',
          'wooddecksf', 'secondflrsf', 'FV', 'RH', 'RL', 'RM',
          'saleprice', 'ttl_sq_ft', 'sqfXqual']]

Y = np.log1p(df2.saleprice)
X = df2.drop(columns = ['saleprice'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .02, random_state = 390)

Try OLS, Lasso, Ridge, and ElasticNet regression using the same model specification. This time, you need to do k-fold cross-validation to choose the best hyperparameter values for your models.

In [32]:
lrm = LinearRegression()
lrm.fit(X_train, Y_train)

Y_pred_train = lrm.predict(X_train)
Y_pred_test = lrm.predict(X_test)

print('R-squared train: {} \nR-squared test: {}'.format(
    lrm.score(X_train, Y_train), lrm.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(lrm, X, Y, cv=kfold)

R-squared train: 0.8254385059049244 
R-squared test: 0.9082138652014795
MAE: 0.09267718641438687
MSE: 0.018675924992709845
RMSE: 0.1366598880166007
MAPE: 0.007897929205480998


array([0.69054489, 0.84496639, 0.82863481, 0.84793952, 0.86994585,
       0.88487373, 0.8812502 , 0.87723982, 0.88272707, 0.41654209])

In [33]:
ridge = RidgeCV()
ridge.fit(X_train, Y_train)

Y_pred_train = ridge.predict(X_train)
Y_pred_test = ridge.predict(X_test)

print('R-squared train: {} \nR-squared test: {}'.format(
    ridge.score(X_train, Y_train), ridge.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(ridge, X, Y, cv=kfold)

R-squared train: 0.8254249416427648 
R-squared test: 0.9051487992748176
MAE: 0.09325776452177632
MSE: 0.019299580640370604
RMSE: 0.13892293057796687
MAPE: 0.007952986801642265


array([0.83358314, 0.86717454, 0.86185936, 0.85526778, 0.8672741 ,
       0.77778482, 0.4510395 , 0.85436503, 0.86327565, 0.90156865])

In [19]:
ridge = Ridge(alpha=10**37)
ridge.fit(X_train, Y_train)

Y_pred_train = ridge.predict(X_train)
Y_pred_test = ridge.predict(X_test)

print('R-squared train: {} \nR-squared test: {}'.format(
    ridge.score(X_train, Y_train), ridge.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(ridge, X, Y, cv=kfold)

R-squared train: 0.0 
R-squared test: -0.002230960694857176
MAE: 0.33582277665744464
MSE: 0.20392611899820828
RMSE: 0.45158179657533615
MAPE: 0.028320178258867325


array([-6.09811744e-03, -1.70782383e-02, -1.31032033e-04, -8.25179844e-05,
       -3.95332762e-05, -5.51308997e-03, -9.99038707e-03, -1.17068558e-02,
       -1.76135105e-02, -5.85467508e-03])

In [35]:
ridge = Ridge(alpha=100)
ridge.fit(X_train, Y_train)

Y_pred_train = ridge.predict(X_train)
Y_pred_test = ridge.predict(X_test)

print('R-squared train: {} \nR-squared test: {}'.format(
    ridge.score(X_train, Y_train), ridge.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(ridge, X, Y, cv=kfold)

R-squared train: 0.8168708067656612 
R-squared test: 0.8207287896637013
MAE: 0.10324346398343008
MSE: 0.0364767040789149
RMSE: 0.19098875380219354
MAPE: 0.008912966083369611


array([0.78711265, 0.41958728, 0.84156201, 0.86687997, 0.69623519,
       0.89620986, 0.83992893, 0.88541582, 0.85028592, 0.87096234])

In [21]:
lasso = LassoCV()
lasso.fit(X_train, Y_train)

Y_pred_train = lasso.predict(X_train)
Y_pred_test = lasso.predict(X_test)

print('R-squared train: {} \nR-squared test: {}'.format(
    lasso.score(X_train, Y_train), lasso.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(lasso, X, Y, cv=kfold)

R-squared train: 0.6784632764489293 
R-squared test: 0.6875700266928421
MAE: 0.1604438120928058
MSE: 0.06357080794138499
RMSE: 0.25213252059459723
MAPE: 0.013746616278129955


array([ 0.73989203,  0.7835821 ,  0.77702338,  0.57596442,  0.77187035,
        0.77099236,  0.75946966,  0.7534005 , -0.22804763,  0.75599613])

In [22]:
lasso = Lasso(alpha=20)
lasso.fit(X_train, Y_train)

Y_pred_train = lasso.predict(X_train)
Y_pred_test = lasso.predict(X_test)

print('R-squared train: {} \nR-squared test: {}'.format(
    lasso.score(X_train, Y_train), lasso.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(lasso, X, Y, cv=kfold)

R-squared train: 0.6797034238536916 
R-squared test: 0.6966241311027542
MAE: 0.1566388272515221
MSE: 0.06172854957407428
RMSE: 0.24845230844987995
MAPE: 0.013431609802339041


array([ 0.74441083,  0.77334897,  0.76212048,  0.56322558,  0.7642841 ,
        0.77558904,  0.74943617,  0.74657636, -0.11237169,  0.74022257])

In [23]:
elastic = ElasticNetCV()
elastic.fit(X_train, Y_train)

Y_pred_train = elastic.predict(X_train)
Y_pred_test = elastic.predict(X_test)

print('R-squared train: {} \n R-squared test: {}'.format(
    elastic.score(X_train, Y_train), elastic.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(elastic, X, Y, cv=kfold)

R-squared train: 0.6784631841945427 
 R-squared test: 0.6875696082891455
MAE: 0.1604439765872346
MSE: 0.06357089307489748
RMSE: 0.25213268942145817
MAPE: 0.013746629896268387


array([ 0.7398918 ,  0.78358196,  0.77702306,  0.57596488,  0.77186954,
        0.77099213,  0.75946983,  0.75340119, -0.22804695,  0.75599516])

In [36]:
elastic = ElasticNet(alpha=10**10, l1_ratio=0.8)
elastic.fit(X_train, Y_train)

Y_pred_train = elastic.predict(X_train)
Y_pred_test = elastic.predict(X_test)

print('R-squared train: {} \n R-squared test: {}'.format(
    elastic.score(X_train, Y_train), elastic.score(X_test, Y_test)))
print('MAE: {}'.format(mean_absolute_error(Y_test, Y_pred_test)))
print('MSE: {}'.format(mse(Y_test, Y_pred_test)))
print('RMSE: {}'.format(rmse(Y_test, Y_pred_test)))
print('MAPE: {}'.format(((Y_test - Y_pred_test) / Y_test).abs().mean()))
kfold = KFold(n_splits=10, random_state=None, shuffle=True)
cross_val_score(elastic, X, Y, cv=kfold)

R-squared train: 0.0 
 R-squared test: -0.002230960694857176
MAE: 0.33582277665744464
MSE: 0.20392611899820828
RMSE: 0.45158179657533615
MAPE: 0.028320178258867325


array([-0.0078519 , -0.00543366, -0.00068278, -0.00281817, -0.00060011,
       -0.02762842, -0.00895402, -0.0108221 , -0.01986626, -0.00277468])

Which model is the best? Why?

Oddly, the first.