# Introduction

### Imports

In [25]:
from statsmodels.tsa.stattools import acf
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import SCORERS
from sqlalchemy import create_engine
%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span></li></ul></li></ul></li><li><span><a href="#Assignment" data-toc-modified-id="Assignment-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Assignment</a></span></li><li><span><a href="#OLS-Model" data-toc-modified-id="OLS-Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>OLS Model</a></span></li><li><span><a href="#Lasso-Model" data-toc-modified-id="Lasso-Model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Lasso Model</a></span></li><li><span><a href="#Ridge-Model" data-toc-modified-id="Ridge-Model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Ridge Model</a></span></li><li><span><a href="#Elastic-Net" data-toc-modified-id="Elastic-Net-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Elastic Net</a></span></li><li><span><a href="#Best-Model" data-toc-modified-id="Best-Model-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Best Model</a></span></li></ul></div>

# Assignment
Load the houseprices data from Thinkful's database.
Reimplement your model from the previous checkpoint.
Try OLS, Lasso, Ridge, and ElasticNet regression using the same model specification. This time, you need to do k-fold cross-validation to choose the best hyperparameter values for your models. Which model is the best? Why?

In [5]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
housing_df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()


In [6]:
# remove major outliers (caused RMSE to be exceptionally high)
housing_df = housing_df[(housing_df['totalbsmtsf'] < 5000) & (housing_df['grlivarea'] < 4000)].copy()
# filter out houses with sale price higher than 400,000 (not enough high value houses in dataset)
housing_df = housing_df[housing_df['saleprice'] < 400000]

house_prices_df['totalsf'] = house_prices_df['totalbsmtsf'] + house_prices_df['firstflrsf'] + house_prices_df['secondflrsf']

# OLS Model

In [80]:
feature_list = ['overallqual',
 'yearbuilt',
 'yearremodadd',
 'totalbsmtsf',
 'firstflrsf',
 'grlivarea',
 'garagearea']

model = LinearRegression()
data = housing_df[feature_list]
target = housing_df['saleprice']
linear_results = cross_validate(model, data, target, cv=5, scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], return_train_score=True)

# Lasso Model

In [78]:
lasso = Lasso(alpha=10*25)
lasso_results = cross_validate(lasso, data, target, cv=5, scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], return_train_score=True)

# Ridge Model

In [79]:
ridge = Ridge(alpha=10**25)
ridge_results = cross_validate(lasso, data, target, cv=5, scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], return_train_score=True)

# Elastic Net

In [74]:
elastic = ElasticNet(alpha= 10**25)
elastic_results = cross_validate(lasso, data, target, cv=5, scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], return_train_score=True)

# Best Model

In [83]:
lasso_results

{'fit_time': array([0.00533915, 0.00359797, 0.00302577, 0.00505996, 0.00344706]),
 'score_time': array([0.00475812, 0.00660896, 0.00303102, 0.00355315, 0.00226188]),
 'test_r2': array([0.81595429, 0.84813905, 0.81961668, 0.8154111 , 0.79379942]),
 'train_r2': array([0.82460187, 0.81516833, 0.82351837, 0.82359382, 0.82956452]),
 'test_neg_mean_absolute_error': array([-20291.15956938, -19958.67977726, -19888.08529353, -20143.87817032,
        -21543.7079921 ]),
 'train_neg_mean_absolute_error': array([-20225.46046339, -20171.29526397, -20161.28153345, -20373.37753659,
        -19914.02277102]),
 'test_neg_mean_squared_error': array([-7.71600745e+08, -7.62264260e+08, -8.27568829e+08, -6.69865391e+08,
        -8.80368284e+08]),
 'train_neg_mean_squared_error': array([-7.69492994e+08, -7.72487523e+08, -7.56804292e+08, -7.96910617e+08,
        -7.44460613e+08])}

In [88]:
model_results = [elastic_results, lasso_results, linear_results]
scoring = ['test_neg_mean_absolute_error', 'train_neg_mean_absolute_error', 'test_neg_mean_squared_error', 'train_neg_mean_squared_error']

#convert neg score to pos
for results in model_results:
    for score in scoring:
        results[score] = results[score] * -1
#create RMSE for all model results
for results in model_results:
    for score in scoring[2:]:
        results['root_'+score] = np.sqrt(results[score])

Create MAPE for each results

create function to take the sum of all errors in each fold and find which fold has the smallest sum of errors

Compare the best fold in each model with each other to find the best model