In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')

In [5]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [6]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [7]:
variables = ['overallqual','firstflrsf','totrmsabvgrd','neighborhood','exterior1st','kitchenqual','saleprice']

In [8]:
def mape(y_values, y_preds):
    return np.mean(np.abs(y_values-y_preds)/y_values)

In [9]:
def print_prediction_errors(y_test, y_preds):
    print("The mean absolute error of the model: ",mean_absolute_error(y_test, y_preds))
    print("The mean absolute percentage error of the model: ",mape(y_test, y_preds))
    print("The mean square error of the model: ",mse(y_test,y_preds))
    print("The root mean square error of the model: ",rmse(y_test,y_preds))

# Preparing the model

In [10]:
df2 = pd.get_dummies(df[variables], drop_first=True)
X = df2.drop('saleprice', axis=1)
y = df2.saleprice

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 625)

# Ordinary Least Squares

In [12]:
lr = LinearRegression()

In [13]:
lr.fit(X_train,y_train)

LinearRegression()

In [14]:
y_preds = lr.predict(X_test)

In [31]:
r1 = lr.score(X_train,y_train)
r2 = lr.score(X_test,y_test)
print("R-squared for the training set is: ",r1)
print('-'*50)
print('R-squared for the test set is: ',r2)
print_prediction_errors(y_test,y_preds)
print('-'*50)
print("The difference in R squared between training and test sets is: ", r2-r1)

R-squared for the training set is:  0.8001678409709541
--------------------------------------------------
R-squared for the test set is:  0.8089281736956702
The mean absolute error of the model:  23151.99609765595
The mean absolute percentage error of the model:  0.1336155682334984
The mean square error of the model:  1055261030.7530787
The root mean square error of the model:  32484.781525401686
--------------------------------------------------
The difference in R squared between training and test sets is:  0.008760332724716102


# Ridge Regression

In [18]:
rr = RidgeCV(alphas = np.arange(0.5,10.5,0.5))

In [19]:
rr.fit(X_train,y_train)

RidgeCV(alphas=array([ 0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,
        6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. ]))

In [20]:
y_preds = rr.predict(X_test)

In [32]:
r1 = rr.score(X_train,y_train)
r2 = rr.score(X_test,y_test)
print("R-squared for the training set is: ",r1)
print('-'*50)
print('R-squared for the test set is: ',r2)
print_prediction_errors(y_test,y_preds)
print('-'*50)
print("The difference in R squared between training and test sets is: ", r2-r1)

R-squared for the training set is:  0.7991721608802357
--------------------------------------------------
R-squared for the test set is:  0.8086376092045602
The mean absolute error of the model:  23151.99609765595
The mean absolute percentage error of the model:  0.1336155682334984
The mean square error of the model:  1055261030.7530787
The root mean square error of the model:  32484.781525401686
--------------------------------------------------
The difference in R squared between training and test sets is:  0.009465448324324499


# Lasso Regression

In [23]:
lssr = LassoCV(alphas = np.arange(0.5,10.5,0.5))

In [24]:
lssr.fit(X_test,y_test)

LassoCV(alphas=array([ 0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,
        6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. ]))

In [25]:
y_preds = lssr.predict(X_test)

In [33]:
r1 = lssr.score(X_train,y_train)
r2 = lssr.score(X_test,y_test)
print("R-squared for the training set is: ",r1)
print('-'*50)
print('R-squared for the test set is: ',r2)
print_prediction_errors(y_test,y_preds)
print('-'*50)
print("The difference in R squared between training and test sets is: ", r2-r1)

R-squared for the training set is:  0.7440260881292402
--------------------------------------------------
R-squared for the test set is:  0.8495325033707486
The mean absolute error of the model:  23151.99609765595
The mean absolute percentage error of the model:  0.1336155682334984
The mean square error of the model:  1055261030.7530787
The root mean square error of the model:  32484.781525401686
--------------------------------------------------
The difference in R squared between training and test sets is:  0.10550641524150839


# ElasticNet Regression

In [34]:
er = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99])

In [35]:
er.fit(X_train,y_train)

ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99])

In [36]:
y_preds = er.predict(X_test)

In [37]:
r1 = er.score(X_train,y_train)
r2 = er.score(X_test,y_test)
print("R-squared for the training set is: ",r1)
print('-'*50)
print('R-squared for the test set is: ',r2)
print_prediction_errors(y_test,y_preds)
print('-'*50)
print("The difference in R squared between training and test sets is: ", r2-r1)

R-squared for the training set is:  0.36253264271056174
--------------------------------------------------
R-squared for the test set is:  0.40267551469367113
The mean absolute error of the model:  48036.60740468905
The mean absolute percentage error of the model:  0.28236185625823157
The mean square error of the model:  4189165541.9212313
The root mean square error of the model:  64723.76334794842
--------------------------------------------------
The difference in R squared between training and test sets is:  0.040142871983109396


# Evaluation of training

Based on the metrics, the best model is the original model using ordinary least squares.  The reason for this may be that as I am not using many parameters, overfitting is unlikely and so regularization is not needed.  Also as cross validation reduces the training sample size, and since the larger the training size the better, OLS is better.