In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from scipy import stats
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse

from sklearn import linear_model
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

houseprices_df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

houseprices_df.info()
houseprices_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   int64  
 18  overallc

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
houseprices_df = pd.concat([houseprices_df,pd.get_dummies(houseprices_df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
houseprices_df = pd.concat([houseprices_df,pd.get_dummies(houseprices_df.street, prefix="street", drop_first=True)], axis=1)
dummy_column_names = list(pd.get_dummies(houseprices_df.mszoning, prefix="mszoning", drop_first=True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(houseprices_df.street, prefix="street", drop_first=True).columns)

In [3]:
houseprices_df['totalsf'] = houseprices_df['totalbsmtsf'] + houseprices_df['firstflrsf'] + houseprices_df['secondflrsf']

houseprices_df['int_over_sf'] = houseprices_df['totalsf'] * houseprices_df['overallqual']

#target variable
Y = np.log(houseprices_df['saleprice'])

# feature set
X = houseprices_df[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalsf', 'int_over_sf'] + dummy_column_names]

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 42)


In [4]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

#predictions 
y_pred_train = lrm.predict(X_train)
y_pred_test = lrm.predict(X_test)

In [5]:
print('R-squared of the model in training set: {}'.format(lrm.score(X_train, y_train)))
print('R-squared of the model in test: {}'.format(lrm.score(X_test, y_test)))
print('Mean absolute error: {}'.format(mean_absolute_error(y_test, y_pred_test)))
print('Mean squared error: {}'.format(mse(y_test, y_pred_test)))
print('Root mean squared error: {}'.format(rmse(y_test, y_pred_test)))
print('Mean absolute percentage errors: {}'.format(np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100))

R-squared of the model in training set: 0.8250626041648544
R-squared of the model in test: 0.8414430151723824
Mean absolute error: 0.12478414496185025
Mean squared error: 0.027518128707270235
Root mean squared error: 0.16588589062144565
Mean absolute percentage errors: 1.046495464038673


#### Try OLS, Lasso, Ridge, and ElasticNet regression 

In [6]:
lasso_ = LassoCV(cv=5)

lasso_.fit(X_train, y_train)

LassoCV(cv=5)

In [7]:
y_pred_train = lasso_.predict(X_train)
y_pred_test = lasso_.predict(X_test)

In [8]:
print("Best alpha value is: {}".format(lasso_.alpha_))
print("R-squared of the model in training set is: {}".format(lasso_.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso_.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100))

NameError: name 'lasso_cv' is not defined

In [None]:
ridge_ = RidgeCV(cv=5)

ridge_.fit(X_train, y_train)

# Predictions
y_pred_train = ridge_.predict(X_train)
y_pred_test = ridge_.predict(X_test)


In [None]:
print("Best alpha value is: {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is: {}".format(ridge_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

In [None]:
elasticnet_ = ElasticNetCV(cv=5)

elasticnet_.fit(X_train, y_train)

#Predict
y_pred_train = elasticnet_.predict(X_train)
y_pred_test = elasticnet_.predict(X_test)

In [None]:
print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print("R-squared of the model in training set is: {}".format(elasticnet_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(elasticnet_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))