In [7]:
# import libraries
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV

1. Load the houseprices data from Thinkful's database.
2. Reimplement your model from the previous checkpoint.
3. Try OLS, Lasso, Ridge, and ElasticNet regression using the same model specification. This time, you need to do k-fold cross-validation to choose the best hyperparameter values for your models. Which model is the best? Why?

In [2]:
# import dataset
df = pd.read_csv(r'C:\Users\AP\Downloads\house prices.csv')

# columns to lower case
df.columns = map(str.lower, df.columns)

# list/number of numerical columns
num_col = df.select_dtypes(['int64', 'float64']).columns

# list/number of alpha columns
alpha_col = df.select_dtypes(['object']).columns

# convert alpha features to binary
df['mszoning'] = pd.get_dummies(df.mszoning, drop_first=True)
df['neighborhood'] = pd.get_dummies(df.neighborhood, drop_first=True)
df['exterior1st'] = pd.get_dummies(df.exterior1st, drop_first=True)

In [10]:
# create totalsf variable
df['totalsf'] = df['totalbsmtsf'] + df['1stflrsf'] + df['2ndflrsf']

# create interaction variable
df['int_over_sf'] = df['totalsf'] * df['overallqual']

# Y is the target variable
Y = np.log1p(df.saleprice)
# X is the feature set
X = df[['overallqual', 'grlivarea', 'garagecars', 'mszoning', 'neighborhood', 'exterior1st', 'totalsf', 'int_over_sf']]

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

results = sm.OLS(y_train, X_train).fit()

alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

results.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.805
Model:,OLS,Adj. R-squared:,0.804
Method:,Least Squares,F-statistic:,685.0
Date:,"Wed, 25 Sep 2019",Prob (F-statistic):,0.0
Time:,16:31:00,Log-Likelihood:,371.53
No. Observations:,1168,AIC:,-727.1
Df Residuals:,1160,BIC:,-686.6
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.0468,0.060,168.548,0.000,9.930,10.164
overallqual,0.2043,0.010,20.799,0.000,0.185,0.224
grlivarea,6.561e-05,2.02e-05,3.245,0.001,2.59e-05,0.000
garagecars,0.1205,0.009,13.362,0.000,0.103,0.138
mszoning,0.0358,0.025,1.438,0.151,-0.013,0.085
neighborhood,-0.1308,0.125,-1.044,0.297,-0.376,0.115
exterior1st,4.8e-16,4.85e-16,0.989,0.323,-4.72e-16,1.43e-15
totalsf,0.0004,2.76e-05,13.153,0.000,0.000,0.000
int_over_sf,-3.156e-05,3.21e-06,-9.846,0.000,-3.78e-05,-2.53e-05

0,1,2,3
Omnibus:,334.265,Durbin-Watson:,1.878
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1630.12
Skew:,-1.249,Prob(JB):,0.0
Kurtosis:,8.221,Cond. No.,4.02e+21


In [8]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of model in training set: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of model in test set: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error prediction: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error prediction: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error prediction: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error prediction: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of model in training set: 0.8051956214887739
-----Test set statistics-----
R-squared of model in test set: 0.7978443814077518
Mean absolute error prediction: 0.13326851158756162
Mean squared error prediction: 0.03248165614363869
Root mean squared error prediction: 0.18022667988851898
Mean absolute percentage error prediction: 1.1108088717754452


In [11]:
lasso_cv = LassoCV(alphas=alphas, cv=5)

lasso_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print("Best alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model in training set is: {}".format(lasso_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.8051956214887737
-----Test set statistics-----
R-squared of the model in test set is: 0.7978443816171521
Mean absolute error of the prediction is: 0.1332685115486375
Mean squared error of the prediction is: 0.032481656109993
Root mean squared error of the prediction is: 0.18022667979517626
Mean absolute percentage error of the prediction is: 1.1108088714556057


In [12]:
ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print("Best alpha value is: {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is: {}".format(ridge_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha value is: 1e-10
R-squared of the model in training set is: 0.8051956214887739
-----Test set statistics-----
R-squared of the model in test set is: 0.7978443814077774
Mean absolute error of the prediction is: 0.13326851158756314
Mean squared error of the prediction is: 0.0324816561436346
Root mean squared error of the prediction is: 0.18022667988850763
Mean absolute percentage error of the prediction is: 1.1108088717754643


  overwrite_a=True).T


In [13]:
elasticnet_cv = ElasticNetCV(alphas=alphas, cv=5)

elasticnet_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print("R-squared of the model in training set is: {}".format(elasticnet_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(elasticnet_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.8051956214887739
-----Test set statistics-----
R-squared of the model in test set is: 0.7978443815290192
Mean absolute error of the prediction is: 0.1332685115634973
Mean squared error of the prediction is: 0.03248165612415391
Root mean squared error of the prediction is: 0.18022667983446267
Mean absolute percentage error of the prediction is: 1.1108088715771247


All 4 models performed the same?? Cross validation??