In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import math
import statsmodels.api as sm

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline
plt.style.use('ggplot')

train = pd.read_csv('../data/train_clean.csv', index_col=0)
train.columns
train.shape
train.head(3)

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,GrLivArea,BsmtFullBath,BedroomAbvGr,KitchenAbvGr,GarageType,GarageArea,SalePrice,Kitchen,Fireplace,ExterQ,BsmtQ,HeatingQ,n_toilets,n_showers
0,8450,2003,2003,2416,1,3,1,1,548,208500,4,0,4,4,5,4,3
1,9600,1976,1976,2240,0,3,1,1,460,181500,3,3,3,4,5,3,2
2,11250,2001,2002,2272,1,3,1,1,608,223500,4,3,4,4,5,4,3


In [2]:
# Make year features continous variables
current_year = 2019
train['HouseAge'] = current_year - train['YearBuilt']
train['RemodelAge'] = current_year - train['YearRemodAdd']
numeric_vars = ['LotArea', 'HouseAge', 'RemodelAge', 'GrLivArea' ]

# subset df
continuous_feat = train[numeric_vars]


In [3]:
continuous_feat.sample(5)

Unnamed: 0,LotArea,HouseAge,RemodelAge,GrLivArea
266,7917,43,43,1864
962,10800,70,69,1192
515,10918,93,15,2080
278,7200,13,13,2167
387,8405,119,69,1758


In [4]:
# General summary

X = continuous_feat
y = train['SalePrice'] 


X_add_const = sm.add_constant(X)  #add a column with Beta Zero =1

model = sm.OLS(y, X_add_const).fit()
predictions = model.predict(X_add_const) # make the predictions by the model
model.summary()




0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.73
Model:,OLS,Adj. R-squared:,0.729
Method:,Least Squares,F-statistic:,966.3
Date:,"Fri, 01 Mar 2019",Prob (F-statistic):,0.0
Time:,10:04:36,Log-Likelihood:,-17267.0
No. Observations:,1437,AIC:,34540.0
Df Residuals:,1432,BIC:,34570.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.37e+04,4502.079,18.590,0.000,7.49e+04,9.25e+04
LotArea,2.5804,0.255,10.136,0.000,2.081,3.080
HouseAge,-564.8718,44.344,-12.738,0.000,-651.858,-477.886
RemodelAge,-673.5720,64.863,-10.385,0.000,-800.809,-546.335
GrLivArea,63.7336,1.737,36.688,0.000,60.326,67.141

0,1,2,3
Omnibus:,476.607,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3186.207
Skew:,1.375,Prob(JB):,0.0
Kurtosis:,9.756,Cond. No.,46800.0


In [7]:
# Without splitting data
lm = LinearRegression() 
model = lm.fit(X, y)

print("Intercept: %f" %lm.intercept_)
for i in range(len(X.columns)):
    print("Coefficient: %s : %s" %(lm.coef_[i], list(X.columns)[i]))

print("R^2: %f" %(lm.score(X, y)))

predictions = lm.predict(X)
lm.score(X,y)


Intercept: 83695.193734
Coefficient: 2.580379744575241 : LotArea
Coefficient: -564.8717695159893 : HouseAge
Coefficient: -673.5719693763572 : RemodelAge
Coefficient: 63.73359549035173 : GrLivArea
R^2: 0.729670


0.7296695853751587

In [8]:
# Split into training and test for validation

X = continuous_feat
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)


In [10]:
# Model with training and test splits

model = LinearRegression()
model.fit(X_train, y_train)

print ('Training score: {}'.format(model.score(X_train, y_train)))
print ('Test score: {}'.format(model.score(X_test, y_test)))


Training score: 0.7232057104642942
Test score: 0.7431170051813005


In [11]:
# Standardize data
# Create Polynomial Feat
# Train and Score models

steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.7995520132108133
Test score: 0.8194607402988242


In [52]:
# Ridge Regression
penalty = 120


steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=penalty, fit_intercept=True))
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)

print('Training Score: {}'.format(ridge_pipe.score(X_train, y_train)))
print('Test Score: {}'.format(ridge_pipe.score(X_test, y_test)))



Training Score: 0.792776630635791
Test Score: 0.8232744443063039


In [72]:
# Lasso Regression
penalty = 900


steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=penalty, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)

lasso_pipe.fit(X_train, y_train)

print('Training score: {}'.format(lasso_pipe.score(X_train, y_train)))
print('Test score: {}'.format(lasso_pipe.score(X_test, y_test)))




Training score: 0.7976711778776272
Test score: 0.8214074806497913
