In [18]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [19]:
house = pd.read_csv("house train.csv")
house.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500


In [20]:
# Y is the target variable
Y = house['SalePrice']
# X is the feature set which includes
X = house[['OverallQual','YearBuilt']]

In [21]:
import statsmodels.api as sm

# We need to add constant manually 
# in statsmodels' sm
X = sm.add_constant(X)

results = sm.OLS(Y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.633
Model:,OLS,Adj. R-squared:,0.632
Method:,Least Squares,F-statistic:,1256.0
Date:,"Wed, 08 Jul 2020",Prob (F-statistic):,7.52e-318
Time:,00:08:07,Log-Likelihood:,-17812.0
No. Observations:,1460,AIC:,35630.0
Df Residuals:,1457,BIC:,35650.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6.166e+05,9.66e+04,-6.380,0.000,-8.06e+05,-4.27e+05
OverallQual,4.2e+04,1111.777,37.781,0.000,3.98e+04,4.42e+04
YearBuilt,274.5914,50.908,5.394,0.000,174.730,374.453

0,1,2,3
Omnibus:,649.933,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7045.246
Skew:,1.78,Prob(JB):,0.0
Kurtosis:,13.156,Cond. No.,151000.0


In [23]:
# Y is the target variable
Y = house['SalePrice']

# This is the interaction OverallQual and YearBuilt
house["Qual_Year"] = house.OverallQual * house.YearBuilt

# X is the feature set
X = house[['OverallQual','YearBuilt','Qual_Year']]

# We add constant to the model as it's a best practice
# to do so everytime!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(Y, X).fit()

# We print the summary results
display(results.summary())

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.65
Method:,Least Squares,F-statistic:,904.3
Date:,"Wed, 08 Jul 2020",Prob (F-statistic):,0.0
Time:,00:26:00,Log-Likelihood:,-17776.0
No. Observations:,1460,AIC:,35560.0
Df Residuals:,1456,BIC:,35580.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.564e+06,3.81e+05,6.724,0.000,1.82e+06,3.31e+06
OverallQual,-4.783e+05,6.05e+04,-7.912,0.000,-5.97e+05,-3.6e+05
YearBuilt,-1340.1264,194.038,-6.907,0.000,-1720.751,-959.502
Qual_Year,263.6739,30.629,8.609,0.000,203.592,323.756

0,1,2,3
Omnibus:,618.099,Durbin-Watson:,1.967
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6633.87
Skew:,1.672,Prob(JB):,0.0
Kurtosis:,12.893,Cond. No.,3930000.0


In [None]:
# Our model's F statistic is 1256 and the associated p-value is very close to zero. This means that, our features add some
# information to the reduced model and our model is useful for explaining the Sold prices.
# R-squared, adjusted R-squared are in a good level which is 0.632 and can explain %63.2 the variance of the target.
# AIC and BIC values are high which is not so good.

# In order to improve the goodness of fit of our model, we included the interaction of OverallQual and YearBuilt to the model we
# estimated again the model. And the results improved in a good manner. This time explained ratio became %65.
# And AIC and BIC values decreased which means new model has developed.