# SLR Practice

### Imports

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor

import sklearn.metrics as metrics

import statsmodels.api as sm

from statsmodels.stats.stattools import durbin_watson


### Get the data

In [2]:
df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


I always fix my column names first. 

In [3]:
df.columns = [name.strip().lower() for name in df.columns] 

In [6]:
df.columns

Index(['id', 'mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street',
       'alley', 'lotshape', 'landcontour', 'utilities', 'lotconfig',
       'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype',
       'housestyle', 'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd',
       'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype',
       'masvnrarea', 'exterqual', 'extercond', 'foundation', 'bsmtqual',
       'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1',
       'bsmtfintype2', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating',
       'heatingqc', 'centralair', 'electrical', '1stflrsf', '2ndflrsf',
       'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'kitchenqual',
       'totrmsabvgrd', 'functional', 'fireplaces', 'fireplacequ', 'garagetype',
       'garageyrblt', 'garagefinish', 'garagecars', 'garagearea', 'garagequal',
       'garagecond', 'paveddrive

In [8]:
df

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


I'll create an SLR using one feature and `saleprice` as the target.

In [9]:
# Create target/output and features/input
x = df['grlivarea']

y = df['saleprice']

### Assumption: Linear Relationship between feature and target

In [23]:
sm_lr = sm.OLS(y, sm.add_constant(x))


In [30]:
sm.ols(formula='y-x',(X))

SyntaxError: positional argument follows keyword argument (<ipython-input-30-95ac69235bcd>, line 1)

In [25]:
formula = 'saleprice ~ grlivarea'

In [28]:
sm_lr_formula = sm.formula.ols(formula=formula, data=df)

In [29]:
sm_lr_formula.fit().summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.502
Method:,Least Squares,F-statistic:,1471.0
Date:,"Wed, 15 Jun 2022",Prob (F-statistic):,4.5200000000000005e-223
Time:,00:44:51,Log-Likelihood:,-18035.0
No. Observations:,1460,AIC:,36070.0
Df Residuals:,1458,BIC:,36080.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.857e+04,4480.755,4.144,0.000,9779.612,2.74e+04
grlivarea,107.1304,2.794,38.348,0.000,101.650,112.610

0,1,2,3
Omnibus:,261.166,Durbin-Watson:,2.025
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3432.287
Skew:,0.41,Prob(JB):,0.0
Kurtosis:,10.467,Cond. No.,4900.0


sm.ols(formula='y-x',X)

In [22]:
sm_lr.fit().summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.502
Method:,Least Squares,F-statistic:,1471.0
Date:,"Wed, 15 Jun 2022",Prob (F-statistic):,4.5200000000000005e-223
Time:,00:40:46,Log-Likelihood:,-18035.0
No. Observations:,1460,AIC:,36070.0
Df Residuals:,1458,BIC:,36080.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.857e+04,4480.755,4.144,0.000,9779.612,2.74e+04
grlivarea,107.1304,2.794,38.348,0.000,101.650,112.610

0,1,2,3
Omnibus:,261.166,Durbin-Watson:,2.025
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3432.287
Skew:,0.41,Prob(JB):,0.0
Kurtosis:,10.467,Cond. No.,4900.0


In [None]:
# And for Sale price, as well


What does this tell us?

In [None]:
# statsmodel first


In [None]:
# Summary


### How do we compare this?

Baseline!

There's a [class](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html) for it!

In [None]:
# Instantiate


In [None]:
# fit


In [None]:
# score


Well that's a relief. We're at least better then that.

### Error assumptions

#### Normally Distributed with a mean of 0

In [None]:
# Instantiate


In [None]:
# fit


In [None]:
# score


In [None]:
# predict


In [None]:
# calculate residuals



In [None]:
# Histogram of residuals


In [None]:
# sns


In [None]:
# QQ!



#### Thoughts?

#### Heteroskedacity

In [None]:
# resid plot sns.resid


In [None]:
# scatter


#### Thoughts?

#### No autocorrelation in residuals

Hello, [Durbin-Watson!](https://en.wikipedia.org/wiki/Durbin%E2%80%93Watson_statistic)

In [None]:
print('\nPerforming Durbin-Watson Test')
print('Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data')
print('0 to 2< is positive autocorrelation')
print('>2 to 4 is negative autocorrelation')
print('-------------------------------------')
durbinWatson = durbin_watson(resids)
print('Durbin-Watson:', durbinWatson)
if durbinWatson < 1.5:
    print('Signs of positive autocorrelation', '\n')
    print('Assumption not satisfied')
elif durbinWatson > 2.5:
    print('Signs of negative autocorrelation', '\n')
    print('Assumption not satisfied')
else:
    print('Little to no autocorrelation', '\n')
    print('Assumption satisfied')

The above code was audaciously stolen from [this](https://jeffmacaluso.github.io/post/LinearRegressionAssumptions/) excellent article.

#### Thoughts?

## Your turn!

Your goal is find a feature or an interaction of features that outperforms this model. GO!