In [14]:
import numpy as np
import pandas as pd
%matplotlib inline 
# Option to display all dataframes columns
pd.options.display.max_columns = None
import matplotlib.pyplot as plt

housing = pd.read_csv('Ames_HousePrice_cleaned.csv', index_col=0)
housing.shape

(2580, 59)

In [22]:
#Whole data set, no split
from sklearn.linear_model import LinearRegression

X=housing.drop(columns='SalePrice')
y=housing.SalePrice
lin = LinearRegression().fit(X,y)
print(lin.score(X,y))

0.8876308381267729


In [2]:
from sklearn.model_selection import train_test_split

X=housing.drop(columns='SalePrice')
y=housing.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=42,shuffle=True)

In [3]:
#Baseline model with all variables
from sklearn.linear_model import LinearRegression

lin = LinearRegression().fit(X_train,y_train)
print(lin.score(X_train,y_train))
print(lin.score(X_test,y_test))

0.8864374204793761
0.8864374998432968


In [4]:
#Baseline model with all variables (90-10 train test split)
from sklearn.linear_model import LinearRegression

lin = LinearRegression().fit(X_train,y_train)
print(lin.score(X_train,y_train))
print(lin.score(X_test,y_test))

0.8864374204793761
0.8864374998432968


In [5]:
#K fold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

X=housing.drop(columns='SalePrice')
y=housing.SalePrice
kf = KFold(n_splits=5,shuffle=True,random_state=42)

for train_idx, test_idx in kf.split(X,y):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    lin = LinearRegression().fit(X_train,y_train)
    print(lin.score(X_train,y_train))
    print(lin.score(X_test,y_test))



0.8864374204793761
0.8864374998432968
0.8980440677676648
0.8405802616258292
0.8861699444580762
0.8886632274573878
0.8848221966009314
0.894772876324662
0.8880798987466282
0.8792946363016549


In [73]:
from sklearn.model_selection import KFold
import statsmodels.api as sm 


X= housing.drop(columns='SalePrice')
X= sm.add_constant(X)
y= housing.SalePrice
kf = KFold(n_splits=5,shuffle=True,random_state=42)
results=pd.DataFrame(index=housing.columns)

for train_idx, test_idx in kf.split(X,y):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    ols = sm.OLS(y_train, X_train)
    ans = ols.fit()
    print(ans.rsquared)
    results= pd.concat([results,ans.pvalues],axis=1)
results['Mean']=results.mean(axis=1)
results.sort_values(['Mean'],ascending=False)


0.8864374204793761
0.8980440677676649
0.8861699444580762
0.8848221966009314
0.8880798987466283


Unnamed: 0,0,0.1,0.2,0.3,0.4,Mean
FireplaceQu,0.8096507,0.9489494,0.9052294,0.438361,0.9765548,0.8157491
YearRemodAdd,0.6485412,0.9755034,0.971615,0.3142082,0.9467709,0.7713277
3SsnPorch,0.7761081,0.6793531,0.9410952,0.4871255,0.9395985,0.7646561
PavedDrive,0.7755261,0.9563179,0.7082893,0.6582061,0.6806121,0.7557903
Class_1.5story,0.992792,0.6703099,0.5924804,0.9778888,0.5005962,0.7468135
Baths,0.573102,0.7684513,0.8458439,0.9843909,0.5238145,0.7391205
ExterCond,0.2136511,0.954799,0.8818727,0.4712412,0.8853337,0.6813795
GarageQual,0.6208846,0.3203128,0.5227863,0.8876588,0.73116,0.6165605
Fence,0.812539,0.6644502,0.5057891,0.461988,0.4421461,0.5773825
GarageCars,0.53452,0.8913593,0.2227036,0.5983303,0.5301194,0.5554065


In [74]:
from sklearn.model_selection import KFold
import statsmodels.api as sm 
from sklearn.preprocessing import StandardScaler

#Get features and response
X= housing.drop(columns='SalePrice')
y= housing.SalePrice.to_numpy().reshape(-1,1)

#Scale X and y
scaler_X = StandardScaler().fit(X)
X_scaled = scaler_X.transform(X)
scaler_y = StandardScaler().fit(y)
y_scaled = scaler_y.transform(y)

#Convert back to dataframes
X= pd.DataFrame(data=X_scaled,index=X.index,columns=X.columns)
y= pd.DataFrame(data=y_scaled,index=X.index,columns=['SalePrice'])

#Add constant for stats model API
X= sm.add_constant(X)

#Initialize empty results dataframe and K-folds object
kf = KFold(n_splits=5,shuffle=True,random_state=42)
results=pd.DataFrame(index=housing.columns)

#Train statsmodel objects
for train_idx, test_idx in kf.split(X,y):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    ols = sm.OLS(y_train, X_train)
    ans = ols.fit()
    print(ans.rsquared)
    results= pd.concat([results,ans.pvalues],axis=1)
    #print(ans.pvalues.sort_values(ascending=False))
results['Mean']=results.mean(axis=1)
results.sort_values(['Mean'],ascending=False)


0.8864374204793761
0.8980440677676649
0.8861699444580762
0.8848221966009313
0.8880798987466282


Unnamed: 0,0,0.1,0.2,0.3,0.4,Mean
FireplaceQu,0.8096507,0.9489494,0.9052294,0.438361,0.9765548,0.8157491
YearRemodAdd,0.6485412,0.9755034,0.971615,0.3142082,0.9467709,0.7713277
3SsnPorch,0.7761081,0.6793531,0.9410952,0.4871255,0.9395985,0.7646561
PavedDrive,0.7755261,0.9563179,0.7082893,0.6582061,0.6806121,0.7557903
Class_1.5story,0.992792,0.6703099,0.5924804,0.9778888,0.5005962,0.7468135
const,0.768925,0.7146174,0.5902295,0.8581581,0.7780947,0.7420049
Baths,0.573102,0.7684513,0.8458439,0.9843909,0.5238145,0.7391205
ExterCond,0.2136511,0.954799,0.8818727,0.4712412,0.8853337,0.6813795
GarageQual,0.6208846,0.3203128,0.5227863,0.8876588,0.73116,0.6165605
Fence,0.812539,0.6644502,0.5057891,0.461988,0.4421461,0.5773825


In [76]:
#Simple stats model API on entire dataset
import statsmodels.api as sm 

X=housing.drop(columns='SalePrice')
y=housing.SalePrice

X_add_const = sm.add_constant(X)
ols = sm.OLS(y, X_add_const)
ans = ols.fit()
print(ans.summary())
print(ans.rsquared)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.888
Model:                            OLS   Adj. R-squared:                  0.885
Method:                 Least Squares   F-statistic:                     349.5
Date:                Sun, 22 Nov 2020   Prob (F-statistic):               0.00
Time:                        14:43:08   Log-Likelihood:                -29803.
No. Observations:                2580   AIC:                         5.972e+04
Df Residuals:                    2522   BIC:                         6.006e+04
Df Model:                          57                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const               -7.121e+05   9

In [None]:
#First round of drops
