In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
#!pip install --upgrade git+https://github.com/statsmodels/statsmodels


# This will prompt for authorization.
from google.colab import drive

# Mount your drive. It will be at this path: "/content/gdrive/My Drive/"
drive.mount('/content/drive')


#%% Import Data
train = pd.read_csv('/content/drive/My Drive/ml_project/matt/train.csv')
test = pd.read_csv('/content/drive/My Drive/ml_project/matt/test.csv')

#%lowercase for my brain
train.columns = [x.lower() for x in train.columns]
test.columns = [x.lower() for x in test.columns]


#outlier removal
train = train[train.garagearea < 1200]
train = train[train.totalbsmtsf < 2500]

#street type to street bool
train['enc_street'] = pd.get_dummies(train.street, drop_first = True)
test['enc_street'] = pd.get_dummies(test.street, drop_first = True)

#pool type to pool bool
def encode(x): return 1 if x > 0 else 0
train['enc_pool'] = train.poolarea.apply(encode)
test['enc_pool'] = train.poolarea.apply(encode)

#financial crisis bool
def encode(x): return 1 if x > 2008 else 0
train['enc_after08'] = train.yrsold.apply(encode)
test['enc_after08'] = train.yrsold.apply(encode)

#condition-partial bool
def encode(x): return 1 if x == 'Partial' else 0
train['enc_condition'] = train.salecondition.apply(encode)
test['enc_condition'] = test.salecondition.apply(encode)

#interpolate
train_data = train.select_dtypes(include=[np.number]).interpolate().dropna()
test_data = test.select_dtypes(include=[np.number]).interpolate().dropna()


##################
y = np.log(train.saleprice)
X = train_data.drop(['saleprice', 'id','poolarea', 'yrsold'], axis = 1) 


#split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100, test_size = .2)





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import statsmodels.api as sm 
ols = sm.OLS(y_train, X_train)
ans = ols.fit()
print(ans.summary())

                                 OLS Regression Results                                
Dep. Variable:              saleprice   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.860e+05
Date:                Mon, 27 May 2019   Prob (F-statistic):                        0.00
Time:                        23:20:09   Log-Likelihood:                          759.63
No. Observations:                1159   AIC:                                     -1447.
Df Residuals:                    1123   BIC:                                     -1265.
Df Model:                          36                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [0]:
table = pd.DataFrame(ans.summary().tables[1].data[1:])
table.columns = ['name','coef','std err','t value','p value','2.5% confidence','97.5% confidence']
table = table.astype({'name':str,'coef':float,'std err':float, 't value':float, 'p value':float,'2.5% confidence':float, '97.5% confidence':float})
table[table['p value']<0.05]  # those coefficients with 0 lying outside their CIs

Unnamed: 0,name,coef,std err,t value,p value,2.5% confidence,97.5% confidence
0,mssubclass,-0.0005,9.79e-05,-5.022,0.0,-0.001,-0.0
1,lotarea,3e-06,4.33e-07,6.631,0.0,2e-06,4e-06
2,overallqual,0.0681,0.004,15.647,0.0,0.06,0.077
3,overallcond,0.0572,0.004,13.622,0.0,0.049,0.065
4,yearbuilt,0.0036,0.0,17.579,0.0,0.003,0.004
5,yearremodadd,0.0017,0.0,8.236,0.0,0.001,0.002
6,bsmtfinsf1,7.8e-05,1.25e-05,6.22,0.0,5.3e-05,0.0
7,totalbsmtsf,0.0001,1.38e-05,7.331,0.0,7.4e-05,0.0
8,grlivarea,0.0003,1.07e-05,26.358,0.0,0.0,0.0
9,bsmtfullbath,0.0337,0.01,3.433,0.001,0.014,0.053


In [0]:
cols = list(table[table['p value']<=0.05].name)
X = train_data.loc[:, cols]
X.columns


Index(['mssubclass', 'lotarea', 'overallqual', 'overallcond', 'yearbuilt',
       'yearremodadd', 'bsmtfinsf1', 'totalbsmtsf', 'grlivarea',
       'bsmtfullbath', 'fireplaces', 'garagecars', 'wooddecksf',
       'enclosedporch', 'screenporch', 'enc_street', 'enc_condition'],
      dtype='object')

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100, test_size = .2)
ols = sm.OLS(y_train, X_train)
ans = ols.fit()
print(ans.summary())

                                 OLS Regression Results                                
Dep. Variable:              saleprice   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          6.046e+05
Date:                Mon, 27 May 2019   Prob (F-statistic):                        0.00
Time:                        23:26:44   Log-Likelihood:                          748.94
No. Observations:                1159   AIC:                                     -1464.
Df Residuals:                    1142   BIC:                                     -1378.
Df Model:                          17                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [0]:
table = pd.DataFrame(ans.summary().tables[1].data[1:])
table.columns = ['name','coef','std err','t value','p value','2.5% confidence','97.5% confidence']
table = table.astype({'name':str,'coef':float,'std err':float, 't value':float, 'p value':float,'2.5% confidence':float, '97.5% confidence':float})
table[table['p value']<0.05]  # those coefficients with 0 lying outside their CIs

Unnamed: 0,name,coef,std err,t value,p value,2.5% confidence,97.5% confidence
0,mssubclass,-0.0005,9.93e-05,-4.903,0.0,-0.001,-0.0
1,lotarea,3e-06,4.34e-07,6.621,0.0,2e-06,4e-06
2,overallqual,0.0681,0.004,15.504,0.0,0.059,0.077
3,overallcond,0.0571,0.004,13.569,0.0,0.049,0.065
4,yearbuilt,0.0035,0.0,17.281,0.0,0.003,0.004
5,yearremodadd,0.0017,0.0,8.266,0.0,0.001,0.002
6,bsmtfinsf1,7.7e-05,1.25e-05,6.183,0.0,5.3e-05,0.0
7,totalbsmtsf,0.0001,1.83e-05,5.454,0.0,6.4e-05,0.0
10,grlivarea,0.0002,8.33e-05,2.272,0.023,2.6e-05,0.0
11,bsmtfullbath,0.0337,0.01,3.429,0.001,0.014,0.053


In [0]:
cols = list(table[table['p value']<=0.05].name)
X = train_data.loc[:, cols]
X.columns

Index(['mssubclass', 'lotarea', 'overallqual', 'overallcond', 'yearbuilt',
       'yearremodadd', 'bsmtfinsf1', 'totalbsmtsf', 'grlivarea',
       'bsmtfullbath', 'fireplaces', 'garagecars', 'wooddecksf',
       'enclosedporch', 'screenporch', 'enc_street', 'enc_condition'],
      dtype='object')

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100, test_size = .2)
ols = sm.OLS(y_train, X_train)
ans = ols.fit()
print(ans.summary())

                                 OLS Regression Results                                
Dep. Variable:              saleprice   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          6.046e+05
Date:                Mon, 27 May 2019   Prob (F-statistic):                        0.00
Time:                        23:20:09   Log-Likelihood:                          748.94
No. Observations:                1159   AIC:                                     -1464.
Df Residuals:                    1142   BIC:                                     -1378.
Df Model:                          17                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [0]:
table = pd.DataFrame(ans.summary().tables[1].data[1:])
table.columns = ['name','coef','std err','t value','p value','2.5% confidence','97.5% confidence']
table = table.astype({'name':str,'coef':float,'std err':float, 't value':float, 'p value':float,'2.5% confidence':float, '97.5% confidence':float})
table[table['p value']<0.05]  # those coefficients with 0 lying outside their CIs
cols = list(table[table['p value']<=0.05].name)
X = train_data.loc[:, cols]
X.columns = cols
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100, test_size = .2)
ols = sm.OLS(y_train, X_train)
ans = ols.fit()
print(ans.summary())

                                 OLS Regression Results                                
Dep. Variable:              saleprice   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          6.046e+05
Date:                Mon, 27 May 2019   Prob (F-statistic):                        0.00
Time:                        23:21:20   Log-Likelihood:                          748.94
No. Observations:                1159   AIC:                                     -1464.
Df Residuals:                    1142   BIC:                                     -1378.
Df Model:                          17                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [0]:
features = test_data.loc[:, cols]
predictions = ans.predict(features)
saleprice = np.exp(predictions)
submission = pd.DataFrame()
submission['Id'] = test.id
submission['SalePrice'] = saleprice
submission.set_index('Id', inplace = True)
submission.reset_index()
submission.to_csv('/content/drive/My Drive/ml_project/matt/mult_linreg.csv')
submission

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,116656.498814
1462,150153.256853
1463,179067.641521
1464,199549.930676
1465,178513.294722
1466,172152.110733
1467,199657.923950
1468,164696.946767
1469,194554.073469
1470,116681.995685


Unnamed: 0,Id,SalePrice
0,1461,116656.498814
1,1462,150153.256853
2,1463,179067.641521
3,1464,199549.930676
4,1465,178513.294722
5,1466,172152.110733
6,1467,199657.923950
7,1468,164696.946767
8,1469,194554.073469
9,1470,116681.995685
