In [120]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [121]:
props = pd.read_csv('../datasets/model-ready-data.csv')

In [122]:
props.set_index('Id', inplace = True)

### Recast some of the stubborn numerics that don't want to stay objects

In [123]:
props['PID'] = props['PID'].astype(str)
props['MS SubClass'] = props['MS SubClass'].astype(str)
props['Overall Qual'] = props['Overall Qual'].astype(str)
props['Overall Cond'] = props['Overall Cond'].astype(str)
props['Mo Sold'] = props['Mo Sold'].astype(str)

### Drop PID since we know it should have no correlation to Sale Price

In [124]:
props.drop('PID', axis = 1, inplace = True)

In [125]:
non_numerics = props.select_dtypes(exclude = np.number)
non_numerics.head()

Unnamed: 0_level_0,MS SubClass,MS Zoning,Alley,Lot Shape,Land Contour,Lot Config,Neighborhood,Condition 1,Condition 2,Bldg Type,...,Fireplace Qu,Garage Type,Garage Finish,Garage Qual,Garage Cond,Pool QC,Fence,Misc Feature,Mo Sold,Sale Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,60,RL,,IR1,Lvl,CulDSac,Sawyer,RRAe,Norm,1Fam,...,,Attchd,RFn,TA,TA,,,,3,WD
544,60,RL,,IR1,Lvl,CulDSac,SawyerW,Norm,Norm,1Fam,...,TA,Attchd,RFn,TA,TA,,,,4,WD
153,20,RL,,Reg,Lvl,Inside,NAmes,Norm,Norm,1Fam,...,,Detchd,Unf,TA,TA,,,,1,WD
318,60,RL,,Reg,Lvl,Inside,Timber,Norm,Norm,1Fam,...,,BuiltIn,Fin,TA,TA,,,,4,WD
255,50,RL,,IR1,Lvl,Inside,SawyerW,Norm,Norm,1Fam,...,,Detchd,Unf,TA,TA,,,,3,WD


In [126]:
non_numerics.columns

Index(['MS SubClass', 'MS Zoning', 'Alley', 'Lot Shape', 'Land Contour',
       'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating',
       'Heating QC', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Pool QC', 'Fence', 'Misc Feature', 'Mo Sold',
       'Sale Type'],
      dtype='object')

In [127]:
props_dummy = pd.get_dummies(props, columns = non_numerics.columns, drop_first = True)

In [128]:
props_dummy.shape

(2026, 284)

In [129]:
corr = props_dummy.corr(numeric_only = True)[['SalePrice']].sort_values( by = 'SalePrice', ascending = False)

In [130]:
correlated_features = corr[(corr['SalePrice'] > 0.2) |  (corr['SalePrice'] < -0.2)]

In [131]:
correlated_features.index[1:]

Index(['Gr Liv Area', 'Garage Area', 'Garage Cars', 'Total Bsmt SF',
       '1st Flr SF', 'Garage Yr Blt', 'Full Bath', 'Foundation_PConc',
       'Mas Vnr Area', 'TotRms AbvGrd', 'Overall Qual_9', 'Fireplaces',
       'BsmtFin Type 1_GLQ', 'Exter Qual_Gd', 'Neighborhood_NridgHt',
       'BsmtFin SF 1', 'Overall Qual_8', 'Fireplace Qu_Gd', 'Bsmt Exposure_Gd',
       'Overall Cond_5', 'MS SubClass_60', 'Garage Type_Attchd',
       'Sale Type_New', 'Exterior 1st_VinylSd', 'Lot Frontage',
       'Exterior 2nd_VinylSd', 'Wood Deck SF', 'Open Porch SF',
       'Mas Vnr Type_Stone', 'Kitchen Qual_Gd', 'Overall Qual_10', 'Lot Area',
       'Paved Drive', 'Bsmt Full Bath', 'Half Bath', 'Garage Cond_TA',
       'Roof Style_Hip', 'Neighborhood_NoRidge', 'Mas Vnr Type_BrkFace',
       'Neighborhood_StoneBr', 'Electrical_SBrkr', '2nd Flr SF',
       'Garage Qual_TA', 'Bsmt Qual_Gd', 'MS Zoning_RL', 'Garage Type_BuiltIn',
       'Land Contour_HLS', 'House Style_2Story', 'Sale Type_WD ',
       'Nei

## Some dropping / feature engineering based on what we see above

The following are either sparse data or could introduce multicolinearity (such as ages of subsections of the house and totals of square footage already represented here)

In [132]:

props.drop(['Alley', 'Garage Yr Blt', 'Pool QC', 'Misc Feature', 'Fence' ], axis = 1, inplace = True )

In [133]:
sf = [feature for feature in props.columns if 'sf' in feature.lower() or 'tot' in feature.lower() ]

In [134]:
props.drop(['Total Bsmt SF', 'TotRms AbvGrd'], axis = 1, inplace = True)

In [135]:
non_numerics = C
props_dummy = pd.get_dummies(props, columns = non_numerics.columns, drop_first = True)
props_dummy.shape

NameError: name 'C' is not defined

In [None]:
corr = props_dummy.corr(numeric_only = True)[['SalePrice']].sort_values( by = 'SalePrice', ascending = False)

In [119]:
props.loc[: [item for item in correlated_features.index[1:]]]

InvalidIndexError: ['Gr Liv Area', 'Garage Area', 'Garage Cars', '1st Flr SF', 'Full Bath', 'Foundation_PConc', 'Mas Vnr Area', 'Overall Qual_9', 'Fireplaces', 'BsmtFin Type 1_GLQ', 'Exter Qual_Gd', 'Neighborhood_NridgHt', 'BsmtFin SF 1', 'Overall Qual_8', 'Fireplace Qu_Gd', 'Bsmt Exposure_Gd', 'Overall Cond_5', 'MS SubClass_60', 'Garage Type_Attchd', 'Sale Type_New', 'Exterior 1st_VinylSd', 'Lot Frontage', 'Exterior 2nd_VinylSd', 'Wood Deck SF', 'Open Porch SF', 'Mas Vnr Type_Stone', 'Kitchen Qual_Gd', 'Overall Qual_10', 'Lot Area', 'Paved Drive', 'Bsmt Full Bath', 'Half Bath', 'Garage Cond_TA', 'Roof Style_Hip', 'Neighborhood_NoRidge', 'Mas Vnr Type_BrkFace', 'Neighborhood_StoneBr', 'Electrical_SBrkr', '2nd Flr SF', 'Garage Qual_TA', 'Bsmt Qual_Gd', 'MS Zoning_RL', 'Garage Type_BuiltIn', 'Land Contour_HLS', 'House Style_2Story', 'Sale Type_WD ', 'Neighborhood_OldTown', 'MS SubClass_30', 'Roof Style_Gable', 'Overall Qual_4', 'MS Zoning_RM', 'Bsmt Exposure_No', 'Lot Shape_Reg', 'Heating QC_TA', 'Foundation_CBlock', 'Overall Qual_5', 'Garage Type_Detchd', 'Mas Vnr Type_None', 'Garage Finish_Unf', 'Bsmt Qual_TA', 'Kitchen Qual_TA', 'Year Remod/Add', 'Years since built', 'Exter Qual_TA']

In [100]:
ols = LinearRegression()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(