In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn import metrics

%run 00_Functions.ipynb
%run 01_Data_Cleaning.ipynb


In [23]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# see all of my columns, no truncating!

### Drop PID since we know it should have no correlation to Sale Price

In [24]:
props.drop('PID', axis = 1, inplace = True)

In [25]:
non_numerics = props.select_dtypes(exclude = np.number)
non_numerics.head()

Unnamed: 0_level_0,MS SubClass,MS Zoning,Alley,Lot Shape,Land Contour,Lot Config,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin Type 2,Heating,Heating QC,Electrical,Kitchen Qual,Functional,Fireplace Qu,Garage Type,Garage Finish,Garage Qual,Garage Cond,Pool QC,Fence,Misc Feature,Mo Sold,Sale Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
109,60,RL,,IR1,Lvl,CulDSac,Sawyer,RRAe,Norm,1Fam,2Story,6,8,Gable,CompShg,HdBoard,Plywood,BrkFace,Gd,TA,CBlock,TA,TA,No,GLQ,Unf,GasA,Ex,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,,,,3,WD
544,60,RL,,IR1,Lvl,CulDSac,SawyerW,Norm,Norm,1Fam,2Story,7,5,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,,,,4,WD
153,20,RL,,Reg,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,5,7,Gable,CompShg,VinylSd,VinylSd,,TA,Gd,CBlock,TA,TA,No,GLQ,Unf,GasA,TA,SBrkr,Gd,Typ,,Detchd,Unf,TA,TA,,,,1,WD
318,60,RL,,Reg,Lvl,Inside,Timber,Norm,Norm,1Fam,2Story,5,5,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Gd,SBrkr,TA,Typ,,BuiltIn,Fin,TA,TA,,,,4,WD
255,50,RL,,IR1,Lvl,Inside,SawyerW,Norm,Norm,1Fam,1.5Fin,6,8,Gable,CompShg,Wd Sdng,Plywood,,TA,TA,PConc,Fa,Gd,No,Unf,Unf,GasA,TA,SBrkr,TA,Typ,,Detchd,Unf,TA,TA,,,,3,WD


## OneHotEncode Non Numeric Columns

In [26]:
non_numerics.columns

Index(['MS SubClass', 'MS Zoning', 'Alley', 'Lot Shape', 'Land Contour',
       'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating',
       'Heating QC', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Pool QC', 'Fence', 'Misc Feature', 'Mo Sold',
       'Sale Type'],
      dtype='object')

In [27]:
props_dummy = pd.get_dummies(props, columns = non_numerics.columns, drop_first = True)

In [28]:
corr = props_dummy.corr(numeric_only = True)[['SalePrice']].sort_values( by = 'SalePrice', ascending = False)

In [29]:
correlated_features = corr[(corr['SalePrice'] > 0.2) |  (corr['SalePrice'] < -0.2)].index[1:]

In [30]:
correlated_features

Index(['Gr Liv Area', 'Garage Area', 'Garage Cars', 'Total Bsmt SF',
       '1st Flr SF', 'Garage Yr Blt', 'Full Bath', 'Foundation_PConc',
       'Mas Vnr Area', 'TotRms AbvGrd', 'Overall Qual_9', 'Fireplaces',
       'BsmtFin Type 1_GLQ', 'Exter Qual_Gd', 'Neighborhood_NridgHt',
       'BsmtFin SF 1', 'Overall Qual_8', 'Fireplace Qu_Gd', 'Bsmt Exposure_Gd',
       'Overall Cond_5', 'MS SubClass_60', 'Garage Type_Attchd',
       'Sale Type_New', 'Exterior 1st_VinylSd', 'Lot Frontage',
       'Exterior 2nd_VinylSd', 'Wood Deck SF', 'Open Porch SF',
       'Mas Vnr Type_Stone', 'Kitchen Qual_Gd', 'Overall Qual_10', 'Lot Area',
       'Paved Drive', 'Bsmt Full Bath', 'Half Bath', 'Central Air',
       'Garage Cond_TA', 'Roof Style_Hip', 'Neighborhood_NoRidge',
       'Mas Vnr Type_BrkFace', 'Neighborhood_StoneBr', 'Electrical_SBrkr',
       '2nd Flr SF', 'Garage Qual_TA', 'Bsmt Qual_Gd', 'MS Zoning_RL',
       'Garage Type_BuiltIn', 'Land Contour_HLS', 'House Style_2Story',
       'Sale 

## Some dropping / feature engineering based on what we see above

The following are either sparse data or could introduce multicolinearity (such as ages of subsections of the house and totals of square footage already represented here)

In [31]:
features_to_use_list = [feature for feature in correlated_features if feature not in ['Total Bsmt SF', 'Garage Yr Blt', 'TotRms AbvGrd']]

In [32]:
X_1 = props_dummy.loc[: , features_to_use_list]
y = props['SalePrice']


In [33]:
# log_transform(props, ['Gr Liv Area','1st Flr SF','2nd Flr SF','BsmtFin SF 1','BsmtFin SF 2'])
# props.loc[: , ['Gr Liv Area','1st Flr SF','2nd Flr SF','BsmtFin SF 1','BsmtFin SF 2']].fillna(0, inplace = True)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size = 0.3, random_state=2023)

In [35]:
si = SimpleImputer()

In [36]:
X_train_si = si.fit_transform(X_train)
X_test_si = si.transform(X_test)

In [37]:
ols = LinearRegression()
ols.fit(X_train_si, y_train)