In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn import metrics

%run 00_Functions.ipynb
%run 01_Data_Cleaning.ipynb


In [31]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# see all of my columns, no truncating!

Make a new version of a usable dataframe

In [32]:
X_2 = props.copy()

### Dropping values that are either clearly colinear (square footage totals, for example) and PID)

In [33]:
X_2.drop(['PID','Garage Yr Blt'], axis = 1, inplace = True)

In [34]:
non_numerics = X_2.select_dtypes(exclude = np.number)

### Exploring correlated numeric values to decide which to use in the model / engineer

In [35]:
corr = X_2.corr(numeric_only = True)[['SalePrice']].sort_values( by = 'SalePrice', ascending = False)

In [36]:
corr

Unnamed: 0,SalePrice
SalePrice,1.0
Gr Liv Area,0.699026
Garage Area,0.648661
Garage Cars,0.647015
Total Bsmt SF,0.631975
1st Flr SF,0.623523
Full Bath,0.538471
Mas Vnr Area,0.512699
TotRms AbvGrd,0.505188
Fireplaces,0.471222


In [37]:
non_numerics.columns

Index(['MS SubClass', 'MS Zoning', 'Alley', 'Lot Shape', 'Land Contour',
       'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating',
       'Heating QC', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Pool QC', 'Fence', 'Misc Feature', 'Mo Sold',
       'Sale Type'],
      dtype='object')

### Selecting features based on what kind of correlation value they have

In [38]:
correlated_numeric_features = corr[(corr['SalePrice'] > 0.10) |  (corr['SalePrice'] < -0.10)].index[1:]

In [39]:
X_2_nums = X_2.loc[:, list(correlated_numeric_features)]


###  Converting categorical values to dummies

In [40]:
X_2_non_numerics = X_2.select_dtypes(exclude = np.number)

In [41]:
replace_NaN_with_NA(X_2, [column for column in X_2_non_numerics.columns if X_2_non_numerics[column].isnull().sum() > 0])

In [42]:
X_2_dummy = pd.get_dummies(X_2_non_numerics, columns = non_numerics.columns, drop_first = True)

Trimmed down the list of dummied categories to limit underrepresented and overrepresented values

In [43]:
X_2_means = X_2_dummy.mean().sort_values().to_frame()
X_2_dummy_trimmed = list(X_2_means[(X_2_means[0] > 0.10) & (X_2_means[0] < 0.90)].index)

In [44]:
X_2_dummy.shape

(2026, 257)

In [45]:
X_2_nums.shape

(2026, 26)

### Settling on X and Y for this iteration

In [46]:
X_2_use = X_2_nums.join(X_2_dummy.loc[:, X_2_dummy_trimmed], on='Id', how='left')
y2 = X_2['SalePrice']

In [47]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_2_use, y2, test_size = 0.3, random_state=2023)

In [48]:
si = SimpleImputer()

In [49]:
X2_train_si = si.fit_transform(X2_train)
X2_test_si = si.transform(X2_test)

In [50]:
ols = LinearRegression()
ols.fit(X2_train_si, y2_train)

In [51]:
ols.coef_

array([ 1.04728573e+01,  4.96997962e+00,  1.38858350e+04,  2.20363508e+01,
        2.09077131e+01,  6.89599951e+03,  3.65794818e+01,  1.57737204e+03,
        9.76476709e+03, -1.46349654e+01, -6.84405236e+01,  1.53412878e+01,
       -5.58614147e+00,  3.94772446e-01,  8.25681846e+03,  1.04004583e+04,
        6.06504247e+02,  9.72413523e+03,  4.30240505e+01, -6.82151216e+00,
       -8.45281702e+02,  6.69202876e+01, -1.77409535e+04,  2.50509110e+01,
       -3.66416172e+02, -1.99901164e+02,  1.73780252e+03,  1.50983151e+04,
       -3.53183069e+03,  1.91527433e+04,  4.27329609e+03,  1.65227770e+04,
        8.56542064e+03, -9.47705958e+03,  7.39108424e+03,  8.34677302e+03,
       -6.04190916e+03, -4.33008937e+03, -3.95056907e+03, -2.20341528e+03,
        1.09155300e+04, -8.44114497e+03,  2.57430918e+03,  3.19745222e+03,
        9.79913658e+02,  3.07640467e+03, -1.58213194e+03, -2.70187844e+03,
       -7.00564894e+03,  4.25184384e+03,  4.13032817e+02, -4.83594245e+03,
        3.33453085e+02, -