In [1221]:
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels import regression as sm
import sklearn.linear_model as lm
from sklearn import model_selection as ms
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

In [1222]:
train_df = pd.read_csv(r'train.csv')
test_df = pd.read_csv(r'test.csv')

In [1223]:
# train_df.info()
categorical = list(train_df.select_dtypes('object').columns)
# categorical

There are 1460 samples in the training data set and 80 features. There are 43 columns with the 'object' data type,
meaning non-numeric categorical data. These features are contained in the "categorical" list.

I will select seven non-categorical features.

In [1224]:
# sns.pairplot(train_df[['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'FullBath', '1stFlrSF', '2ndFlrSF']])

The plots we care about here are in row 1 (or column 1). There appears to be a correlation between sales price and:
Overall Quality, 1st Floor Area, 2nd Floor Area, and some slight correlations with Year of Remodelling, and
Number of Full Baths.

In [1225]:
# sm.linear_model.OLS()

In [1226]:
train_df = train_df.drop('Id', axis=1)
df_encoded = pd.get_dummies(train_df)
# df_encoded.info(verbose=True, null_counts=True)
# df_encoded

In [1227]:
columns = df_encoded.columns
df_encoded.columns[df_encoded.isna().any()].tolist()

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

Setting values to the mean or zeroes could highly skew the results of a regression model.
I will use KNN to perform multivariate imputation, filling in the above columns.

In [1228]:
imputer = KNNImputer(n_neighbors=3, weights="uniform")
imputer.fit(df_encoded)
df_encoded = pd.DataFrame(imputer.fit_transform(df_encoded), columns = columns)

In [1229]:
# encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
# encoder.fit(train_df)

In [1230]:
split = model_selection.train_test_split(df_encoded, train_size=0.8)
train_split = split[0]
test_split = split[1]
# train_split

In [1231]:
normalize_train = StandardScaler().fit(train_split)
normalize_test = StandardScaler().fit(test_split)

train_norm = normalize_train.transform(train_split)
train_norm = pd.DataFrame(train_norm, columns = columns)

test_norm = normalize_test.transform(test_split)
test_norm = pd.DataFrame(test_norm, columns = columns)

In [1232]:
lin_reg = lm.LinearRegression(normalize=False).fit(train_norm.drop(['SalePrice'], axis=1), train_norm['SalePrice'])
lin_pred = lin_reg.predict(test_norm.drop(['SalePrice'], axis=1))
# lin_reg.score(test_norm.drop(['SalePrice'], axis=1), test_norm['SalePrice'])
r2_score(test_norm['SalePrice'], lin_pred)

-5.417990108626332e+24

Interestingly, when we perform OLS on normalized data, there is some serious overfitting going on. Our predicted values
are much higher in magnitude than our actual values.

In [1233]:
lin_reg = lm.LinearRegression(normalize=False).fit(train_split.drop(['SalePrice'], axis=1), train_split['SalePrice'])
lin_pred = lin_reg.predict(test_split.drop(['SalePrice'], axis=1))
# lin_reg.score(test_norm.drop(['SalePrice'], axis=1), test_norm['SalePrice'])
r2_score(test_split['SalePrice'], lin_pred)

0.8534778960314928