In [6]:
# Implementation of https://www.jstor.org/stable/25734098
# Bayes Bayes Bayes

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
df_raw = pd.read_csv('./data/train_with_dummies.csv', index_col=[0])

# Specify prefixes of columns to drop
prefixes_to_drop = ['Id', 'SaleType', 'SaleCondition', 'SalePrice']

# Drop specified columns before imputation
df_filtered = df_raw.drop([col for col in df_raw.columns if any(col.startswith(prefix) for prefix in prefixes_to_drop)], axis=1)

# Impute missing values in the filtered dataset
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_filtered), columns=df_filtered.columns)

# Extract the SalePrice column from the original dataset for use as the target variable
sale_price_col = df_raw['SalePrice']
sale_price_mean = np.mean(sale_price_col)

# Scale the imputed dataset
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_imputed)
df_scaled = pd.DataFrame(scaled_data, columns=df_imputed.columns)

# Define data_x and data_y for model input
data_x = df_scaled

selected_columns = [
    'OverallQual',
    'MasVnrArea',
    'TotalBsmtSF',
    'GrLivArea',
    'GarageCars',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Exterior1st_CBlock',
    'ExterQual_TA',
    'Foundation_PConc',
    'BsmtFinType1_GLQ',
    'KitchenQual_Ex',
    'KitchenQual_TA',
    'GarageType_BuiltIn'
]

# Select only the specified columns for model input
data_x_selected = df_scaled[selected_columns]

data_y = sale_price_col.reset_index(drop=True)  # Reset index to ensure alignmen

In [10]:
data_x_selected.head()

Unnamed: 0,OverallQual,MasVnrArea,TotalBsmtSF,GrLivArea,GarageCars,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Exterior1st_CBlock,ExterQual_TA,Foundation_PConc,BsmtFinType1_GLQ,KitchenQual_Ex,KitchenQual_TA,GarageType_BuiltIn
0,0.651479,0.511418,-0.459303,0.370333,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,-1.278819,1.120968,1.578868,-0.271163,-1.006873,-0.253259
1,-0.071836,-0.57441,0.466465,-0.482512,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,-0.892086,-0.633365,-0.271163,0.993174,-0.253259
2,0.651479,0.32306,-0.313369,0.515013,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,-1.278819,1.120968,1.578868,-0.271163,-1.006873,-0.253259
3,0.651479,-0.57441,-0.687324,0.383659,1.650307,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,-0.892086,-0.633365,-0.271163,-1.006873,-0.253259
4,1.374795,1.36457,0.19968,1.299326,1.650307,-0.229416,5.883006,-0.235958,-0.02618,-1.278819,1.120968,1.578868,-0.271163,-1.006873,-0.253259


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data_x_selected, data_y, test_size=0.2, random_state=42)

In [8]:
linear_reg = LinearRegression()

# Fit the model to the training data
linear_reg.fit(X_train, y_train)
coefficients = linear_reg.coef_
# Predict y values for X_test
y_pred = linear_reg.predict(X_test)

# Evaluate predictions using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Squared Error: {mse}')

Root Mean Squared Error: 35615.149041670105
Mean Squared Error: 1268438841.260375


In [11]:
# Map coefficients to feature names
feature_importance = dict(zip(selected_columns, coefficients))

# Sort features by their absolute importance
sorted_feature_importance = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)

# Print sorted feature importance
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {round(importance, 2)}")

GrLivArea: 21242.12
OverallQual: 20444.39
GarageCars: 12095.79
KitchenQual_Ex: 10795.07
TotalBsmtSF: 9403.69
Neighborhood_NoRidge: 7003.22
BsmtFinType1_GLQ: 6460.28
Neighborhood_NridgHt: 5664.8
KitchenQual_TA: -4049.42
ExterQual_TA: -2345.93
GarageType_BuiltIn: 1808.35
MasVnrArea: 1231.61
Foundation_PConc: 886.91
Exterior1st_CBlock: -285.27
Neighborhood_NWAmes: 8.31
