In [1]:
# Implementation of https://www.jstor.org/stable/25734098
# Bayes Bayes Bayes

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pymc as pm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load the dataset
df_raw = pd.read_csv('./data/train_with_dummies.csv', index_col=[0])

# Specify prefixes of columns to drop
prefixes_to_drop = ['Id', 'SaleType', 'SaleCondition', 'SalePrice']

# Drop specified columns before imputation
df_filtered = df_raw.drop([col for col in df_raw.columns if any(col.startswith(prefix) for prefix in prefixes_to_drop)], axis=1)

# Impute missing values in the filtered dataset
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_filtered), columns=df_filtered.columns)

# Extract the SalePrice column from the original dataset for use as the target variable
sale_price_col = df_raw['SalePrice']
sale_price_mean = np.mean(sale_price_col)

# Scale the imputed dataset
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_imputed)
df_scaled = pd.DataFrame(scaled_data, columns=df_imputed.columns)

# Define data_x and data_y for model input
data_x = df_scaled

selected_columns = [
    'OverallQual',
    'MasVnrArea',
    'TotalBsmtSF',
    'GrLivArea',
    'GarageCars',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Exterior1st_CBlock',
    'ExterQual_TA',
    'Foundation_PConc',
    'BsmtFinType1_GLQ',
    'KitchenQual_Ex',
    'KitchenQual_TA',
    'GarageType_BuiltIn'
]

# Select only the specified columns for model input
data_x_selected = df_scaled[selected_columns]

data_y = sale_price_col.reset_index(drop=True)  # Reset index to ensure alignmen

In [13]:
data_x_selected.sample(50)

Unnamed: 0,OverallQual,MasVnrArea,TotalBsmtSF,GrLivArea,GarageCars,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Exterior1st_CBlock,ExterQual_TA,Foundation_PConc,BsmtFinType1_GLQ,KitchenQual_Ex,KitchenQual_TA,GarageType_BuiltIn
679,-0.795151,-0.258634,-0.158314,-1.004118,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,-0.892086,-0.633365,-0.271163,0.993174,-0.253259
1133,1.374795,-0.57441,0.160916,0.960472,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,-1.278819,1.120968,1.578868,-0.271163,-1.006873,-0.253259
163,-1.518467,-0.57441,-0.400017,-1.205908,-2.36544,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,-0.892086,-0.633365,-0.271163,0.993174,-0.253259
821,-1.518467,-0.57441,-0.276885,-1.103109,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,-1.278819,-0.892086,-0.633365,-0.271163,0.993174,-0.253259
150,-0.795151,-0.57441,-0.201638,-1.040288,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,-0.892086,-0.633365,-0.271163,0.993174,-0.253259
478,1.374795,1.28701,1.4766,0.385563,1.650307,-0.229416,-0.169981,-0.235958,-0.02618,-1.278819,1.120968,1.578868,-0.271163,-1.006873,-0.253259
1363,-0.071836,-0.57441,-1.006554,-0.196961,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,1.120968,-0.633365,-0.271163,-1.006873,3.948533
61,-0.795151,-0.57441,-1.202653,-0.769966,-1.026858,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,-0.892086,-0.633365,-0.271163,-1.006873,-0.253259
523,2.821425,3.647026,4.744149,6.016627,1.650307,-0.229416,-0.169981,-0.235958,-0.02618,-1.278819,1.120968,1.578868,3.687818,-1.006873,3.948533
1452,-0.795151,-0.131215,-1.163889,-0.84421,0.311725,-0.229416,-0.169981,-0.235958,-0.02618,0.781971,1.120968,1.578868,-0.271163,0.993174,-0.253259


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data_x_selected, data_y, test_size=0.2, random_state=42)

In [4]:
n_features = X_train.shape[1]
with pm.Model() as model:
    # Priors
    lambda_ = pm.HalfCauchy('lambda_', beta=1, shape=n_features)
    tau = pm.HalfCauchy('tau', beta=1)
    beta = pm.Normal('beta', mu=0, sigma=tau * lambda_, shape=n_features)
    
    # Likelihood
    y_obs = pm.Normal('y_obs', mu=pm.math.dot(X_train, beta), sigma=0.1, observed=y_train)
    
    # Inference
    trace = pm.sample(1000, tune=1000, return_inferencedata=True, chains=1)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
NUTS: [lambda_, tau, beta]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 350 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [6]:
# Access beta samples from the posterior for predictions
beta_samples = trace.posterior['beta'].values
mean_beta = np.mean(beta_samples, axis=(0, 1))  # Average over both chain and draw dimensions

# Predict y values for X_test using mean_beta
y_pred = np.dot(X_test, mean_beta)

# (Optional) Evaluate predictions
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Squared Error: {mse}')

Root Mean Squared Error: 187910.01890554023
Mean Squared Error: 35310175205.08048


In [10]:
# Assuming selected_columns is a list of your feature names corresponding to X_train
selected_columns = [
    'OverallQual',
    'MasVnrArea',
    'TotalBsmtSF',
    'GrLivArea',
    'GarageCars',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Exterior1st_CBlock',
    'ExterQual_TA',
    'Foundation_PConc',
    'BsmtFinType1_GLQ',
    'KitchenQual_Ex',
    'KitchenQual_TA',
    'GarageType_BuiltIn'
]

# Create a dictionary mapping feature names to their mean beta coefficient
feature_coefficients = dict(zip(selected_columns, mean_beta))

# Display the feature coefficients
for feature, coef in feature_coefficients.items():
    print(f"{feature}: {coef}")

OverallQual: 24969.06093854829
MasVnrArea: -1175.9659846532697
TotalBsmtSF: 10721.137333683522
GrLivArea: 23714.85887592559
GarageCars: 14908.437961401427
Neighborhood_NWAmes: 3769.4287079693327
Neighborhood_NoRidge: 6705.42211564125
Neighborhood_NridgHt: 5683.245784851849
Exterior1st_CBlock: 926.5552740226532
ExterQual_TA: 1158.3201567874223
Foundation_PConc: 2465.282408967908
BsmtFinType1_GLQ: 3387.2402332081506
KitchenQual_Ex: 6922.883090863846
KitchenQual_TA: -3168.0256266280926
GarageType_BuiltIn: 3735.534268353591
