In [46]:
# Implementation of https://www.jstor.org/stable/25734098
# Bayes Bayes Bayes

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pymc as pm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [39]:
# Load the dataset
df_raw = pd.read_csv('./data/train_with_dummies.csv', index_col=[0])

# Specify prefixes of columns to drop
prefixes_to_drop = ['Id', 'SaleType', 'SaleCondition', 'SalePrice']

# Drop specified columns before imputation
df_filtered = df_raw.drop([col for col in df_raw.columns if any(col.startswith(prefix) for prefix in prefixes_to_drop)], axis=1)

# Impute missing values in the filtered dataset
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_filtered), columns=df_filtered.columns)

# Extract the SalePrice column from the original dataset for use as the target variable
sale_price_col = df_raw['SalePrice']
sale_price_mean = np.mean(sale_price_col)

# Scale the imputed dataset
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_imputed)
df_scaled = pd.DataFrame(scaled_data, columns=df_imputed.columns)

# Define data_x and data_y for model input
data_x = df_scaled

selected_columns = [
    'OverallQual',
    'MasVnrArea',
    'TotalBsmtSF',
    'GrLivArea',
    'GarageCars',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Exterior1st_CBlock',
    'ExterQual_TA',
    'Foundation_PConc',
    'BsmtFinType1_GLQ',
    'KitchenQual_Ex',
    'KitchenQual_TA',
    'GarageType_BuiltIn'
]

# Select only the specified columns for model input
data_x_selected = df_scaled[selected_columns]

data_y = sale_price_col.reset_index(drop=True)  # Reset index to ensure alignmen

In [40]:
X_train, X_test, y_train, y_test = train_test_split(data_x_selected, data_y, test_size=0.2, random_state=42)

In [45]:
n_features = X_train.shape[1]
with pm.Model() as model:
    # Priors
    lambda_ = pm.HalfCauchy('lambda_', beta=1, shape=n_features)
    tau = pm.HalfCauchy('tau', beta=1)
    beta = pm.Normal('beta', mu=0, sigma=tau * lambda_, shape=n_features)
    
    # Likelihood
    y_obs = pm.Normal('y_obs', mu=pm.math.dot(X_train, beta), sigma=0.1, observed=y_train)
    
    # Inference
    trace = pm.sample(1000, tune=1000, return_inferencedata=True, chains=1)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
NUTS: [lambda_, tau, beta]


Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 339 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [47]:
# Access beta samples from the posterior for predictions
beta_samples = trace.posterior['beta'].values
mean_beta = np.mean(beta_samples, axis=(0, 1))  # Average over both chain and draw dimensions

# Predict y values for X_test using mean_beta
y_pred = np.dot(X_test, mean_beta)

# (Optional) Evaluate predictions
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 35310175430.63019


In [48]:
print(y_pred)

[ -41874.48672313  141376.2917222   -50524.80572607   -3849.64784612
  126717.44606483 -135706.24383148   24183.5402941   -16882.39799472
 -135706.24383148  -53733.90113905  -17683.29095658  -80953.62948342
  -77913.16702793   61665.82542341     331.1571015   -65047.65899668
   21641.66118796  -65939.82982589  -53044.96798066   32152.72295971
  -12893.42930797   32073.15363207   -2434.50510444  -76394.47347484
   30535.90065415   -3669.92448709   24203.0475267   -79509.45923626
   -3502.44623502   17544.78978648  -47770.54055701  106891.59193825
   50639.08429749  -93341.53404895   81146.01288856  -44415.5856971
  -26044.37295854   46898.93722633  156030.17740118  -96751.48559997
  -25513.75070746   49381.91031326  -75420.1883783   154420.70131728
  -53190.89248667    7277.52798578  -78064.67649746  -75072.22941526
  216632.38846747  -52537.38352016  -75513.70676736   16833.58081803
  -55550.36701594  105644.31774353  -29197.64842072   51622.21381479
   20787.47388624   12785.90771831 

In [50]:
print(y_test.values)

[154500 325000 115000 159000 315500  75500 311500 146000  84500 135500
 145000 130000  81000 214000 181000 134500 183500 135000 118400 226000
 155000 210000 173500 129000 192000 153900 181134 141000 181000 208900
 127000 284000 200500 135750 255000 140000 138000 219500 310000  97000
 114500 205000 119500 253293 128500 117500 115000 127000 451950 144000
 119000 196000 115000 287000 144500 260000 213000 175000 107000 107500
  68500 154000 317000 264132 283463 243000 109000 305000  93500 176000
 118858 134000 109008  93500 611657 173000 348000 341000 141000 124900
 118000  67000 113000  91300 149500 133000 266000 190000 155900 155835
 153500 152000 124500 301000 136500 169990 205000 183900 204900 260000
 163500 224900 244000 132000 194000 156500 156000 275000 145000 135000
  60000 124000 127000 137500 213500 119000 107900 123000 112000 284000
 133000 149000 169000 207000 175000 137000 236000  79500 144000 162900
 185900 369900 197900 104000  35311 337500 367294 130250 230000 755000
 40300