In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

# Set seed for reproducibility
np.random.seed(1)

# Load the Craigslist housing data
housing = pd.read_csv('craigslist.csv')

# Fit initial model with basic features
model1 = sm.OLS.from_formula('price ~ type + sqfeet + beds + baths', data=housing).fit()
# Fit second model with additional features
model2 = sm.OLS.from_formula('price ~ type + sqfeet + beds + baths + comes_furnished + laundry_options + parking_options + smoking_allowed', data=housing).fit()
# Fit final model with all features
model3 = sm.OLS.from_formula('price ~ type + sqfeet + beds + baths + comes_furnished + laundry_options + parking_options + smoking_allowed + cats_allowed + dogs_allowed', data=housing).fit()

# Print R-squared and Adjusted R-squared for model comparison
print(f"Model 1 R-squared: {model1.rsquared.round(4)}, Adjusted R-squared: {model1.rsquared_adj.round(4)}")
print(f"Model 2 R-squared: {model2.rsquared.round(4)}, Adjusted R-squared: {model2.rsquared_adj.round(4)}")
print(f"Model 3 R-squared: {model3.rsquared.round(4)}, Adjusted R-squared: {model3.rsquared_adj.round(4)}")

# Run an F-test to compare model2 and model3
anova_results = anova_lm(model2, model3)
print("\nANOVA Results Comparing Model 2 and Model 3:")
print(anova_results)

# Log-Likelihood, AIC, and BIC for all models
print("\nModel Log-Likelihoods, AICs, and BICs:")
for i, model in enumerate([model1, model2, model3], 1):
    print(f"Model {i}: Log-Likelihood = {model.llf:.2f}, AIC = {model.aic:.2f}, BIC = {model.bic:.2f}")

# Split data into training and test sets
indices = range(len(housing))
train_size = int(0.8 * len(indices))
train_ind = np.random.choice(indices, size=train_size, replace=False)
test_ind = list(set(indices) - set(train_ind))

# Create training and testing datasets
housing_train = housing.iloc[train_ind]
housing_test = housing.iloc[test_ind]

# Fit models using training data
model2_train = sm.OLS.from_formula('price ~ type + sqfeet + beds + baths + comes_furnished + laundry_options + parking_options + smoking_allowed', data=housing_train).fit()
model3_train = sm.OLS.from_formula('price ~ type + sqfeet + beds + baths + comes_furnished + laundry_options + parking_options + smoking_allowed + cats_allowed + dogs_allowed', data=housing_train).fit()

# Predict prices on test set
predicted_mod2 = model2_train.predict(housing_test)
predicted_mod3 = model3_train.predict(housing_test)

# Calculate PRMSE for both models
prmse2 = np.mean((housing_test.price - predicted_mod2) ** 2) ** 0.5
prmse3 = np.mean((housing_test.price - predicted_mod3) ** 2) ** 0.5

# Print PRMSE for both models
print(f"\nPredictive RMSE for Model 2: {prmse2:.2f}")
print(f"Predictive RMSE for Model 3: {prmse3:.2f}")

Model 1 R-squared: 0.1278, Adjusted R-squared: 0.1257
Model 2 R-squared: 0.2819, Adjusted R-squared: 0.2763
Model 3 R-squared: 0.2838, Adjusted R-squared: 0.2777

ANOVA Results Comparing Model 2 and Model 3:
   df_resid           ssr  df_diff       ss_diff         F    Pr(>F)
0    3064.0  5.270049e+08      0.0           NaN       NaN       NaN
1    3062.0  5.256050e+08      2.0  1.399862e+06  4.077564  0.017041

Model Log-Likelihoods, AICs, and BICs:
Model 1: Log-Likelihood = -37528.12, AIC = 75082.25, BIC = 75166.97
Model 2: Log-Likelihood = -22989.87, AIC = 46029.75, BIC = 46180.64
Model 3: Log-Likelihood = -22985.77, AIC = 46025.53, BIC = 46188.49

Predictive RMSE for Model 2: 410.81
Predictive RMSE for Model 3: 409.93
