In [2]:
import pandas as pd
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

# Load and clean data
house = pd.read_csv("housing_hw.csv").dropna()
X = house.drop(columns=["MEDV"])
y = house["MEDV"]

# Custom AIC scoring function for feature selection
def aic_scorer(estimator, X, y):
    X_with_const = sm.add_constant(X)
    model = sm.OLS(y, X_with_const).fit()
    return -model.aic  # Negative AIC for minimization

# Initialize linear regression model
model = LinearRegression()

# Forward Selection using AIC
sfs_forward = SFS(model, k_features="best", forward=True, floating=False, scoring=aic_scorer, cv=0)
sfs_forward = sfs_forward.fit(X, y)
X_forward = X[list(sfs_forward.k_feature_names_)]
model_forward = sm.OLS(y, sm.add_constant(X_forward)).fit()
print("Forward Selection Features:", list(sfs_forward.k_feature_names_))
print("Forward Selection AIC:", -sfs_forward.k_score_)
print("\nForward Selection Model Summary:\n", model_forward.summary())

# Backward Elimination using AIC
sfs_backward = SFS(model, k_features="best", forward=False, floating=False, scoring=aic_scorer, cv=0)
sfs_backward = sfs_backward.fit(X, y)
X_backward = X[list(sfs_backward.k_feature_names_)]
model_backward = sm.OLS(y, sm.add_constant(X_backward)).fit()
print("\nBackward Elimination Features:", list(sfs_backward.k_feature_names_))
print("Backward Elimination AIC:", -sfs_backward.k_score_)
print("\nBackward Elimination Model Summary:\n", model_backward.summary())

# Stepwise (Bidirectional) Selection using AIC
sfs_stepwise = SFS(model, k_features="best", forward=True, floating=True, scoring=aic_scorer, cv=0)
sfs_stepwise = sfs_stepwise.fit(X, y)
X_stepwise = X[list(sfs_stepwise.k_feature_names_)]
model_stepwise = sm.OLS(y, sm.add_constant(X_stepwise)).fit()
print("\nStepwise (Bidirectional) Selection Features:", list(sfs_stepwise.k_feature_names_))
print("Stepwise (Bidirectional) Selection AIC:", -sfs_stepwise.k_score_)
print("\nStepwise (Bidirectional) Selection Model Summary:\n", model_stepwise.summary())


Forward Selection Features: ['CRIM', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
Forward Selection AIC: 3050.390527574222

Forward Selection Model Summary:
                             OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.722
Model:                            OLS   Adj. R-squared:                  0.718
Method:                 Least Squares   F-statistic:                     161.5
Date:                Thu, 31 Oct 2024   Prob (F-statistic):          5.87e-133
Time:                        18:35:58   Log-Likelihood:                -1516.2
No. Observations:                 506   AIC:                             3050.
Df Residuals:                     497   BIC:                             3088.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t