In [2]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
df = pd.read_csv('housing.csv')
print(df.head())
print(df.isnull().sum())

      RM  LSTAT  PTRATIO      MEDV
0  6.575   4.98     15.3  504000.0
1  6.421   9.14     17.8  453600.0
2  7.185   4.03     17.8  728700.0
3  6.998   2.94     18.7  701400.0
4  7.147   5.33     18.7  760200.0
RM         0
LSTAT      0
PTRATIO    0
MEDV       0
dtype: int64


In [3]:
X = df[['RM', 'LSTAT', 'PTRATIO']]
y = df['MEDV']

In [7]:
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names_out())


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [9]:
# Function to evaluate models
def evaluate_model(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    adj_r2_train = 1 - (1 - r2_train) * (len(y_train) - 1) / (len(y_train) - X_train.shape[1] - 1)
    adj_r2_test = 1 - (1 - r2_test) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
    return r2_train, adj_r2_train, r2_test, adj_r2_test


In [10]:
# 1. Keep All Variables
r2_all, adj_r2_all, r2_test_all, adj_r2_test_all = evaluate_model(X_train, X_test, y_train, y_test)


In [11]:

# 2. Backward Elimination
X_train_be = sm.add_constant(X_train)
X_test_be = sm.add_constant(X_test)
model_be = sm.OLS(y_train, X_train_be).fit()
p_values = model_be.pvalues
selected_features = X_train.columns[p_values[1:] < 0.05]
X_train_be = X_train[selected_features]
X_test_be = X_test[selected_features]
r2_be, adj_r2_be, r2_test_be, adj_r2_test_be = evaluate_model(X_train_be, X_test_be, y_train, y_test)


In [12]:
# 3. Forward Selection
selected_features = []
remaining_features = list(X_train.columns)
best_adj_r2 = -np.inf
while remaining_features:
    adj_r2_candidates = {}
    for feature in remaining_features:
        X_train_fs = X_train[selected_features + [feature]]
        X_test_fs = X_test[selected_features + [feature]]
        _, adj_r2_train, _, adj_r2_test = evaluate_model(X_train_fs, X_test_fs, y_train, y_test)
        adj_r2_candidates[feature] = adj_r2_test
    best_feature = max(adj_r2_candidates, key=adj_r2_candidates.get)
    if adj_r2_candidates[best_feature] > best_adj_r2:
        selected_features.append(best_feature)
        best_adj_r2 = adj_r2_candidates[best_feature]
        remaining_features.remove(best_feature)
    else:
        break
X_train_fs = X_train[selected_features]
X_test_fs = X_test[selected_features]
r2_fs, adj_r2_fs, r2_test_fs, adj_r2_test_fs = evaluate_model(X_train_fs, X_test_fs, y_train, y_test)

In [14]:
selected_features = []
remaining_features = list(X_train.columns)
best_adj_r2 = -np.inf
while remaining_features:
    adj_r2_candidates = {}
    for feature in remaining_features:
        X_train_bs = X_train[selected_features + [feature]]
        X_test_bs = X_test[selected_features + [feature]]
        _, adj_r2_train, _, adj_r2_test = evaluate_model(X_train_bs, X_test_bs, y_train, y_test)
        adj_r2_candidates[feature] = adj_r2_test
    best_feature = max(adj_r2_candidates, key=adj_r2_candidates.get)
    if adj_r2_candidates[best_feature] > best_adj_r2:
        selected_features.append(best_feature)
        best_adj_r2 = adj_r2_candidates[best_feature]
        remaining_features.remove(best_feature)
        while len(selected_features) > 1:  # Ensure at least one feature remains
            adj_r2_candidates = {}
            for feature in selected_features:
                temp_features = selected_features.copy()
                temp_features.remove(feature)
                X_train_bs = X_train[temp_features]
                X_test_bs = X_test[temp_features]
                _, adj_r2_train, _, adj_r2_test = evaluate_model(X_train_bs, X_test_bs, y_train, y_test)
                adj_r2_candidates[feature] = adj_r2_test
            worst_feature = min(adj_r2_candidates, key=adj_r2_candidates.get)
            if adj_r2_candidates[worst_feature] < best_adj_r2:
                selected_features.remove(worst_feature)
                best_adj_r2 = adj_r2_candidates[worst_feature]
            else:
                break
    else:
        break
X_train_bs = X_train[selected_features]
X_test_bs = X_test[selected_features]
r2_bs, adj_r2_bs, r2_test_bs, adj_r2_test_bs = evaluate_model(X_train_bs, X_test_bs, y_train, y_test)


In [15]:
# Print results
print("Model Performance:")
print(f"Keep All Variables: R²={r2_all:.4f}, Adjusted R²={adj_r2_all:.4f}")
print(f"Backward Elimination: R²={r2_be:.4f}, Adjusted R²={adj_r2_be:.4f}")
print(f"Forward Selection: R²={r2_fs:.4f}, Adjusted R²={adj_r2_fs:.4f}")
print(f"Bidirectional Selection: R²={r2_bs:.4f}, Adjusted R²={adj_r2_bs:.4f}")


Model Performance:
Keep All Variables: R²=0.8378, Adjusted R²=0.8340
Backward Elimination: R²=0.8370, Adjusted R²=0.8340
Forward Selection: R²=0.8086, Adjusted R²=0.8066
Bidirectional Selection: R²=0.0039, Adjusted R²=0.0014
