In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from itertools import combinations
import numpy as np


In [3]:
# Example: Load your dataset
df = pd.read_csv("IBMDataset.csv")

# Target variable
y = df['MonthlyIncome']

# Select numerical predictors (or choose specific ones)
X = df.select_dtypes(include=[np.number]).drop(['MonthlyIncome', 'EmployeeNumber'], axis=1)

# Drop any columns with NaNs (optional)
X = X.dropna(axis=1)


In [4]:
def forward_selection(X, y, max_features=None):
    remaining_features = list(X.columns)
    selected_features = []
    current_score, best_new_score = 0.0, 0.0
    max_features = max_features or len(remaining_features)

    while remaining_features and len(selected_features) < max_features:
        scores_with_candidates = []
        for candidate in remaining_features:
            formula = selected_features + [candidate]
            X_new = sm.add_constant(X[formula])
            model = sm.OLS(y, X_new).fit()
            score = model.rsquared_adj
            scores_with_candidates.append((score, candidate))

        scores_with_candidates.sort(reverse=True)
        best_new_score, best_candidate = scores_with_candidates[0]

        if best_new_score > current_score:
            remaining_features.remove(best_candidate)
            selected_features.append(best_candidate)
            current_score = best_new_score
            print(f"Added: {best_candidate} | Adjusted R²: {current_score:.4f}")
        else:
            break

    return selected_features


In [5]:
selected = forward_selection(X, y, max_features=10)
print("Final selected features:", selected)


Added: JobLevel | Adjusted R²: 0.9030
Added: TotalWorkingYears | Adjusted R²: 0.9052
Added: YearsWithCurrManager | Adjusted R²: 0.9059
Added: DistanceFromHome | Adjusted R²: 0.9063
Added: Age | Adjusted R²: 0.9065
Added: YearsSinceLastPromotion | Adjusted R²: 0.9066
Added: HourlyRate | Adjusted R²: 0.9066
Final selected features: ['JobLevel', 'TotalWorkingYears', 'YearsWithCurrManager', 'DistanceFromHome', 'Age', 'YearsSinceLastPromotion', 'HourlyRate']


In [6]:
X_selected = sm.add_constant(X[selected])
model = sm.OLS(y, X_selected).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:          MonthlyIncome   R-squared:                       0.907
Model:                            OLS   Adj. R-squared:                  0.907
Method:                 Least Squares   F-statistic:                     2038.
Date:                Mon, 05 May 2025   Prob (F-statistic):               0.00
Time:                        23:38:51   Log-Likelihood:                -12771.
No. Observations:                1470   AIC:                         2.556e+04
Df Residuals:                    1462   BIC:                         2.560e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                   -1