In [3]:
import numpy as np
import pandas as pd 
df = pd.read_csv('StudentsPerformance.csv')
print(df.head())
print(df.isnull().sum())

   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course    

In [None]:
#preprocessing and train-test
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

target = 'math score'
features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'reading score', 'writing score']

encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = pd.DataFrame(encoder.fit_transform(df[features[:-2]]))
encoded_features.columns = encoder.get_feature_names_out(features[:-2])

X = pd.concat([encoded_features, df[['reading score', 'writing score']]], axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_const = sm.add_constant(X_train)
model_all = sm.OLS(y_train, X_train_const).fit()


In [6]:
# Backward Elimination Function
def backward_elimination(X, y, significance_level=0.05):
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    while True:
        p_values = model.pvalues[1:]  # Exclude constant term
        max_p_value = p_values.max()
        if max_p_value > significance_level:
            excluded_feature = p_values.idxmax()
            X = X.drop(columns=[excluded_feature])
            X_const = sm.add_constant(X)
            model = sm.OLS(y, X_const).fit()
        else:
            break
    return X, model

X_train_be, model_be = backward_elimination(X_train, y_train)

In [7]:

# Forward Selection Function
def forward_selection(X, y, significance_level=0.05):
    selected_features = []
    remaining_features = list(X.columns)
    while remaining_features:
        best_p_value = float('inf')
        best_feature = None
        for feature in remaining_features:
            X_subset = sm.add_constant(X[selected_features + [feature]])
            model = sm.OLS(y, X_subset).fit()
            p_value = model.pvalues[feature]
            if p_value < significance_level and p_value < best_p_value:
                best_p_value = p_value
                best_feature = feature
        if best_feature:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    X_selected = X[selected_features]
    return X_selected, sm.OLS(y, sm.add_constant(X_selected)).fit()

X_train_fs, model_fs = forward_selection(X_train, y_train)


In [10]:

# Combined Method (Forward + Backward)
X_train_combined, model_combined = backward_elimination(X_train_fs, y_train)

# Display results
results = {
    "No Elimination (All Variables)": model_all.summary(),
    "Backward Elimination": model_be.summary(),
    "Forward Selection": model_fs.summary(),
    "Combined (Forward + Backward)": model_combined.summary()
}

print(results)

{'No Elimination (All Variables)': <class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
Dep. Variable:             math score   R-squared:                       0.874
Model:                            OLS   Adj. R-squared:                  0.872
Method:                 Least Squares   F-statistic:                     390.1
Date:                Mon, 24 Feb 2025   Prob (F-statistic):               0.00
Time:                        20:27:31   Log-Likelihood:                -2472.8
No. Observations:                 800   AIC:                             4976.
Df Residuals:                     785   BIC:                             5046.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------