In [220]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from dmba import stepwise_selection
from dmba import AIC_score

In [221]:
df = pd.read_csv("/home/satire/PycharmProjects/Statistics/csv/Student_Performance.csv")
df1 = df.copy()

In [222]:
pred = ['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced']
outcome = 'Performance Index'
X = pd.get_dummies(df[pred], drop_first=True, dtype=int)
X['Extracurricular Activities_Yes'] = [1 if ea else 0 for ea in X['Extracurricular Activities_Yes']]

In [223]:
X

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Extracurricular Activities_Yes
0,7,99,9,1,1
1,4,82,4,2,0
2,8,51,7,2,1
3,5,52,5,2,1
4,7,75,8,5,0
...,...,...,...,...,...
9995,1,49,4,2,1
9996,7,64,8,5,1
9997,6,83,8,5,1
9998,9,97,7,0,1


In [224]:
df = sm.OLS(df[outcome], X.assign(const=1))
result = df.fit()
result.summary()

0,1,2,3
Dep. Variable:,Performance Index,R-squared:,0.989
Model:,OLS,Adj. R-squared:,0.989
Method:,Least Squares,F-statistic:,175700.0
Date:,"Tue, 11 Mar 2025",Prob (F-statistic):,0.0
Time:,14:06:55,Log-Likelihood:,-21307.0
No. Observations:,10000,AIC:,42630.0
Df Residuals:,9994,BIC:,42670.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Hours Studied,2.8530,0.008,362.353,0.000,2.838,2.868
Previous Scores,1.0184,0.001,866.450,0.000,1.016,1.021
Sleep Hours,0.4806,0.012,39.972,0.000,0.457,0.504
Sample Question Papers Practiced,0.1938,0.007,27.257,0.000,0.180,0.208
Extracurricular Activities_Yes,0.6129,0.041,15.029,0.000,0.533,0.693
const,-34.0756,0.127,-268.010,0.000,-34.325,-33.826

0,1,2,3
Omnibus:,3.851,Durbin-Watson:,2.001
Prob(Omnibus):,0.146,Jarque-Bera (JB):,4.036
Skew:,0.013,Prob(JB):,0.133
Kurtosis:,3.095,Cond. No.,452.0


In [225]:
y = df1[outcome]

In [226]:
def train_model(variables): #Train the model on the remaining data
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(X[variables], y)
    return model

def score_model(model, variables): #Apply score the model to the 1/k holdout(dummy variables)
    if len(variables) == 0:
        return AIC_score(y, [y.mean()] * len(y), model, df=1)
    return AIC_score(y, model.predict(X[variables]), model) #penalizes adding terms to a model
best_model, best_variables = stepwise_selection(X.columns, train_model, # start with a constant model and add variables(forward selection)
                                            score_model, verbose=True)
#record model assessment metrics
print(f'Intercept: {best_model.intercept_:.3f}')
print('Coefficients')
for name, coef in zip(best_variables, best_model.coef_):
    print(f' {name}: {coef}')

Variables: Hours Studied, Previous Scores, Sleep Hours, Sample Question Papers Practiced, Extracurricular Activities_Yes
Start: score=87493.05, constant
Step: score=69319.90, add Previous Scores
Step: score=44900.79, add Hours Studied
Step: score=43558.25, add Sleep Hours
Step: score=42848.60, add Sample Question Papers Practiced
Step: score=42627.11, add Extracurricular Activities_Yes
Step: score=42627.11, unchanged None
Intercept: -34.076
Coefficients
 Previous Scores: 1.0184341923340543
 Hours Studied: 2.852982053532594
 Sleep Hours: 0.4805597547118866
 Sample Question Papers Practiced: 0.1938021400698875
 Extracurricular Activities_Yes: 0.6128975819601042
