In [104]:
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv("./data/student_lifestyle_dataset.csv")
df = pd.get_dummies(df,columns=["Stress_Level"], drop_first=True)
X = df.drop(labels = ["GPA", "Student_ID"], axis=1)
y = df["GPA"]
print(X.columns)
full_reg = LinearRegression().fit(X,y)

Index(['Study_Hours_Per_Day', 'Extracurricular_Hours_Per_Day',
       'Sleep_Hours_Per_Day', 'Social_Hours_Per_Day',
       'Physical_Activity_Hours_Per_Day', 'Stress_Level_Low',
       'Stress_Level_Moderate'],
      dtype='object')


In [None]:
print(df.var())

In [105]:
#evaluating the full model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

y_pred = full_reg.predict(X)

n = X.shape[0] 
p = X.shape[1] 

r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f"R2: {r2}, adjR2: {adj_r2}, MSE: {mse}, RMSE: {rmse}")

R2: 0.5417504784010405, adjR2: 0.5401401638171084, MSE: 0.040858113171951635, RMSE: 0.2021338991162829


In [106]:
#Using BIC as the selection criterion 
def bic(model, X, y):
    y_hat = model.predict(X) 
    residuals = y - y_hat 
    rss  = np.sum(residuals ** 2)
    p = X.shape[1]
    n = X.shape[0]
    return n*np.log(rss/n) + (p + 1)*np.log(n)


In [107]:
from  sklearn.feature_selection import SequentialFeatureSelector

forward_sfs = SequentialFeatureSelector(LinearRegression(),n_features_to_select="auto",direction="forward",scoring=bic)
forward_sfs.fit(X,y)
selected_mask = forward_sfs.get_support()
selected_features = X.columns[selected_mask]
print(selected_features)

backward = SequentialFeatureSelector(LinearRegression(),n_features_to_select="auto",direction="backward",scoring=bic)
backward_sfs.fit(X,y)
selected_mask = backward_sfs.get_support()
selected_features = X.columns[selected_mask]
print(selected_features)

Index(['Extracurricular_Hours_Per_Day', 'Sleep_Hours_Per_Day',
       'Social_Hours_Per_Day'],
      dtype='object')
Index(['Study_Hours_Per_Day', 'Sleep_Hours_Per_Day', 'Social_Hours_Per_Day',
       'Physical_Activity_Hours_Per_Day'],
      dtype='object')


In [86]:
#switching to stats models for the F test 
from statsmodels.stats.diagnostic import compare_encompassing
import statsmodels.formula.api as smf
m0 = smf.ols("GPA ~ Extracurricular_Hours_Per_Day + Sleep_Hours_Per_Day + Social_Hours_Per_Day", data=df).fit()
m1 = smf.ols("GPA ~ Study_Hours_Per_Day + Sleep_Hours_Per_Day + Social_Hours_Per_Day + Physical_Activity_Hours_Per_Day", data=df).fit()
cmp = compare_encompassing(m0, m1)
print(cmp)

ValueError: The exog in results_x and in results_z are nested. Testing encompassing requires that models are non-nested.


In [102]:
#the above failure shows there's some *exact* colinearity 
#this is confirmed by the rank of the predictors matrix becing 1 lower than its column count
#this is because the number of hours, for every row, will sum to 24
#to account for this we remove one of the hours columns from the data set, and refit the models
import numpy as np
import statsmodels.api as sm
X = df[[
    "Extracurricular_Hours_Per_Day",
    "Study_Hours_Per_Day",
    "Sleep_Hours_Per_Day",
    "Social_Hours_Per_Day",
    "Physical_Activity_Hours_Per_Day",
]]
X = sm.add_constant(X)

rank = np.linalg.matrix_rank(X)
n_cols = X.shape[1]
print("rank:", rank, "n_cols:", n_cols)
cond_number = np.linalg.cond(X)
print("condition number:", cond_number)

rank: 5 n_cols: 6
condition number: 1.5579303813916428e+16


In [109]:
from  sklearn.feature_selection import SequentialFeatureSelector
X = df.drop(labels = ["GPA", "Student_ID", "Extracurricular_Hours_Per_Day"], axis=1)
y = df["GPA"]

forward_sfs = SequentialFeatureSelector(LinearRegression(),n_features_to_select="auto",direction="forward",scoring=bic)
forward_sfs.fit(X,y)
selected_mask = forward_sfs.get_support()
selected_features = X.columns[selected_mask]
print(selected_features)

backward = SequentialFeatureSelector(LinearRegression(),n_features_to_select="auto",direction="backward",scoring=bic)
backward_sfs.fit(X,y)
selected_mask = backward_sfs.get_support()
selected_features = X.columns[selected_mask]
print(selected_features)
#since both the forward and backwards selection agree, we can conclude that this is the best model. 
#all of the hourly measures were meaningful, and the only column that was not effectual is the low stress column
#interpretively, we can say that this means that a student being low stress does not strongly correlate one way or the other
#but it matters much more if they are moderatly or high stress, and how they distribute their day

Index(['Sleep_Hours_Per_Day', 'Social_Hours_Per_Day', 'Stress_Level_Moderate'], dtype='object')
Index(['Sleep_Hours_Per_Day', 'Social_Hours_Per_Day', 'Stress_Level_Moderate'], dtype='object')
