In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))


False
0


AssertionError: Torch not compiled with CUDA enabled

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, PolynomialFeatures
from category_encoders import BinaryEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor

KeyboardInterrupt: 

In [None]:
# Load the unpreprocessed data
with open('../data/unprocessed_data.pkl', 'rb') as f:
    X_train, y_train, X_test, y_test = pickle.load(f)
    
X_train

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender
2275,18,95,1,0,0,3,96,2,1,2,2,0,0,2,4,1,1,1,1
4603,16,89,0,2,1,4,58,1,1,2,1,2,1,1,3,0,0,1,1
2202,16,69,2,0,1,2,55,2,1,1,2,0,1,0,1,0,1,1,1
471,11,65,2,0,0,3,78,2,1,2,1,2,1,1,3,0,2,2,0
4060,21,95,2,0,1,4,57,1,0,1,1,2,1,2,3,0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,30,70,0,1,0,2,84,0,1,2,2,2,1,2,3,0,2,2,1
5191,24,90,1,1,1,5,97,0,1,3,1,2,1,2,4,0,1,2,1
5226,24,65,1,0,1,3,52,2,1,1,0,2,1,1,1,0,2,1,0
5390,30,91,2,0,1,3,95,1,1,1,2,0,1,2,3,0,0,2,0


In [None]:
# Load the preprocessed data
with open('../data/processed_data.pkl', 'rb') as f:
    X_train_prep, y_train_log, X_test_prep, y_test_log = pickle.load(f)

X_train_prep

array([[-0.32767187,  1.29777176, -0.26517275, ...,  0.15444734,
        -0.74815373,  0.84930709],
       [-0.66082888,  0.77858843, -1.42150727, ..., -1.26791787,
        -0.74815373,  0.84930709],
       [-0.66082888, -0.95202267,  0.89116176, ...,  0.15444734,
        -0.74815373,  0.84930709],
       ...,
       [ 0.67179916, -1.29814488, -0.26517275, ...,  1.57681254,
        -0.74815373, -1.17743041],
       [ 1.67127019,  0.95164954,  0.89116176, ..., -1.26791787,
         0.74464265, -1.17743041],
       [ 1.33811318,  1.21124121,  0.89116176, ...,  1.57681254,
        -2.24095011,  0.84930709]])

# Modeling

## Liner Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_prep, y_train_log)   # Learn Weights from training data Using OLS (Normal Equation)


lr_scores = cross_val_score(lr, X_train_prep, y_train_log, cv=5, scoring='r2')
print(f"Linear Regression: {np.mean(lr_scores)}")

Linear Regression: 0.6471715480797441


In [None]:
scores_dict = {'Linear Regression': round(np.mean(lr_scores), 2)}
scores_dict

{'Linear Regression': 0.65}

## Polynomial Regression

In [None]:
#  Preprocessing & Base Model

num_cols=['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender']


imputer = SimpleImputer(strategy="median")
scaler  = RobustScaler()

poly = PolynomialFeatures(degree=2, include_bias=False)

log_transformer = FunctionTransformer(np.log1p, feature_names_out="one-to-one" )  

num_pipeline = Pipeline(steps=[ ('imputer', imputer),
                                ('poly', poly),
                                ('scaler', scaler)
                              ])
log_pipeline = Pipeline(steps=[ ('imputer', imputer),
                              ('log_transform', log_transformer),
                              ('poly', poly),
                              ('scaler', scaler)
                            ])
transformer = ColumnTransformer(transformers=[
                                                ('num', num_pipeline, num_cols),
                                                ('log_transform', log_pipeline, num_cols)
                                            ], remainder='passthrough'
                                )


# Linear Regression
lr = LinearRegression()
pipe = Pipeline(steps=[ ('preprocessor', transformer),
                        ('model', lr)
                      ])


poly2_scores = cross_val_score(pipe, X_train, y_train_log, cv=5, scoring='r2')
print("Cross-validated R²: %.2f (± %.2f)" % (poly2_scores.mean(), poly2_scores.std() * 2))

Cross-validated R²: 0.69 (± 0.09)


In [None]:
scores_dict.update({'Polynomial Regression (2)': round(np.mean(poly2_scores), 2)})
scores_dict

{'Linear Regression': 0.65, 'Polynomial Regression (2)': 0.69}

## Ridge Regression 

In [None]:
# Grid Search with Ridge Regression (tuning alpha and polynomial degree)
ridge = Ridge()
pipe_ridge = Pipeline(steps=[ ('preprocessor', transformer),
                              ('model', ridge)
                            ])
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3, 4],
    'model__alpha': [0.01, 0.1, 1.0, 10.0]
}
ridge_search = GridSearchCV(pipe_ridge, param_grid, cv=5, scoring='r2')
ridge_search.fit(X_train, y_train_log)
print("Best parameters:", ridge_search.best_params_)
print("Best cross-validated R²: %.2f" % ridge_search.best_score_)

  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos",

Best parameters: {'model__alpha': 10.0, 'preprocessor__num__poly__degree': 1}
Best cross-validated R²: 0.71


In [None]:
scores_dict.update({'Ridge Regression (3)': round(ridge_search.best_score_ , 2)})
scores_dict

{'Linear Regression': 0.65,
 'Polynomial Regression (2)': 0.69,
 'Ridge Regression (3)': 0.71}

## Descion Tree

In [None]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
pipe_dt = Pipeline(steps=[ ('preprocessor', transformer),
                           ('model', dt)
                         ])
param_grid_dt = {
    'preprocessor__num__poly__degree': [1],
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
grid_search_dt = GridSearchCV(pipe_dt, param_grid_dt, cv=5, scoring='r2')
grid_search_dt.fit(X_train, y_train_log)
print("Best parameters:", grid_search_dt.best_params_)
print("Best cross-validated R²: %.2f" % grid_search_dt.best_score_)

Best parameters: {'model__max_depth': 5, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2, 'preprocessor__num__poly__degree': 1}
Best cross-validated R²: 0.49


In [None]:
scores_dict.update({'Decision Tree Regressor': round(grid_search_dt.best_score_, 2)})
scores_dict

{'Linear Regression': 0.65,
 'Polynomial Regression (2)': 0.69,
 'Ridge Regression (3)': 0.71,
 'Decision Tree Regressor': 0.49}

## GridSearch with random forest regressor

In [None]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
pipe_rf = Pipeline(steps=[ ('preprocessor', transformer),
                           ('model', rf)
                         ])
param_grid_rf = {
    'preprocessor__num__poly__degree': [1],
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}
grid_search_rf = GridSearchCV(pipe_rf, param_grid_dt, cv=5, scoring='r2')
grid_search_rf.fit(X_train, y_train_log)
print("Best parameters:", grid_search_rf.best_params_)
print("Best cross-validated R²: %.2f" % grid_search_rf.best_score_)

Best parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2, 'preprocessor__num__poly__degree': 1}
Best cross-validated R²: 0.65


In [None]:
scores_dict.update({'RandomForest Regressor': round(grid_search_rf.best_score_, 2)})
scores_dict

{'Linear Regression': 0.65,
 'Polynomial Regression (2)': 0.69,
 'Ridge Regression (3)': 0.71,
 'Decision Tree Regressor': 0.49,
 'RandomForest Regressor': 0.65}

## XGBOOST

In [None]:
xgb = XGBRegressor(random_state=42, n_jobs=-1)

# Pipeline with preprocessing + model
pipe_xgb = Pipeline(steps=[
    ('preprocessor', transformer),
    ('model', xgb)
])

# Hyperparameter grid
param_grid_xgb = {
    # 'preprocessor__num__poly__degree': [1],   
    # 'model__n_estimators': [100, 200, 300, 500],
    # 'model__max_depth': [3, 5, 7],
    # 'model__learning_rate': [0.01, 0.05, 0.1],
    # 'model__subsample': [0.8, 1.0],
    # 'model__colsample_bytree': [0.8, 1.0],
    # 'model__gamma': [0, 0.1, 0.2]
    'preprocessor__num__poly__degree': [1],
    'model__n_estimators': [200],
    'model__max_depth': [3],
    'model__learning_rate': [0.05],
    'model__subsample': [0.8],
    'model__colsample_bytree': [0.8],
    'model__gamma': [0]
}

# Grid search with 5-fold CV
grid_search_xgb = RandomizedSearchCV(
    pipe_xgb,
    param_grid_xgb,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_search_xgb.fit(X_train, y_train_log)

print("Best parameters:", grid_search_xgb.best_params_)
print("Best cross-validated R²: %.2f" % grid_search_xgb.best_score_)




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'preprocessor__num__poly__degree': 1, 'model__subsample': 0.8, 'model__n_estimators': 200, 'model__max_depth': 3, 'model__learning_rate': 0.05, 'model__gamma': 0, 'model__colsample_bytree': 0.8}
Best cross-validated R²: 0.69


In [None]:
scores_dict.update({'XGB Regressor': round(grid_search_xgb.best_score_, 2)})
scores_dict

{'Linear Regression': 0.65,
 'Polynomial Regression (2)': 0.69,
 'Ridge Regression (3)': 0.71,
 'Decision Tree Regressor': 0.49,
 'RandomForest Regressor': 0.65,
 'XGB Regressor': 0.69}

# Best Model

In [None]:
scores_df = pd.DataFrame(list(scores_dict.items()), columns=['Model', 'CV Avg R² Score'])
scores_df = scores_df.sort_values(by='CV Avg R² Score', ascending=False).reset_index(drop=True)
scores_df

Unnamed: 0,Model,CV Avg R² Score
0,Ridge Regression (3),0.71
1,Polynomial Regression (2),0.69
2,XGB Regressor,0.69
3,Linear Regression,0.65
4,RandomForest Regressor,0.65
5,Decision Tree Regressor,0.49


In [None]:
best_model = ridge_search.best_estimator_
y_pred_log = best_model.predict(X_test)

print('Testing R2 Score: ', round(best_model.score(X_test, y_test_log), 2))

Testing R2 Score:  0.73


In [None]:
best_model = ridge_search.best_estimator_
y_pred_log = best_model.predict(X_test)

print('Testing R2 Score: ', round(best_model.score(X_test, y_test_log), 2))

Testing R2 Score:  0.73


In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_absolute_percentage_error

y_pred = cross_val_predict(pipe_ridge, X_test, y_test_log, cv=5)
mape = mean_absolute_percentage_error(y_test_log, y_pred)
accuracy = 1 - mape
print("Approximate Accuracy (1 - MAPE): %.2f%%" % (accuracy * 100))

# from sklearn.model_selection import cross_val_predict
# from sklearn.metrics import mean_absolute_percentage_error

# y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
# mape = mean_absolute_percentage_error(y_test_log, y_pred)
# accuracy = 1 - mape
# print("Approximate Accuracy (1 - MAPE): %.2f%%" % (accuracy * 100))

Approximate Accuracy (1 - MAPE): 98.53%


# Final Evaluation

# Best Model Saving

In [None]:
# Create or open the models directory if it doesn't exist
import os
os.makedirs('../models', exist_ok=True)

# Save the best model
with open('../models/ridge_best_model.pkl', 'wb') as f:
    pickle.dump(ridge_search.best_estimator_, f)

print('Best Model Saved Successfully')

Best Model Saved Successfully
