In [143]:
import sklearn
print(sklearn.__version__)

1.7.2


In [144]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_theme()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


from sklearn.metrics import confusion_matrix,classification_report

import joblib

In [145]:
data_path = "../Data/student-por.csv"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


splitting train and test data

In [146]:
X = data.drop('G3', axis=1)
y = data["G3"]

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def fit_preprocessor(data, save_path="../Model/preprocessor.pkl"):
    binary_features = ['school','sex','address','famsize','Pstatus','schoolsup','famsup',
                       'paid','activities','nursery','higher','internet','romantic']
    multi_categorical_features = ['Mjob', 'Fjob', 'reason', 'guardian']
    numeric_cont_features = ['age', 'absences', 'G1', 'G2']

    # Create and store label encoders for binary columns
    label_encoders = {}
    for col in binary_features:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le  # save the encoder for later use

    # Column transformer for numeric + multi-categorical
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cont_features),
            ('cat', OneHotEncoder(drop='first'), multi_categorical_features)
        ],
        remainder='passthrough'
    )

    # Fit column transformer
    preprocessor.fit(data)

    # Save both encoders and preprocessor together
    joblib.dump({
        'label_encoders': label_encoders,
        'preprocessor': preprocessor,
        'columns': data.columns.tolist()
    }, save_path)

    print(f"✅ Full preprocessor (LabelEncoders + ColumnTransformer) saved to {save_path}")
    return preprocessor

# fit_preprocessor(X_train)


✅ Full preprocessor (LabelEncoders + ColumnTransformer) saved to ../Model/preprocessor.pkl


0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [149]:
binary_features = []
multi_categorical_features = []
numeric_cont_features = []

for col in X_train.columns:
    if data[col].dtype == 'O':
        if data[col].value_counts().count() == 2:
            binary_features += [col]
        else:
            multi_categorical_features += [col]
    else:
        if data[col].value_counts().count() > 5:
            numeric_cont_features += [col]


In [150]:
binary_features, multi_categorical_features, numeric_cont_features


(['school',
  'sex',
  'address',
  'famsize',
  'Pstatus',
  'schoolsup',
  'famsup',
  'paid',
  'activities',
  'nursery',
  'higher',
  'internet',
  'romantic'],
 ['Mjob', 'Fjob', 'reason', 'guardian'],
 ['age', 'absences', 'G1', 'G2'])

Label encoding for binary features

In [151]:
le = LabelEncoder()

for col in binary_features:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

ValueError: invalid literal for int() with base 10: 'MS'

In [None]:
X_train.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
332,0,0,18,1,0,1,2,2,at_home,at_home,...,0,4,3,3,1,2,2,0,18,18
29,0,1,16,1,0,1,4,4,teacher,teacher,...,1,4,4,5,5,5,5,4,12,11
302,0,1,18,0,0,1,3,2,other,other,...,0,5,3,2,1,1,3,2,10,11
286,0,1,17,0,0,1,2,1,other,other,...,0,4,4,2,2,4,5,0,12,12
554,1,0,17,0,0,1,1,1,at_home,at_home,...,1,3,5,5,2,2,4,3,10,11


One-hot-encoding for multi-categorical features

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cont_features),
        ('cat', OneHotEncoder(drop='first'), multi_categorical_features)
    ],
    remainder='passthrough'
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [None]:
# Convert back to DataFrame
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

In [None]:
X_train.head()

Unnamed: 0,num__age,num__absences,num__G1,num__G2,cat__Mjob_health,cat__Mjob_other,cat__Mjob_services,cat__Mjob_teacher,cat__Fjob_health,cat__Fjob_other,...,remainder__nursery,remainder__higher,remainder__internet,remainder__romantic,remainder__famrel,remainder__freetime,remainder__goout,remainder__Dalc,remainder__Walc,remainder__health
0,0.987932,-0.789616,2.525549,2.272896,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,4.0,3.0,3.0,1.0,2.0,2.0
1,-0.629534,0.065939,0.281097,-0.159337,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,4.0,4.0,5.0,5.0,5.0,5.0
2,0.987932,-0.361839,-0.467054,-0.159337,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,5.0,3.0,2.0,1.0,1.0,3.0
3,0.179199,-0.789616,0.281097,0.188125,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,4.0,4.0,2.0,2.0,4.0,5.0
4,0.179199,-0.14795,-0.467054,-0.159337,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,3.0,5.0,5.0,2.0,2.0,4.0


## Modeling

#### Linear Regression

##### with all columns

In [None]:
def evaluate_regression_model(model, X_train, X_test, y_train, y_test, model_name="Model"):

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    return {
        "model name": model_name,
        "Mean Squared Error (MSE):": mse, # lower = better (best value 0)
        "Mean Absolute Error (MAE):" : mae, #  lower = better (best value 0)
        "R² Score:": r2, # higher = better (best value 1)
    }

In [None]:
lr = LinearRegression()
evaluate_regression_model(lr, X_train, X_test, y_train, y_test, "Linear Regression")

{'model name': 'Linear Regression',
 'Mean Squared Error (MSE):': 1.4759092563639329,
 'Mean Absolute Error (MAE):': 0.7650597682758269,
 'R² Score:': 0.8486513286537313}

In [None]:
ridge = Ridge()
evaluate_regression_model(ridge, X_train, X_test, y_train, y_test, "Ridge")

{'model name': 'Ridge',
 'Mean Squared Error (MSE):': 1.4747604122312319,
 'Mean Absolute Error (MAE):': 0.7639505243128645,
 'R² Score:': 0.8487691380870135}

In [None]:
lasso = Lasso()
evaluate_regression_model(lasso, X_train, X_test, y_train, y_test, "Lasso")

{'model name': 'Lasso',
 'Mean Squared Error (MSE):': 2.0278961980512236,
 'Mean Absolute Error (MAE):': 0.9482828913004037,
 'R² Score:': 0.7920472455336904}

In [None]:
svr = SVR(kernel='rbf', C=1.0, epsilon=0.2)
evaluate_regression_model(svr, X_train, X_test, y_train, y_test, "Support Vector Regressor (SVR)")

{'model name': 'Support Vector Regressor (SVR)',
 'Mean Squared Error (MSE):': 1.7493844997195658,
 'Mean Absolute Error (MAE):': 0.7642981517567519,
 'R² Score:': 0.8206075213874623}

In [None]:
dt = DecisionTreeRegressor()
evaluate_regression_model(dt, X_train, X_test, y_train, y_test, "Decision Tree Regressor")

{'model name': 'Decision Tree Regressor',
 'Mean Squared Error (MSE):': 2.3692307692307693,
 'Mean Absolute Error (MAE):': 0.7538461538461538,
 'R² Score:': 0.7570447319239824}

In [None]:
rf = RandomForestRegressor()
evaluate_regression_model(rf, X_train, X_test, y_train, y_test, "Random Forest Regressor")

{'model name': 'Random Forest Regressor',
 'Mean Squared Error (MSE):': 1.5870615384615387,
 'Mean Absolute Error (MAE):': 0.7684615384615385,
 'R² Score:': 0.8372531006528967}

Till now Ridge model is the best

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)


0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [None]:
# save the model
joblib.dump(ridge, "../Model/trained_model.pkl")


['../Model/trained_model.pkl']