In [1]:
#import libraries 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [3]:
#read the data
df = pd.read_csv("Student_performance_data _.csv")  # change filename
df.head()


Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [4]:
#describe the information
df.info()
df.describe()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64

In [18]:
#drop if any null values are present 
df = df.dropna()   # simple & safe
print(df)

      StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
0          1001   17       1          0                  2        19.833723   
1          1002   18       0          0                  1        15.408756   
2          1003   15       0          2                  3         4.210570   
3          1004   17       1          0                  3        10.028829   
4          1005   17       1          0                  2         4.672495   
...         ...  ...     ...        ...                ...              ...   
2387       3388   18       1          0                  3        10.680555   
2388       3389   17       0          0                  1         7.583217   
2389       3390   16       1          0                  2         6.805500   
2390       3391   16       1          1                  0        12.416653   
2391       3392   16       1          0                  2        17.819907   

      Absences  Tutoring  ParentalSupport  Extracur

In [7]:
#set a target value or variable 
X = df.drop('Age', axis=1)
y = df['Age']


In [8]:
#train and test the data available 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [9]:
#evaluate the model 
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, mae, r2


In [10]:
#MULTIPLE LINEAR REGRESSION (BASELINE)
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

lr_pipeline.fit(X_train, y_train)
lr_results = evaluate_model(lr_pipeline, X_test, y_test)


In [11]:
#RIDGE REGRESSION (REGULARIZATION)
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=1.0))
])

ridge_pipeline.fit(X_train, y_train)
ridge_results = evaluate_model(ridge_pipeline, X_test, y_test)


In [12]:
#LASSO REGRESSION
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Lasso(alpha=0.01))
])

lasso_pipeline.fit(X_train, y_train)
lasso_results = evaluate_model(lasso_pipeline, X_test, y_test)


In [13]:
#POLYNOMIAL REGRESSION + RIDGE
poly_ridge_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=1.0))
])

poly_ridge_pipeline.fit(X_train, y_train)
poly_ridge_results = evaluate_model(poly_ridge_pipeline, X_test, y_test)


In [14]:
#DECISION TREE REGRESSOR
dt_model = DecisionTreeRegressor(
    max_depth=5,
    random_state=42
)

dt_model.fit(X_train, y_train)
dt_results = evaluate_model(dt_model, X_test, y_test)


In [15]:
#RANDOM FOREST REGRESSOR
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf_model.fit(X_train, y_train)
rf_results = evaluate_model(rf_model, X_test, y_test)


In [16]:
#GRADIENT BOOSTING REGRESSOR (ADVANCED)
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb_model.fit(X_train, y_train)
gb_results = evaluate_model(gb_model, X_test, y_test)


In [17]:
#MODEL COMPARISON TABLE
results_df = pd.DataFrame({
    'Model': [
        'Linear Regression',
        'Ridge Regression',
        'Lasso Regression',
        'Polynomial Ridge',
        'Decision Tree',
        'Random Forest',
        'Gradient Boosting'
    ],
    'RMSE': [
        lr_results[0], ridge_results[0], lasso_results[0],
        poly_ridge_results[0], dt_results[0], rf_results[0], gb_results[0]
    ],
    'MAE': [
        lr_results[1], ridge_results[1], lasso_results[1],
        poly_ridge_results[1], dt_results[1], rf_results[1], gb_results[1]
    ],
    'R2 Score': [
        lr_results[2], ridge_results[2], lasso_results[2],
        poly_ridge_results[2], dt_results[2], rf_results[2], gb_results[2]
    ]
})

results_df.sort_values(by='R2 Score', ascending=False)


Unnamed: 0,Model,RMSE,MAE,R2 Score
2,Lasso Regression,1.128373,1.005934,-0.017138
1,Ridge Regression,1.130032,1.005392,-0.02013
0,Linear Regression,1.130036,1.005376,-0.020138
5,Random Forest,1.133315,1.006537,-0.026068
3,Polynomial Ridge,1.144799,1.005079,-0.046968
4,Decision Tree,1.16544,1.012419,-0.085062
6,Gradient Boosting,1.177965,1.028908,-0.108509
