In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# 1. Load Dataset

DATA_PATH = "data/student_performance_data.csv"
dataset = pd.read_csv(DATA_PATH)
dataset

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.210570,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.279150,4.0
2389,3390,16,1,0,2,6.805500,20,0,2,0,0,0,1,1.142333,2.0
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0


In [3]:
# -------------------------------
# 2. Target Variable (Pass / Fail)
# -------------------------------
# GradeClass: 0,1,2 = Pass | 3,4 = Fail
dataset["Result"] = dataset["GradeClass"].apply(lambda x: 1 if x <= 2 else 0)
dataset

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass,Result
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0,1
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0,1
2,1003,15,0,2,3,4.210570,26,0,2,0,0,0,0,0.112602,4.0,0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0,0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0,1
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.279150,4.0,0
2389,3390,16,1,0,2,6.805500,20,0,2,0,0,0,1,1.142333,2.0,1
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0,1


In [4]:
# -------------------------------
# 3. Feature Engineering
# -------------------------------
TOTAL_CLASSES = 100  # assumed constant

dataset["PresentClasses"] = TOTAL_CLASSES - dataset["Absences"]
dataset["AttendanceRatio"] = dataset["PresentClasses"] / TOTAL_CLASSES
dataset

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass,Result,PresentClasses,AttendanceRatio
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0,1,93,0.93
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0,1,100,1.00
2,1003,15,0,2,3,4.210570,26,0,2,0,0,0,0,0.112602,4.0,0,74,0.74
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0,0,86,0.86
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0,0,83,0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0,1,98,0.98
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.279150,4.0,0,96,0.96
2389,3390,16,1,0,2,6.805500,20,0,2,0,0,0,1,1.142333,2.0,1,80,0.80
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0,1,83,0.83


In [5]:
# -------------------------------
# 4. Feature Selection
# -------------------------------
features = [
    "Gender",
    "StudyTimeWeekly",
    "AttendanceRatio",
    "Tutoring",
    "ParentalSupport"
]

X = dataset[features]
y = dataset["Result"]


In [6]:
# -------------------------------
# 5. Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [7]:
# -------------------------------
# 6. Preprocessing
# -------------------------------
categorical_features = ["Gender", "Tutoring", "ParentalSupport"]
numerical_features = ["StudyTimeWeekly", "AttendanceRatio"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)


In [8]:
# -------------------------------
# 7. Models
# -------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

best_model = None
best_f1 = 0
best_name = ""

In [9]:
# -------------------------------
# 8. Training & Evaluation
# -------------------------------
for name, clf in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", clf)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = pipeline
        best_name = name


Logistic Regression
Accuracy: 0.8914
F1 Score: 0.8156

Random Forest
Accuracy: 0.8622
F1 Score: 0.7857


In [10]:
# -------------------------------
# 9. Save Best Model
# -------------------------------
joblib.dump(best_model, "model.pkl")

print("\n✅ Best Model Saved")
print(f"Model: {best_name}")
print(f"Best F1 Score: {best_f1:.4f}")


✅ Best Model Saved
Model: Logistic Regression
Best F1 Score: 0.8156
