In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

os.makedirs("outputs_Test", exist_ok=True)

df = pd.read_csv("../data/StudentsPerformance.csv")
df["average"] = df[["math score", "reading score", "writing score"]].mean(axis=1)
df["result"] = df["average"].apply(lambda x: "geçti" if x >= 40 else "kaldı")

# Encode categories
label_cols = ["gender", "race/ethnicity", "parental level of education", "lunch", "test preparation course"]
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop(["average", "result"], axis=1)
y = LabelEncoder().fit_transform(df["result"])

# MODEL DESCRIPTION 
models = {
    "KNN": KNeighborsClassifier(n_neighbors=13),
    "DT": DecisionTreeClassifier(),
    "NB": GaussianNB(),
    "RF": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel="rbf", C=2, probability=True),
    "LR": LogisticRegression(max_iter=250),
    "LDA": LinearDiscriminantAnalysis()
}

# CROSS-VALIDATION METRICS
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    acc = cross_val_score(model, X, y, cv=cv, scoring="accuracy").mean()
    precision_scores, recall_scores, f1_scores = [], [], []
    for train_idx, test_idx in cv.split(X, y):
        model.fit(X.iloc[train_idx], y[train_idx])
        y_pred = model.predict(X.iloc[test_idx])
        precision_scores.append(precision_score(y[test_idx], y_pred))
        recall_scores.append(recall_score(y[test_idx], y_pred))
        f1_scores.append(f1_score(y[test_idx], y_pred))

    results.append({
        "Model": name,
        "Accuracy": round(acc, 2),
        "Precision": round(np.mean(precision_scores), 2),
        "Recall": round(np.mean(recall_scores), 2),
        "F1-Score": round(np.mean(f1_scores), 2)
    })

results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print("CROSS-VALIDATION METRICS")
print(results_df)
results_df.to_csv("outputs_Test/cv_metrics.csv", index=False)

# NORMALIZATION 
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# TRAIN/TEST SPLIT (STUDENT-BASED TEST)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Model selection: Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# TRUE AND FALSE PREDICTIONS
acc = accuracy_score(y_test, y_pred)
print(f"\nTEST SET METRITS (Random Forest)")
print(f"Accuracy: {acc:.2f}")

wrong_idx = X_test.index[y_test != y_pred]
wrong_df = X_test.loc[wrong_idx].copy()
wrong_df["TrueResult"] = y_test[wrong_idx]
wrong_df["Predicted"] = y_pred[wrong_idx]
print("Students who guessed wrong:")
print(wrong_df)
wrong_df.to_csv("outputs_Test/wrong_predictions.csv")

# CONFUSION MATRIX 
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=["Kaldı", "Geçti"])
disp.plot()
plt.title("Random Forest – Confusion Matrix")
plt.savefig("outputs_Test/confusion_matrix.png")
plt.close()

# ROC CURVE 
plt.figure(figsize=(6,5))
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
auc_val = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"Random Forest (AUC={auc_val:.2f})")
plt.plot([0,1],[0,1],'k--', alpha=0.6)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Random Forest")
plt.legend()
plt.grid(alpha=0.3)
plt.savefig("outputs_Test/roc_curve.png")
plt.close()

CROSS-VALIDATION METRICS
  Model  Accuracy  Precision  Recall  F1-Score
1    DT      1.00       0.92    0.87      0.87
3    RF      1.00       0.97    0.90      0.92
5    LR      1.00       0.97    0.97      0.97
0   KNN      0.99       1.00    0.77      0.86
4   SVM      0.99       1.00    0.60      0.74
6   LDA      0.99       1.00    0.57      0.69
2    NB      0.98       0.64    0.97      0.77

TEST SET METRITS (Random Forest)
Accuracy: 1.00
Students who guessed wrong:
Empty DataFrame
Columns: [gender, race/ethnicity, parental level of education, lunch, test preparation course, math score, reading score, writing score, TrueResult, Predicted]
Index: []
