In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)

df = pd.read_csv("data/preprocessed_student_spending.csv")

categorical = ["gender", "year_in_school", "major", "preferred_payment_method"]

numerical = [
    "age", "monthly_income", "financial_aid", "tuition", "housing",
    "food", "transportation", "books_supplies", "entertainment",
    "personal_care", "technology", "health_wellness", "miscellaneous"
]

X = df[categorical + numerical]
y = df["financial_stress"]

# ---------------------
# PREPROCESSING
# ---------------------

preprocess_clf = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numerical)
])

# ---------------------
# MODEL
# ---------------------

stress_model = Pipeline([
    ("prep", preprocess_clf),
    ("clf", RandomForestClassifier(
        n_estimators=400,
        class_weight="balanced",
        random_state=42
    ))
])

# ---------------------
# TRAIN/TEST SPLIT
# ---------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------
# TRAIN
# ---------------------

stress_model.fit(X_train, y_train)

# ---------------------
# PREDICT
# ---------------------

preds = stress_model.predict(X_test)
probs = stress_model.predict_proba(X_test)[:, 1]

# ---------------------
# METRICS
# ---------------------

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
roc_auc = roc_auc_score(y_test, probs)

print("\n=== FINANCIAL STRESS CLASSIFICATION MODEL PERFORMANCE ===")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print(f"ROC-AUC:   {roc_auc:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, preds))


  from scipy.sparse import csr_matrix, issparse



=== FINANCIAL STRESS CLASSIFICATION MODEL PERFORMANCE ===
Accuracy:  1.000
Precision: 1.000
Recall:    1.000
F1 Score:  1.000
ROC-AUC:   1.000

Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00         6
        True       1.00      1.00      1.00       194

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

