**Load + clean + Prediction week**

In [None]:
import pandas as pd
from pathlib import Path

from src.data.cleaning import clean_for_classification

DATA_PATH = Path("../../data/processed/ews_feature_store.csv")
df = pd.read_csv(DATA_PATH)

df_cls = clean_for_classification(df)

# Choose your early warning horizon:
PRED_WEEK = 2  # change to 0,2,4 etc.
df_w = df_cls[df_cls["week"] <= PRED_WEEK].copy()

df_w.shape, df_w[["week","target_pass"]].head()


**Train/Test no leakage**

As multiple rows, Don't split randomly filter by ID



In [None]:
from sklearn.model_selection import GroupShuffleSplit

X = df_w.drop(columns=["target_pass"])
y = df_w["target_pass"].astype(int)
groups = df_w["student_id"]

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]


**preprocess: one-hot encode categoricals, passthrough numerics**

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ]), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), cat_cols),
    ],
    remainder="drop",
)


**Logistic**

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])


**Evaluation**

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix

logreg.fit(X_train, y_train)
proba = logreg.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_test, proba))
print("PR-AUC :", average_precision_score(y_test, proba))
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=3))
