In [None]:
import json, re
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import joblib

In [None]:
# ---------------- Config ----------------
# DATA_CSV = "simulated_alevel_dataset_5.csv"
DATA_CSV = "a_level_simulated_dataset_with_programPerStudent_BALANCED.csv"
MODEL_DIR = Path("models/alevel")
MODEL_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
TARGET_CANDIDATES = [
    "university_program","University_Program","uni_program",
    "program","Program","target_program"
]

In [None]:
YEAR_CANDIDATES = [
    "academic_year","Academic_Year","year","Year",
    "promotion_year","Promotion_Year","cohort_year","Cohort_Year",
    "exam_year","Exam_Year","graduation_year","Graduation_Year","sitting_year","Sitting_Year"
]
ID_LIKE = {"student_id","Student_ID","studentId","StudentId","id","ID"}

### ---------------- Helpers ----------------

In [None]:
def find_first_col(df, candidates):
    for c in candidates:
        if c in df.columns: return c
    # fuzzy normalize (strip non-letters, lowercase)
    norm = {re.sub(r"[^a-z]", "", c.lower()): c for c in df.columns}
    for c in candidates:
        k = re.sub(r"[^a-z]", "", c.lower())
        if k in norm: return norm[k]
    return None

In [None]:
def coerce_year(s: pd.Series) -> pd.Series:
    def to_year(v):
        if pd.isna(v): return np.nan
        if isinstance(v, (int, np.integer, float, np.floating)):
            if 1900 <= int(v) <= 2100: return int(v)
        m = re.search(r"(19|20)\d{2}", str(v))
        return int(m.group(0)) if m else np.nan
    return pd.Series([to_year(v) for v in s], index=s.index, dtype="float").astype("Int64")

In [None]:
def time_aware_split(df, year_col, n_test_years=2):
    max_year = int(df[year_col].max())
    split_year = max_year - n_test_years
    tr = df[year_col] <= split_year
    te = df[year_col] >  split_year
    return tr, te, split_year, max_year

In [None]:
def build_pipeline(num_cols, cat_cols):
    num = Pipeline([("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())])
    cat = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])
    pre = ColumnTransformer([("num", num, num_cols), ("cat", cat, cat_cols)])
    rf = RandomForestClassifier(
        n_estimators=400, min_samples_split=4, min_samples_leaf=2,
        class_weight="balanced_subsample", n_jobs=-1, random_state=42
    )
    clf = CalibratedClassifierCV(rf, method="isotonic", cv=3)
    return Pipeline([("pre", pre), ("clf", clf)])

### ---------------- Load & detect ----------------

In [None]:
df = pd.read_csv(DATA_CSV)
target_col = find_first_col(df, TARGET_CANDIDATES)
year_col   = find_first_col(df, YEAR_CANDIDATES)

if target_col is None:
    raise ValueError(f"Could not find target column among: {TARGET_CANDIDATES}")
if year_col is None:
    # heuristic: the first column that looks like a year series
    for c in df.columns:
        y = coerce_year(df[c])
        if (y.notna().mean() > 0.7) and (y.dropna().between(1990, 2100).mean() > 0.9):
            year_col = c; break
    if year_col is None:
        raise ValueError("Could not find a year/academic-year column.")

df[year_col] = coerce_year(df[year_col]).astype("Int64")
df = df.dropna(subset=[year_col, target_col]).copy()
df[year_col] = df[year_col].astype(int)

#### features

In [None]:
exclude = {target_col, year_col} | (ID_LIKE & set(df.columns))
feature_cols = [c for c in df.columns if c not in exclude]
num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in feature_cols if not pd.api.types.is_numeric_dtype(df[c])]

#### target

In [None]:
le = LabelEncoder()
y_all = le.fit_transform(df[target_col].astype(str))
classes = le.classes_.tolist()

#### split (time-aware)

In [14]:
tr_idx, te_idx, split_year, max_year = time_aware_split(df, year_col, n_test_years=2)
X_train, y_train = df.loc[tr_idx, feature_cols], y_all[tr_idx]
X_test,  y_test  = df.loc[te_idx, feature_cols], y_all[te_idx]

#### ---------------- Train & evaluate ----------------

In [16]:
pipe = build_pipeline(num_cols, cat_cols)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("\n=== A-Level → University Program (Calibrated RF) ===")
print(classification_report(y_test, y_pred, target_names=classes, digits=3))
print("Accuracy:", accuracy_score(y_test, y_pred))


=== A-Level → University Program (Calibrated RF) ===
                         precision    recall  f1-score   support

            Agriculture      0.949     0.951     0.950     11746
           Architecture      0.953     0.956     0.954     11810
Business Administration      0.969     0.960     0.964     11753
      Civil Engineering      0.974     0.971     0.973     11733
       Computer Science      0.950     0.960     0.955     11848
              Education      0.961     0.962     0.961     11838
 Electrical Engineering      0.959     0.960     0.960     11752
                    Law      0.990     0.985     0.988     11754
               Medicine      0.979     0.977     0.978     11642
           No Placement      0.976     0.976     0.976     35775
                Nursing      0.953     0.959     0.956     11921
            Social Work      0.937     0.946     0.941      1002
  Tourism & Hospitality      0.965     0.937     0.951      2814

               accuracy           

####  ---------------- Importances snapshot (train another RF without calibration) ----------------


In [17]:
rf_plain = RandomForestClassifier(
    n_estimators=500, min_samples_split=4, min_samples_leaf=2,
    class_weight="balanced_subsample", n_jobs=-1, random_state=42
)
pre = pipe.named_steps["pre"]     # re-use identical preprocessing
rf_pipe = Pipeline([("pre", pre), ("rf", rf_plain)])
rf_pipe.fit(X_train, y_train)

#### expanded feature names after OHE

In [18]:
ohe = pre.named_transformers_["cat"].named_steps["ohe"] if cat_cols else None
num_feats = num_cols
cat_feats = ohe.get_feature_names_out(cat_cols).tolist() if ohe is not None else []
feat_names = num_feats + cat_feats

imps = getattr(rf_pipe.named_steps["rf"], "feature_importances_", None)
imp_df = pd.DataFrame({"feature": feat_names, "importance": imps}).sort_values("importance", ascending=False)


#### ---------------- Save artifacts ----------------

In [19]:
joblib.dump(pipe, MODEL_DIR / "alevel_pipeline.pkl")
joblib.dump(le,   MODEL_DIR / "alevel_label_encoder.pkl")
imp_df.to_csv(MODEL_DIR / "alevel_importances.csv", index=False)

meta = {
    "target": target_col,
    "year_col": year_col,
    "feature_cols": feature_cols,
    "numeric_cols": num_cols,
    "categorical_cols": cat_cols,
    "classes": classes,
    "split_year": int(split_year),
    "max_year": int(max_year)
}
with open(MODEL_DIR / "alevel_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print(f"\nSaved to: {MODEL_DIR.resolve()}")


Saved to: D:\Projects\bigDataFinalProject\finalProject\models\A-Level\models\alevel
