In [17]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib

In [18]:
# === 1) Load ===
df = pd.read_csv("simulated_rwanda_primary_promotions_1996_2023_with_locations_01.csv")

In [19]:
# === 2) Columns ===
subjects = ["Kinyarwanda", "English", "Mathematics", "Science", "Social_Studies", "Creative_Arts"]
grades_hist = ["P1", "P2", "P3", "P4", "P5"]  # exclude P6 to avoid leakage
subject_cols = [f"{s}_{g}" for g in grades_hist for s in subjects]

In [20]:
cat_cols = ["Gender", "School_Location", "Residence_Location", "Parental_Education_Level"]
num_cols = subject_cols + ["Has_Electricity"]  # keep as numeric 0/1


In [21]:
X = df[num_cols + cat_cols].copy()
y = df["Passed_National_Exam"].astype(int)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [23]:
# === 3) Preprocessors ===
# Dense one-hot to keep life simple (works across sklearn versions)
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)


In [24]:
rf_pre = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", ohe, cat_cols),
    ],
    remainder="drop",
)

lr_pre = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_cols),
        ("cat", ohe, cat_cols),
    ],
    remainder="drop",
)

In [25]:
# === 4) Pipelines ===
rf_pipe = Pipeline([
    ("pre", rf_pre),
    ("clf", RandomForestClassifier(
        n_estimators=300, random_state=42, n_jobs=-1, class_weight="balanced_subsample"
    )),
])

In [26]:
lr_pipe = Pipeline([
    ("pre", lr_pre),
    ("clf", LogisticRegression(max_iter=1000, random_state=42)),
])


In [27]:
# === 5) Fit & quick checks ===
rf_pipe.fit(X_train, y_train)
lr_pipe.fit(X_train, y_train)




In [31]:
print("RF test acc:", rf_pipe.score(X_test, y_test))
print("LR test acc:", lr_pipe.score(X_test, y_test))

RF test acc: 0.9445531637312459
LR test acc: 0.9442270058708415


In [32]:
# === 6) Save to your Django models folder ===
out_dir = Path("finalProject/models")  # adjust if your manage.py is elsewhere
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(rf_pipe, out_dir / "primary_rf_model.pkl")
joblib.dump(lr_pipe, out_dir / "primary_logreg_model.pkl")
print("Saved:", out_dir / "primary_rf_model.pkl")
print("Saved:", out_dir / "primary_logreg_model.pkl")

Saved: finalProject\models\primary_rf_model.pkl
Saved: finalProject\models\primary_logreg_model.pkl


In [45]:
# train_and_save_primary_pipelines.py
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# === load ===
df = pd.read_csv("simulated_rwanda_primary_promotions_1996_2023_with_locations_01.csv")

subjects = ["Kinyarwanda", "English", "Mathematics", "Science", "Social_Studies", "Creative_Arts"]
grades = ["P1", "P2", "P3", "P4", "P5"]  # exclude P6 to avoid leakage
subject_cols = [f"{s}_{g}" for g in grades for s in subjects]

cat_cols = ["Gender", "School_Location", "Residence_Location", "Parental_Education_Level"]
num_cols = subject_cols + ["Has_Electricity"]

X = df[num_cols + cat_cols].copy()
y = df["Passed_National_Exam"].astype(int)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

rf_pre = ColumnTransformer(
    [("num", "passthrough", num_cols),
     ("cat", ohe, cat_cols)],
    remainder="drop",
)

lr_pre = ColumnTransformer(
    [("num", Pipeline([("scaler", StandardScaler())]), num_cols),
     ("cat", ohe, cat_cols)],
    remainder="drop",
)

rf_pipe = Pipeline([
    ("pre", rf_pre),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1,
                                   class_weight="balanced_subsample")),
])

lr_pipe = Pipeline([
    ("pre", lr_pre),
    ("clf", LogisticRegression(max_iter=1000, random_state=42)),
])

rf_pipe.fit(X_tr, y_tr)
lr_pipe.fit(X_tr, y_tr)

print("RF test acc:", rf_pipe.score(X_te, y_te))
print("LR test acc:", lr_pipe.score(X_te, y_te))

out_dir = Path("finalProject/models")
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(rf_pipe, out_dir / "primary_rf_model.pkl")
joblib.dump(lr_pipe, out_dir / "primary_logreg_model.pkl")
print("Saved:", out_dir / "primary_rf_model.pkl")
print("Saved:", out_dir / "primary_logreg_model.pkl")




RF test acc: 0.9445531637312459
LR test acc: 0.9442270058708415
Saved: finalProject\models\primary_rf_model.pkl
Saved: finalProject\models\primary_logreg_model.pkl


In [47]:
from manage import m_rf, m_lr
import pandas as pd

df = pd.read_csv("simulated_rwanda_primary_promotions_1996_2023_with_locations_01.csv")
subjects = ["Kinyarwanda", "English", "Mathematics", "Science", "Social_Studies", "Creative_Arts"]
grades = ["P1", "P2", "P3", "P4", "P5"]
subject_cols = [f"{s}_{g}" for g in grades for s in subjects]
cat_cols = ["Gender", "School_Location", "Residence_Location", "Parental_Education_Level"]
num_cols = subject_cols + ["Has_Electricity"]
X = df[num_cols + cat_cols].copy()

print(m_rf.predict(X.head(3)))
print(m_lr.predict(X.head(3)))


AttributeError: 'numpy.ndarray' object has no attribute 'predict'

## 10-September-2025  Version

In [48]:
import os
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    average_precision_score, balanced_accuracy_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

import joblib
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks




In [49]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

In [50]:
CANDIDATES = [
    "/mnt/data/simulated_rwanda_primary_promotions_1996_2023_with_locations_01.csv",
    "/mnt/data/simulated_rwanda_primary_promotions_1996_2023_V6_with_locations.csv",
    "simulated_rwanda_primary_promotions_1996_2023_with_locations_01.csv",
    "simulated_rwanda_primary_promotions_1996_2023_V6_with_locations.csv",
]
data_path = next((p for p in CANDIDATES if Path(p).exists()), None)
if data_path is None:
    raise FileNotFoundError(
        "Could not find the dataset. Checked:\n  - " + "\n  - ".join(CANDIDATES)
    )

df = pd.read_csv(data_path)
print(f"[OK] Loaded: {data_path}  shape={df.shape}")

[OK] Loaded: simulated_rwanda_primary_promotions_1996_2023_with_locations_01.csv  shape=(15330, 52)


In [51]:
subjects = ["Kinyarwanda", "English", "Mathematics", "Science", "Social_Studies", "Creative_Arts"]
grades_all = ["P1", "P2", "P3", "P4", "P5", "P6"]
grades_seq = ["P1", "P2", "P3", "P4", "P5"]  # LSTM uses P1–P5 as the sequence

subject_cols_all = [f"{s}_{g}" for g in grades_all for s in subjects]  # for RF/LR
subject_cols_seq = [f"{s}_{g}" for g in grades_seq for s in subjects]  # for LSTM

categorical_cols = ["Gender", "School_Location", "Residence_Location", "Has_Electricity", "Parental_Education_Level"]
required_cols = subject_cols_all + categorical_cols + ["Passed_National_Exam"]

missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise RuntimeError(f"Missing required columns: {missing}")

In [52]:
# Target as int {0,1}
y = df["Passed_National_Exam"].astype(int)

In [53]:
# Features for tabular models (RF & LR): P1–P6 subjects + demographics
X = df[subject_cols_all + categorical_cols].copy()

In [54]:
print("\n[Info] Target distribution:", Counter(y))
print(pd.Series(y).value_counts(normalize=True).rename("class_ratio").round(3))


[Info] Target distribution: Counter({1: 13781, 0: 1549})
Passed_National_Exam
1    0.899
0    0.101
Name: class_ratio, dtype: float64


In [55]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

In [56]:
numeric_cols = subject_cols_all  # all subject scores are numeric
cat_cols = categorical_cols

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),  # helps LR; RF is robust but OK with scaled too
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop",
)

In [57]:
# Fit preprocessor on TRAIN only
preprocessor.fit(X_train)

In [61]:
# Transform both splits
X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)
print("\n[OK] Preprocessing done.")
print("Processed train shape:", X_train_proc.shape, "| test shape:", X_test_proc.shape)


[OK] Preprocessing done.
Processed train shape: (12264, 48) | test shape: (3066, 48)


In [62]:
def evaluate_model(name, y_true, y_pred, y_proba=None):
    print(f"\n==== {name} ====")
    print(classification_report(y_true, y_pred, digits=3))
    print("Balanced Acc.:", round(balanced_accuracy_score(y_true, y_pred), 4))
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm)
    if y_proba is not None and len(np.unique(y_true)) == 2:
        try:
            roc = roc_auc_score(y_true, y_proba)
            ap = average_precision_score(y_true, y_proba)
            print("ROC AUC:", round(roc, 4))
            print("PR AUC (Avg Precision):", round(ap, 4))
        except ValueError as e:
            print("AUC metrics not available:", e)

#### 6) Train Random Forest (with class_weight 'balanced' in case of imbalance)

In [63]:
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    class_weight="balanced"
)
rf_clf.fit(X_train_proc, y_train)
rf_pred = rf_clf.predict(X_test_proc)
rf_proba = rf_clf.predict_proba(X_test_proc)[:, 1]
evaluate_model("Random Forest", y_test, rf_pred, rf_proba)


==== Random Forest ====
              precision    recall  f1-score   support

           0      0.923     0.700     0.796       310
           1      0.967     0.993     0.980      2756

    accuracy                          0.964      3066
   macro avg      0.945     0.847     0.888      3066
weighted avg      0.963     0.964     0.962      3066

Balanced Acc.: 0.8467
Confusion Matrix:
 [[ 217   93]
 [  18 2738]]
ROC AUC: 0.9919
PR AUC (Avg Precision): 0.9991


### 7) Train Logistic Regression (uses the same preprocessed features)

In [64]:
logreg_clf = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    n_jobs=-1,
    class_weight="balanced",
    random_state=RANDOM_STATE,
)
logreg_clf.fit(X_train_proc, y_train)
lr_pred = logreg_clf.predict(X_test_proc)
lr_proba = logreg_clf.predict_proba(X_test_proc)[:, 1]
evaluate_model("Logistic Regression", y_test, lr_pred, lr_proba)


==== Logistic Regression ====
              precision    recall  f1-score   support

           0      0.794     0.997     0.884       310
           1      1.000     0.971     0.985      2756

    accuracy                          0.974      3066
   macro avg      0.897     0.984     0.935      3066
weighted avg      0.979     0.974     0.975      3066

Balanced Acc.: 0.9839
Confusion Matrix:
 [[ 309    1]
 [  80 2676]]
ROC AUC: 0.9976
PR AUC (Avg Precision): 0.9997


### 8) Save end-to-end pipelines for easy reuse in Django

In [65]:
#    (preprocessor + classifier) -> single artifact
models_dir = Path("models");
models_dir.mkdir(exist_ok=True)

rf_pipeline = Pipeline(steps=[("pre", preprocessor), ("clf", rf_clf)])
lr_pipeline = Pipeline(steps=[("pre", preprocessor), ("clf", logreg_clf)])

joblib.dump(rf_pipeline, models_dir / "primary_rf_pipeline.pkl")
joblib.dump(lr_pipeline, models_dir / "primary_logreg_pipeline.pkl")
print("\n[OK] Saved sklearn pipelines to /models")


[OK] Saved sklearn pipelines to /models


### 9) Build LSTM sequence data (P1–P5 subjects only) with proper train-only fitting

In [66]:
# Flatten sequence features to 2D so we can impute/scale with sklearn, then reshape to 3D

seq_feature_order = subject_cols_seq  # exact order matters
M = df[seq_feature_order].copy()


In [67]:
# Split indices using the SAME split as tabular (aligns samples)
train_idx = X_train.index.values
test_idx = X_test.index.values

In [68]:
M_train = M.loc[train_idx].values  # shape: (n_train, 5*6)
M_test = M.loc[test_idx].values

In [69]:
# Impute+scale FIT on TRAIN only
seq_imputer = SimpleImputer(strategy="median")
seq_scaler = StandardScaler()

In [70]:
M_train_imp = seq_imputer.fit_transform(M_train)
M_test_imp = seq_imputer.transform(M_test)

In [71]:
M_train_scaled = seq_scaler.fit_transform(M_train_imp)
M_test_scaled = seq_scaler.transform(M_test_imp)

In [72]:
# Reshape to (samples, timesteps=5, features=6)
N_SUBJECTS = len(subjects)  # 6
TIMESTEPS = len(grades_seq)  # 5

In [73]:
def to_3d(m2d):
    return m2d.reshape(m2d.shape[0], TIMESTEPS, N_SUBJECTS)


X_seq_train = to_3d(M_train_scaled)
X_seq_test = to_3d(M_test_scaled)

y_seq_train = y.loc[train_idx].values
y_seq_test = y.loc[test_idx].values

print("\n[OK] LSTM data shapes:", X_seq_train.shape, X_seq_test.shape)



[OK] LSTM data shapes: (12264, 5, 6) (3066, 5, 6)


### 10) LSTM model with EarlyStopping + class weights (helps with imbalance)

In [74]:
classes = np.unique(y_seq_train)
class_w = compute_class_weight(class_weight="balanced", classes=classes, y=y_seq_train)
class_weight_dict = {int(c): w for c, w in zip(classes, class_w)}
print("[Info] LSTM class weights:", class_weight_dict)

lstm_model = models.Sequential([
    layers.Input(shape=(TIMESTEPS, N_SUBJECTS)),
    layers.Masking(mask_value=0.0),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.25),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

[Info] LSTM class weights: {0: 4.9491525423728815, 1: 0.5561904761904762}



In [75]:
lstm_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

es = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

history = lstm_model.fit(
    X_seq_train, y_seq_train,
    epochs=50,
    batch_size=64,
    validation_split=0.2,
    callbacks=[es],
    class_weight=class_weight_dict,
    verbose=2
)


Epoch 1/50


154/154 - 11s - loss: 0.3167 - accuracy: 0.7531 - val_loss: 0.1946 - val_accuracy: 0.8985 - 11s/epoch - 71ms/step
Epoch 2/50
154/154 - 2s - loss: 0.2099 - accuracy: 0.8856 - val_loss: 0.2421 - val_accuracy: 0.8765 - 2s/epoch - 12ms/step
Epoch 3/50
154/154 - 2s - loss: 0.2052 - accuracy: 0.8893 - val_loss: 0.1989 - val_accuracy: 0.8956 - 2s/epoch - 12ms/step
Epoch 4/50
154/154 - 3s - loss: 0.2031 - accuracy: 0.8936 - val_loss: 0.1894 - val_accuracy: 0.8993 - 3s/epoch - 19ms/step
Epoch 5/50
154/154 - 3s - loss: 0.2043 - accuracy: 0.8923 - val_loss: 0.1928 - val_accuracy: 0.8997 - 3s/epoch - 17ms/step
Epoch 6/50
154/154 - 2s - loss: 0.2020 - accuracy: 0.8944 - val_loss: 0.2309 - val_accuracy: 0.8797 - 2s/epoch - 12ms/step
Epoch 7/50
154/154 - 4s - loss: 0.2014 - accuracy: 0.8942 - val_loss: 0.2644 - val_accuracy: 0.8724 - 4s/epoch - 26ms/step
Epoch 8/50
154/154 - 2s - loss: 0.2012 - accuracy: 0.8917 - val_loss: 0.2099 - val_accuracy: 0.8846 - 2s/epoch - 15ms/step
Epoch 9/50


### 11) LSTM evaluation

In [76]:
lstm_proba = lstm_model.predict(X_seq_test, verbose=0).ravel()
lstm_pred = (lstm_proba >= 0.5).astype(int)

evaluate_model("LSTM (P1–P5 sequence)", y_seq_test, lstm_pred, lstm_proba)


==== LSTM (P1–P5 sequence) ====
              precision    recall  f1-score   support

           0      0.470     0.945     0.628       310
           1      0.993     0.880     0.933      2756

    accuracy                          0.887      3066
   macro avg      0.732     0.913     0.781      3066
weighted avg      0.940     0.887     0.902      3066

Balanced Acc.: 0.9127
Confusion Matrix:
 [[ 293   17]
 [ 330 2426]]
ROC AUC: 0.9712
PR AUC (Avg Precision): 0.9967


### 12) Save the actual LSTM model + its preprocessor (imputer & scaler & column order)

In [77]:
lstm_dir = models_dir / "primary_lstm"
lstm_dir.mkdir(exist_ok=True)

In [78]:
# Save Keras model
lstm_model_path = lstm_dir / "primary_lstm_model.keras"
lstm_model.save(lstm_model_path)

In [79]:
# Save sequence preprocessor pack (for consistent inference later)
lstm_preproc_pack = {
    "seq_feature_order": seq_feature_order,
    "imputer": seq_imputer,
    "scaler": seq_scaler,
    "timesteps": TIMESTEPS,
    "n_subjects": N_SUBJECTS,
}
joblib.dump(lstm_preproc_pack, lstm_dir / "primary_lstm_preprocessor.pkl")

print(f"\n[OK] Saved LSTM model to {lstm_model_path}")
print("[OK] Saved LSTM preprocessor to /models/primary_lstm/primary_lstm_preprocessor.pkl")


[OK] Saved LSTM model to models\primary_lstm\primary_lstm_model.keras
[OK] Saved LSTM preprocessor to /models/primary_lstm/primary_lstm_preprocessor.pkl
