In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [37]:
# === 1. Load the Dataset ===
# df = pd.read_csv("simulated_rwanda_primary_promotions_1996_2023_V3.csv")
# df = pd.read_csv("simulated_rwanda_primary_promotions_1996_2023_V6_with_locations.csv")
df = pd.read_csv("simulated_rwanda_primary_promotions_1996_2023_with_locations_01.csv")

In [18]:
# === 2. Prepare Features & Target ===
# Use only up to P5 for prediction (simulate "future" forecast before P6 is known)
subject_cols = []
for grade in ["P1", "P2", "P3", "P4", "P5", "P6"]:
    subject_cols += [f"{subj}_{grade}" for subj in
                     ["Kinyarwanda", "English", "Mathematics", "Science", "Social_Studies", "Creative_Arts"]]

demo_cols = [
    "Gender", "School_Location", "Residence_Location",
    "Has_Electricity", "Parental_Education_Level"
]

X = df[subject_cols + demo_cols].copy()
y = df["Passed_National_Exam"].astype(int)

In [19]:
# === 3. Encode Categorical Features ===
for col in ["Gender", "School_Location", "Residence_Location", "Parental_Education_Level"]:
    X[col] = LabelEncoder().fit_transform(X[col])

In [20]:
# Optional: Scale the features for logistic regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
# === 4. Split Train/Test ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_scaled, X_test_scaled = scaler.transform(X_train), scaler.transform(X_test)


In [22]:
# === 5. Train Models ===

# -- Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)


In [23]:
# -- Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_scaled, y_train)
logreg_pred = logreg.predict(X_test_scaled)


In [24]:
# === 6. Evaluate Models ===
print("==== Random Forest Results ====")
print(classification_report(y_test, rf_pred))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

print("\n==== Logistic Regression Results ====")
print(classification_report(y_test, logreg_pred))
print("ROC AUC:", roc_auc_score(y_test, logreg.predict_proba(X_test_scaled)[:, 1]))

==== Random Forest Results ====
              precision    recall  f1-score   support

           0       0.90      0.78      0.84       310
           1       0.98      0.99      0.98      2756

    accuracy                           0.97      3066
   macro avg       0.94      0.89      0.91      3066
weighted avg       0.97      0.97      0.97      3066

ROC AUC: 0.9911202537571984

==== Logistic Regression Results ====
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       310
           1       0.99      0.99      0.99      2756

    accuracy                           0.98      3066
   macro avg       0.95      0.95      0.95      3066
weighted avg       0.98      0.98      0.98      3066

ROC AUC: 0.997738658176881


In [38]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [39]:
# Prepare LSTM input: reshape to (samples, timesteps, features)
# Each subject per grade as a time step (P1â€“P5)
n_subjects = 6
X_seq = df[[f"{subj}_{grade}" for grade in ["P1", "P2", "P3", "P4", "P5"] for subj in
            ["Kinyarwanda", "English", "Mathematics", "Science", "Social_Studies", "Creative_Arts"]]].values
X_seq = X_seq.reshape(-1, 5, n_subjects)  # [samples, timesteps, features]
y_seq = y.values

In [40]:
# Split for LSTM
idx = np.arange(X_seq.shape[0])
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=y_seq)
X_seq_train, X_seq_test = X_seq[train_idx], X_seq[test_idx]
y_seq_train, y_seq_test = y_seq[train_idx], y_seq[test_idx]

In [41]:
# Simple LSTM
model = models.Sequential([
    layers.Input(shape=(5, n_subjects)),
    layers.LSTM(32, return_sequences=False),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_seq_train, y_seq_train, epochs=30, batch_size=32,
                    validation_split=0.2, verbose=2)

Epoch 1/30
307/307 - 4s - loss: 0.3001 - accuracy: 0.9009 - val_loss: 0.2077 - val_accuracy: 0.9213 - 4s/epoch - 12ms/step
Epoch 2/30
307/307 - 1s - loss: 0.1986 - accuracy: 0.9197 - val_loss: 0.1689 - val_accuracy: 0.9344 - 1s/epoch - 4ms/step
Epoch 3/30
307/307 - 1s - loss: 0.1777 - accuracy: 0.9268 - val_loss: 0.1891 - val_accuracy: 0.9250 - 1s/epoch - 4ms/step
Epoch 4/30
307/307 - 2s - loss: 0.1675 - accuracy: 0.9338 - val_loss: 0.1520 - val_accuracy: 0.9417 - 2s/epoch - 6ms/step
Epoch 5/30
307/307 - 1s - loss: 0.1590 - accuracy: 0.9347 - val_loss: 0.1448 - val_accuracy: 0.9433 - 1s/epoch - 5ms/step
Epoch 6/30
307/307 - 1s - loss: 0.1551 - accuracy: 0.9368 - val_loss: 0.1387 - val_accuracy: 0.9450 - 1s/epoch - 4ms/step
Epoch 7/30
307/307 - 1s - loss: 0.1569 - accuracy: 0.9363 - val_loss: 0.1602 - val_accuracy: 0.9323 - 1s/epoch - 4ms/step
Epoch 8/30
307/307 - 1s - loss: 0.1439 - accuracy: 0.9401 - val_loss: 0.1568 - val_accuracy: 0.9348 - 1s/epoch - 4ms/step
Epoch 9/30
307/307 - 1s

In [42]:
lstm_pred = (model.predict(X_seq_test) > 0.5).astype(int)
print("\n==== LSTM Results ====")
print(classification_report(y_seq_test, lstm_pred))


==== LSTM Results ====
              precision    recall  f1-score   support

           0       0.78      0.59      0.68       310
           1       0.96      0.98      0.97      2756

    accuracy                           0.94      3066
   macro avg       0.87      0.79      0.82      3066
weighted avg       0.94      0.94      0.94      3066



In [53]:
import joblib

joblib.dump(rf, "primary_rf_model.pkl")
joblib.dump(logreg, "primary_logreg_model.pkl")
joblib.dump(lstm_pred, "primary_lstm_model.pkl")
print("Models saved for later use.")


Models saved for later use.
