In [1]:
import json, os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import joblib




In [3]:
# -------------------- Config --------------------
DATA_PATH = "O_Level_Dataset_with_Schools.csv"  # change if needed
MODEL_DIR = Path("models/olevel")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

SUBJECTS = [
    "Mathematics","Physics","Chemistry","Biology","Geography",
    "History","Economics","English","Kinyarwanda","ICT"
]
YEARS = ["S1","S2","S3"]
TARGET = "A_Level_Stream"

In [4]:
def required_cols():
    cols = []
    for y in YEARS:
        for s in SUBJECTS:
            cols.append(f"{y}_{s}")
    return cols

REQ_COLS = required_cols()

In [5]:
# -------------------- Load & validate --------------------
df = pd.read_csv(DATA_PATH)

missing = [c for c in REQ_COLS + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

In [6]:
# Drop rows with NA in features/target
df = df.dropna(subset=REQ_COLS + [TARGET]).copy()


In [7]:
# -------------------- Encode target --------------------
le_stream = LabelEncoder()
df["A_Level_Stream_Label"] = le_stream.fit_transform(df[TARGET])
n_classes = int(df["A_Level_Stream_Label"].nunique())

In [8]:
# -------------------- Split --------------------
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df["A_Level_Stream_Label"], random_state=42
)


In [9]:
# -------------------- Scale features (FIT ON TRAIN ONLY) --------------------
scaler = MinMaxScaler()
train_df[REQ_COLS] = scaler.fit_transform(train_df[REQ_COLS])
test_df[REQ_COLS]  = scaler.transform(test_df[REQ_COLS])

In [10]:
# -------------------- Build sequences (LSTM) --------------------
def make_lstm_sequences(fr):
    seqs = []
    for _, row in fr.iterrows():
        steps = []
        for y in YEARS:
            steps.append([row[f"{y}_{s}"] for s in SUBJECTS])
        seqs.append(steps)
    return np.asarray(seqs, dtype="float32")

X_train_seq = make_lstm_sequences(train_df)     # (N, 3, 10)
y_train = train_df["A_Level_Stream_Label"].to_numpy()
X_test_seq  = make_lstm_sequences(test_df)
y_test  = test_df["A_Level_Stream_Label"].to_numpy()

In [11]:
# -------------------- Random Forest on flattened --------------------
X_train_rf = X_train_seq.reshape((X_train_seq.shape[0], -1))
X_test_rf  = X_test_seq.reshape((X_test_seq.shape[0], -1))


In [12]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_rf, y_train)
y_pred_rf = rf.predict(X_test_rf)
print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred_rf, target_names=le_stream.classes_))
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))


=== Random Forest ===
              precision    recall  f1-score   support

         HEG       0.91      0.84      0.87      1546
         HEL       0.86      0.89      0.88      2203
         HGL       0.86      0.89      0.88      2345
         ICT       0.86      0.90      0.88      2497
         MCE       0.91      0.84      0.87      1307
         MEG       0.91      0.82      0.86      1445
         PCB       0.87      0.90      0.89      2237
         PCM       0.88      0.91      0.89      1864

    accuracy                           0.88     15444
   macro avg       0.88      0.87      0.88     15444
weighted avg       0.88      0.88      0.88     15444

RF Accuracy: 0.8785288785288785


In [13]:
# -------------------- LSTM (sequence classifier) --------------------
input_shape = (len(YEARS), len(SUBJECTS))
lstm = models.Sequential([
    layers.Input(shape=input_shape),
    layers.LSTM(64, return_sequences=False),
    layers.Dense(32, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(n_classes, activation="softmax"),
])
lstm.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

early = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
hist = lstm.fit(
    X_train_seq, y_train,
    validation_split=0.2,
    epochs=40, batch_size=64,
    callbacks=[early], verbose=2
)

y_pred_lstm = np.argmax(lstm.predict(X_test_seq), axis=1)
print("\n=== LSTM ===")
print(classification_report(y_test, y_pred_lstm, target_names=le_stream.classes_))
print("LSTM Accuracy:", accuracy_score(y_test, y_pred_lstm))



Epoch 1/40


773/773 - 10s - loss: 1.7357 - accuracy: 0.3427 - val_loss: 1.1240 - val_accuracy: 0.6780 - 10s/epoch - 13ms/step
Epoch 2/40
773/773 - 4s - loss: 0.9683 - accuracy: 0.6356 - val_loss: 0.6725 - val_accuracy: 0.7838 - 4s/epoch - 5ms/step
Epoch 3/40
773/773 - 5s - loss: 0.7207 - accuracy: 0.7232 - val_loss: 0.4750 - val_accuracy: 0.8563 - 5s/epoch - 7ms/step
Epoch 4/40
773/773 - 6s - loss: 0.6254 - accuracy: 0.7577 - val_loss: 0.4206 - val_accuracy: 0.8617 - 6s/epoch - 7ms/step
Epoch 5/40
773/773 - 6s - loss: 0.5683 - accuracy: 0.7788 - val_loss: 0.4035 - val_accuracy: 0.8600 - 6s/epoch - 7ms/step
Epoch 6/40
773/773 - 7s - loss: 0.5268 - accuracy: 0.7935 - val_loss: 0.3455 - val_accuracy: 0.8870 - 7s/epoch - 9ms/step
Epoch 7/40
773/773 - 9s - loss: 0.4961 - accuracy: 0.8059 - val_loss: 0.3234 - val_accuracy: 0.8918 - 9s/epoch - 12ms/step
Epoch 8/40
773/773 - 9s - loss: 0.4791 - accuracy: 0.8157 - val_loss: 0.3047 - val_accuracy: 0.8979 - 9s/epoch - 11ms/step
Epoch 9/40
773/

In [14]:
# -------------------- Soft-voting ensemble --------------------
lstm_probs = lstm.predict(X_test_seq)
rf_probs   = rf.predict_proba(X_test_rf)
ens_probs  = (lstm_probs + rf_probs) / 2.0
y_pred_ens = np.argmax(ens_probs, axis=1)
print("\n=== Ensemble (avg) ===")
print(classification_report(y_test, y_pred_ens, target_names=le_stream.classes_))
print("Ensemble Accuracy:", accuracy_score(y_test, y_pred_ens))


=== Ensemble (avg) ===
              precision    recall  f1-score   support

         HEG       0.88      0.98      0.93      1546
         HEL       0.96      0.95      0.95      2203
         HGL       0.97      0.94      0.95      2345
         ICT       0.97      0.95      0.96      2497
         MCE       0.91      0.92      0.91      1307
         MEG       0.90      0.97      0.94      1445
         PCB       0.96      0.95      0.95      2237
         PCM       0.97      0.90      0.94      1864

    accuracy                           0.94     15444
   macro avg       0.94      0.94      0.94     15444
weighted avg       0.95      0.94      0.94     15444

Ensemble Accuracy: 0.9446386946386947


In [15]:
# -------------------- Save artifacts --------------------
joblib.dump(rf, MODEL_DIR / "olevel_rf.pkl")
lstm.save(MODEL_DIR / "olevel_lstm.keras")
joblib.dump(scaler, MODEL_DIR / "olevel_scaler.pkl")
joblib.dump(le_stream, MODEL_DIR / "olevel_label_encoder.pkl")

meta = {
    "subjects": SUBJECTS,
    "years": YEARS,
    "target": TARGET,
    "classes": le_stream.classes_.tolist(),
    "required_columns": REQ_COLS,
}
with open(MODEL_DIR / "olevel_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print(f"\nSaved to {MODEL_DIR.resolve()}")


Saved to D:\Projects\bigDataFinalProject\finalProject\models\O-Level\models\olevel
