In [None]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib
%pip install q
%pip install joblib

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


sns.set(style="whitegrid", context="notebook")

In [None]:
##  Predicting GCE A-Level Maths grade from prior attainment

# Load the Dataset
file_path = ("Data/synthetic_uk_attainment_10000_clean_1.csv")
df = pd.read_csv(file_path) 

# Quick check 
print(df.head()) 
print(df.info()) 
print(df.isna().sum())

In [None]:
# ref_id               int
# SATS_score           int
# GCSE_grade           int (9-1)
# GCE_AS_grade         object (A, B, C, D, E, U)
# GCE_A_grade          object (A*, A, B, C, D, E, U)
# Alevel_Maths_grade    object (A*, A, B, C, D, E, U)

In [None]:
# %%
# Grade mappings

AS_TO_NUM = {
    "U": 0,
    "E": 1,
    "D": 2,
    "C": 3,
    "B": 4,
    "A": 5,
}

ALEVEL_MATHS_TO_NUM = {
    "U": 0,
    "E": 1,
    "D": 2,
    "C": 3,
    "B": 4,
    "A": 5,
    "A*": 6,
}

NUM_TO_ALEVEL_MATHS = {v: k for k, v in ALEVEL_MATHS_TO_NUM.items()}

# %%
# Encoding helper
def encode_grade_series(s, mapping, col_name):
    unknown = set(s.unique()) - set(mapping.keys())
    if unknown:
        print(f"Warning: {col_name} has unknown grades: {unknown}")
    return s.map(mapping)

# %%
# Create working copy
data = df.copy()

# GCSE numeric
data["GCSE_grade_num"] = data["GCSE_grade"].astype(int)

# Encode AS grades
data["GCE_AS_grade_num"] = encode_grade_series(
    data["GCE_AS_grade"], AS_TO_NUM, "GCE_AS_grade"
)

# Encode A-level Maths grades
data["Alevel_Maths_grade_num"] = encode_grade_series(
    data["Alevel_Maths_grade"], ALEVEL_MATHS_TO_NUM, "Alevel_Maths_grade"
)

# %%
# EDA pairplot
eda_cols = [
    "SATS_score",
    "GCSE_grade_num",
    "GCE_AS_grade_num",
    "Alevel_Maths_grade_num",
]

sns.pairplot(data[eda_cols], diag_kind="kde")
plt.suptitle("Pairplot of attainment variables", y=1.02)
plt.show()

# %%
# Train–test split
feature_cols = [
    "SATS_score",
    "GCSE_grade_num",
    "GCE_AS_grade_num",
]

target_col = "Alevel_Maths_grade_num"

X = data[feature_cols]
y = data[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train target distribution:\n", y_train.value_counts(normalize=True).sort_index())
print("Test target distribution:\n", y_test.value_counts(normalize=True).sort_index())

# %%
# Improved model: RandomForest + class_weight="balanced"

rf_clf = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(
            n_estimators=300,
            class_weight="balanced",
            random_state=42
        )),
    ]
)

# Train
rf_clf.fit(X_train, y_train)

# Predict
y_pred = rf_clf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report (numeric grades):")
print(classification_report(y_test, y_pred, zero_division=0))

# %%
# Confusion matrix (numeric)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues", xticks_rotation=45)
plt.title("Confusion matrix for A-level Maths (numeric encoding)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# %%
# Convert numeric back to grade labels
grade_order = ["U", "E", "D", "C", "B", "A", "A*"]
num_to_grade = {i: g for i, g in enumerate(grade_order)}

y_test_labels = y_test.map(num_to_grade)
y_pred_labels = pd.Series(y_pred, index=y_test.index).map(num_to_grade)

print("\nClassification report (grade labels):")
print(classification_report(y_test_labels, y_pred_labels, zero_division=0))

cm_labels = confusion_matrix(y_test_labels, y_pred_labels, labels=grade_order)

disp_labels = ConfusionMatrixDisplay(
    confusion_matrix=cm_labels,
    display_labels=grade_order,
)
disp_labels.plot(cmap="Blues", xticks_rotation=45)
plt.title("Confusion matrix for A-level Maths (grade labels)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# %%
# Binary pass/fail confusion matrix
y_test_binary = (y_test >= 3).astype(int)
y_pred_binary = (y_pred >= 3).astype(int)

cm = confusion_matrix(y_test_binary, y_pred_binary)

plt.figure(figsize=(4, 3))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Pred fail", "Pred pass"],
    yticklabels=["True fail", "True pass"],
)
plt.ylabel("True")
plt.xlabel("Predicted")
plt.title("Confusion Matrix – RandomForest (Math pass)")
plt.show()

# %%
# Save model
model_path = "uk_attainment_rf_math.pkl"
joblib.dump(rf_clf, model_path)
print(f"Model saved to {model_path}")

# %%
# Load model and predict new students
loaded_model = joblib.load(model_path)

new_students = pd.DataFrame(
    {
        "SATS_score": [105, 115],
        "GCSE_grade_num": [6, 8],
        "GCE_AS_grade_num": [4, 5],
    }
)

pred_nums = loaded_model.predict(new_students)
pred_grades = pd.Series(pred_nums).map(num_to_grade)

print("Predicted A-level Maths grades:", list(pred_grades))
