In [None]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib
%pip install q
%pip install joblib

In [None]:
import sys 
sys.version

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import onnx

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

sns.set(style="whitegrid", context="notebook")

In [None]:
##  Predicting GCE A-Level Maths grade from prior attainment

# Load the Dataset
file_path = ("Data/synthetic_uk_attainment_10000_clean_1.csv")
df = pd.read_csv(file_path) 

# Quick check 
print(df.head()) 
print(df.info()) 
print(df.isna().sum())

In [None]:
# ref_id               int
# SATS_score           int
# GCSE_grade           int (9-1)
# GCE_AS_grade         object (A, B, C, D, E, U)
# GCE_A_grade          object (A*, A, B, C, D, E, U)
# Alevel_Maths_grade    object (A*, A, B, C, D, E, U)

In [None]:
# ---------------------------------------------------------
# 1. Grade mappings
# ---------------------------------------------------------
# AS-level numeric mapping
AS_TO_NUM = {
    'U': 0,
    'E': 1,
    'D': 2,
    'C': 3,
    'B': 4,
    'A': 5
}

# A-level numeric mapping with ranges
ALEVEL_MATHS_TO_RANGE = {
    'U': 0,  
    'E': 1,  
    'D': range(2, 4),   # 2, 3
    'C': 4,  
    'B': range(5, 7),   # 5, 6
    'A': range(7, 9),   # 7, 8
    'A*': 9,  
}


def range_to_value(r):
    if isinstance(r, range):
        return r.start
    return r


ALEVEL_MATHS_TO_NUM = {
    grade: range_to_value(rng)
    for grade, rng in ALEVEL_MATHS_TO_RANGE.items()
}

NUM_TO_AS = {v: k for k, v in AS_TO_NUM.items()}
NUM_TO_ALEVEL_MATHS = {v: k for k, v in ALEVEL_MATHS_TO_NUM.items()}

# ---------------------------------------------------------
# 2. Validate expected columns
# ---------------------------------------------------------

expected_cols = [
    "SATS_score",
    "GCSE_grade",
    "GCE_AS_grade",
    "Alevel_Maths_grade"
]

missing = [c for c in expected_cols if c not in df.columns]
data = df.copy()


if missing:
    raise KeyError(f"Your dataframe is missing required columns: {missing}")

# ---------------------------------------------------------
# 3. Create working copy and clean string grades
# ---------------------------------------------------------

# Create working copy AFTER validation
data = df.copy()

data["GCE_AS_grade"] = (
    data["GCE_AS_grade"]
    .astype(str)
    .str.strip()
    .str.upper()
)

data["Alevel_Maths_grade"] = (
    data["Alevel_Maths_grade"]
    .astype(str)
    .str.strip()
    .str.upper()
)

# ---------------------------------------------------------
# 4. Helper: safe encoder with warnings
# ---------------------------------------------------------

def encode_grade_series(s, mapping, col_name):
    unknown = set(s.unique()) - set(mapping.keys())
    if unknown:
        print(f"Warning: {col_name} has unknown grades: {unknown}")
    return s.map(mapping)

# ---------------------------------------------------------
# 5. Encode numeric columns
# ---------------------------------------------------------

# GCSE numeric
data["GCSE_grade_num"] = data["GCSE_grade"].astype(int)

# Encode AS
data["GCE_AS_grade_num"] = encode_grade_series(
    data["GCE_AS_grade"], AS_TO_NUM, "GCE_AS_grade"
)

# Encode A-level Maths
data["Alevel_Maths_grade_num"] = encode_grade_series(
    data["Alevel_Maths_grade"], ALEVEL_MATHS_TO_NUM, "Alevel_Maths_grade"
)

# ---------------------------------------------------------
# 6. EDA pairplot
# ---------------------------------------------------------

eda_cols = [
    "SATS_score",
    "GCSE_grade_num",
    "GCE_AS_grade_num",
    "Alevel_Maths_grade_num",
]

sns.pairplot(data[eda_cols], diag_kind="kde")
plt.suptitle("Pairplot of attainment variables", y=1.02)
plt.show()


In [None]:
#data exploration
# Info
df.info()

In [None]:
# Basic statistics for numeric columns
df.describe(include="all")

In [None]:
#Understand structure and grade distributions
# Basic stats for SATS
print(df["SATS_score"].describe())

# Unique values / distributions of grades
print("\nGCSE_grade distribution:")
print(df["GCSE_grade"].value_counts().sort_index())

print("\nGCE_AS_grade distribution:")
print(df["GCE_AS_grade"].value_counts().sort_index())

print("\nAlevel_Maths_grade distribution:")
print(df["Alevel_Maths_grade"].value_counts().sort_index())


In [None]:
# Implement encoders
# # Ordered mapping for A-level style grades
grade_order = ["U", "E", "D", "C", "B", "A", "A*"]
grade_to_num = {g: i for i, g in enumerate(grade_order)}

# Single definition
def encode_grade_series(s, mapping, col_name):
    unknown = set(s.unique()) - set(mapping.keys())
    if unknown:
        print(f"Warning: {col_name} has unknown grades: {unknown}")
    return s.map(mapping)

# Single working copy
#data = df.copy()

#data["GCSE_grade_num"] = data["GCSE_grade"].astype(int)

#data["GCE_AS_grade_num"] = encode_grade_series(
#    data["GCE_AS_grade"], AS_TO_NUM, "GCE_AS_grade"
#)

#data["Alevel_Maths_grade_num"] = encode_grade_series(
#    data["Alevel_Maths_grade"], ALEVEL_MATHS_TO_NUM, "Alevel_Maths_grade"
#)

# Check encodings
print(
    data[
        [
            "GCSE_grade",
            "GCSE_grade_num",
            "GCE_AS_grade",
            "GCE_AS_grade_num",
            "Alevel_Maths_grade",
            "Alevel_Maths_grade_num",
        ]
    ].head()
)


In [None]:
#Exploratory data analysis (EDA)
# Pairplot of key variables
# Select columns for EDA
eda_cols = [
    "SATS_score",
    "GCSE_grade_num",
    "GCE_AS_grade_num",
    "Alevel_Maths_grade_num",
]

#sns.pairplot(data[eda_cols], diag_kind="kde")
#plt.suptitle("Pairplot of attainment variables", y=1.02)
#plt.show()


In [None]:
# Train–test split with stratification
# Define features and target
feature_cols = [
    "SATS_score",
    "GCSE_grade_num",
    "GCE_AS_grade_num",
    ]
target_col = "Alevel_Maths_grade_num"

X = data[feature_cols]
y = data[target_col]

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train target distribution:\n", y_train.value_counts(normalize=True).sort_index())
print("Test target distribution:\n", y_test.value_counts(normalize=True).sort_index())


In [None]:
# Build a logistic regression pipeline
#StandardScaler: scale features
# LogisticRegression: multiclass classifier with increased max_iter for convergence.

log_reg_clf = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "logreg",
            LogisticRegression(
                solver="lbfgs",
                max_iter=2000
            ),
        ),
    ]
)

# Train
log_reg_clf.fit(X_train, y_train)


In [None]:
# Evaluate the model

# Predictions and classification report
from sklearn.metrics import accuracy_score

y_pred = log_reg_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report (numeric grades):")
print(classification_report(y_test, y_pred))


In [None]:
#Confusion matrix (numeric)
import numpy as np

# Determine which classes actually appear
present_classes = np.unique(np.concatenate([y_test, y_pred]))

cm = confusion_matrix(y_test, y_pred, labels=present_classes)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=present_classes
)

disp.plot(
    cmap="Blues",
    xticks_rotation=45,
)
plt.title("Confusion matrix for A-level Maths (numeric encoding)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# map numeric back to grade labels for interpretability
# Reverse mapping using grade_ranges
def num_to_grade(v, mapping=ALEVEL_MATHS_TO_RANGE):
    for grade, rng in mapping.items():
        if isinstance(rng, range):
            if v in rng:
                return grade
        else:
            if v == rng:
                return grade
    return None

PASS_GRADES = {"C", "B", "A", "A*"}

# Convert numeric to labels first
y_test_labels = y_test.map(lambda v: num_to_grade(v))
y_pred_labels = pd.Series(y_pred, index=y_test.index).map(lambda v: num_to_grade(v))

# Binary pass/fail based on labels
y_test_binary = y_test_labels.isin(PASS_GRADES).astype(int)
y_pred_binary = y_pred_labels.isin(PASS_GRADES).astype(int)

cm = confusion_matrix(y_test_binary, y_pred_binary)

plt.figure(figsize=(4, 3))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Pred fail', 'Pred pass'],
    yticklabels=['True fail', 'True pass']
)
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title('Confusion Matrix – Logistic Regression (Math pass)')
plt.show()

In [None]:
#Confusion matrix + heatmap
# Convert numeric grades to binary pass/fail
# Fail = 0, Pass = 1
# Using your grade_to_num mapping:
# U=0, E=1, D=2-3, C=4, B=5-6, A=7-8, A*=9

#y_test_binary = (y_test >= 3).astype(int)
#y_pred_binary = (y_pred >= 3).astype(int)

# Confusion matrix (binary)
present_classes = np.unique(np.concatenate([y_test, y_pred]))

cm = confusion_matrix(y_test, y_pred, labels=present_classes)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=present_classes
)

cm = confusion_matrix(y_test_binary, y_pred_binary)

plt.figure(figsize=(4,3))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Pred fail', 'Pred pass'],
    yticklabels=['True fail', 'True pass']
)
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title('Confusion Matrix – Logistic Regression (Math pass)')
plt.show()


In [None]:
# Persist the trained model with joblib
model_path = "uk_attainment_logreg_math.pkl"
joblib.dump(log_reg_clf, model_path)

print(f"Model saved to {model_path}")

In [None]:
#Load and use later (deployment-style usage)
# Later, in a different script / environment:
loaded_model = joblib.load(model_path)

# Example: predict for some new students
#new_students = pd.DataFrame(
#    {
#        "SATS_score": [105, 115],
#        "GCSE_grade_num": [6, 8],
#        "GCE_AS_grade_num": [4, 5],  # B, A
#    }
#)

new_students = pd.DataFrame(
    {
        "SATS_score": [105],
        "GCSE_grade_num": [6],
        "GCE_AS_grade_num": [4],  # B
    }
)

pred_nums = loaded_model.predict(new_students)
pred_grades = pd.Series(pred_nums).map(num_to_grade)

print("Predicted A-level Maths grades:", list(pred_grades))