In [None]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib
%pip install q
%pip install joblib

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


sns.set(style="whitegrid", context="notebook")

In [None]:
##  Predicting GCE A-Level Maths grade from prior attainment

# Load the Dataset
file_path = ("Data/synthetic_uk_attainment_10000_clean_1.csv")
df = pd.read_csv(file_path) 

# Quick check 
print(df.head()) 
print(df.info()) 
print(df.isna().sum())

In [None]:
# ref_id               int
# SATS_score           int
# GCSE_grade           int (9-1)
# GCE_AS_grade         object (A, B, C, D, E, U)
# GCE_A_grade          object (A*, A, B, C, D, E, U)
# GCE_A_Maths_grade    object (A*, A, B, C, D, E, U)

In [None]:
#data exploration
# Info
df.info()

In [None]:
# Basic statistics for numeric columns
df.describe(include="all")

In [None]:
#Understand structure and grade distributions
# Basic stats for SATS
print(df["SATS_score"].describe())

# Unique values / distributions of grades
print("\nGCSE_grade distribution:")
print(df["GCSE_grade"].value_counts().sort_index())

print("\nGCE_AS_grade distribution:")
print(df["GCE_AS_grade"].value_counts().sort_index())

print("\nGCE_A_grade distribution:")
print(df["GCE_A_grade"].value_counts().sort_index())

print("\nGCE_A_Maths_grade distribution:")
print(df["GCE_A_Maths_grade"].value_counts().sort_index())


In [None]:
# Implement encoders
# # Ordered mapping for A-level style grades
grade_order = ["U", "E", "D", "C", "B", "A", "A*"]
grade_to_num = {g: i for i, g in enumerate(grade_order)}

def encode_grade_series(s, mapping, col_name):
    """Safely encode a grade column using the provided mapping."""
    unknown = set(s.unique()) - set(mapping.keys())
    if unknown:
        print(f"Warning: {col_name} has unknown grades: {unknown}")
    return s.map(mapping)

# Create a working copy
data = df.copy()

# GCSE: cast to integer (it’s already numeric in your file)
data["GCSE_grade_num"] = data["GCSE_grade"].astype(int)

# Encode AS, A, and A Maths grades
for col in ["GCE_AS_grade", "GCE_A_Maths_grade"]:
    data[col + "_num"] = encode_grade_series(data[col], grade_to_num, col)

# Check encodings
print(
    data[
        [
            "GCSE_grade",
            "GCSE_grade_num",
            "GCE_AS_grade",
            "GCE_AS_grade_num",
            "GCE_A_Maths_grade",
            "GCE_A_Maths_grade_num",
        ]
    ].head()
)


In [None]:
#Exploratory data analysis (EDA)
# Pairplot of key variables
# Select columns for EDA
eda_cols = [
    "SATS_score",
    "GCSE_grade_num",
    "GCE_AS_grade_num",
    "GCE_A_Maths_grade_num",
]

sns.pairplot(data[eda_cols], diag_kind="kde")
plt.suptitle("Pairplot of attainment variables", y=1.02)
plt.show()

In [None]:
# Train–test split with stratification
# Define features and target
feature_cols = [
    "SATS_score",
    "GCSE_grade_num",
    "GCE_AS_grade_num",
    ]
target_col = "GCE_A_Maths_grade_num"

X = data[feature_cols]
y = data[target_col]

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train target distribution:\n", y_train.value_counts(normalize=True).sort_index())
print("Test target distribution:\n", y_test.value_counts(normalize=True).sort_index())


In [None]:
# Build a logistic regression pipeline
#StandardScaler: scale features
# LogisticRegression: multiclass classifier with increased max_iter for convergence.

log_reg_clf = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "logreg",
            LogisticRegression(
                solver="lbfgs",
                max_iter=1000
            ),
        ),
    ]
)

# Train
log_reg_clf.fit(X_train, y_train)


In [None]:
# Evaluate the model

# Predictions and classification report
from sklearn.metrics import accuracy_score

y_pred = log_reg_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report (numeric grades):")
print(classification_report(y_test, y_pred))


In [None]:
#Confusion matrix (numeric)
import numpy as np

# Determine which classes actually appear
present_classes = np.unique(np.concatenate([y_test, y_pred]))

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=present_classes
)

disp.plot(
    cmap="Blues",
    xticks_rotation=45,
)
plt.title("Confusion matrix for A-level Maths (numeric encoding)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# map numeric back to grade labels for interpretability
# Reverse mapping
num_to_grade = {v: k for k, v in grade_to_num.items()}

y_test_labels = y_test.map(num_to_grade)
y_pred_labels = pd.Series(y_pred, index=y_test.index).map(num_to_grade)

print("\nClassification report (grade labels):")
print(classification_report(y_test_labels, y_pred_labels))

cm_labels = confusion_matrix(y_test_labels, y_pred_labels, labels=grade_order)

disp_labels = ConfusionMatrixDisplay(
    confusion_matrix=cm_labels,
    display_labels=grade_order,
)
disp_labels.plot(cmap="Blues", xticks_rotation=45)
plt.title("Confusion matrix for A-level Maths (grade labels)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
#Confusion matrix + heatmap
# Convert numeric grades to binary pass/fail
# Fail = 0, Pass = 1
# Using your grade_to_num mapping:
# U=0, E=1, D=2, C=3, B=4, A=5, A*=6

y_test_binary = (y_test >= 3).astype(int)
y_pred_binary = (y_pred >= 3).astype(int)

# Confusion matrix (binary)
cm = confusion_matrix(y_test_binary, y_pred_binary)

plt.figure(figsize=(4,3))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Pred fail', 'Pred pass'],
    yticklabels=['True fail', 'True pass']
)
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title('Confusion Matrix – Logistic Regression (Math pass)')
plt.show()


In [None]:
# Persist the trained model with joblib
model_path = "uk_attainment_logreg_math.pkl"
joblib.dump(log_reg_clf, model_path)

print(f"Model saved to {model_path}")

In [None]:
#Load and use later (deployment-style usage)
# Later, in a different script / environment:
loaded_model = joblib.load(model_path)

# Example: predict for some new students
new_students = pd.DataFrame(
    {
        "SATS_score": [105, 115],
        "GCSE_grade_num": [6, 8],
        "GCE_AS_grade_num": [4, 5],  # B, A
        "GCE_A_grade_num": [4, 5],   # B, A
    }
)

pred_nums = loaded_model.predict(new_students)
pred_grades = pd.Series(pred_nums).map(num_to_grade)

print("Predicted A-level Maths grades:", list(pred_grades))