In [None]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib
%pip install q
%pip install joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import joblib

sns.set(style="whitegrid", context="notebook")

In [None]:
##  Predicting GCE A-Level Maths grade from prior attainment

# Load the Dataset
file_path = ("Data/synthetic_uk_attainment_10000_1.csv")
df = pd.read_csv(file_path) 
df.head()

In [None]:
# ref_id               int
# SATS_score           int
# GCSE_grade           int (1â€“9)
# GCE_AS_grade         object (A, B, C, D, E, U)
# GCE_A_grade          object (A*, A, B, C, D, E, U)
# GCE_A_Maths_grade    object (A*, A, B, C, D, E, U)


In [None]:
#data exploration
# Info
df.info()

In [None]:
# Basic statistics for numeric columns
df.describe(include="all")

In [None]:
# Basic EDA: distributions and structure
# Distribution of SATS scores
plt.figure(figsize=(8, 4))
sns.histplot(df["SATS_score"], bins=20, kde=True)
plt.title("Distribution of SATS scores")
plt.xlabel("SATS_score")
plt.ylabel("Count")
plt.show()

In [None]:
# Distribution of each grade column
grade_cols = ["GCSE_grade", "GCE_AS_grade", "GCE_A_grade", "GCE_A_Maths_grade"]

for col in grade_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=df[col], order=sorted(df[col].unique()))
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [None]:
# check for missing values
df.isna().sum()

In [None]:
#
grade_order = [["U", "E", "D", "C", "B", "A", "A*"]]

encoder = OrdinalEncoder(categories=grade_order)

# Columns that use letter grades
grade_cols = ["GCSE_grade", "GCE_AS_grade", "GCE_A_grade", "GCE_A_Maths_grade"]

# Fit and transform
df[grade_cols] = encoder.fit_transform(df[grade_cols])

df[grade_cols].head()