In [1]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [4]:
# -----------------------
# 1. Helper definitions
# -----------------------

n_rows = 10000

GCSE_SCALE = list(range(1, 10))
AS_SCALE = ['U', 'E', 'D', 'C', 'B', 'A']
A_SCALE = ['U', 'E', 'D', 'C', 'B', 'A', 'A*']

# GCSE numeric mapping (identity)
GCSE_TO_NUM = {str(g): g for g in GCSE_SCALE}

# AS-level numeric mapping (your corrected version)
AS_TO_NUM = {
    'U': 0,
    'E': 1,
    'D': 2,
    'C': 3,
    'B': 4,
    'A': 5
}

# A-level numeric mapping (your corrected version)
ALEVEL_TO_NUM = {
    'U': 0,
    'E': 1,
    'D': range(2, 3),
    'C': 4,
    'B': range(5, 6),
    'A': range(7, 8),
    'A*': 9
}

# -----------------------
# 2. Generate SATS scores
# -----------------------

sats_mean = 105
sats_std = 8

sats = np.random.normal(loc=sats_mean, scale=sats_std, size=n_rows)
sats = np.clip(np.round(sats), 90, 120).astype(int)

# -----------------------
# 3. SATS bands
# -----------------------

def get_indices(scale, min_grade, max_grade):
    start = scale.index(min_grade)
    end = scale.index(max_grade)
    return scale[start:end+1]

SATS_BANDS = [
    {"min": 90, "max": 99,  "gcse_min": 3, "gcse_max": 5, "as_min": 'D', "as_max": 'B'},
    {"min": 100, "max": 109, "gcse_min": 4, "gcse_max": 7, "as_min": 'C', "as_max": 'A'},
    {"min": 110, "max": 120, "gcse_min": 6, "gcse_max": 9, "as_min": 'B', "as_max": 'A'},
]

# -----------------------
# 4. Base grade assignment
# -----------------------

gcse_grades = []
as_grades = []

for s in sats:
    band = next(b for b in SATS_BANDS if b["min"] <= s <= b["max"])

    gcse_allowed = list(range(band["gcse_min"], band["gcse_max"] + 1))
    gcse_grade = np.random.choice(gcse_allowed)

    as_allowed = get_indices(AS_SCALE, band["as_min"], band["as_max"])
    as_grade = np.random.choice(as_allowed)

    gcse_grades.append(gcse_grade)
    as_grades.append(as_grade)

gcse_grades = np.array(gcse_grades)
as_grades = np.array(as_grades, dtype=object)

# -----------------------
# 5. Apply 5% exceptions
# -----------------------

def apply_exceptions(sats, gcse, asg, bands, exception_rate=0.05):
    gcse = gcse.copy()
    asg = asg.copy()

    for band in bands:
        idx = np.where((sats >= band["min"]) & (sats <= band["max"]))[0]
        if len(idx) == 0:
            continue

        n_exc = max(1, int(len(idx) * exception_rate))
        exc_idx = np.random.choice(idx, size=n_exc, replace=False)

        for i in exc_idx:
            direction = np.random.choice([-1, 1])
            step = np.random.choice([1, 2])

            # GCSE numeric
            new_gcse = gcse[i] + direction * step
            gcse[i] = int(np.clip(new_gcse, 1, 9))

            # AS letter
            pos_as = AS_SCALE.index(asg[i])
            new_pos_as = int(np.clip(pos_as + direction * step, 0, len(AS_SCALE) - 1))
            asg[i] = AS_SCALE[new_pos_as]

    return gcse, asg

gcse_grades, as_grades = apply_exceptions(
    sats, gcse_grades, as_grades, SATS_BANDS
)

# -----------------------
# 6. Derive A-level Maths (rule-based)
# -----------------------

def sats_to_numeric(s):
    return (s - 90) / 40 * 9.0  # scale 90–120 → approx 0–6

def gcse_to_numeric(g):
    return g

def as_to_numeric(letter):
    return AS_TO_NUM[letter]

def composite_to_alevel_grade(c):
    if c < 0.5: return 'U'
    if c < 1.5: return 'E'
    if c < 2.5: return 'D'
    if c < 4.0: return 'C'
    if c < 5.5: return 'B'
    if c < 7.0: return 'A'
    return 'A*'

maths_grades = []

for s, g_gcse, g_as in zip(sats, gcse_grades, as_grades):
    comp = (
    0.30 * sats_to_numeric(s) +
    0.50 * (gcse_to_numeric(g_gcse) / 9 * 6) +
    0.40 * as_to_numeric(g_as)
)

    maths_grades.append(composite_to_alevel_grade(comp))

maths_grades = np.array(maths_grades, dtype=object)

# -----------------------
# 7. Unique reference numbers
# -----------------------

refs = np.random.permutation(np.arange(1_000_000, 1_000_000 + n_rows))

# -----------------------
# 8. Build DataFrame
# -----------------------

df = pd.DataFrame({
    "ref_id": refs,
    "SATS_score": sats,
    "GCSE_grade": gcse_grades,
    "GCE_AS_grade": as_grades,
    "Alevel_Maths_grade": maths_grades
})

df.to_csv("synthetic_uk_attainment_10000_clean_1.csv", index=False)

print(df.head())
print("\nSaved to synthetic_uk_attainment_10000_clean_1.csv")
print(df['Alevel_Maths_grade'].value_counts(normalize=True))

    ref_id  SATS_score  GCSE_grade GCE_AS_grade Alevel_Maths_grade
0  1002670         120           6            A                  A
1  1004752         108           5            B                  B
2  1002953         105           7            B                  B
3  1007545          93           4            D                  D
4  1005538         113           6            A                  A

Saved to synthetic_uk_attainment_10000_clean_1.csv
Alevel_Maths_grade
B     0.4113
C     0.2925
A     0.2320
D     0.0588
A*    0.0037
E     0.0015
U     0.0002
Name: proportion, dtype: float64
