In [None]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [None]:
np.random.seed(42)  # reproducibility

n_rows = 10000

# -----------------------
# 1. Helper definitions
# -----------------------

# GCSE scale: 1 (lowest) to 9 (highest)
GCSE_SCALE = list(range(1, 10))

# GCE AS scale: highest grade is A (no A*)
AS_SCALE = ['U', 'E', 'D', 'C', 'B', 'A']

# GCE A level scale: A*–U
A_SCALE = ['U', 'E', 'D', 'C', 'B', 'A', 'A*']

# Numeric mapping for composite maths grade
GRADE_TO_NUM = {
    **{str(g): g for g in GCSE_SCALE},
    'U': 0, 'E': 1, 'D': 2, 'C': 3, 'B': 4, 'A': 5, 'A*': 6
}

# -----------------------
# 2. Generate SATS scores (bell curve)
# -----------------------

sats_mean = 105
sats_std = 8

sats = np.random.normal(loc=sats_mean, scale=sats_std, size=n_rows)
sats = np.clip(np.round(sats), 80, 120).astype(int)

# -----------------------
# 3. SATS bands and allowed grade ranges
# -----------------------

def get_indices(scale, min_grade, max_grade):
    start = scale.index(min_grade)
    end = scale.index(max_grade)
    return scale[start:end+1]

SATS_BANDS = [
    # 90–99
    {
        "min": 90, "max": 99,
        "gcse_min": 3, "gcse_max": 5,      # GCSE 5–3
        "as_min": 'D', "as_max": 'B',      # AS B–D
        "a_min": 'D', "a_max": 'B'         # A B–D
    },
    # 100–109
    {
        "min": 100, "max": 109,
        "gcse_min": 4, "gcse_max": 7,      # GCSE 7–4
        "as_min": 'C', "as_max": 'A',      # AS A–C
        "a_min": 'C', "a_max": 'A'         # A A–C
    },
    # 110–120
    {
        "min": 110, "max": 120,
        "gcse_min": 6, "gcse_max": 9,      # GCSE 9–6
        "as_min": 'B', "as_max": 'A',      # AS A–B (corrected)
        "a_min": 'C', "a_max": 'A'         # A A–C
    }
]

# -----------------------
# 4. Base grade assignment
# -----------------------

gcse_grades = []
as_grades = []
a_grades = []

for s in sats:
    band = next(b for b in SATS_BANDS if b["min"] <= s <= b["max"])

    gcse_allowed = list(range(band["gcse_min"], band["gcse_max"] + 1))
    gcse_grade = np.random.choice(gcse_allowed)

    as_allowed = get_indices(AS_SCALE, band["as_min"], band["as_max"])
    as_grade = np.random.choice(as_allowed)

    a_allowed = get_indices(A_SCALE, band["a_min"], band["a_max"])
    a_grade = np.random.choice(a_allowed)

    gcse_grades.append(gcse_grade)
    as_grades.append(as_grade)
    a_grades.append(a_grade)

gcse_grades = np.array(gcse_grades)
as_grades = np.array(as_grades, dtype=object)
a_grades = np.array(a_grades, dtype=object)

# -----------------------
# 5. Apply 5% exceptions per band
# -----------------------

def apply_exceptions(sats, gcse, asg, ag, bands, exception_rate=0.05):
    gcse = gcse.copy()
    asg = asg.copy()
    ag = ag.copy()

    for band in bands:
        idx = np.where((sats >= band["min"]) & (sats <= band["max"]))[0]
        if len(idx) == 0:
            continue

        n_exc = max(1, int(len(idx) * exception_rate))
        exc_idx = np.random.choice(idx, size=n_exc, replace=False)

        for i in exc_idx:
            direction = np.random.choice([-1, 1])
            step = np.random.choice([1, 2])

            # GCSE numeric
            new_gcse = gcse[i] + direction * step
            gcse[i] = int(np.clip(new_gcse, 1, 9))

            # AS letter
            pos_as = AS_SCALE.index(asg[i])
            new_pos_as = int(np.clip(pos_as + direction * step, 0, len(AS_SCALE) - 1))
            asg[i] = AS_SCALE[new_pos_as]

            # A level letter
            pos_a = A_SCALE.index(ag[i])
            new_pos_a = int(np.clip(pos_a + direction * step, 0, len(A_SCALE) - 1))
            ag[i] = A_SCALE[new_pos_a]

    return gcse, asg, ag

gcse_grades, as_grades, a_grades = apply_exceptions(
    sats, gcse_grades, as_grades, a_grades, SATS_BANDS
)

# -----------------------
# 6. Derive A-level Maths (rule-based)
# -----------------------

def sats_to_numeric(s):
    return (s - 80) / 40 * 6.0  # scale 80–120 → 0–6

def gcse_to_numeric(g):
    return g

def letter_to_numeric(letter):
    return GRADE_TO_NUM[letter]

def composite_to_alevel_grade(c):
    if c < 1: return 'U'
    if c < 2: return 'E'
    if c < 3: return 'D'
    if c < 4: return 'C'
    if c < 5: return 'B'
    if c < 5.7: return 'A'
    return 'A*'

maths_grades = []

for s, g_gcse, g_as, g_a in zip(sats, gcse_grades, as_grades, a_grades):
    comp = (
        0.25 * sats_to_numeric(s) +
        0.30 * (gcse_to_numeric(g_gcse) / 9 * 6) +
        0.15 * letter_to_numeric(g_as) +
        0.30 * letter_to_numeric(g_a)
    )
    maths_grades.append(composite_to_alevel_grade(comp))

maths_grades = np.array(maths_grades, dtype=object)

# -----------------------
# 7. Unique reference numbers
# -----------------------

refs = np.random.permutation(np.arange(1_000_000, 1_000_000 + n_rows))

# -----------------------
# 8. Build DataFrame and export CSV
# -----------------------

df = pd.DataFrame({
    "ref_id": refs,
    "SATS_score": sats,
    "GCSE_grade": gcse_grades,
    "GCE_AS_grade": as_grades,
    "GCE_A_grade": a_grades,
    "GCE_A_Maths_grade": maths_grades
})

df.to_csv("synthetic_uk_attainment_10000_1.csv", index=False)

print(df.head())
print("\nSaved to synthetic_uk_attainment_10000_1.csv")
