In [1]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [4]:
# For reproducibility (change or remove if you want different random data each run)
np.random.seed(42)

n_rows = 10000

# -------------------------------
# 1. Generate SATS with a bell curve
# -------------------------------
# Approximate: SATS ~ Normal(mean=104, sd=10), clipped to [80, 120]
sats_scores = np.random.normal(loc=104, scale=10, size=n_rows)
sats_scores = np.clip(sats_scores, 80, 120).round().astype(int)

# -------------------------------
# 2. Define grade options per band
# -------------------------------

# GCSE numeric grades
gcse_all = list(range(1, 9))  # 1–9

# AS level grades (A–U, including E)
as_all = ['A', 'B', 'C', 'D', 'E', 'U']
# A level grades (A*–U, including E)
a_all = ['A*', 'A', 'B', 'C', 'D', 'E', 'U']

# Band-specific ranges (inclusive)
def gcse_range_for_sats(s):
    if 80 <= s <= 89:
        return list(range(1, 4))          # 1–3
    elif 90 <= s <= 99:
        return list(range(1, 5))          # 1–4
    elif 100 <= s <= 109:
        return list(range(1, 6))          # 1–5
    elif 110 <= s <= 114:
        return list(range(1, 8))          # 1–7
    elif 115 <= s <= 120:
        return list(range(1, 10))         # 1–9
    else:
        # Shouldn't happen due to clipping, but be safe
        return list(range(1, 10))

def as_range_for_sats(s):
    if 80 <= s <= 89:
        return ['D', 'E', 'U']
    elif 90 <= s <= 99:
        return ['C', 'D', 'E', 'U']
    elif 100 <= s <= 109:
        return ['B', 'C', 'D']
    elif 110 <= s <= 114:
        return ['A', 'B', 'C']
    elif 115 <= s <= 120:
        return ['A*', 'A', 'B']
    else:
        return as_all

def a_level_range_for_sats(s):
    if 80 <= s <= 89:
        return ['D', 'E', 'U']
    elif 90 <= s <= 99:
        return ['C', 'D', 'E', 'U']
    elif 100 <= s <= 109:
        return ['B', 'C', 'D']
    elif 110 <= s <= 114:
        return ['A', 'B', 'C']
    elif 115 <= s <= 120:
        # You specified A–C here
        return ['A', 'B', 'C']
    else:
        return a_all

# -------------------------------
# 3. Generate grades row-by-row with 5% off-band exceptions
# -------------------------------

gcse_grades = []
as_grades = []
a_level_grades = []

# Probability of "one or 2 grades higher or lower than suggested restrictions"
off_band_prob = 0.10

def expand_gcse_range(base_range, step=2):
    low = max(1, min(base_range) - step)
    high = min(9, max(base_range) + step)
    return list(range(low, high + 1))

def expand_as_range(base_range):
    order = ['A*', 'A', 'B', 'C', 'D', 'E', 'U']
    idxs = [order.index(g) for g in base_range]
    low = max(0, min(idxs) - 2)
    high = min(len(order) - 1, max(idxs) + 2)
    return order[low:high + 1]

def expand_a_level_range(base_range):
    order = ['A*', 'A', 'B', 'C', 'D', 'E', 'U']
    idxs = [order.index(g) for g in base_range]
    low = max(0, min(idxs) - 2)
    high = min(len(order) - 1, max(idxs) + 2)
    return order[low:high + 1]

# -------------------------------
# 4. Add random unique reference numbers
# -------------------------------
# Use a large range and sample without replacement
ref_numbers = np.random.choice(
    np.arange(10_000_000, 99_999_999), size=n_rows, replace=False
)

# -------------------------------
# 5. Build DataFrame and save to CSV
# -------------------------------
df = pd.DataFrame({
    'Ref_ID': ref_numbers,
    'SATS': sats_scores,
    'GCSE': gcse_grades,
    'GCE_AS': as_grades,
    'GCE_A_level': a_level_grades
})

# Ensure a nice, simple CSV with no index
output_path = "synthetic_uk_attainment_10000.csv"
df.to_csv(output_path, index=False)

print(f"Saved {n_rows} rows to {output_path}")
print(df.head())


ValueError: All arrays must be of the same length