In [1]:
# Installs
%pip install pandas
%pip install scikit-learn
%pip install seaborn
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [4]:
rng = np.random.default_rng(42)  # reproducibility

#n_rows = 10000
n_rows = 200000
# ---------------------------------------------------------
# 1. GRADE SCALES (UPDATED)
# ---------------------------------------------------------

gcse_grades = [1, 2, 3, 4, 5, 6, 7, 8, 9]

gce_AS_grades = ["U", "E", "D", "C", "B", "A"]        # AS levels
gce_A_grades  = ["U", "E", "D", "C", "B", "A", "A*"]  # A levels

# Index maps
gcse_index = {g: i for i, g in enumerate(gcse_grades)}
gce_AS_index = {g: i for i, g in enumerate(gce_AS_grades)}
gce_A_index  = {g: i for i, g in enumerate(gce_A_grades)}


# ---------------------------------------------------------
# 2. HELPER FUNCTIONS
# ---------------------------------------------------------

def sample_from_range(values, low, high, size=1):
    """
    Sample from values where low <= grade <= high.
    GCSE uses numeric comparison.
    GCE AS/A use index-based comparison.
    """
    if isinstance(values[0], int):
        allowed = [g for g in values if low <= g <= high]
    else:
        index_map = gce_AS_index if values is gce_AS_grades else gce_A_index
        li, hi = index_map[low], index_map[high]
        allowed = values[li:hi+1]
    return rng.choice(allowed, size=size)


def apply_exception(grade, values, max_shift=2):
    """
    Move grade up or down 1â€“2 steps.
    """
    if isinstance(values[0], int):
        idx = gcse_index[grade]
    else:
        index_map = gce_AS_index if values is gce_AS_grades else gce_A_index
        idx = index_map[grade]

    direction = rng.choice([-1, 1])
    shift = rng.integers(1, max_shift + 1)
    new_idx = max(0, min(idx + direction * shift, len(values) - 1))
    return values[new_idx]


# ---------------------------------------------------------
# 3. GENERATE SATS SCORES (BELL CURVE)
# ---------------------------------------------------------

sats_raw = rng.normal(loc=105, scale=8, size=n_rows)
sats_scores = np.clip(np.round(sats_raw), 90, 120).astype(int)


# ---------------------------------------------------------
# 4. PREPARE OUTPUT ARRAYS
# ---------------------------------------------------------

gcse_results = np.empty(n_rows, dtype=int)
gce_as_results = np.empty(n_rows, dtype=object)
gce_a_results = np.empty(n_rows, dtype=object)


# ---------------------------------------------------------
# 5. SATS BANDS + GRADE RANGES
# ---------------------------------------------------------

bands = [
    #{
        #"mask": lambda s: (s >= 80) & (s <= 89),
        #"gcse_low": 1, "gcse_high": 3,
        #"as_low": "U", "as_high": "D",
        #"a_low": "U", "a_high": "D",
    #},
    #{
        #"mask": lambda s: (s >= 90) & (s <= 99),
        #"gcse_low": 1, "gcse_high": 4,
        #"as_low": "U", "as_high": "C",
        #"a_low": "U", "a_high": "C",
    #},
    {
        "mask": lambda s: (s >= 90) & (s <= 99),
        "gcse_low": 3, "gcse_high": 5,
        "as_low": "D", "as_high": "B",
        "a_low": "D", "a_high": "B",
    },
    {
        "mask": lambda s: (s >= 100) & (s <= 109),
        "gcse_low": 4, "gcse_high": 7,
        "as_low": "C", "as_high": "A",
        "a_low": "C", "a_high": "A",
    },
    {
        "mask": lambda s: (s >= 110) & (s <= 120),
        "gcse_low": 6, "gcse_high": 9,
        "as_low": "B", "as_high": "A",
        "a_low": "C", "a_high": "A*",
    },
]

exception_rate = 0.05


# ---------------------------------------------------------
# 6. ASSIGN GRADES PER BAND
# ---------------------------------------------------------

for band in bands:
    mask = band["mask"](sats_scores)
    idx = np.where(mask)[0]
    n_band = len(idx)
    if n_band == 0:
        continue

    n_exceptions = int(np.round(exception_rate * n_band))
    exception_indices = rng.choice(idx, size=n_exceptions, replace=False) if n_exceptions > 0 else np.array([], dtype=int)
    normal_indices = np.setdiff1d(idx, exception_indices, assume_unique=True)

    # Normal rows
    gcse_results[normal_indices] = sample_from_range(
        gcse_grades, band["gcse_low"], band["gcse_high"], size=len(normal_indices)
    )
    gce_as_results[normal_indices] = sample_from_range(
        gce_AS_grades, band["as_low"], band["as_high"], size=len(normal_indices)
    )
    gce_a_results[normal_indices] = sample_from_range(
        gce_A_grades, band["a_low"], band["a_high"], size=len(normal_indices)
    )

    # Exception rows
    if len(exception_indices) > 0:
        gcse_base = sample_from_range(gcse_grades, band["gcse_low"], band["gcse_high"], size=len(exception_indices))
        as_base = sample_from_range(gce_AS_grades, band["as_low"], band["as_high"], size=len(exception_indices))
        a_base = sample_from_range(gce_A_grades, band["a_low"], band["a_high"], size=len(exception_indices))

        gcse_exc = [apply_exception(g, gcse_grades) for g in gcse_base]
        as_exc = [apply_exception(g, gce_AS_grades) for g in as_base]
        a_exc = [apply_exception(g, gce_A_grades) for g in a_base]

        gcse_results[exception_indices] = gcse_exc
        gce_as_results[exception_indices] = as_exc
        gce_a_results[exception_indices] = a_exc


# ---------------------------------------------------------
# 7. UNIQUE REFERENCE IDs
# ---------------------------------------------------------

def generate_unique_ids(n, length=9):
    ids = set()
    results = []
    while len(results) < n:
        remaining = n - len(results)
        candidates = rng.integers(10**(length-1), 10**length, size=remaining * 2)
        for c in candidates:
            if c not in ids:
                ids.add(c)
                results.append(c)
            if len(results) == n:
                break
    return results

reference_ids = generate_unique_ids(n_rows)


# ---------------------------------------------------------
# 8. BUILD DATAFRAME + EXPORT
# ---------------------------------------------------------

df = pd.DataFrame({
    "reference_id": reference_ids,
    "SATS": sats_scores,
    "GCSE": gcse_results,
    "GCE_AS": gce_as_results,
    "GCE_A": gce_a_results,
})

df.to_csv("synthetic_exam_data2.csv", index=False)

print(df.head())
print("Rows:", len(df))


   reference_id  SATS  GCSE GCE_AS GCE_A
0     835724108   107     5      B     B
1     489964504    97     4      D     B
2     153659217   111     6      A    A*
3     301853339   113     8      A     A
4     968991804    90     3      D     D
Rows: 200000
