In [1]:
import pandas as pd
import yaml

# 1. Load the table
path = "metadata_ml_ready_splits.xlsx"
df = pd.read_excel(path, sheet_name=0, dtype="object")

print("Loaded:", df.shape)

# 2. Build schema dictionary
schema = {}

for col in df.columns:
    col_info = {}
    # detect dtype
    if col in ["movement_intensity_raw", "movement_intensity_z", "age_years", "elapsed_time_sec_total",
               "sus_total", "nasa_tlx_weighted", "nasa_tlx_unweighted"]:
        dtype = "numeric"
    elif col in ["movement_intensity_bin", "split_seed", "cv_fold_iid", "cv_fold_lodo"]:
        dtype = "integer"
    else:
        dtype = "string"

    col_info["dtype"] = dtype

    # domain: unique values (for categoricals, limited to 20)
    uniq = df[col].dropna().unique().tolist()
    if dtype != "numeric" and len(uniq) <= 20:
        col_info["domain"] = sorted(list(map(str, uniq)))
    elif dtype != "numeric":
        col_info["domain"] = f"{len(uniq)} unique values"

    # description — minimal stub, can be expanded
    descriptions = {
        "dataset": "Source dataset (MMASD or Engagnition)",
        "sample_id": "Unique identifier for sample",
        "participant_id": "Participant code (Pxx)",
        "condition": "Experimental condition (Baseline, LPE, HPE, NA)",
        "unit_level": "Granularity (session or block)",
        "modality": "Signal modality (ACC, GSR, TMP, ENG, GAZE, PERF)",
        "movement_intensity_raw": "Median raw movement intensity (numeric)",
        "movement_intensity_z": "Robust z-score of movement intensity",
        "movement_intensity_bin": "Binary proxy label: 1 if z>=0 else 0",
        "engagement_level": "Categorical engagement level (if available)",
        "sex": "Participant sex (M/F/unknown)",
        "age_years": "Participant age in years",
        "age_group": "Age group (child, teen, adult, unknown)",
        "intervention_type": "Type of intervention (from Engagnition XLSX)",
        "intervention_timestamps_raw": "Raw timestamps string",
        "elapsed_time_sec_total": "Session total elapsed time (sec)",
        "sus_total": "System Usability Scale score",
        "nasa_tlx_weighted": "NASA-TLX weighted score",
        "nasa_tlx_unweighted": "NASA-TLX unweighted score",
        "split_seed": "Random seed used for split",
        "split_iid": "Split assignment (train/val/test, IID strategy)",
        "split_lodo": "Split assignment (train/val/test, LODO strategy)",
        "cv_fold_iid": "Fold index (IID stratified group split)",
        "cv_fold_lodo": "Fold index (LODO group split)",
        "block_field": "Block column name (Engagnition only)",
        "block_id": "Block identifier"}
    if col in descriptions:
        col_info["description"] = descriptions[col]

    schema[col] = col_info

# 3. Save schema.yaml
with open("schema.yaml", "w", encoding="utf-8") as f:
    yaml.dump(schema, f, allow_unicode=True, sort_keys=False)

print("Saved schema.yaml")


Loaded: (1431, 20)
Saved schema.yaml
