In [4]:
# build_metadata_master.py
# -*- coding: utf-8 -*-
"""
Парсер для MMASD: собираем metadata_master.csv
- Берём sample_id, participant_id, activity_class (по имени папки/файла)
- Мерджим с ADOS_rating.xlsx (sex, age_years)
- Никаких вычислений, только сбор известного
"""

import os
import re
import argparse
import pandas as pd

# ----- словарь префиксов активности -----
ACTIVITY_MAP = {
    "as": "Arm Swing",
    "bs": "Body Swing",
    "ce": "Chest Expansion",
    "sq": "Squat",
    "dr": "Drumming",
    "mfs": "Maracas Forward Shaking",
    "ms": "Maracas Shaking",
    "sac": "Sing and Clap",
    "fg": "Frog Pose",
    "tr": "Tree Pose",
    "tw": "Twist Pose",
}

SAMPLE_ID_RE = re.compile(r"^([a-z]+)_([0-9]+)_", re.IGNORECASE)

def parse_sample(sample_id: str):
    """Извлекаем activity_prefix и participant_id из имени клипа"""
    m = SAMPLE_ID_RE.match(sample_id)
    if not m:
        return "", ""
    return m.group(1).lower(), m.group(2)

def scan_openpose(root: str):
    """
    Сканируем структуру 2D skeleton/output/
    Возвращаем DataFrame с sample_id, participant_id, activity_class
    """
    base_dir = os.path.join(root, "2D skeleton", "output")
    rows = []

    for activity_folder in os.listdir(base_dir):
        activity_dir = os.path.join(base_dir, activity_folder)
        if not os.path.isdir(activity_dir):
            continue

        for sample_id in os.listdir(activity_dir):
            sample_path = os.path.join(activity_dir, sample_id)
            if not os.path.isdir(sample_path):
                continue

            prefix, participant_id = parse_sample(sample_id)
            activity_class = ACTIVITY_MAP.get(prefix, "Unknown")

            rows.append({
                "sample_id": sample_id,
                "participant_id": participant_id,
                "activity_prefix": prefix,
                "activity_class": activity_class,
                "rel_path_openpose": os.path.relpath(sample_path, root),
            })

    return pd.DataFrame(rows)

def read_ados(ados_path: str):
    """Читаем ADOS_rating.xlsx и вытаскиваем sex и age_years"""
    df = pd.read_excel(ados_path)
    cols = {c.lower().replace(" ", "").replace("_",""): c for c in df.columns}

    pid_col = next((cols[c] for c in cols if "id" in c), None)
    sex_col = next((cols[c] for c in cols if "sex" in c or "gender" in c), None)
    age_col = next((cols[c] for c in cols if "age" in c), None)

    out = pd.DataFrame()
    if pid_col:
        out["participant_id"] = df[pid_col].astype(str).str.extract(r"(\d+)")[0]
    if sex_col:
        out["sex"] = df[sex_col]
    if age_col:
        out["age_years"] = df[age_col]

    return out.dropna(subset=["participant_id"]).drop_duplicates("participant_id")

def main():
    ap = argparse.ArgumentParser(description="Сбор metadata_master.csv (MMASD).")
    ap.add_argument("--root", required=True, help="Путь к корневой папке MMASD")
    ap.add_argument("--ados", required=True, help="Путь к ADOS_rating.xlsx")
    ap.add_argument("--out", required=True, help="Куда сохранить metadata_master.csv")
    args = ap.parse_args()

    # 1) парсим openpose
    df_openpose = scan_openpose(args.root)

    # 2) добавляем dataset колонку
    df_openpose["dataset"] = "MMASD"

    # 3) подмерджим ADOS
    ados = read_ados(args.ados)
    df = df_openpose.merge(ados, on="participant_id", how="left")

    # 4) сохраняем
    os.makedirs(os.path.dirname(args.out), exist_ok=True)
    df.to_csv(args.out, index=False, encoding="utf-8")

    print(f"[OK] Сохранено {len(df)} строк в {args.out}")

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --root ROOT --ados ADOS --out OUT
ipykernel_launcher.py: error: the following arguments are required: --root, --ados, --out


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [5]:
import pandas as pd
df = pd.read_csv("data/metadata_master.csv")
print(len(df["sample_id"].unique()))


1693


In [6]:
import pandas as pd

# Загружаем твою собранную таблицу
df = pd.read_csv("data/metadata_master.csv")

# Считаем количество строк по каждому классу активности
counts = df["activity_class"].value_counts().sort_index()

print("Распределение по activity_class:")
print(counts)

print("\nОбщее количество строк:", len(df))
print("Уникальных sample_id:", df["sample_id"].nunique())
print("Уникальных participant_id:", df["participant_id"].nunique())


Распределение по activity_class:
activity_class
Arm Swing                  105
Body Swing                 119
Chest Expansion            114
Drumming                   545
Frog Pose                  113
Maracas Forward Shaking    103
Maracas Shaking            130
Sing and Clap              113
Squat                      102
Tree Pose                  129
Twist Pose                 120
Name: count, dtype: int64

Общее количество строк: 1693
Уникальных sample_id: 1693
Уникальных participant_id: 32
