In [1]:
import numpy as np
import pandas as pd
import os

# Create LOSO split

In [1]:
import os
import random
import pandas as pd

def generate_leave_n_subject_splits(
    subject_ids,
    N_test=1,
    M_val=0,
    is_test_val=False,
    out_dir="LOSO",
    shuffle=True,
    seed=42,
):
    """
    Generate multiple CSV split files for leave-N-subject-out.

    Each CSV:
      - includes all subjects (each subject appears exactly once in {train, val, test})
      - has columns: subject_id, split

    Parameters
    ----------
    subject_ids : list of str
        e.g. ["S2", "S3", "S4", ...]
    N_test : int
        Number of subjects in the test split (leave-N-subject-out).
    M_val : int
        Number of subjects in the validation split (0 means no val).
    out_dir : str
        Directory to save CSV files.
    shuffle : bool
        Whether to shuffle the subject_ids before building splits.
    seed : int
        Random seed for reproducibility when shuffle=True.
    """
    os.makedirs(out_dir, exist_ok=True)

    subject_ids = list(subject_ids)
    num_subjects = len(subject_ids)

    # ---- sanity checks ----
    if N_test <= 0 or N_test > num_subjects:
        raise ValueError("N_test must be in [1, len(subject_ids)].")
    if M_val < 0 or (N_test + M_val) > num_subjects:
        raise ValueError("M_val must be >= 0 and N_test + M_val <= len(subject_ids).")

    # Optional shuffle for randomness
    if shuffle:
        random.seed(seed)
        random.shuffle(subject_ids)
    else:
        subject_ids.sort()

    print(f"Subjects used (order): {subject_ids}")

    # number of folds: partition subjects into contiguous chunks of size N_test for test
    num_fold = num_subjects // N_test
    if num_subjects % N_test != 0:
        print(
            f"Warning: len(subject_ids)={num_subjects} is not divisible by N_test={N_test}. "
            f"The last few subjects will never be in test."
        )

    for k in range(num_fold):
        # ---- choose test subjects for this fold ----
        start = k * N_test
        end = start + N_test
        test_subjects = subject_ids[start:end]

        # remaining subjects are candidates for train/val
        remaining = [s for s in subject_ids if s not in test_subjects]

        # choose val subjects from remaining (first M_val)
        if M_val > 0 and not is_test_val:
            val_subjects = remaining[:M_val]
            train_subjects = remaining[M_val:]
        elif M_val > 0 and is_test_val:
            val_subjects = test_subjects[:M_val]
            train_subjects = remaining
        else:
            val_subjects = []
            train_subjects = remaining

        # ---- build rows ----
        rows = []
        for s in train_subjects:
            rows.append({"subject_id": s, "split": "train"})
        for s in val_subjects:
            rows.append({"subject_id": s, "split": "val"})
        for s in test_subjects:
            rows.append({"subject_id": s, "split": "test"})

        df = pd.DataFrame(rows)

        # e.g. Fold/fold_00.csv, Fold/fold_01.csv, ...
        out_path = os.path.join(out_dir, f"fold_{k:02d}.csv")
        df.to_csv(out_path, index=False)

        print(
            f"[fold_{k:02d}.csv] "
            f"train={len(train_subjects)}, val={len(val_subjects)}, test={len(test_subjects)}"
        )
        print(f"  train: {train_subjects}")
        if val_subjects:
            print(f"  val:   {val_subjects}")
        print(f"  test:  {test_subjects}")


In [2]:
subject_ids = ["S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S11", "S13", "S14", "S15", "S16", "S17"]
generate_leave_n_subject_splits(
    subject_ids,
    N_test=1,
    M_val=1,
    is_test_val=True,
    out_dir="LOSO_testval",
    shuffle=False,   # set True if you want random order
)

Subjects used (order): ['S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
[fold_00.csv] train=14, val=1, test=1
  train: ['S11', 'S13', 'S14', 'S15', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
  val:   ['S10']
  test:  ['S10']
[fold_01.csv] train=14, val=1, test=1
  train: ['S10', 'S13', 'S14', 'S15', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
  val:   ['S11']
  test:  ['S11']
[fold_02.csv] train=14, val=1, test=1
  train: ['S10', 'S11', 'S14', 'S15', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
  val:   ['S13']
  test:  ['S13']
[fold_03.csv] train=14, val=1, test=1
  train: ['S10', 'S11', 'S13', 'S15', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
  val:   ['S14']
  test:  ['S14']
[fold_04.csv] train=14, val=1, test=1
  train: ['S10', 'S11', 'S13', 'S14', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
  val:   ['S15']
  test:  ['S15']
[fold_05.csv] train