In [1]:
from pathlib import Path
import pandas as pd

# ==== Config ====
ROOT = Path("/neurospin/dico/data/deep_folding/current/datasets")
ROI = "OCCIPITAL"
HEMI = "L"
DATASETS = ["cnp", "candi", "bsnip1", "schizconnect-vip-prague"]  # check order of appearance among these
SUBDIR = "crops/2mm"
MASKDIR = "mask"

agg_file = ROOT / "aggregate_schizophrenia" / SUBDIR / ROI / MASKDIR / f"{HEMI}skeleton_subject.csv"

HEADER_TOKENS = {"subject", "subjects", "id", "ids"}  # case-insensitive

def read_ids(p: Path):
    if not p.exists():
        return []
    try:
        s = pd.read_csv(p, header=None, dtype=str, engine="python")
        col0 = s.iloc[:, 0].astype(str).str.strip()
        vals = [x for x in col0.tolist() if x]
    except Exception:
        vals = [line.strip().split(",")[0] for line in p.read_text(encoding="utf-8", errors="ignore").splitlines() if line.strip()]
    # drop header-like tokens
    vals = [v for v in vals if v.lower() not in HEADER_TOKENS]
    return vals

# Load aggregate list
agg_ids = read_ids(agg_file)

# Build membership maps (set of IDs for each dataset)
dataset_sets = {}
for ds in DATASETS:
    p = ROOT / ds / SUBDIR / ROI / MASKDIR / f"{HEMI}skeleton_subject.csv"
    dataset_sets[ds] = set(read_ids(p))

# Map each aggregate ID to its dataset (first matching dataset wins; else "UNKNOWN")
def which_dataset(sid: str):
    for ds in DATASETS:
        if sid in dataset_sets[ds]:
            return ds
    return "UNKNOWN"

labels = [which_dataset(sid) for sid in agg_ids]

# Detect contiguous blocks by dataset label
blocks = []
if agg_ids:
    cur = labels[0]; start = 0
    for i, lab in enumerate(labels):
        if lab != cur:
            blocks.append((cur, start, i))  # [start, i-1]
            cur = lab; start = i
    blocks.append((cur, start, len(labels)))  # last block [start, end)

# Print a clean “1st, 2nd, …” order with counts and indices
suffix = ["st","nd","rd"] + ["th"]*10
def ordinal(n): 
    if 10 <= (n % 100) <= 20: return f"{n}th"
    return f"{n}{suffix[min((n % 10)-1, 3)] if 1 <= (n % 10) <= 3 else 'th'}"

print(f"Aggregate: {agg_file}")
print(f"Total subjects: {len(agg_ids)}\n")

if not blocks:
    print("No subjects found.")
else:
    print("Block order in aggregate:")
    order = []
    for k, (ds, a, b) in enumerate(blocks, start=1):
        count = b - a
        print(f"  {ordinal(k)}: {ds:24s}  idx[{a}..{b-1}]  count={count}")
        order.append(ds)
    print("\nDatasets encountered in order (collapsed):")
    collapsed = []
    for ds in order:
        if not collapsed or collapsed[-1] != ds:
            collapsed.append(ds)
    print("  " + "  ->  ".join(collapsed))


Aggregate: /neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/crops/2mm/OCCIPITAL/mask/Lskeleton_subject.csv
Total subjects: 2406

Block order in aggregate:
  1st: cnp                       idx[0..263]  count=264
  2nd: bsnip1                    idx[264..1566]  count=1303
  3rd: candi                     idx[1567..1669]  count=103
  4th: schizconnect-vip-prague   idx[1670..2405]  count=736

Datasets encountered in order (collapsed):
  cnp  ->  bsnip1  ->  candi  ->  schizconnect-vip-prague


In [5]:
import numpy as np
from pathlib import Path
import pandas as pd
import shutil

# =========================
# Config
# =========================
ROOT = Path("/neurospin/dico/data/deep_folding/current/datasets")
ROI = "INSULA."
HEMIS = ["L"]
DATASETS_ORDER = ["cnp", "bsnip1", "candi", "schizconnect-vip-prague"]  # order as in OCCIPITAL
SUBDIR = "crops/2mm"
MASKDIR = "mask"
AGG_NAME = "aggregate_schizophrenia"

HEADER_TOKENS = {"subject", "subjects", "id", "ids"}  # drop if seen

agg_dir = ROOT / AGG_NAME / SUBDIR / ROI / MASKDIR
agg_dir.mkdir(parents=True, exist_ok=True)

# =========================
# Helpers
# =========================
def read_ids(p: Path):
    """Read subject IDs, drop headers, return as list"""
    if not p.exists():
        return []
    try:
        s = pd.read_csv(p, header=None, dtype=str, engine="python")
        col0 = s.iloc[:, 0].astype(str).str.strip()
        vals = [x for x in col0.tolist() if x]
    except Exception:
        vals = [
            line.strip().split(",")[0]
            for line in p.read_text(encoding="utf-8", errors="ignore").splitlines()
            if line.strip()
        ]
    vals = [v for v in vals if v.lower() not in HEADER_TOKENS]
    return vals

def concat_subjects(hemi):
    """Return concatenated subject IDs in desired order"""
    out = []
    for ds in DATASETS_ORDER:
        src = ROOT / ds / SUBDIR / ROI / MASKDIR / f"{hemi}skeleton_subject.csv"
        ids = read_ids(src)
        out.extend(ids)
    return out

# =========================
# 1. Build subject lists
# =========================
all_subjects = {h: concat_subjects(h) for h in HEMIS}

for h in HEMIS:
    subj_csv = agg_dir / f"{h}skeleton_subject.csv"
    with subj_csv.open("w", encoding="utf-8") as f:
        for sid in all_subjects[h]:
            f.write(f"{sid}\n")
    print(f"Wrote {subj_csv} with {len(all_subjects[h])} subjects")

# =========================
# 2. Build skeleton arrays (.npy)
# =========================
for h in HEMIS:
    stacked_parts = []
    subj_total = []
    for ds in DATASETS_ORDER:
        src_dir = ROOT / ds / SUBDIR / ROI / MASKDIR
        ids = read_ids(src_dir / f"{h}skeleton_subject.csv")
        npy_path = src_dir / f"{h}skeleton.npy"
        if not npy_path.exists() or not ids:
            print(f"⚠️ Skipping {ds} {h}: no npy or no subjects")
            continue
        arr = np.load(npy_path)  # shape should be (N_subjects, ...)
        if arr.shape[0] != len(ids):
            print(f"⚠️ WARNING: mismatch in {ds} {h}: {arr.shape[0]} rows vs {len(ids)} IDs")
        stacked_parts.append(arr)
        subj_total.extend(ids)
    if stacked_parts:
        stacked = np.concatenate(stacked_parts, axis=0)
        out_path = agg_dir / f"{h}skeleton.npy"
        np.save(out_path, stacked)
        print(f"Wrote {out_path} with shape {stacked.shape}")
        # sanity check
        agg_ids = read_ids(agg_dir / f"{h}skeleton_subject.csv")
        if len(agg_ids) != stacked.shape[0]:
            print(f" ERROR: mismatch for {h}: {len(agg_ids)} IDs vs {stacked.shape[0]} rows")
        else:
            print(f" Sanity check passed for {h}: {len(agg_ids)} subjects")
    else:
        print(f" No arrays found for {h}, nothing written")

# =========================
# 3. Copy mask .minf
# =========================
for h in HEMIS:
    src_minf = ROOT / DATASETS_ORDER[0] / SUBDIR / ROI / MASKDIR / f"{h}mask_cropped.nii.gz.minf"
    dst_minf = agg_dir / f"{h}mask_cropped.nii.gz.minf"
    if src_minf.exists():
        shutil.copy(src_minf, dst_minf)
        print(f"Copied {src_minf} -> {dst_minf}")
    else:
        print(f"⚠️ Missing mask .minf for {h} in {DATASETS_ORDER[0]}")


Wrote /neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/crops/2mm/INSULA./mask/Lskeleton_subject.csv with 2406 subjects
Wrote /neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/crops/2mm/INSULA./mask/Lskeleton.npy with shape (2406, 27, 52, 39, 1)
 Sanity check passed for L: 2406 subjects
Copied /neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/INSULA./mask/Lmask_cropped.nii.gz.minf -> /neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/crops/2mm/INSULA./mask/Lmask_cropped.nii.gz.minf


In [None]:
from pathlib import Path
import shutil
import os
import pandas as pd

# ========= config =========
ROOT = Path("/neurospin/dico/data/deep_folding/current/datasets")
AGG_NAME = "aggregate_schizophrenia"
SUBDIR = "crops/2mm"
ROI = "INSULA."                   # <- you used INSULA. (with the dot)
MASKDIR = "mask"
DATASETS_ORDER = ["cnp", "bsnip1", "candi", "schizconnect-vip-prague"]
HEMIS = ["L"]                # will skip missing ones automatically
USE_SYMLINKS = True               # set False to copy files instead of symlink

agg_mask_dir = ROOT / AGG_NAME / SUBDIR / ROI / MASKDIR
agg_mask_dir.mkdir(parents=True, exist_ok=True)

def first_existing(path_list):
    for p in path_list:
        if p.exists():
            return p
    return None

def copy_or_link(src: Path, dst: Path):
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists():
        return
    if USE_SYMLINKS:
        try:
            os.symlink(src, dst)
        except FileExistsError:
            pass
    else:
        shutil.copy2(src, dst)

# 1) Copy common mask files per hemisphere (take the first dataset that has them)
for h in HEMIS:
    candidates = [ROOT / ds / SUBDIR / ROI / MASKDIR for ds in DATASETS_ORDER]

    mask_nii_src  = first_existing([c / f"{h}mask_cropped.nii.gz"      for c in candidates])
    mask_inf_src  = first_existing([c / f"{h}mask_cropped.nii.gz.minf" for c in candidates])
    skel_nii_src  = first_existing([c / f"{h}mask_skeleton.nii.gz"      for c in candidates])
    skel_inf_src  = first_existing([c / f"{h}mask_skeleton.nii.gz.minf" for c in candidates])

    if mask_nii_src:
        copy_or_link(mask_nii_src, agg_mask_dir / f"{h}mask_cropped.nii.gz")
        print(f"[{h}] copied mask_cropped.nii.gz from {mask_nii_src}")
    else:
        print(f"[{h}] no mask_cropped.nii.gz found in any dataset")

    if mask_inf_src:
        copy_or_link(mask_inf_src, agg_mask_dir / f"{h}mask_cropped.nii.gz.minf")
        print(f"[{h}] copied mask_cropped.nii.gz.minf from {mask_inf_src}")
    else:
        print(f"[{h}] no mask_cropped.nii.gz.minf found in any dataset")

    if skel_nii_src:
        copy_or_link(skel_nii_src, agg_mask_dir / f"{h}mask_skeleton.nii.gz")
        print(f"[{h}] copied mask_skeleton.nii.gz from {skel_nii_src}")
    else:
        print(f"[{h}] no mask_skeleton.nii.gz found in any dataset")

    if skel_inf_src:
        copy_or_link(skel_inf_src, agg_mask_dir / f"{h}mask_skeleton.nii.gz.minf")
        print(f"[{h}] copied mask_skeleton.nii.gz.minf from {skel_inf_src}")
    else:
        print(f"[{h}] no mask_skeleton.nii.gz.minf found in any dataset")

# 2) Merge Lcrops/Rcrops directories
for h in HEMIS:
    agg_crops_dir = agg_mask_dir / f"{h}crops"
    added = 0
    for ds in DATASETS_ORDER:
        src_crops = ROOT / ds / SUBDIR / ROI / MASKDIR / f"{h}crops"
        if not src_crops.exists():
            continue
        for item in sorted(src_crops.iterdir()):
            dst = agg_crops_dir / item.name
            if dst.exists():
                continue
            if item.is_dir():
                if USE_SYMLINKS:
                    os.symlink(item, dst)
                else:
                    shutil.copytree(item, dst)
            else:
                copy_or_link(item, dst)
            added += 1
    if added > 0:
        print(f"[{h}] merged {added} entries into {agg_crops_dir}")
    else:
        print(f"[{h}] no {h}crops directory found in any dataset; skipped")

# 3) Reformat skeleton_subject.csv files with "Subject" header
for h in HEMIS:
    subj_csv = agg_mask_dir / f"{h}skeleton_subject.csv"
    if subj_csv.exists():
        ids = [line.strip() for line in subj_csv.read_text().splitlines() if line.strip() and line.lower() != "subject"]
        pd.DataFrame(ids, columns=["Subject"]).to_csv(subj_csv, index=False)
        print(f"[{h}] rewrote {subj_csv} with header 'Subject' and {len(ids)} rows")

print("\nDone. If something is still missing, it means no dataset provided that artifact for the given ROI/hemi.")


[L] copied mask_cropped.nii.gz from /neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/INSULA./mask/Lmask_cropped.nii.gz
[L] copied mask_cropped.nii.gz.minf from /neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/INSULA./mask/Lmask_cropped.nii.gz.minf
[L] copied mask_skeleton.nii.gz from /neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/INSULA./mask/Lmask_skeleton.nii.gz
[L] copied mask_skeleton.nii.gz.minf from /neurospin/dico/data/deep_folding/current/datasets/cnp/crops/2mm/INSULA./mask/Lmask_skeleton.nii.gz.minf
[L] wrote /neurospin/dico/data/deep_folding/current/datasets/aggregate_schizophrenia/crops/2mm/INSULA./mask/Lskeleton_subject.csv with 2406 subjects
[L] no Lcrops directory found in any dataset; skipped

Done. If something is still missing, it means no dataset provided that artifact for the given ROI/hemi.
