In [51]:
# Re-run the full pipeline now that we've inspected the column names.
import os
import re
from pathlib import Path
import pandas as pd
import numpy as np
from scipy import stats

from typing import Dict, List, Tuple



In [52]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    def _clean(col: str) -> str:
        if not isinstance(col, str):
            return col
        col = col.strip()
        col = re.sub(r"\s+", " ", col)
        col = col.replace(" _", "_").replace("_ ", "_")
        return col
    df = df.copy()
    df.columns = [_clean(c) for c in df.columns]
    return df

In [53]:

def normalize_name(name: str) -> str:
    s = name.strip()
    s = re.sub(r"\s+", "_", s)      # spaces -> _
    s = s.replace("-", "_")         # hyphens -> _
    s = s.replace(".", "_")         # dots -> _
    s = re.sub(r"_+", "_", s)       # collapse multiple _
    s = s.lower()
    # drop session suffixes
    s = re.sub(r"(_ses\s*1|_ses\s*2|_ses1|_ses2)$", "", s)
    # consistent lh/rh casing already lower
    return s


In [54]:
# ------------- Helpers -------------
def cohens_d(x: np.ndarray, y: np.ndarray) -> float:
    # Cohen's d for independent samples (uses pooled SD)
    nx, ny = len(x), len(y)
    if nx < 2 or ny < 2:
        return np.nan
    vx, vy = x.var(ddof=1), y.var(ddof=1)
    s = np.sqrt(((nx-1)*vx + (ny-1)*vy) / (nx + ny - 2))
    if s == 0:
        return 0.0
    return (x.mean() - y.mean()) / s



In [55]:
def welch_ttest(x: np.ndarray, y: np.ndarray) -> Tuple[float, float]:
    # Returns t-statistic and pvalue (Welch's t-test)
    if len(x) < 2 or len(y) < 2:
        return np.nan, np.nan
    t, p = stats.ttest_ind(x, y, equal_var=False, nan_policy="omit")
    return t, p

In [56]:


def fdr_bh(pvals: np.ndarray, alpha: float = 0.05) -> Tuple[np.ndarray, np.ndarray, float]:
    """
    Benjamini-Hochberg FDR.
    Returns: rejected(bool array), pvals_corrected, critical_value
    """
    p = np.array(pvals, dtype=float)
    n = np.sum(~np.isnan(p))
    order = np.argsort(np.where(np.isnan(p), 1.1, p))  # NaNs at end
    ranked_p = p[order]
    crit = alpha * (np.arange(1, n+1) / n)
    # only apply to non-NaN
    ranked_p_nonan = ranked_p[~np.isnan(ranked_p)]
    rejected = np.zeros_like(p, dtype=bool)
    p_adj = np.full_like(p, np.nan, dtype=float)

    # step-up
    max_i = 0
    for i in range(len(ranked_p_nonan)-1, -1, -1):
        if ranked_p_nonan[i] <= crit[i]:
            max_i = i + 1
            break
    # mark rejected
    if max_i > 0:
        rejected[order[:max_i]] = True

    # compute adjusted p-values
    # p_adj = min_{j>=i} (n/j * p_j)
    adj_vals = np.empty_like(ranked_p_nonan)
    min_val = 1.0
    for i in range(len(ranked_p_nonan)-1, -1, -1):
        val = (n/(i+1)) * ranked_p_nonan[i]
        if val < min_val:
            min_val = val
        adj_vals[i] = min(min_val, 1.0)
    # place back
    p_adj_nonan = adj_vals
    # fill in adjusted p for non-NaNs
    p_adj_indices = order[:len(p_adj_nonan)]
    p_adj[p_adj_indices] = p_adj_nonan

    critical_value = crit[max_i-1] if max_i > 0 else 0.0
    return rejected, p_adj, critical_value

In [57]:

BIG_FILE = Path("ttest_yeo/fmri_T1_clinical_merged_updated.csv")
CLUSTERS_FILE = Path("ttest_yeo/session1_clusters.csv")
OUTPUT_FILE = Path("ttest_yeo/bigfile_pruned_merged.csv")



big_df = pd.read_csv(BIG_FILE)
clusters_df = pd.read_csv(CLUSTERS_FILE)

big_df = normalize_columns(big_df)
clusters_df = normalize_columns(clusters_df)

columns_to_drop: List[str] = [
    "age_at_baseline","b_ctq_total","b_ctq_cutoff","b_ctq_NEGLECT","b_ctq_ABUSE",
    "b_ctq_sexual_abuse","b_ctq_physical_abuse","b_ctq_emotional_abuse","b_ctq_physical_neglect",
    "b_ctq_emotional_neglect","b_ctq_sexual_abuse_cutoff","b_ctq_physical_abuse_cutoff",
    "b_ctq_emotional_abuse_cutoff","b_ctq_physical_neglect_cutoff","b_ctq_emotional_neglect_cutoff",
    "b_ctq_denial_score","b_lec_0_to_16_total","b_lec_interpersonal_events","b_lec_non_interpersonal_events",
    "b_strength_average","b_PHQ_total","b_GAD7_total","b_social_support_total","b_pcl_total",
    "b_DERS_total","b_DERS_Nonacceptance_Emotional_Responses","b_DERS_Goal_Directed_Behavior",
    "b_DERS_Impulse_Control","b_DERS_Lack_Emotional_Awareness","b_DERS_Emotion_Regulation_Strategies",
    "b_DERS_Lack_Emotional_Clarity","b_DES_average","b_DES_Absorption","b_DES_Amnesia","b_DES_Depersonalization",
    "b_LHQ_total","b_PBI_mom_care","b_PBI_mom_overprotection","b_PBI_dad_care","b_PBI_dad_overprotection",
    "b_IRI_Perspective_Taking","b_IRI_Empathic_Concern","b_IRI_Personal_Distress","b_IRI_Fantasy",
    "T1_PHQ_total","T2_PHQ_total","T3_PHQ_total","T1_GAD7_total","T2_GAD7_total","T3_GAD7_total",
    "after_bits_PTSD_total","after_bits_birth_symptoms","after_bits_General_symptoms","after_bits_Dissociatie_symptoms",
    "after_bits_PTSD_criterion","after_bits_Re_experiencing","after_bits_Avoidance","after_bits_Negative_Cognitions",
    "after_bits_Hyperarousal","after_MPAS_total","after_MPAS_proximity","after_MPAS_Acceptance","after_MPAS_Tolerance",
    "after_MPAS_Competence","after_MPAS_Attachment","after_MPAS_Hostility","after_MPAS_Interaction","after_CTQ_total",
    "after_CTQ_cutoff","after_CTQ_NEGLECT","after_CTQ_ABUSE","after_CTQ_sexual_abuse","after_CTQ_physical_abuse",
    "after_CTQ_emotional_abuse","after_CTQ_physical_neglect","after_CTQ_emotional_neglect","after_CTQ_sexual_abuse_cutoff",
    "after_CTQ_physical_abuse_cutoff","after_CTQ_emotional_abuse_cutoff","after_CTQ_physical_neglect_cutoff",
    "after_CTQ_emotional_neglect_cutoff","after_CTQ_denial_score","after_DERS_total",
    "after_DERS_Nonacceptance_Emotional_Responses","after_DERS_Goal_Directed_Behavior","after_DERS_Impulse_Control",
    "after_DERS_Lack_Emotional_Awareness","after_DERS_Emotion_Regulation_Strategies","after_DERS_Lack_Emotional_Clarity",
    "after_DES_total","after_DES_Absorption","after_DES_Amnesia","after_DES_Depersonalization",
    "VIS","SMN","DA","VAN","LIM","FPN","DMN",
    "SMN-VIS","DA-VIS","VAN-VIS","LIM-VIS","FPN-VIS","DMN-VIS","DA-SMN","VAN-SMN","LIM-SMN","FPN-SMN","DMN-SMN",
    "VAN-DA","LIM-DAN","FPN-DA","DMN-DAN","LIM-VAN","FPN-VAN","DMN-VAN","FPN-LIM","DMN-LIM","DMN-FPN",
    "VIS-Brain","SMN-Brain","DAN-Brain","VAN-Brain","LIM-Brain","FPN-Brain","DMN-Brain",
    "seg_VIS","seg_SMN","seg_DAN","seg_VAN","seg_LIM","seg_FPN",
    "FD","total_euler","Clustering","total_euler_before","seg_DMN"
]

# Normalize the drop list to match our normalization of columns
columns_to_drop = list(pd.Index(columns_to_drop).map(lambda c: c.strip().replace(" _","_").replace("_ ","_")))

present_to_drop = [c for c in columns_to_drop if c in big_df.columns]
missing_to_drop = sorted(set(columns_to_drop) - set(present_to_drop))

pruned_df = big_df.drop(columns=present_to_drop, errors="ignore")

# Use explicit columns we observed
merge_key = "Subject_Code"
cluster_col = "kmeans_label"

merged_df = pruned_df.merge(
    clusters_df[[merge_key, cluster_col]],
    on=merge_key,
    how="left",
    validate="m:1"
)

# Save
merged_df.to_csv(OUTPUT_FILE, index=False)

# Show a compact summary and a preview
summary = {
    "big_file_rows": len(big_df),
    "big_file_cols_before": len(big_df.columns),
    "dropped_columns_found": len(present_to_drop),
    "dropped_columns_missing": len(missing_to_drop),
    "big_file_cols_after_drop": len(pruned_df.columns),
    "clusters_file_rows": len(clusters_df),
    "clusters_file_cols": len(clusters_df.columns),
    "merge_strategy": "key 'Subject_Code'",
    "cluster_column_used": cluster_col,
    "output_path": str(OUTPUT_FILE),
    "output_rows": len(merged_df),
    "output_cols": len(merged_df.columns),
    "example_dropped_columns": present_to_drop[:8],
    "example_missing_columns": missing_to_drop[:8],
}

print("Summary:", summary)
print(merged_df.head(50))


Summary: {'big_file_rows': 117, 'big_file_cols_before': 267, 'dropped_columns_found': 138, 'dropped_columns_missing': 1, 'big_file_cols_after_drop': 129, 'clusters_file_rows': 115, 'clusters_file_cols': 2, 'merge_strategy': "key 'Subject_Code'", 'cluster_column_used': 'kmeans_label', 'output_path': 'ttest_yeo\\bigfile_pruned_merged.csv', 'output_rows': 117, 'output_cols': 130, 'example_dropped_columns': ['age_at_baseline', 'b_ctq_total', 'b_ctq_cutoff', 'b_ctq_NEGLECT', 'b_ctq_ABUSE', 'b_ctq_sexual_abuse', 'b_ctq_physical_abuse', 'b_ctq_emotional_abuse'], 'example_missing_columns': ['total_euler']}
   Subject_Code         eTIV  Brain-Stem     CSF  Cingulate  Cingulate_lh  \
0         NT005  1440326.175     20336.4  1050.0      16031          7347   
1         NT006  1567791.439     18653.6  1238.7      22877         12661   
2         NT002  1410700.638     19351.4   846.2      19813          9761   
3         NT003  1657960.938     23059.0  1091.7      24754         12000   
4        

3rd_ventricle
4th_ventricle
5th_ventricle
brain_stem
cc_anterior
cc_central
cc_mid_anterior
cc_mid_posterior
cc_posterior
csf
left_accumbens_area
left_amygdala
left_caudate
left_cerebellum_cortex
left_cerebellum_white_matter
left_choroid_plexus
left_hippocampus
left_inf_lat_vent
left_lateral_ventricle
left_non_wm_hypointensities
left_pallidum
left_putamen
left_thalamus
left_ventraldc
left_vessel
left_wm_hypointensities
non_wm_hypointensities
optic_chiasm
right_accumbens_area
right_amygdala
right_caudate
right_cerebellum_cortex
right_cerebellum_white_matter
right_choroid_plexus
right_hippocampus
right_inf_lat_vent
right_lateral_ventricle
right_non_wm_hypointensities
right_pallidum
right_putamen
right_thalamus
right_ventraldc
right_vessel
right_wm_hypointensities
wm_hypointensities


In [58]:
# Let's diagnose why your columns weren't being matched and fix it.
# We will:
# 1) Load the merged CSV
# 2) Build a normalization that turns names like "Left-Accumbens-area_ses1" into "left_accumbens_area"
#    (lowercase, replace '-' and spaces with '_', collapse repeats, drop _ses1/_ses2)
# 3) Show a mapping of original -> normalized so you can see what's happening
# 4) Provide a selector that matches your requested base names against these normalized names
# 5) Re-run the subcortical t-tests/correlations using the robust mapping


DATA_FILE = Path("ttest_yeo/bigfile_pruned_merged.csv")
GROUP_COL = "kmeans_label"

# Your base names (underscored, no session suffix)
base_features = [
    "3rd_ventricle","4th_ventricle","5th_ventricle","brain_stem","cc_anterior","cc_central",
    "cc_mid_anterior","cc_mid_posterior","cc_posterior","csf","left_accumbens_area","left_amygdala",
    "left_caudate","left_cerebellum_cortex","left_cerebellum_white_matter","left_choroid_plexus",
    "left_hippocampus","left_inf_lat_vent","left_lateral_ventricle","left_non_wm_hypointensities",
    "left_pallidum","left_putamen","left_thalamus","left_ventraldc","left_vessel","left_wm_hypointensities",
    "non_wm_hypointensities","optic_chiasm","right_accumbens_area","right_amygdala","right_caudate",
    "right_cerebellum_cortex","right_cerebellum_white_matter","right_choroid_plexus","right_hippocampus",
    "right_inf_lat_vent","right_lateral_ventricle","right_non_wm_hypointensities","right_pallidum",
    "right_putamen","right_thalamus","right_ventraldc","right_vessel","right_wm_hypointensities",
    "wm_hypointensities"
]

def normalize_name(name: str) -> str:
    s = name.strip()
    s = re.sub(r"\s+", "_", s)      # spaces -> _
    s = s.replace("-", "_")         # hyphens -> _
    s = s.replace(".", "_")         # dots -> _
    s = re.sub(r"_+", "_", s)       # collapse multiple _
    s = s.lower()
    # drop session suffixes
    s = re.sub(r"(_ses\s*1|_ses\s*2|_ses1|_ses2)$", "", s)
    # consistent lh/rh casing already lower
    return s

# Load data
df = pd.read_csv(DATA_FILE)

if GROUP_COL not in df.columns:
    raise ValueError(f"{GROUP_COL} not in file.")

# Build original->normalized and normalized->list(originals) maps
orig_cols = df.columns.tolist()
norm_cols = [normalize_name(c) for c in orig_cols]

map_orig_to_norm: Dict[str, str] = dict(zip(orig_cols, norm_cols))

# Build a "best original" picker for each normalized name:
# - prefer originals ending with _ses1
# - else originals ending with _ses2
# - else the first seen
pref: Dict[str, str] = {}
for orig, norm in zip(orig_cols, norm_cols):
    if norm not in pref:
        pref[norm] = orig
    # prefer *_ses1
    if re.search(r"_ses1$", normalize_name(orig)):
        pref[norm] = orig
    # if current not ses1 and new is ses1, replace; if current neither and new is ses2, consider only if no ses1 seen
    # (already covered by above)

# Make a mapping table so you can inspect
mapping_table = pd.DataFrame({
    "original": orig_cols,
    "normalized": norm_cols
}).sort_values(["normalized","original"])

print(f"Column name mapping (original -> normalized)  {mapping_table.head(200)}")

# Now map requested base names to selected original columns using normalized keys
selected_originals: List[str] = []
selection_reason: List[Tuple[str,str,str]] = []  # (base, original, reason)
for base in base_features:
    n = normalize_name(base)
    if n in pref:
        selected_originals.append(pref[n])
        selection_reason.append((base, pref[n], "matched_normalized"))
    else:
        selection_reason.append((base, "", "not_found"))

selection_df = pd.DataFrame(selection_reason, columns=["base_name","selected_original","reason"])
print(f"Requested base names -> selected original column  {selection_df} ")

# Helper stats functions
def welch_ttest(x: np.ndarray, y: np.ndarray):
    if len(x) < 2 or len(y) < 2:
        return np.nan, np.nan
    t, p = stats.ttest_ind(x, y, equal_var=False, nan_policy="omit")
    return float(t), float(p)

def cohens_d(x: np.ndarray, y: np.ndarray) -> float:
    nx, ny = len(x), len(y)
    if nx < 2 or ny < 2:
        return np.nan
    vx, vy = x.var(ddof=1), y.var(ddof=1)
    s = np.sqrt(((nx-1)*vx + (ny-1)*vy) / (nx + ny - 2))
    if s == 0:
        return 0.0
    return float((x.mean() - y.mean()) / s)

def fdr_bh(pvals: np.ndarray, alpha: float = 0.05):
    p = np.array(pvals, dtype=float)
    n = np.sum(~np.isnan(p))
    if n == 0:
        return np.array([False]*len(p)), np.array([np.nan]*len(p)), 0.0
    order = np.argsort(np.where(np.isnan(p), 1.1, p))
    ranked_p = p[order]
    ranked_nonan = ranked_p[~np.isnan(ranked_p)]
    crit = alpha * (np.arange(1, len(ranked_nonan)+1) / len(ranked_nonan))

    rejected = np.zeros_like(p, dtype=bool)
    p_adj = np.full_like(p, np.nan, dtype=float)

    max_i = 0
    for i in range(len(ranked_nonan)-1, -1, -1):
        if ranked_nonan[i] <= crit[i]:
            max_i = i + 1
            break
    if max_i > 0:
        rejected[order[:max_i]] = True

    adj = np.empty_like(ranked_nonan)
    min_val = 1.0
    for i in range(len(ranked_nonan)-1, -1, -1):
        val = (len(ranked_nonan)/(i+1)) * ranked_nonan[i]
        if val < min_val:
            min_val = val
        adj[i] = min(min_val, 1.0)

    p_adj_indices = order[:len(adj)]
    p_adj[p_adj_indices] = adj
    crit_val = crit[max_i-1] if max_i > 0 else 0.0
    return rejected, p_adj, crit_val

# Proceed with tests if we found any selected columns
if len(selected_originals) == 0:
    print("None of the requested base names matched available columns after normalization.")
else:
    # ensure binary groups
    g = df[GROUP_COL]
    if g.dtype.kind not in "biu":
        try:
            g = g.astype(int)
        except Exception:
            uniq = sorted(g.dropna().unique().tolist())
            if len(uniq) != 2:
                raise ValueError(f"{GROUP_COL} must have 2 groups, found: {uniq}")
            g = g.map({uniq[0]:0, uniq[1]:1})
    df[GROUP_COL] = g
    uniq_g = sorted(df[GROUP_COL].dropna().unique().tolist())
    g0, g1 = uniq_g[0], uniq_g[1]

    # T-tests
    rows = []
    for col in selected_originals:
        x = df.loc[df[GROUP_COL]==g0, col].astype(float).dropna().values
        y = df.loc[df[GROUP_COL]==g1, col].astype(float).dropna().values
        t, p = welch_ttest(x, y)
        d = cohens_d(x, y)
        rows.append({
            "feature_original": col,
            "feature_normalized": normalize_name(col),
            f"n_{g0}": len(x), f"mean_{g0}": float(np.mean(x)) if len(x) else np.nan, f"sd_{g0}": float(np.std(x, ddof=1)) if len(x) else np.nan,
            f"n_{g1}": len(y), f"mean_{g1}": float(np.mean(y)) if len(y) else np.nan, f"sd_{g1}": float(np.std(y, ddof=1)) if len(y) else np.nan,
            "t_stat": t, "p_value": p, "cohens_d": d
        })
    ttest_df = pd.DataFrame(rows).sort_values("p_value", na_position="last")
    rej, p_adj, crit = fdr_bh(ttest_df["p_value"].values)
    ttest_df["p_value_fdr_bh"] = p_adj
    ttest_df["significant_fdr_0.05"] = rej

    # Correlations
    corr_rows = []
    gg = df[GROUP_COL].astype(float)
    for col in selected_originals:
        xx = df[col].astype(float)
        m = gg.notna() & xx.notna()
        if m.sum() < 3:
            r, p = (np.nan, np.nan)
        else:
            r, p = stats.pearsonr(gg[m], xx[m])
        corr_rows.append({"feature_original": col, "feature_normalized": normalize_name(col), "r_pointbiserial": r, "p_value": p})
    corr_df = pd.DataFrame(corr_rows).sort_values("p_value", na_position="last")
    rej_c, p_adj_c, crit_c = fdr_bh(corr_df["p_value"].values)
    corr_df["p_value_fdr_bh"] = p_adj_c
    corr_df["significant_fdr_0.05"] = rej_c

    # Save and display
    t_out = Path("ttest_yeo/ttest_subcortical_normalized.csv")
    c_out = Path("ttest_yeo/corr_subcortical_normalized.csv")
    map_out = Path("ttest_yeo/column_mapping_normalized.csv")
    ttest_df.to_csv(t_out, index=False)
    corr_df.to_csv(c_out, index=False)
    mapping_table.to_csv(map_out, index=False)

    print(f"T-tests (normalized mapping) {ttest_df}")
    print(f"Correlations (normalized mapping) {corr_df}")

    print({
        "downloads": {
            "ttest_results_csv": str(t_out),
            "corr_results_csv": str(c_out),
            "column_mapping_csv": str(map_out)
        },
        "tested_feature_count": len(selected_originals)
    })


Column name mapping (original -> normalized)                        original                  normalized
56                 bankssts_lh                 bankssts_lh
57                 bankssts_rh                 bankssts_rh
2                   Brain-Stem                  brain_stem
58  caudalanteriorcingulate_lh  caudalanteriorcingulate_lh
59  caudalanteriorcingulate_rh  caudalanteriorcingulate_rh
..                         ...                         ...
51                      VAN_GM                      van_gm
52                      VIS_GM                      vis_gm
55               WhiteSurfArea               whitesurfarea
53            WhiteSurfArea_lh            whitesurfarea_lh
54            WhiteSurfArea_rh            whitesurfarea_rh

[130 rows x 2 columns]
Requested base names -> selected original column                          base_name        selected_original              reason
0                   3rd_ventricle                                    not_found
1             

In [59]:
# Let's diagnose why your columns weren't being matched and fix it.
# We will:
# 1) Load the merged CSV
# 2) Build a normalization that turns names like "Left-Accumbens-area_ses1" into "left_accumbens_area"
#    (lowercase, replace '-' and spaces with '_', collapse repeats, drop _ses1/_ses2)
# 3) Show a mapping of original -> normalized so you can see what's happening
# 4) Provide a selector that matches your requested base names against these normalized names
# 5) Re-run the subcortical t-tests/correlations using the robust mapping


DATA_FILE = Path("ttest_yeo/bigfile_pruned_merged.csv")
GROUP_COL = "kmeans_label"

# Your base names (underscored, no session suffix)
base_features= [
    'bankssts_lh', 'bankssts_rh', 'caudalanteriorcingulate_lh', 'caudalanteriorcingulate_rh',
    'caudalmiddlefrontal_lh', 'caudalmiddlefrontal_rh', 'cuneus_lh', 'cuneus_rh',
    'entorhinal_lh', 'entorhinal_rh', 'frontalpole_lh', 'frontalpole_rh',
    'fusiform_lh', 'fusiform_rh', 'inferiorparietal_lh', 'inferiorparietal_rh',
    'inferiortemporal_lh', 'inferiortemporal_rh', 'insula_lh', 'insula_rh',
    'isthmuscingulate_lh', 'isthmuscingulate_rh', 'lateraloccipital_lh', 'lateraloccipital_rh',
    'lateralorbitofrontal_lh', 'lateralorbitofrontal_rh', 'lingual_lh', 'lingual_rh',
    'medialorbitofrontal_lh', 'medialorbitofrontal_rh', 'middletemporal_lh', 'middletemporal_rh',
    'paracentral_lh', 'paracentral_rh', 'parahippocampal_lh', 'parahippocampal_rh',
    'parsopercularis_lh', 'parsopercularis_rh', 'parsorbitalis_lh', 'parsorbitalis_rh',
    'parstriangularis_lh', 'parstriangularis_rh', 'pericalcarine_lh', 'pericalcarine_rh',
    'postcentral_lh', 'postcentral_rh', 'posteriorcingulate_lh', 'posteriorcingulate_rh',
    'precentral_lh', 'precentral_rh', 'precuneus_lh', 'precuneus_rh',
    'rostralanteriorcingulate_lh', 'rostralanteriorcingulate_rh',
    'rostralmiddlefrontal_lh', 'rostralmiddlefrontal_rh',
    'superiorfrontal_lh', 'superiorfrontal_rh', 'superiorparietal_lh', 'superiorparietal_rh',
    'superiortemporal_lh', 'superiortemporal_rh', 'supramarginal_lh', 'supramarginal_rh',
    'temporalpole_lh', 'temporalpole_rh', 'transversetemporal_lh', 'transversetemporal_rh'
]


# Load data
df = pd.read_csv(DATA_FILE)

if GROUP_COL not in df.columns:
    raise ValueError(f"{GROUP_COL} not in file.")

# Build original->normalized and normalized->list(originals) maps
orig_cols = df.columns.tolist()
norm_cols = [normalize_name(c) for c in orig_cols]

map_orig_to_norm: Dict[str, str] = dict(zip(orig_cols, norm_cols))

# Build a "best original" picker for each normalized name:
# - prefer originals ending with _ses1
# - else originals ending with _ses2
# - else the first seen
pref: Dict[str, str] = {}
for orig, norm in zip(orig_cols, norm_cols):
    if norm not in pref:
        pref[norm] = orig
    # prefer *_ses1
    if re.search(r"_ses1$", normalize_name(orig)):
        pref[norm] = orig
    # if current not ses1 and new is ses1, replace; if current neither and new is ses2, consider only if no ses1 seen
    # (already covered by above)

# Make a mapping table so you can inspect
mapping_table = pd.DataFrame({
    "original": orig_cols,
    "normalized": norm_cols
}).sort_values(["normalized","original"])

print(f"Column name mapping (original -> normalized)  {mapping_table.head(200)}")

# Now map requested base names to selected original columns using normalized keys
selected_originals: List[str] = []
selection_reason: List[Tuple[str,str,str]] = []  # (base, original, reason)
for base in base_features:
    n = normalize_name(base)
    if n in pref:
        selected_originals.append(pref[n])
        selection_reason.append((base, pref[n], "matched_normalized"))
    else:
        selection_reason.append((base, "", "not_found"))

selection_df = pd.DataFrame(selection_reason, columns=["base_name","selected_original","reason"])
print(f"Requested base names -> selected original column  {selection_df} ")



# Proceed with tests if we found any selected columns
if len(selected_originals) == 0:
    print("None of the requested base names matched available columns after normalization.")
else:
    # ensure binary groups
    g = df[GROUP_COL]
    if g.dtype.kind not in "biu":
        try:
            g = g.astype(int)
        except Exception:
            uniq = sorted(g.dropna().unique().tolist())
            if len(uniq) != 2:
                raise ValueError(f"{GROUP_COL} must have 2 groups, found: {uniq}")
            g = g.map({uniq[0]:0, uniq[1]:1})
    df[GROUP_COL] = g
    uniq_g = sorted(df[GROUP_COL].dropna().unique().tolist())
    g0, g1 = uniq_g[0], uniq_g[1]

    # T-tests
    rows = []
    for col in selected_originals:
        x = df.loc[df[GROUP_COL]==g0, col].astype(float).dropna().values
        y = df.loc[df[GROUP_COL]==g1, col].astype(float).dropna().values
        t, p = welch_ttest(x, y)
        d = cohens_d(x, y)
        rows.append({
            "feature_original": col,
            "feature_normalized": normalize_name(col),
            f"n_{g0}": len(x), f"mean_{g0}": float(np.mean(x)) if len(x) else np.nan, f"sd_{g0}": float(np.std(x, ddof=1)) if len(x) else np.nan,
            f"n_{g1}": len(y), f"mean_{g1}": float(np.mean(y)) if len(y) else np.nan, f"sd_{g1}": float(np.std(y, ddof=1)) if len(y) else np.nan,
            "t_stat": t, "p_value": p, "cohens_d": d
        })
    ttest_df = pd.DataFrame(rows).sort_values("p_value", na_position="last")
    rej, p_adj, crit = fdr_bh(ttest_df["p_value"].values)
    ttest_df["p_value_fdr_bh"] = p_adj
    ttest_df["significant_fdr_0.05"] = rej

    # Correlations
    corr_rows = []
    gg = df[GROUP_COL].astype(float)
    for col in selected_originals:
        xx = df[col].astype(float)
        m = gg.notna() & xx.notna()
        if m.sum() < 3:
            r, p = (np.nan, np.nan)
        else:
            r, p = stats.pearsonr(gg[m], xx[m])
        corr_rows.append({"feature_original": col, "feature_normalized": normalize_name(col), "r_pointbiserial": r, "p_value": p})
    corr_df = pd.DataFrame(corr_rows).sort_values("p_value", na_position="last")
    rej_c, p_adj_c, crit_c = fdr_bh(corr_df["p_value"].values)
    corr_df["p_value_fdr_bh"] = p_adj_c
    corr_df["significant_fdr_0.05"] = rej_c

    # Save and display
    t_out = Path("ttest_yeo/ttest_cortical_lr.csv")
    c_out = Path("ttest_yeo/corr_cortical_lr.csv")
    ttest_df.to_csv(t_out, index=False)
    corr_df.to_csv(c_out, index=False)
    mapping_table.to_csv(map_out, index=False)

    print(f"T-tests (normalized mapping) {ttest_df}")
    print(f"Correlations (normalized mapping) {corr_df}")

    print({
        "downloads": {
            "ttest_results_cortical_lr_csv": str(t_out),
            "corr_results__cortical_lrcsv": str(c_out),
        },
        "tested_feature_count": len(selected_originals)
    })


Column name mapping (original -> normalized)                        original                  normalized
56                 bankssts_lh                 bankssts_lh
57                 bankssts_rh                 bankssts_rh
2                   Brain-Stem                  brain_stem
58  caudalanteriorcingulate_lh  caudalanteriorcingulate_lh
59  caudalanteriorcingulate_rh  caudalanteriorcingulate_rh
..                         ...                         ...
51                      VAN_GM                      van_gm
52                      VIS_GM                      vis_gm
55               WhiteSurfArea               whitesurfarea
53            WhiteSurfArea_lh            whitesurfarea_lh
54            WhiteSurfArea_rh            whitesurfarea_rh

[130 rows x 2 columns]
Requested base names -> selected original column                       base_name           selected_original              reason
0                  bankssts_lh                 bankssts_lh  matched_normalized
1             

In [60]:
# Let's diagnose why your columns weren't being matched and fix it.
# We will:
# 1) Load the merged CSV
# 2) Build a normalization that turns names like "Left-Accumbens-area_ses1" into "left_accumbens_area"
#    (lowercase, replace '-' and spaces with '_', collapse repeats, drop _ses1/_ses2)
# 3) Show a mapping of original -> normalized so you can see what's happening
# 4) Provide a selector that matches your requested base names against these normalized names
# 5) Re-run the subcortical t-tests/correlations using the robust mapping


DATA_FILE = Path("ttest_yeo/bigfile_pruned_merged.csv")
GROUP_COL = "kmeans_label"

# Your base names (underscored, no session suffix)
base_features= [
    "cingulate_lh",
    "cingulate_rh",
    "frontal_lh",
    "frontal_rh",
    "insula_lh",
    "insula_rh",
    "occipital_lh",
    "occipital_rh",
    "parietal_lh",
    "parietal_rh",
    "temporal_lh",
    "temporal_rh",
]



# Load data
df = pd.read_csv(DATA_FILE)

if GROUP_COL not in df.columns:
    raise ValueError(f"{GROUP_COL} not in file.")

# Build original->normalized and normalized->list(originals) maps
orig_cols = df.columns.tolist()
norm_cols = [normalize_name(c) for c in orig_cols]

map_orig_to_norm: Dict[str, str] = dict(zip(orig_cols, norm_cols))

# Build a "best original" picker for each normalized name:
# - prefer originals ending with _ses1
# - else originals ending with _ses2
# - else the first seen
pref: Dict[str, str] = {}
for orig, norm in zip(orig_cols, norm_cols):
    if norm not in pref:
        pref[norm] = orig
    # prefer *_ses1
    if re.search(r"_ses1$", normalize_name(orig)):
        pref[norm] = orig
    # if current not ses1 and new is ses1, replace; if current neither and new is ses2, consider only if no ses1 seen
    # (already covered by above)

# Make a mapping table so you can inspect
mapping_table = pd.DataFrame({
    "original": orig_cols,
    "normalized": norm_cols
}).sort_values(["normalized","original"])

print(f"Column name mapping (original -> normalized)  {mapping_table.head(200)}")

# Now map requested base names to selected original columns using normalized keys
selected_originals: List[str] = []
selection_reason: List[Tuple[str,str,str]] = []  # (base, original, reason)
for base in base_features:
    n = normalize_name(base)
    if n in pref:
        selected_originals.append(pref[n])
        selection_reason.append((base, pref[n], "matched_normalized"))
    else:
        selection_reason.append((base, "", "not_found"))

selection_df = pd.DataFrame(selection_reason, columns=["base_name","selected_original","reason"])
print(f"Requested base names -> selected original column  {selection_df} ")



# Proceed with tests if we found any selected columns
if len(selected_originals) == 0:
    print("None of the requested base names matched available columns after normalization.")
else:
    # ensure binary groups
    g = df[GROUP_COL]
    if g.dtype.kind not in "biu":
        try:
            g = g.astype(int)
        except Exception:
            uniq = sorted(g.dropna().unique().tolist())
            if len(uniq) != 2:
                raise ValueError(f"{GROUP_COL} must have 2 groups, found: {uniq}")
            g = g.map({uniq[0]:0, uniq[1]:1})
    df[GROUP_COL] = g
    uniq_g = sorted(df[GROUP_COL].dropna().unique().tolist())
    g0, g1 = uniq_g[0], uniq_g[1]

    # T-tests
    rows = []
    for col in selected_originals:
        x = df.loc[df[GROUP_COL]==g0, col].astype(float).dropna().values
        y = df.loc[df[GROUP_COL]==g1, col].astype(float).dropna().values
        t, p = welch_ttest(x, y)
        d = cohens_d(x, y)
        rows.append({
            "feature_original": col,
            "feature_normalized": normalize_name(col),
            f"n_{g0}": len(x), f"mean_{g0}": float(np.mean(x)) if len(x) else np.nan, f"sd_{g0}": float(np.std(x, ddof=1)) if len(x) else np.nan,
            f"n_{g1}": len(y), f"mean_{g1}": float(np.mean(y)) if len(y) else np.nan, f"sd_{g1}": float(np.std(y, ddof=1)) if len(y) else np.nan,
            "t_stat": t, "p_value": p, "cohens_d": d
        })
    ttest_df = pd.DataFrame(rows).sort_values("p_value", na_position="last")
    rej, p_adj, crit = fdr_bh(ttest_df["p_value"].values)
    ttest_df["p_value_fdr_bh"] = p_adj
    ttest_df["significant_fdr_0.05"] = rej

    # Correlations
    corr_rows = []
    gg = df[GROUP_COL].astype(float)
    for col in selected_originals:
        xx = df[col].astype(float)
        m = gg.notna() & xx.notna()
        if m.sum() < 3:
            r, p = (np.nan, np.nan)
        else:
            r, p = stats.pearsonr(gg[m], xx[m])
        corr_rows.append({"feature_normalized": normalize_name(col), "r_pointbiserial": r, "p_value": p})
    corr_df = pd.DataFrame(corr_rows).sort_values("p_value", na_position="last")
    rej_c, p_adj_c, crit_c = fdr_bh(corr_df["p_value"].values)
    corr_df["p_value_fdr_bh"] = p_adj_c
    corr_df["significant_fdr_0.05"] = rej_c

    # Save and display
    t_out = Path("ttest_yeo/ttest_lobes_lr.csv")
    c_out = Path("ttest_yeo/corr_lobes_lr.csv")
    ttest_df.to_csv(t_out, index=False)
    corr_df.to_csv(c_out, index=False)
    mapping_table.to_csv(map_out, index=False)

    print(f"T-tests (normalized mapping) {ttest_df}")
    print(f"Correlations (normalized mapping) {corr_df}")

    print({
        "downloads": {
            "ttest_results_lobes_lr.csv": str(t_out),
            "corr_results__lobes_lr.csv": str(c_out),
        },
        "tested_feature_count": len(selected_originals)
    })


Column name mapping (original -> normalized)                        original                  normalized
56                 bankssts_lh                 bankssts_lh
57                 bankssts_rh                 bankssts_rh
2                   Brain-Stem                  brain_stem
58  caudalanteriorcingulate_lh  caudalanteriorcingulate_lh
59  caudalanteriorcingulate_rh  caudalanteriorcingulate_rh
..                         ...                         ...
51                      VAN_GM                      van_gm
52                      VIS_GM                      vis_gm
55               WhiteSurfArea               whitesurfarea
53            WhiteSurfArea_lh            whitesurfarea_lh
54            WhiteSurfArea_rh            whitesurfarea_rh

[130 rows x 2 columns]
Requested base names -> selected original column         base_name selected_original              reason
0   cingulate_lh      Cingulate_lh  matched_normalized
1   cingulate_rh      Cingulate_rh  matched_normalized
2     f

In [61]:
# Let's diagnose why your columns weren't being matched and fix it.
# We will:
# 1) Load the merged CSV
# 2) Build a normalization that turns names like "Left-Accumbens-area_ses1" into "left_accumbens_area"
#    (lowercase, replace '-' and spaces with '_', collapse repeats, drop _ses1/_ses2)
# 3) Show a mapping of original -> normalized so you can see what's happening
# 4) Provide a selector that matches your requested base names against these normalized names
# 5) Re-run the subcortical t-tests/correlations using the robust mapping


DATA_FILE = Path("ttest_yeo/bigfile_pruned_merged.csv")
GROUP_COL = "kmeans_label"

# Your base names (underscored, no session suffix)
base_features= [
    "cingulate",
    "frontal",
    "insula",
    "occipital",
    "parietal",
    "temporal",
]



# Load data
df = pd.read_csv(DATA_FILE)

if GROUP_COL not in df.columns:
    raise ValueError(f"{GROUP_COL} not in file.")

# Build original->normalized and normalized->list(originals) maps
orig_cols = df.columns.tolist()
norm_cols = [normalize_name(c) for c in orig_cols]

map_orig_to_norm: Dict[str, str] = dict(zip(orig_cols, norm_cols))

# Build a "best original" picker for each normalized name:
# - prefer originals ending with _ses1
# - else originals ending with _ses2
# - else the first seen
pref: Dict[str, str] = {}
for orig, norm in zip(orig_cols, norm_cols):
    if norm not in pref:
        pref[norm] = orig
    # prefer *_ses1
    if re.search(r"_ses1$", normalize_name(orig)):
        pref[norm] = orig
    # if current not ses1 and new is ses1, replace; if current neither and new is ses2, consider only if no ses1 seen
    # (already covered by above)

# Make a mapping table so you can inspect
mapping_table = pd.DataFrame({
    "original": orig_cols,
    "normalized": norm_cols
}).sort_values(["normalized","original"])

print(f"Column name mapping (original -> normalized)  {mapping_table.head(200)}")

# Now map requested base names to selected original columns using normalized keys
selected_originals: List[str] = []
selection_reason: List[Tuple[str,str,str]] = []  # (base, original, reason)
for base in base_features:
    n = normalize_name(base)
    if n in pref:
        selected_originals.append(pref[n])
        selection_reason.append((base, pref[n], "matched_normalized"))
    else:
        selection_reason.append((base, "", "not_found"))

selection_df = pd.DataFrame(selection_reason, columns=["base_name","selected_original","reason"])
print(f"Requested base names -> selected original column  {selection_df} ")



# Proceed with tests if we found any selected columns
if len(selected_originals) == 0:
    print("None of the requested base names matched available columns after normalization.")
else:
    # ensure binary groups
    g = df[GROUP_COL]
    if g.dtype.kind not in "biu":
        try:
            g = g.astype(int)
        except Exception:
            uniq = sorted(g.dropna().unique().tolist())
            if len(uniq) != 2:
                raise ValueError(f"{GROUP_COL} must have 2 groups, found: {uniq}")
            g = g.map({uniq[0]:0, uniq[1]:1})
    df[GROUP_COL] = g
    uniq_g = sorted(df[GROUP_COL].dropna().unique().tolist())
    g0, g1 = uniq_g[0], uniq_g[1]

    # T-tests
    rows = []
    for col in selected_originals:
        x = df.loc[df[GROUP_COL]==g0, col].astype(float).dropna().values
        y = df.loc[df[GROUP_COL]==g1, col].astype(float).dropna().values
        t, p = welch_ttest(x, y)
        d = cohens_d(x, y)
        rows.append({
            "feature_original": col,
            "feature_normalized": normalize_name(col),
            f"n_{g0}": len(x), f"mean_{g0}": float(np.mean(x)) if len(x) else np.nan, f"sd_{g0}": float(np.std(x, ddof=1)) if len(x) else np.nan,
            f"n_{g1}": len(y), f"mean_{g1}": float(np.mean(y)) if len(y) else np.nan, f"sd_{g1}": float(np.std(y, ddof=1)) if len(y) else np.nan,
            "t_stat": t, "p_value": p, "cohens_d": d
        })
    ttest_df = pd.DataFrame(rows).sort_values("p_value", na_position="last")
    rej, p_adj, crit = fdr_bh(ttest_df["p_value"].values)
    ttest_df["p_value_fdr_bh"] = p_adj
    ttest_df["significant_fdr_0.05"] = rej

    # Correlations
    corr_rows = []
    gg = df[GROUP_COL].astype(float)
    for col in selected_originals:
        xx = df[col].astype(float)
        m = gg.notna() & xx.notna()
        if m.sum() < 3:
            r, p = (np.nan, np.nan)
        else:
            r, p = stats.pearsonr(gg[m], xx[m])
        corr_rows.append({"feature_normalized": normalize_name(col), "r_pointbiserial": r, "p_value": p})
    corr_df = pd.DataFrame(corr_rows).sort_values("p_value", na_position="last")
    rej_c, p_adj_c, crit_c = fdr_bh(corr_df["p_value"].values)
    corr_df["p_value_fdr_bh"] = p_adj_c
    corr_df["significant_fdr_0.05"] = rej_c

    # Save and display
    t_out = Path("ttest_yeo/ttest_lobes_full_brain.csv")
    c_out = Path("ttest_yeo/corr_lobes_full_brain.csv")
    ttest_df.to_csv(t_out, index=False)
    corr_df.to_csv(c_out, index=False)
    mapping_table.to_csv(map_out, index=False)

    print(f"T-tests (normalized mapping) {ttest_df}")
    print(f"Correlations (normalized mapping) {corr_df}")

    print({
        "downloads": {
            "ttest_results_lobes_full_brain.csv": str(t_out),
            "corr_results__lobes_full_brain.csv": str(c_out),
        },
        "tested_feature_count": len(selected_originals)
    })


Column name mapping (original -> normalized)                        original                  normalized
56                 bankssts_lh                 bankssts_lh
57                 bankssts_rh                 bankssts_rh
2                   Brain-Stem                  brain_stem
58  caudalanteriorcingulate_lh  caudalanteriorcingulate_lh
59  caudalanteriorcingulate_rh  caudalanteriorcingulate_rh
..                         ...                         ...
51                      VAN_GM                      van_gm
52                      VIS_GM                      vis_gm
55               WhiteSurfArea               whitesurfarea
53            WhiteSurfArea_lh            whitesurfarea_lh
54            WhiteSurfArea_rh            whitesurfarea_rh

[130 rows x 2 columns]
Requested base names -> selected original column     base_name selected_original              reason
0  cingulate         Cingulate  matched_normalized
1    frontal           Frontal  matched_normalized
2     insula       

In [63]:
# Let's diagnose why your columns weren't being matched and fix it.
# We will:
# 1) Load the merged CSV
# 2) Build a normalization that turns names like "Left-Accumbens-area_ses1" into "left_accumbens_area"
#    (lowercase, replace '-' and spaces with '_', collapse repeats, drop _ses1/_ses2)
# 3) Show a mapping of original -> normalized so you can see what's happening
# 4) Provide a selector that matches your requested base names against these normalized names
# 5) Re-run the subcortical t-tests/correlations using the robust mapping


DATA_FILE = Path("ttest_yeo/bigfile_pruned_merged.csv")
GROUP_COL = "kmeans_label"

# Your base names (underscored, no session suffix)
base_features= [
    "DA_GM",
    "VAN_GM",
    "VIS_GM",
    "FPN_GM",
    "SMN_GM",
    "DMN_GM",
]



# Load data
df = pd.read_csv(DATA_FILE)

if GROUP_COL not in df.columns:
    raise ValueError(f"{GROUP_COL} not in file.")

# Build original->normalized and normalized->list(originals) maps
orig_cols = df.columns.tolist()
norm_cols = [normalize_name(c) for c in orig_cols]

map_orig_to_norm: Dict[str, str] = dict(zip(orig_cols, norm_cols))

# Build a "best original" picker for each normalized name:
# - prefer originals ending with _ses1
# - else originals ending with _ses2
# - else the first seen
pref: Dict[str, str] = {}
for orig, norm in zip(orig_cols, norm_cols):
    if norm not in pref:
        pref[norm] = orig
    # prefer *_ses1
    if re.search(r"_ses1$", normalize_name(orig)):
        pref[norm] = orig
    # if current not ses1 and new is ses1, replace; if current neither and new is ses2, consider only if no ses1 seen
    # (already covered by above)

# Make a mapping table so you can inspect
mapping_table = pd.DataFrame({
    "original": orig_cols,
    "normalized": norm_cols
}).sort_values(["normalized","original"])

print(f"Column name mapping (original -> normalized)  {mapping_table.head(200)}")

# Now map requested base names to selected original columns using normalized keys
selected_originals: List[str] = []
selection_reason: List[Tuple[str,str,str]] = []  # (base, original, reason)
for base in base_features:
    n = normalize_name(base)
    if n in pref:
        selected_originals.append(pref[n])
        selection_reason.append((base, pref[n], "matched_normalized"))
    else:
        selection_reason.append((base, "", "not_found"))

selection_df = pd.DataFrame(selection_reason, columns=["base_name","selected_original","reason"])
print(f"Requested base names -> selected original column  {selection_df} ")



# Proceed with tests if we found any selected columns
if len(selected_originals) == 0:
    print("None of the requested base names matched available columns after normalization.")
else:
    # ensure binary groups
    g = df[GROUP_COL]
    if g.dtype.kind not in "biu":
        try:
            g = g.astype(int)
        except Exception:
            uniq = sorted(g.dropna().unique().tolist())
            if len(uniq) != 2:
                raise ValueError(f"{GROUP_COL} must have 2 groups, found: {uniq}")
            g = g.map({uniq[0]:0, uniq[1]:1})
    df[GROUP_COL] = g
    uniq_g = sorted(df[GROUP_COL].dropna().unique().tolist())
    g0, g1 = uniq_g[0], uniq_g[1]

    # T-tests
    rows = []
    for col in selected_originals:
        x = df.loc[df[GROUP_COL]==g0, col].astype(float).dropna().values
        y = df.loc[df[GROUP_COL]==g1, col].astype(float).dropna().values
        t, p = welch_ttest(x, y)
        d = cohens_d(x, y)
        rows.append({
            "feature_original": col,
            "feature_normalized": normalize_name(col),
            f"n_{g0}": len(x), f"mean_{g0}": float(np.mean(x)) if len(x) else np.nan, f"sd_{g0}": float(np.std(x, ddof=1)) if len(x) else np.nan,
            f"n_{g1}": len(y), f"mean_{g1}": float(np.mean(y)) if len(y) else np.nan, f"sd_{g1}": float(np.std(y, ddof=1)) if len(y) else np.nan,
            "t_stat": t, "p_value": p, "cohens_d": d
        })
    ttest_df = pd.DataFrame(rows).sort_values("p_value", na_position="last")
    rej, p_adj, crit = fdr_bh(ttest_df["p_value"].values)
    ttest_df["p_value_fdr_bh"] = p_adj
    ttest_df["significant_fdr_0.05"] = rej

    # Correlations
    corr_rows = []
    gg = df[GROUP_COL].astype(float)
    for col in selected_originals:
        xx = df[col].astype(float)
        m = gg.notna() & xx.notna()
        if m.sum() < 3:
            r, p = (np.nan, np.nan)
        else:
            r, p = stats.pearsonr(gg[m], xx[m])
        corr_rows.append({"feature_normalized": normalize_name(col), "r_pointbiserial": r, "p_value": p})
    corr_df = pd.DataFrame(corr_rows).sort_values("p_value", na_position="last")
    rej_c, p_adj_c, crit_c = fdr_bh(corr_df["p_value"].values)
    corr_df["p_value_fdr_bh"] = p_adj_c
    corr_df["significant_fdr_0.05"] = rej_c

    # Save and display
    t_out = Path("ttest_yeo/ttest_lobes_full_brain.csv")
    c_out = Path("ttest_yeo/corr_lobes_full_brain.csv")
    ttest_df.to_csv(t_out, index=False)
    corr_df.to_csv(c_out, index=False)
    mapping_table.to_csv(map_out, index=False)

    print(f"T-tests (normalized mapping) {ttest_df}")
    print(f"Correlations (normalized mapping) {corr_df}")

    print({
        "downloads": {
            "ttest_results_networks_full_brain.csv": str(t_out),
            "corr_results_networks_full_brain.csv": str(c_out),
        },
        "tested_feature_count": len(selected_originals)
    })


Column name mapping (original -> normalized)                        original                  normalized
56                 bankssts_lh                 bankssts_lh
57                 bankssts_rh                 bankssts_rh
2                   Brain-Stem                  brain_stem
58  caudalanteriorcingulate_lh  caudalanteriorcingulate_lh
59  caudalanteriorcingulate_rh  caudalanteriorcingulate_rh
..                         ...                         ...
51                      VAN_GM                      van_gm
52                      VIS_GM                      vis_gm
55               WhiteSurfArea               whitesurfarea
53            WhiteSurfArea_lh            whitesurfarea_lh
54            WhiteSurfArea_rh            whitesurfarea_rh

[130 rows x 2 columns]
Requested base names -> selected original column    base_name selected_original              reason
0     DA_GM                             not_found
1    VAN_GM            VAN_GM  matched_normalized
2    VIS_GM           