In [None]:
# uses a simple linear regression to get the trajectory of each parcel

# 1. do it per task/contrast/individual and save (also save significance, intercept, etc)
# 2. average the slopes per parcel in each task/contrast and save the group average

In [1]:
import os
import numpy as np
import pandas as pd
import nibabel as nib
import nilearn
import json
import datetime
import pickle
import seaborn as sns
import gc
import psutil
import math
import scipy.stats as stats
from matplotlib.patches import Patch
from nilearn import plotting
from nilearn.glm.first_level import FirstLevelModel
from nilearn.glm.second_level import SecondLevelModel
from nilearn.glm import threshold_stats_img
from nilearn.image import concat_imgs, mean_img, index_img
from nilearn.reporting import make_glm_report
from nilearn import masking, image
from nilearn import datasets
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from collections import defaultdict
from nilearn.maskers import NiftiLabelsMasker
from nilearn.plotting.find_cuts import find_cut_slices

# Import shared utilities and configuration
# need to do it this way because in a sub-directory (later turn config and utils into part of a package)
from utils import (
    TASKS, CONTRASTS, SUBJECTS, SESSIONS, ENCOUNTERS,
    build_first_level_contrast_map_path, is_valid_contrast_map, clean_z_map_data,
    convert_to_regular_dict, create_smor_atlas,load_smor_atlas, load_schaefer_atlas, cleanup_memory
)
from config import BASE_DIR, OUTPUT_DIRS

In [2]:
compiled_req_contrasts = []
for task in TASKS:
    for contrast in CONTRASTS[task]:
        if (contrast not in compiled_req_contrasts):
            compiled_req_contrasts.append(contrast)

In [3]:
# smorgasbord stuff
SMORG_PARCELLATED_DIR = OUTPUT_DIRS["smor"]
smor_files = {'mean':f'discovery_parcel_indiv_mean_updated'}
smor_date_updated = '0111'
indices = [1,2,3]
# get smorgasbord atlas
smorgasbord_atlas = load_smor_atlas()
SMORG_IMG = smorgasbord_atlas.maps
SMORG_DATA = SMORG_IMG.get_fdata()


# atlas
req_atlas = "smor"
main_dir = SMORG_PARCELLATED_DIR
main_files = smor_files
date_updated = smor_date_updated
atlas_obj = smorgasbord_atlas

Loading Smorgasbord atlas...
Atlas loaded with 429 regions
Atlas shape: (193, 229, 193)


# load the parcellated files

In [4]:
file_type = "default" # can change to "default" to make this code use betas
output_ending = "_betas"
if (file_type == "z"):
    output_ending = "_z_scored"
        
# Load mean parcel data from multiple files
loaded_mean_parcel_dict = {}
mean_filename = f"{main_dir}/{main_files['mean']}_{date_updated}"

for num in indices:
    fin_filename = f"{mean_filename}_{num}{output_ending}.pkl"
    print(f"Loading: {fin_filename}")
    
    try:
        with open(fin_filename, 'rb') as f:
            dict_data = pickle.load(f)
            loaded_mean_parcel_dict.update(dict_data)
            print(f"Loaded {len(dict_data)} subjects")
    except FileNotFoundError:
        print(f"Warning: File not found - {fin_filename}")
        continue
    except Exception as e:
        print(f"Error loading {fin_filename}: {e}")
        continue

print(f"\nTotal subjects loaded: {len(loaded_mean_parcel_dict)}")
print(f"Atlas: {req_atlas} ({len(atlas_obj.labels)} regions)")

Loading: processed_data_dfs_2026/smor_parcel_dfs/discovery_parcel_indiv_mean_updated_0111_1_betas.pkl
Loaded 2 subjects
Loading: processed_data_dfs_2026/smor_parcel_dfs/discovery_parcel_indiv_mean_updated_0111_2_betas.pkl
Loaded 2 subjects
Loading: processed_data_dfs_2026/smor_parcel_dfs/discovery_parcel_indiv_mean_updated_0111_3_betas.pkl
Loaded 1 subjects

Total subjects loaded: 5
Atlas: smor (429 regions)


# parcel trajectory modeling:

In [5]:
N_ENCOUNTERS_EXPECTED = 5
ENCOUNTER_CENTER = np.mean(np.arange(1, N_ENCOUNTERS_EXPECTED + 1)) # this will center at the same encounter for everybody

def analyze_parcel_practice_effects(parcel_dict, subject, task, contrast, 
                                      encounters_str=ENCOUNTERS,
                                      center_value=ENCOUNTER_CENTER):
    """
    Analyze practice effects for individual parcels using betas as DVs.
    
    Model: beta_activation = B0 + B1 * encounter_centered
    
    IMPORTANT: Uses fixed centering based on expected N=5 encounters,
               even when some encounters are missing.
    
    Parameters:
    -----------
    parcel_dict : dict
        Format: [subject][task][contrast][encounter] -> DataFrame with 'activation' column (betas)
    subject : str
        Subject ID
    task : str
        Task name
    contrast : str
        Contrast name
    encounters_str : list
        List of encounter strings (default: ['01', '02', '03', '04', '05'])
    center_value : float
        Fixed centering value (default: 3.0 for 5 encounters)
    
    Returns:
    --------
    parcel_results : dict
        Results for each parcel including effect sizes, stats, and diagnostics
    """
    print(f"{subject}/{task}/{contrast}")
    
    # Get all individual parcels
    first_encounter = parcel_dict[subject][task][contrast][encounters_str[0]]
    all_parcels = first_encounter['region'].tolist()
    parcel_results = {}
    
    for parcel in all_parcels:
        # Extract trajectory for this specific parcel
        trajectory = []
        encounter_indices = []  # Track which encounters we have
        
        for i, enc in enumerate(encounters_str):
            try: 
                df = parcel_dict[subject][task][contrast][enc]
                activation = df[df['region'] == parcel]['activation'].iloc[0]
    
                try:
                    activation = float(activation)
                except (ValueError, TypeError):
                    print(f"Warning: Could not convert activation '{activation}' to float")
                    activation = np.nan
 
                trajectory.append(activation)
                encounter_indices.append(i + 1)  # 1-indexed (1, 2, 3, 4, 5)
                
            except Exception as e:
                print(f"for {subject}, encounter {enc} is missing for {task} {contrast}")
                break
        
        trajectory = np.array(trajectory, dtype=float)
        encounter_indices = np.array(encounter_indices)
        
        # Remove NaN values
        valid_mask = ~np.isnan(trajectory)
        trajectory_clean = trajectory[valid_mask]
        encounter_indices_clean = encounter_indices[valid_mask]
        
        if len(trajectory_clean) < 3:  # Need minimum 3 points for regression
            print(f"Insufficient data for parcel {parcel}")
            continue
        
        # CENTER using FIXED center value (same for all parcels/subjects)
        # ensures B0 has consistent interpretation across analyses
        encounters_centered = encounter_indices_clean - center_value
        # ex (1,2,3,4,5): centered = [-2, -1, 0, 1, 2]
        
        n_encounters_actual = len(trajectory_clean)
        
        # Fit model: beta ~ encounter_centered
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            encounters_centered, 
            trajectory_clean
        )#inputs are x,y so trajectory_clean (beta activations) ~ the centered encounters
        
        # Calculate t-statistic for slope
        t_stat_slope = slope / std_err if std_err > 0 else 0
        
        # Degrees of freedom
        df = n_encounters_actual - 2
        
        # Calculate descriptive statistics
        initial_activation = trajectory_clean[0]
        final_activation = trajectory_clean[-1]
        mean_activation = np.mean(trajectory_clean)
        max_activation = np.max(trajectory_clean)
        min_activation = np.min(trajectory_clean)
        
        # Effect sizes
        absolute_change = final_activation - initial_activation
        
        if abs(initial_activation) > 0.001:
            percent_change = (absolute_change / abs(initial_activation)) * 100
        else:
            percent_change = np.nan
        
        trajectory_std = np.std(trajectory_clean)
        if trajectory_std > 0:
            cohens_d = absolute_change / trajectory_std
        else:
            cohens_d = 0
        
        # Store results
        parcel_results[parcel] = {
            # Effect Sizes & Model Parameters
            'beta_intercept': intercept,  # B0 - activation at centered encounter (enc 3)
            'beta_slope': slope,  # B1 - practice effect (change per encounter)
            'std_error': std_err,  # Standard error of slope
            'ci_lower': slope - 1.96 * std_err,  # 95% CI lower bound
            'ci_upper': slope + 1.96 * std_err,  # 95% CI upper bound
            
            # Statistical Tests
            't_stat_slope': t_stat_slope,
            'p_value': p_value,
            'df': df,
            'significant_change': (p_value < 0.05),
            
            # Model Fit & Diagnostics
            'r_squared': r_value**2,
            'n_encounters_actual': n_encounters_actual,
            'encounters_included': encounter_indices_clean.tolist(),
            'trajectory': trajectory_clean,
            'centered_predictor': encounters_centered.tolist(),
            
            # Misc
            'mean_activation': mean_activation,
            'initial_activation': initial_activation,
            'final_activation': final_activation,
            'max_activation': max_activation,
            'min_activation': min_activation,
            'activation_range': max_activation - min_activation,
            'absolute_change': absolute_change,
            'percent_change': percent_change,
            'cohens_d': cohens_d,
        }
    
    return parcel_results

In [None]:
# get the parcel trajectory results per subject
parcel_traj_results = {}
for subj in SUBJECTS:
    parcel_traj_results[subj] = {}

    for task in TASKS:
        parcel_traj_results[subj][task] = {}

        for contrast in CONTRASTS[task]:
            try:
                parcel_traj_results[subj][task][contrast] = analyze_parcel_practice_effects(
                    loaded_mean_parcel_dict, subj, task, contrast
                )
            except Exception as e:
                print(f"Error processing {subj}/{task}/{contrast}: {e}")
                continue

In [7]:
# verify numbers for each
for subj in SUBJECTS:
    count = 0
    
    for task in parcel_traj_results[subj].keys():
        for contrast in parcel_traj_results[subj][task].keys():
            count += 1
    print(f"for {subj} there are {count} specific task/contrast combos loaded")

for sub-s03 there are 40 specific task/contrast combos loaded
for sub-s10 there are 40 specific task/contrast combos loaded
for sub-s19 there are 40 specific task/contrast combos loaded
for sub-s29 there are 40 specific task/contrast combos loaded
for sub-s43 there are 40 specific task/contrast combos loaded


In [8]:
# save the individual files
with open(f'{mean_filename}{output_ending}_indiv_slopes.pkl', 'wb') as f:
    pickle.dump(parcel_traj_results, f)

# step 2: per task/contrast average the slopes across all subjects:

In [11]:
# create an averaged parcel df across all participants and save it to a file
avg_parcel_traj_results = {}

count_success = 0
for task in TASKS:
    avg_parcel_traj_results[task] = {}

    for contrast in CONTRASTS[task]:
        print(f"Processing {task}/{contrast}:")
        avg_parcel_traj_results[task][contrast] = {}

        # Collect all parcel data across subjects
        parcel_data = defaultdict(list)
        
        for subj in SUBJECTS:
            try:
                curr_res = parcel_traj_results[subj][task][contrast]
                
                # For each parcel in this subject's results
                for parcel_name, parcel_stats in curr_res.items():
                    parcel_data[parcel_name].append(parcel_stats)
                    
            except Exception as e:
                print(f"Error processing {subj}/{task}/{contrast}: {e}")
                continue

        
        # Calculate averages for each parcel
        for parcel_name, subject_data_list in parcel_data.items():
            if len(subject_data_list) == 0:
                continue
            
            n_subjects = len(subject_data_list)  # ADD THIS

            # INDIVIDUAL
            slopes = np.array([data['beta_slope'] for data in subject_data_list])
            intercepts = np.array([data['beta_intercept'] for data in subject_data_list])
            r_squareds = np.array([data['r_squared'] for data in subject_data_list])
            std_errors = np.array([data['std_error'] for data in subject_data_list])
            individual_p_values = np.array([data['p_value'] for data in subject_data_list])
            initial_activations = np.array([data['initial_activation'] for data in subject_data_list])
            final_activations = np.array([data['final_activation'] for data in subject_data_list])
            
            percent_changes = np.array([data['percent_change'] for data in subject_data_list 
                                       if not np.isnan(data['percent_change'])])
            cohens_ds = np.array([data['cohens_d'] for data in subject_data_list])
            
            max_activations = np.array([data['max_activation'] for data in subject_data_list])
            min_activations = np.array([data['min_activation'] for data in subject_data_list])
            activation_ranges = np.array([data['activation_range'] for data in subject_data_list])
            trajectories = [data['trajectory'] for data in subject_data_list]

            # GROUP-LEVEL
            slope_mean = np.mean(slopes)
            slope_std = np.std(slopes, ddof=1)
            slope_sem = slope_std / np.sqrt(n_subjects)
            
            # One-sample t-test
            if n_subjects > 1:
                group_t_stat, group_p_value = stats.ttest_1samp(slopes, 0)
                
                # ADD: 95% Confidence interval
                t_crit = stats.t.ppf(0.975, df=n_subjects-1)
                slope_ci_lower = slope_mean - t_crit * slope_sem
                slope_ci_upper = slope_mean + t_crit * slope_sem
                
                # ADD: Cohen's d for group effect
                group_cohens_d = slope_mean / slope_std if slope_std > 0 else 0
            else:
                group_t_stat = np.nan
                group_p_value = np.nan
                slope_ci_lower = np.nan
                slope_ci_upper = np.nan
                group_cohens_d = 0

            # calculate trajectory vals since diff contrasts/subj have different numbers of encounters
            if len(trajectories) > 0:
                # Check if all trajectories are the same length
                trajectory_lengths = [len(traj) for traj in trajectories]
                
                if len(set(trajectory_lengths)) == 1:
                    # All same length
                    trajectory_array = np.array(trajectories)
                    trajectory_mean = np.mean(trajectory_array, axis=0)
                    trajectory_std = np.std(trajectory_array, axis=0, ddof=1)
                    trajectory_sem = trajectory_std / np.sqrt(len(trajectories))
                    trajectory_n = np.full(len(trajectory_mean), len(trajectories))
                else:
                    # Different lengths - use padding
                    max_length = max(trajectory_lengths)
                    padded_trajectories = []
                    
                    for traj in trajectories:
                        if len(traj) < max_length:
                            padded = np.full(max_length, np.nan)
                            padded[:len(traj)] = traj
                            padded_trajectories.append(padded)
                        else:
                            padded_trajectories.append(traj)
                    
                    trajectory_array = np.array(padded_trajectories)
                    trajectory_mean = np.nanmean(trajectory_array, axis=0)
                    trajectory_std = np.nanstd(trajectory_array, axis=0, ddof=1)  # CHANGE: Add ddof=1
                    trajectory_n = np.sum(~np.isnan(trajectory_array), axis=0)  # CHANGE: Use this for SEM
                    trajectory_sem = trajectory_std / np.sqrt(trajectory_n)  # CHANGE: Use trajectory_n
        
            # Store results
            avg_parcel_traj_results[task][contrast][parcel_name] = {
                'n_subjects': n_subjects,
                
                # GROUP-LEVEL
                'slope_mean': slope_mean,
                'slope_std': slope_std,
                'slope_sem': slope_sem,
                'slope_ci_lower': slope_ci_lower,
                'slope_ci_upper': slope_ci_upper,
                
                'group_t_stat': group_t_stat,
                'group_p_value': group_p_value,
                'group_cohens_d': group_cohens_d,
                'group_significant': group_p_value < 0.05 if not np.isnan(group_p_value) else False,
                
                'intercept_mean': np.mean(intercepts),
                'intercept_std': np.std(intercepts, ddof=1),
                'intercept_sem': np.std(intercepts, ddof=1) / np.sqrt(n_subjects),
                
                'r_squared_mean': np.mean(r_squareds),
                'r_squared_std': np.std(r_squareds, ddof=1),  # ADD
                
                'individual_p_significant_proportion': np.mean(individual_p_values < 0.05),
                
                'initial_activation_mean': np.mean(initial_activations),
                'initial_activation_std': np.std(initial_activations, ddof=1),
                'initial_activation_sem': np.std(initial_activations, ddof=1) / np.sqrt(n_subjects),
                
                'final_activation_mean': np.mean(final_activations),
                'final_activation_std': np.std(final_activations, ddof=1),
                'final_activation_sem': np.std(final_activations, ddof=1) / np.sqrt(n_subjects),
                
                'percent_change_mean': np.mean(percent_changes) if len(percent_changes) > 0 else np.nan,
                'percent_change_std': np.std(percent_changes, ddof=1) if len(percent_changes) > 1 else np.nan,
                
                'cohens_d_mean': np.mean(cohens_ds),
                'cohens_d_std': np.std(cohens_ds, ddof=1),  # ADD
                
                'max_activation_mean': np.mean(max_activations),
                'min_activation_mean': np.mean(min_activations),
                'activation_range_mean': np.mean(activation_ranges),
                
                'trajectory_mean': trajectory_mean,
                'trajectory_std': trajectory_std,
                'trajectory_sem': trajectory_sem,
                'trajectory_n_subjects': trajectory_n if 'trajectory_n' in locals() else len(trajectories),
                
                'positive_slope_proportion': np.mean(slopes > 0),
                
                'individual_slopes': slopes.tolist(),
                'individual_intercepts': intercepts.tolist(),
            }

        n_parcels = len(avg_parcel_traj_results[task][contrast])
        
        if n_parcels > 0:
            n_sig = sum(1 for p in avg_parcel_traj_results[task][contrast].values() 
                       if p['group_significant'])
            print(f"Completed: {n_parcels} parcels")
            print(f"Group-level significant (p<0.05): {n_sig} ({100*n_sig/n_parcels:.1f}%)")
            count_success += 1
        else:
            print(f"No parcels found (no subjects have data for this contrast): {task}|{contrast}")

print(f"\nAveraging complete for {count_success} task/contrasts!")

Processing nBack/twoBack-oneBack:
Completed: 429 parcels
Group-level significant (p<0.05): 14 (3.3%)
Processing nBack/match-mismatch:
Completed: 429 parcels
Group-level significant (p<0.05): 23 (5.4%)
Processing nBack/task-baseline:
Completed: 429 parcels
Group-level significant (p<0.05): 29 (6.8%)
Processing nBack/response_time:
Completed: 429 parcels
Group-level significant (p<0.05): 5 (1.2%)
Processing flanker/incongruent-congruent:
Completed: 429 parcels
Group-level significant (p<0.05): 10 (2.3%)
Processing flanker/task-baseline:
Completed: 429 parcels
Group-level significant (p<0.05): 14 (3.3%)
Processing directedForgetting/neg-con:
Completed: 429 parcels
Group-level significant (p<0.05): 21 (4.9%)
Processing directedForgetting/task-baseline:
Completed: 429 parcels
Group-level significant (p<0.05): 23 (5.4%)
Processing directedForgetting/response_time:
Completed: 429 parcels
Group-level significant (p<0.05): 43 (10.0%)
Processing goNogo/nogo_success-go:
Completed: 429 parcels
Gro

In [13]:
with open(f'{mean_filename}{output_ending}_averaged.pkl', 'wb') as f:
    pickle.dump(avg_parcel_traj_results, f)