In [None]:
import pandas as pd
import numpy as np

# Load participants
participants = pd.read_csv('/home/dingxuan/hu_project/ds005899/participants.tsv', sep='\t')

# Select 10 ADHD and 10 TD
adhd_subjects = participants[participants['ADHD'] == 1].sample(n=10, random_state=42)
td_subjects = participants[participants['ADHD'] == 0].sample(n=10, random_state=42)
selected_subjects = pd.concat([adhd_subjects, td_subjects])

# Save
selected_subjects.to_csv('selected_subjects.tsv', sep='\t', index=False)
print("Selected subjects:", selected_subjects['participant_id'].tolist())

In [13]:
from bids import BIDSLayout
import pandas as pd

# Load dataset and participants
layout = BIDSLayout('/home/dingxuan/hu_project/ds005899')
participants = pd.read_csv('/home/dingxuan/hu_project/ds005899/participants.tsv', sep='\t')

# Check data for all subjects
data_info = []
for sub in participants['participant_id']:
    t1w_files = layout.get(subject=sub, suffix='T1w', extension='.nii')
    bold_files = layout.get(subject=sub, task='csst', suffix='bold', run=None, extension='.nii.gz')
    event_files = layout.get(subject=sub, task='csst', suffix='events', run=None, extension='.tsv')
    data_info.append({
        'subject': sub,
        'has_t1w': len(t1w_files) > 0,
        'num_bold_runs': len(bold_files),
        'num_event_files': len(event_files),
        'ADHD': participants[participants['participant_id'] == sub]['ADHD'].iloc[0]
    })
data_df = pd.DataFrame(data_info)
valid_subjects = data_df[(data_df['has_t1w']) & (data_df['num_bold_runs'] > 0) & (data_df['num_event_files'] > 0)]
print("Valid subjects with complete data:")
print(valid_subjects)
valid_subjects.to_csv('/home/dingxuan/hu_project/valid_subjects.csv', index=False)

Valid subjects with complete data:
Empty DataFrame
Columns: [subject, has_t1w, num_bold_runs, num_event_files, ADHD]
Index: []


In [16]:
import pandas as pd
import os

# Load selected subject IDs
selected_ids = pd.read_csv('/home/dingxuan/hu_project/selected_subjects.tsv', sep='\t')['participant_id'].tolist()

# Process event files
for sub in selected_ids:
    event_file = f'{sub}/func/{sub}_task-csst_run-01_events.tsv'
    if os.path.exists(event_file):
        df = pd.read_csv(event_file, sep='\t')
        df = df.fillna('n/a')
        df.to_csv(event_file, sep='\t', index=False)
        print(f"Updated {event_file}")

In [24]:
import pandas as pd
from bids import BIDSLayout
import os
import subprocess

# Load dataset
dataset_path = '/home/dingxuan/hu_project/ds005899'
print(f"Checking dataset path: {dataset_path}")
if not os.path.exists(dataset_path):
    print("Error: Dataset path does not exist")
    exit(1)

try:
    layout = BIDSLayout(dataset_path, validate=False)
except Exception as e:
    print(f"Error loading BIDS layout: {e}")
    exit(1)

# Load all subjects from participants.tsv
participants_file = f'{dataset_path}/participants.tsv'
if not os.path.exists(participants_file):
    print(f"Error: {participants_file} not found")
    exit(1)
participants = pd.read_csv(participants_file, sep='\t')
selected_ids = participants['participant_id'].tolist()

# Fix event files for all runs
modified_files = []
for sub in selected_ids:
    event_files = layout.get(subject=sub, task='csst', suffix='events', run=None, extension='.tsv')
    if not event_files:
        print(f"No event files found for {sub}")
        continue
    for event_file in event_files:
        try:
            # Unlock file to allow modifications
            print(f"Unlocking {event_file.path}")
            subprocess.run(['git', 'annex', 'unlock', event_file.path], check=True, cwd=dataset_path)

            # Read TSV, treating multiple NaN-like values as NaN
            df = pd.read_csv(event_file.path, sep='\t', na_values=['NaN', 'nan', 'NA', '', ' ', 'None'])
            if df.isna().any().any():
                print(f"Found NaN values in {event_file.path}")
                df = df.fillna('n/a')
                df.to_csv(event_file.path, sep='\t', index=False)
                print(f"Updated {event_file.path}")
                modified_files.append(event_file.path)
            else:
                print(f"No NaN values found in {event_file.path}")

            # Verify changes
            df_verify = pd.read_csv(event_file.path, sep='\t')
            if df_verify.isin(['NaN', 'nan', 'NA', '', ' ', 'None']).any().any() or df_verify.isna().any().any():
                print(f"Warning: {event_file.path} still contains NaN-like values or NaN")
            else:
                print(f"Verified: {event_file.path} has no NaN-like values")
        except Exception as e:
            print(f"Error processing {event_file.path}: {e}")

# Commit changes to prevent DataLad reversion
if modified_files:
    try:
        subprocess.run(['git', 'add'] + modified_files, check=True, cwd=dataset_path)
        subprocess.run(['git', 'commit', '-m', 'Fixed NaN values in event files'], check=True, cwd=dataset_path)
        print("Committed modified event files")
    except Exception as e:
        print(f"Error committing changes: {e}")

print("Finished updating event files")

Checking dataset path: /home/dingxuan/hu_project/ds005899
No event files found for sub-7155
No event files found for sub-7183
No event files found for sub-7223
No event files found for sub-7348
No event files found for sub-7384
No event files found for sub-7386
No event files found for sub-7395
No event files found for sub-7402
No event files found for sub-7498
No event files found for sub-7565
No event files found for sub-7605
No event files found for sub-7664
No event files found for sub-7714
No event files found for sub-7727
No event files found for sub-7733
No event files found for sub-7755
No event files found for sub-7780
No event files found for sub-7878
No event files found for sub-7889
No event files found for sub-7891
No event files found for sub-7899
No event files found for sub-7957
No event files found for sub-7960
No event files found for sub-7961
No event files found for sub-7979
No event files found for sub-8005
No event files found for sub-8008
No event files found for

In [30]:
## check selected subjects' data

from bids import BIDSLayout
import pandas as pd

layout = BIDSLayout('/home/dingxuan/hu_project/ds005899', validate=False)
selected_ids = pd.read_csv('/home/dingxuan/hu_project/selected_subjects.tsv', sep='\t')['participant_id'].tolist()
participants = pd.read_csv('/home/dingxuan/hu_project/ds005899/participants.tsv', sep='\t')

data_info = []
for sub in selected_ids:
    t1w_files = layout.get(subject=sub, suffix='T1w', extension='.nii')
    bold_files = layout.get(subject=sub, task='csst', suffix='bold', run=None, extension='.nii.gz')
    event_files = layout.get(subject=sub, task='csst', suffix='events', run=None, extension='.tsv')
    data_info.append({
        'subject': sub,
        'has_t1w': len(t1w_files) > 0,
        'num_bold_runs': len(bold_files),
        'num_event_files': len(event_files),
        'ADHD': participants[participants['participant_id'] == sub]['ADHD'].iloc[0]
    })
data_df = pd.DataFrame(data_info)
valid_subjects = data_df[(data_df['has_t1w']) & (data_df['num_bold_runs'] > 0) & (data_df['num_event_files'] > 0)]
print("Valid selected subjects:")
print(valid_subjects)
data_df.to_csv('/home/dingxuan/hu_project/selected_subjects_data.csv', index=False)

Valid selected subjects:
Empty DataFrame
Columns: [subject, has_t1w, num_bold_runs, num_event_files, ADHD]
Index: []


In [28]:
from bids import BIDSLayout
import os
layout = BIDSLayout('/home/dingxuan/hu_project/ds005899', validate=False)
selected_ids = pd.read_csv('/home/dingxuan/hu_project/selected_subjects.tsv', sep='\t')['participant_id'].tolist()
for sub in selected_ids:
    bold_files = layout.get(subject=sub, task='csst', suffix='bold', run=None, extension='.nii.gz')
    event_files = layout.get(subject=sub, task='csst', suffix='events', run=None, extension='.tsv')
    t1w_files = layout.get(subject=sub, suffix='T1w', extension='.nii')
    for f in bold_files + event_files + t1w_files:
        print(f"Getting {f.path}")
        os.system(f"datalad get {f.path}")

In [29]:
# replace invalid subjects
import pandas as pd
import numpy as np

# Load all subjects' data
all_data = pd.read_csv('/home/dingxuan/hu_project/selected_subjects_data.csv')
valid_subjects = all_data[(all_data['has_t1w']) & (all_data['num_bold_runs'] > 0) & (all_data['num_event_files'] > 0)]
participants = pd.read_csv('/home/dingxuan/hu_project/ds005899/participants.tsv', sep='\t')

# Load current selected subjects
selected_subjects = pd.read_csv('/home/dingxuan/hu_project/selected_subjects.tsv', sep='\t')
invalid_subjects = selected_subjects[~selected_subjects['participant_id'].isin(valid_subjects['subject'])]

# Replace invalid subjects
for _, invalid in invalid_subjects.iterrows():
    adhd_status = invalid['ADHD']
    available = participants[(participants['participant_id'].isin(valid_subjects['subject'])) & 
                            (participants['ADHD'] == adhd_status) & 
                            (~participants['participant_id'].isin(selected_subjects['participant_id']))]
    if not available.empty:
        new_sub = available.sample(n=1, random_state=42)
        selected_subjects = selected_subjects[selected_subjects['participant_id'] != invalid['participant_id']]
        selected_subjects = pd.concat([selected_subjects, new_sub], ignore_index=True)
        print(f"Replaced {invalid['participant_id']} with {new_sub['participant_id'].iloc[0]} (ADHD={adhd_status})")
    else:
        print(f"No replacement for {invalid['participant_id']} (ADHD={adhd_status})")

# Save updated list
selected_subjects.to_csv('/home/dingxuan/hu_project/selected_subjects.tsv', sep='\t', index=False)
selected_subjects['participant_id'].to_csv('/home/dingxuan/hu_project/selected_ids.txt', index=False, header=False)
print("Updated subjects:", selected_subjects['participant_id'].tolist())

No replacement for sub-8013 (ADHD=1)
No replacement for sub-8096 (ADHD=1)
No replacement for sub-7565 (ADHD=1)
No replacement for sub-8175 (ADHD=1)
No replacement for sub-8058 (ADHD=1)
No replacement for sub-8017 (ADHD=1)
No replacement for sub-8091 (ADHD=1)
No replacement for sub-7878 (ADHD=1)
No replacement for sub-8168 (ADHD=1)
No replacement for sub-7957 (ADHD=1)
No replacement for sub-8150 (ADHD=0)
No replacement for sub-7733 (ADHD=0)
No replacement for sub-8144 (ADHD=0)
No replacement for sub-8098 (ADHD=0)
No replacement for sub-7780 (ADHD=0)
No replacement for sub-8157 (ADHD=0)
No replacement for sub-8028 (ADHD=0)
No replacement for sub-7727 (ADHD=0)
No replacement for sub-7498 (ADHD=0)
No replacement for sub-7960 (ADHD=0)
Updated subjects: ['sub-8013', 'sub-8096', 'sub-7565', 'sub-8175', 'sub-8058', 'sub-8017', 'sub-8091', 'sub-7878', 'sub-8168', 'sub-7957', 'sub-8150', 'sub-7733', 'sub-8144', 'sub-8098', 'sub-7780', 'sub-8157', 'sub-8028', 'sub-7727', 'sub-7498', 'sub-7960']


In [31]:
# replacement
import pandas as pd

# Load data
dataset_path = '/home/dingxuan/hu_project/ds005899'
participants = pd.read_csv(f'{dataset_path}/participants.tsv', sep='\t')
selected_subjects = pd.read_csv('/home/dingxuan/hu_project/selected_subjects.tsv', sep='\t')

# Verify subjects exist
replacements = {'sub-8168': 'sub-8121', 'sub-7957': 'sub-7899'}
for old_sub, new_sub in replacements.items():
    if old_sub not in selected_subjects['participant_id'].values:
        print(f"Error: {old_sub} not in selected subjects")
        continue
    if new_sub not in participants['participant_id'].values:
        print(f"Error: {new_sub} not in participants.tsv")
        continue

    # Get ADHD status
    old_adhd = selected_subjects[selected_subjects['participant_id'] == old_sub]['ADHD'].iloc[0]
    new_adhd = participants[participants['participant_id'] == new_sub]['ADHD'].iloc[0]
    if old_adhd != new_adhd:
        print(f"Warning: ADHD mismatch for {old_sub} (ADHD={old_adhd}) and {new_sub} (ADHD={new_adhd})")

    # Replace subject
    new_sub_data = participants[participants['participant_id'] == new_sub][['participant_id', 'ADHD']]
    selected_subjects = selected_subjects[selected_subjects['participant_id'] != old_sub]
    selected_subjects = pd.concat([selected_subjects, new_sub_data], ignore_index=True)
    print(f"Replaced {old_sub} with {new_sub} (ADHD={new_adhd})")

# Verify ADHD/TD balance
adhd_count = selected_subjects['ADHD'].sum()
td_count = len(selected_subjects) - adhd_count
print(f"ADHD count: {adhd_count}, TD count: {td_count}")
if adhd_count != 10 or td_count != 10:
    print("Warning: ADHD/TD balance not maintained")

# Save updated lists
selected_subjects.to_csv('/home/dingxuan/hu_project/selected_subjects.tsv', sep='\t', index=False)
selected_subjects['participant_id'].to_csv('/home/dingxuan/hu_project/selected_ids.txt', index=False, header=False)
print("Updated subjects:", selected_subjects['participant_id'].tolist())

Replaced sub-8168 with sub-8121 (ADHD=1)
Replaced sub-7957 with sub-7899 (ADHD=1)
ADHD count: 10, TD count: 10
Updated subjects: ['sub-8013', 'sub-8096', 'sub-7565', 'sub-8175', 'sub-8058', 'sub-8017', 'sub-8091', 'sub-7878', 'sub-8150', 'sub-7733', 'sub-8144', 'sub-8098', 'sub-7780', 'sub-8157', 'sub-8028', 'sub-7727', 'sub-7498', 'sub-7960', 'sub-8121', 'sub-7899']
