<a href="https://colab.research.google.com/github/nourhan254/Autism-/blob/main/Untitled26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
import zipfile
import numpy as np
import pandas as pd
import nibabel as nib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

Mounted at /content/drive


In [3]:
# Define paths
base_path = '/content/drive/MyDrive/ADHD1/'

# Unzip the files (only need to do this once)
def unzip_files():
    sites = ['Peking_1', 'NeuroIMAGE', 'KKI']
    for site in sites:
        zip_path = os.path.join(base_path, f'{site}.zip')
        extract_path = os.path.join(base_path, site)
        if not os.path.exists(extract_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_path)

unzip_files()

In [4]:
def load_phenotypic_data():
    # Load each site's phenotypic data
    peking_pheno = pd.read_csv(os.path.join(base_path, 'Peking_1_phenotypic.csv'))
    neuro_pheno = pd.read_csv(os.path.join(base_path, 'NeuroIMAGE_phenotypic.csv'))
    kki_pheno = pd.read_csv(os.path.join(base_path, 'KKI_phenotypic.csv'))

    # Standardize column names across sites
    def standardize_columns(df):
        # Create mapping of possible column names to standard names
        column_map = {
            'subject': ['subject', 'SUBID', 'Subject', 'ID', 'participant_id'],
            'diagnosis': ['diagnosis', 'DX', 'ADHD', 'label', 'group'],
            'site': ['site', 'dataset', 'Site']
        }

        # Find matching columns
        standardized_df = pd.DataFrame()
        for standard_name, possible_names in column_map.items():
            for name in possible_names:
                if name in df.columns:
                    standardized_df[standard_name] = df[name]
                    break

        return standardized_df

    # Apply standardization to each dataframe
    peking_pheno = standardize_columns(peking_pheno)
    neuro_pheno = standardize_columns(neuro_pheno)
    kki_pheno = standardize_columns(kki_pheno)

    # Add site identifiers if not present
    if 'site' not in peking_pheno.columns:
        peking_pheno['site'] = 'Peking'
    if 'site' not in neuro_pheno.columns:
        neuro_pheno['site'] = 'NeuroIMAGE'
    if 'site' not in kki_pheno.columns:
        kki_pheno['site'] = 'KKI'

    # Combine all phenotypic data
    combined_pheno = pd.concat([peking_pheno, neuro_pheno, kki_pheno], ignore_index=True)

    # First clean the diagnosis column (convert to string for consistent handling)
    combined_pheno['diagnosis'] = combined_pheno['diagnosis'].astype(str).str.lower().str.strip()

    # Create mapping dictionary for diagnosis values
    diagnosis_map = {
        '1': 1, 'adhd': 1, 'adhd-i': 1, 'adhd-c': 1, 'adhd-h': 1,
        '0': 0, 'control': 0, 'healthy': 0, 'typical': 0, 'normal': 0
    }

    # Map diagnosis values
    combined_pheno['diagnosis'] = combined_pheno['diagnosis'].map(diagnosis_map)

    # Drop rows with NaN in diagnosis (couldn't be mapped)
    combined_pheno = combined_pheno.dropna(subset=['diagnosis'])

    # Convert diagnosis to integer
    combined_pheno['diagnosis'] = combined_pheno['diagnosis'].astype(int)

    # Verify we have reasonable data
    print("Diagnosis value counts:")
    print(combined_pheno['diagnosis'].value_counts())

    return combined_pheno

pheno_data = load_phenotypic_data()

Diagnosis value counts:
diagnosis
0    145
1     41
Name: count, dtype: int64


In [5]:
def inspect_csv_files():
    files = {
        'Peking': 'Peking_1_phenotypic.csv',
        'NeuroIMAGE': 'NeuroIMAGE_phenotypic.csv',
        'KKI': 'KKI_phenotypic.csv'
    }

    for site, filename in files.items():
        filepath = os.path.join(base_path, filename)
        df = pd.read_csv(filepath)
        print(f"\n{site} file columns:")
        print(df.columns.tolist())
        print("\nFirst 3 rows:")
        print(df.head(3))

inspect_csv_files()


Peking file columns:
['ScanDir ID', 'Site', 'Gender', 'Age', 'Handedness', 'DX', 'Secondary Dx ', 'ADHD Measure', 'ADHD Index', 'Inattentive', 'Hyper/Impulsive', 'IQ Measure', 'Verbal IQ', 'Performance IQ', 'Full2 IQ', 'Full4 IQ', 'Med Status', 'QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4', 'QC_Anatomical_1', 'QC_Anatomical_2']

First 3 rows:
   ScanDir ID  Site  Gender    Age  Handedness  DX Secondary Dx   \
0     1056121     1       1  13.92           1   0           NaN   
1     1113498     1       0  14.83           1   0           NaN   
2     1133221     1       1  12.33           1   1           ODD   

   ADHD Measure  ADHD Index  Inattentive  ...  Performance IQ  Full2 IQ  \
0             1        30.0         15.0  ...             136       NaN   
1             1        20.0         11.0  ...             135       NaN   
2             1        64.0         32.0  ...             108       NaN   

   Full4 IQ  Med Status  QC_Rest_1  QC_Rest_2  QC_Rest_3  QC_Rest_4  \
0   

In [6]:
def load_phenotypic_data():
    # Define the exact file names we know you have
    files = {
        'Peking': 'Peking_1_phenotypic.csv',
        'NeuroIMAGE': 'NeuroIMAGE_phenotypic.csv',
        'KKI': 'KKI_phenotypic.csv'
    }

    dfs = []

    for site, filename in files.items():
        filepath = os.path.join(base_path, filename)

        # Check if file exists before trying to read it
        if not os.path.exists(filepath):
            print(f"Warning: File not found - {filepath}")
            continue

        df = pd.read_csv(filepath)
        print(f"\nLoaded {site} data with shape: {df.shape}")

        # Create standardized dataframe with the columns we know exist
        standardized_df = pd.DataFrame()

        # Use 'ScanDir ID' as subject ID
        if 'ScanDir ID' in df.columns:
            standardized_df['subject'] = df['ScanDir ID'].astype(str).str.strip()
        else:
            print(f"Could not find subject column in {site} data")
            continue

        # Use 'DX' as diagnosis
        if 'DX' in df.columns:
            standardized_df['diagnosis'] = df['DX']
        else:
            print(f"Could not find diagnosis column in {site} data")
            continue

        # Add site identifier
        standardized_df['site'] = site

        dfs.append(standardized_df)

    if not dfs:
        raise ValueError("No valid data files could be loaded")

    # Combine all data
    combined = pd.concat(dfs, ignore_index=True)

    # Clean diagnosis data
    combined['diagnosis'] = combined['diagnosis'].astype(str).str.upper().str.strip()

    # Map to binary classification (ADHD=1, Control=0)
    diagnosis_map = {
        '1': 1, 'ADHD': 1, 'ADHD-I': 1, 'ADHD-C': 1, 'ADHD-H': 1, 'CASE': 1,
        '0': 0, 'CONTROL': 0, 'HEALTHY': 0, 'TYPICAL': 0, 'NORMAL': 0, 'TD': 0
    }

    combined['diagnosis'] = combined['diagnosis'].map(diagnosis_map)

    # Drop rows with invalid diagnosis
    combined = combined.dropna(subset=['diagnosis'])
    combined['diagnosis'] = combined['diagnosis'].astype(int)

    print("\nFinal dataset summary:")
    print(f"Total subjects: {len(combined)}")
    print("Diagnosis counts:")
    print(combined['diagnosis'].value_counts())
    print("\nSites distribution:")
    print(combined['site'].value_counts())

    return combined

pheno_data = load_phenotypic_data()


Loaded Peking data with shape: (85, 23)

Loaded NeuroIMAGE data with shape: (48, 23)

Loaded KKI data with shape: (83, 23)

Final dataset summary:
Total subjects: 186
Diagnosis counts:
diagnosis
0    145
1     41
Name: count, dtype: int64

Sites distribution:
site
KKI           77
Peking        68
NeuroIMAGE    41
Name: count, dtype: int64



Testing image paths for 5 random subjects:
✗ Peking subject 3004580: NOT FOUND
✗ Peking subject 2123983: NOT FOUND
✗ Peking subject 1056121: NOT FOUND
✗ KKI subject 8083695: NOT FOUND
✗ KKI subject 2138826: NOT FOUND


In [8]:
def explore_directory(path, max_depth=3):
    print(f"\nExploring: {path}")
    for root, dirs, files in os.walk(path):
        level = root.replace(path, '').count(os.sep)
        if level < max_depth:
            indent = ' ' * 4 * level
            print(f"{indent}{os.path.basename(root)}/")
            subindent = ' ' * 4 * (level + 1)
            for f in files[:5]:  # Print first 5 files
                print(f"{subindent}{f}")
            if len(files) > 5:
                print(f"{subindent}... and {len(files)-5} more")

explore_directory(base_path)


Exploring: /content/drive/MyDrive/ADHD1/
/
    KKI_phenotypic.csv
    NeuroIMAGE_phenotypic.csv
    Peking_1_TestRelease_phenotypic.csv
    Peking_1_phenotypic.csv
    KKI.zip
    ... and 2 more
Peking_1/
    Peking_1/
        anat_processed_paths.csv
        dataset_description.json
        filtered_participants.tsv
        filtered_paths.csv
        missing_participants_ids.csv
        ... and 4 more
        sub-1038415/
            normalized_resampled_128_sub-1038415_T1_biascorr_brain.nii
            normalized_resampled_224_sub-1038415_T1_biascorr_brain.nii
            resampled_128_sub-1038415_T1_biascorr_brain.nii
            resampled_224_sub-1038415_T1_biascorr_brain.nii
            sub-1038415_T1_biascorr_brain.nii
        sub-1056121/
            normalized_resampled_128_sub-1056121_T1_biascorr_brain.nii
            normalized_resampled_224_sub-1056121_T1_biascorr_brain.nii
            resampled_128_sub-1056121_T1_biascorr_brain.nii
            resampled_224_sub-1056121_T1_

In [9]:
def find_t1_image_path(subject_id, site):
    """Find the T1-weighted structural image path"""
    site_folder = {
        'Peking': 'Peking_1/Peking_1',
        'NeuroIMAGE': 'NeuroIMAGE/NeuroIMAGE',
        'KKI': 'KKI/KKI'
    }.get(site)

    # Try multiple naming patterns
    patterns = [
        f"{base_path}/{site_folder}/sub-{subject_id}/sub-{subject_id}_T1_biascorr_brain.nii",
        f"{base_path}/{site_folder}/sub-{subject_id}/normalized_resampled_128_sub-{subject_id}_T1_biascorr_brain.nii",
        f"{base_path}/{site_folder}/sub-{subject_id}/resampled_128_sub-{subject_id}_T1_biascorr_brain.nii"
    ]

    for path in patterns:
        if os.path.exists(path):
            return path
    return None

# Test with sample subjects
print("\nTesting T1 image paths:")
sample_subjects = pheno_data.sample(5)
for _, row in sample_subjects.iterrows():
    path = find_t1_image_path(str(row['subject']).strip(), row['site'])
    if path:
        print(f"✓ Found: {row['site']} subject {row['subject']} at {path}")
    else:
        print(f"✗ Missing: {row['site']} subject {row['subject']}")

# Create image dataframe with T1 paths
image_df = pheno_data.copy()
image_df['image_path'] = image_df.apply(
    lambda row: find_t1_image_path(str(row['subject']).strip(), row['site']),
    axis=1
)

# Remove subjects without images
image_df = image_df.dropna(subset=['image_path'])
print(f"\nFound {len(image_df)} subjects with T1 images")


Testing T1 image paths:
✓ Found: KKI subject 4104523 at /content/drive/MyDrive/ADHD1//KKI/KKI/sub-4104523/sub-4104523_T1_biascorr_brain.nii
✓ Found: KKI subject 9922944 at /content/drive/MyDrive/ADHD1//KKI/KKI/sub-9922944/sub-9922944_T1_biascorr_brain.nii
✗ Missing: Peking subject 3983607
✗ Missing: Peking subject 3262042
✗ Missing: Peking subject 3889095

Found 141 subjects with T1 images
