In [10]:
# Import necessary libraries for data preprocessing
import xml.etree.ElementTree as ET
import os
import requests
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [11]:
# STEP #1a: DATA COLLECTION (Catalog XML file of fMRI dataset)

# Specify the path to the XML file
xml_file_path = "ABIDE1_Dataset.xml"   # ABIDE I (fMRI) Dataset

# Read the XML data from the file
with open(xml_file_path, "r") as xml_file:
    # Read the XML content, and remove any leading non-XML characters
    xml_data = xml_file.read().lstrip()

# Parse the XML data
root = ET.fromstring(xml_data)

In [None]:
# STEP #2a: DATA PREPROCESSING (function)

def preprocess_fmri_data(fmri_data_path):
    # Implement preprocessing steps (motion correction, slice timing correction, etc.)
    pass

In [None]:
# STEP #1b: DATA COLLECTION (extracting fMRI in batches) AND
# STEP #2b: DATA PREPROCESSING (preprocessing each fMRI batch)

# Directory to save downloaded fMRI datasets
download_dir = "downloaded_fmri_datasets"
os.makedirs(download_dir, exist_ok=True)

# Navigate to entrySet elements with the ID "RAW"
raw_entry_sets = root.findall('.//cat:entrySet[@ID="RAW"]', namespaces={'cat': 'http://nrg.wustl.edu/catalog'})

# Iterate through RAW entrySets to extract and download fMRI data
for raw_entry_set in raw_entry_sets:
    # Navigate to entries element under the RAW entrySet
    entries_element = raw_entry_set.find('.//cat:entries', namespaces={'cat': 'http://nrg.wustl.edu/catalog'})

    # Navigate to entry elements under the entries element
    entries = entries_element.findall('.//cat:entry', namespaces={'cat': 'http://nrg.wustl.edu/catalog'})

    # Extract subject name from entrySet
    subject_name = raw_entry_set.get('ID')

    # Iterate through entries and download fMRI data in batches
    for entry in entries:
        # Extract relevant information, such as URI and name
        entry_uri = entry.get('URI')
        entry_name = entry.get('name')

        # Construct a URL based on the extracted URI
        base_url = "https://www.nitrc.org/ir/data"  
        full_url = f"{base_url}{entry_uri}"

        # Download the fMRI files using requests library
        response = requests.get(full_url, stream=True)

        # Specify the directory to save the downloaded files
        base_dir = "/workspaces/ASD-Diagnostic_Research_2023-24/ABIDE I/"
        save_dir = os.path.join(base_dir, subject_name)
        os.makedirs(save_dir, exist_ok=True)

        # Save the downloaded fMRI files with the entry name
        file_path = os.path.join(download_dir, entry_name)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)

        # Perform preprocessing on the downloaded fMRI data
        preprocess_fmri_data(file_path)