In [15]:
import zipfile
import os
import pandas as pd
import numpy as np
import shutil
import re


In [30]:
# Directory containing ZIP files
directory_path = '/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200'
csv_path = './ABIDEII_Composite_Phenotypic.csv'  

# Read CSV file
csv_data = pd.read_csv(csv_path,  encoding='ISO-8859-1')
csv_data['SEX'].replace({2: 0}, inplace=True)
csv_data['DX_GROUP'].replace({2: 0}, inplace=True)


In [17]:
# Iterate through ZIP files
base_path = '/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/'

# List the ZIP files in the directory
zip_files = [f for f in os.listdir(base_path) if f.endswith('.zip')]

for zip_file_name in zip_files:
    zip_path = os.path.join(base_path, zip_file_name)
    # Construct the extracted folder path without .zip
    extracted_folder_path = os.path.join(base_path, zip_file_name[:-4])
    print(extracted_folder_path)


/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-NYU_2
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII_GU_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII_IP_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII_EMC_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-NYU_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-UCLA_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-KKI_1_29424_29485
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-KKI_1_29273_29322
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-KKI_1_29373_29423
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-U_MIA_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII_IU_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-SDSU_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-USM_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-KKI_1_29323_29372
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-UCD_1
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII-KUL_3
/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/ABIDEII_BNI_1

In [8]:
# ### combining KK1_1 data

# # Create a target directory for combined data
# combined_path = os.path.join(base_path, 'ABIDEII-KKI_1')
# os.makedirs(combined_path, exist_ok=True)

# # List the ZIP files in the directory
# zip_files = [f for f in os.listdir(base_path) if f.endswith('.zip') and 'KKI_1' in f]

# for zip_file_name in zip_files:
#     zip_path = os.path.join(base_path, zip_file_name)

#     # Temporary extraction path
#     extracted_folder_path = os.path.join(base_path, zip_file_name[:-4])

#     # Unzip the file
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         zip_ref.extractall(extracted_folder_path)

#     # Move the extracted contents to the combined directory
#     for subject_file in os.listdir(extracted_folder_path):
#         shutil.move(os.path.join(extracted_folder_path,
#                     subject_file), combined_path)

#     # Remove temporary extraction path
#     shutil.rmtree(extracted_folder_path)

# print(f"Combined all KKI_1 data into: {combined_path}")


In [63]:
# Define final data structure
fc_data = {
    'corr': [],
    'label': [],
    'site': [],
    'age': [],
    'sex': []
}

# Iterate through ZIP files
base_path = '/mnt/ssd1/mehul_data/ABIDE2/abide2_fc_200/'

# List the ZIP files in the directory
zip_files = [f for f in os.listdir(base_path) if f.endswith('.zip')]

for zip_file_name in zip_files:
    zip_path = os.path.join(base_path, zip_file_name)
    # Construct the extracted folder path without .zip
    extracted_folder_path = os.path.join(base_path, zip_file_name[:-4])

    site_name_parts = re.split('[_-]', extracted_folder_path)
    site_name = '_'.join(site_name_parts[-2:])
    print("Site name: ", site_name)

    # Unzip the file if not already unzipped
    if not os.path.exists(extracted_folder_path):
        os.mkdir(extracted_folder_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_folder_path)

    # Iterate through subject data
    extracted_folder_path = zip_path.replace('.zip', '')
    for subject_folder in os.listdir(extracted_folder_path):
        subject_id_str = subject_folder.split('_')[0]
        subject_id = int(subject_id_str.split('-')[1])

        # Get corresponding CSV row
        row = csv_data[csv_data['SUB_ID'] == subject_id]
        if row.empty:
            continue

        # Assume 'corr' file is located within the subject folder
        corr_file_name = f'{subject_id_str}_connectomes.csv'
        corr_file_path = os.path.join(
            extracted_folder_path, subject_folder, corr_file_name)

        # Load correlation data from CSV
        # Adjust loading method if necessary
        corr_data = pd.read_csv(corr_file_path, header=None).values

        fc_data['corr'].append(corr_data)
        fc_data['label'].append(row['DX_GROUP'].values[0])
        fc_data['site'].append(site_name)
        fc_data['age'].append(row['AGE_AT_SCAN '].values[0])
        fc_data['sex'].append(row['SEX'].values[0])


# Convert lists to numpy arrays
for key, value in fc_data.items():
    fc_data[key] = np.array(value)


Site name:  NYU_2
Site name:  GU_1
Site name:  IP_1
Site name:  EMC_1
Site name:  NYU_1
Site name:  UCLA_1
Site name:  MIA_1
Site name:  IU_1
Site name:  SDSU_1
Site name:  USM_1
Site name:  UCD_1
Site name:  KUL_3
Site name:  KKI_1
Site name:  BNI_1
Site name:  ETH_1
Site name:  OHSU_1
Site name:  TCD_1


In [64]:
print(fc_data.keys())
print(fc_data['corr'].shape)
print(fc_data['label'].shape)
print(fc_data['site'].shape)
print(fc_data['age'].shape)
print(fc_data['sex'].shape)


dict_keys(['corr', 'label', 'site', 'age', 'sex'])
(812, 200, 200)
(812,)
(812,)
(812,)
(812,)


In [67]:
print(np.unique(fc_data['label']))
print(np.unique(fc_data['site']))
# print(np.unique(fc_data['age']))
print(np.unique(fc_data['sex']))


[0 1]
['BNI_1' 'EMC_1' 'ETH_1' 'GU_1' 'IP_1' 'IU_1' 'KKI_1' 'KUL_3' 'MIA_1'
 'NYU_1' 'NYU_2' 'OHSU_1' 'SDSU_1' 'TCD_1' 'UCD_1' 'UCLA_1' 'USM_1']
[0 1]


In [68]:
print(fc_data['corr'].shape)

(812, 200, 200)


In [69]:
save_path = "/mnt/ssd1/mehul_data/research/fc_abide2_200.npy"
np.save(save_path, fc_data)


In [74]:
load_path = "/mnt/ssd1/mehul_data/research/fc_abide2_200.npy"
loaded_data = np.load(load_path, allow_pickle=True).item()

# Check the loaded data (e.g., print shape, some elements, etc.)
print(loaded_data.keys())
print(loaded_data['corr'].shape)


dict_keys(['corr', 'label', 'site', 'age', 'sex'])
(812, 200, 200)
