<span style="font-family: Century Gothic"> Load libraries</span>


In [125]:
import pandas as pd
import openpyxl
import numpy as np
from sklearn.preprocessing import StandardScaler

### <span style="font-family: Century Gothic"> Load and Explore the Dataset</span>

- <span style="font-family: Century Gothic"> Initial exploration to understand the structure, features, and potential issues </span>
- <span style="font-family: Century Gothic">Use libraries like Pandas to load the dataset.</span>
- <span style="font-family: Century Gothic">Check the size of the dataset (number of rows and columns).</span>
- <span style="font-family: Century Gothic">Inspect the first few rows to understand the features and their formats.</span>

In [3]:
# Load the dataset
df_matric = pd.read_csv('Data/FUNCTIONAL_CONNECTOME_MATRICES.csv')
df_labels = pd.read_excel('Data/LABELS.xlsx')
df_meta_a = pd.read_excel('Data/METADATA_A.xlsx')
df_meta_b = pd.read_excel('Data/METADATA_b.xlsx')

df_dictionary = pd.read_excel('Data Dictionary.xlsx')

### <span style="font-family: Century Gothic"> Data Dictionary</span>
#### <p style="font-family: Century Gothic; font-size: 15px">Purpose: Understand the variables in the dataset.</p>

- <span style="font-family: Century Gothic; font-size: 15px"> Inspect the file to see how variables are described.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Identify key variables.</span>

In [None]:
# Inspect the first few rows
df_dictionary.head(2)

In [68]:
df_dictionary.DataType.value_counts()

DataType
Quantitative    18
Categorical      9
Target           2
METADATA A       1
METADATA B       1
TARGETS          1
Name: count, dtype: int64

In [69]:
df_dictionary_copy = df_dictionary.copy()

# Filter rows by DataType
df_dictionary_meta_a = df_dictionary_copy[df_dictionary_copy['DataType'] == 'Quantitative']
df_dictionary_meta_b = df_dictionary_copy[df_dictionary_copy['DataType'] == 'Categorical ']
df_dictionary_target = df_dictionary_copy[df_dictionary_copy['DataType'] == 'Target']

In [71]:
df_dictionary_target

Unnamed: 0,DataType,Instrument,Field,Description,Type,Labels
35,Target,Diagnosis:ADHD_type,ADHD_Outcome,Type of Diagnosis,str,"0= Other/None, 1=ADHD"
36,Target,"Demographics:Basic_Demos,Sex",Sex_F,Sex of participant,categorical int,0=Male\n1=Female


### <span style="font-family: Century Gothic">Labels</span>
#### <p style="font-family: Century Gothic; font-size: 15px">Purpose: Contains the target variables.</p>
- <span style="font-family: Century Gothic; font-size: 15px">Check the structure of the file.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Identify the columns for ADHD diagnosis and sex.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Check for missing values or inconsistencies.</span>

In [74]:
df_labels.head(3)

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0


In [82]:
df_labels.shape

(1213, 3)

In [76]:
df_labels.isnull().sum()

participant_id    0
ADHD_Outcome      0
Sex_F             0
dtype: int64

In [None]:
# 0= Other/None, 1=ADHD
df_labels.ADHD_Outcome.value_counts()

ADHD_Outcome
1    831
0    382
Name: count, dtype: int64

In [None]:
# 0=Male, 1=Female
df_labels.Sex_F.value_counts()

Sex_F
0    797
1    416
Name: count, dtype: int64

In [None]:
# This code snippet is filtering a DataFrame `df_labels` to select rows where the "ADHD_Outcome" column has a value of 1 (indicating ADHD) and the "Sex_F" column has a value of 0 (indicating male gender).

df_male_with_adhd = df_labels[(df_labels["ADHD_Outcome"] == 1) & (df_labels["Sex_F"] == 0)]
df_male_with_adhd.shape

(581, 3)

In [None]:
# This code snippet is filtering a DataFrame `df_labels` to select rows where the "ADHD_Outcome" column has a value of 1 (indicating ADHD) and the "Sex_F" column has a value of 1 (indicating female gender).

df_female_with_adhd = df_labels[(df_labels["ADHD_Outcome"] == 1) & (df_labels["Sex_F"] == 1)]
df_female_with_adhd.shape

(250, 3)

### <span style="font-family: Century Gothic">Meta A and Meta B</span>
#### <p style="font-family: Century Gothic; font-size: 15px">Purpose: Contains additional metadata about the subjects.</p>
- <span style="font-family: Century Gothic; font-size: 15px">Inspect the files to understand the features.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Check for missing values and inconsistencies.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Merge these files if they contain complementary information.</span>

In [101]:
df_meta_a.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,40.0,13,3,10,47,13,11,28,0,6,1,5,0,5,1,0,10,
1,CPaeQkhcjg7d,-94.47,14,3,13,34,18,23,30,0,18,6,8,7,8,10,4,5,
2,Nb4EetVPm3gs,-46.67,14,4,10,35,16,10,29,1,14,2,8,5,7,6,4,9,8.239904
3,p4vPhVu91o4b,-26.68,10,5,12,39,19,16,28,6,24,4,16,9,10,8,4,6,
4,M09PXs7arQ5E,0.0,14,5,15,40,20,24,28,1,18,4,11,4,10,7,3,9,8.940679


In [106]:
df_meta_b.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0


In [96]:
df_meta_a.isnull().sum()

participant_id                  0
EHQ_EHQ_Total                   0
ColorVision_CV_Score            0
APQ_P_APQ_P_CP                  0
APQ_P_APQ_P_ID                  0
APQ_P_APQ_P_INV                 0
APQ_P_APQ_P_OPD                 0
APQ_P_APQ_P_PM                  0
APQ_P_APQ_P_PP                  0
SDQ_SDQ_Conduct_Problems        0
SDQ_SDQ_Difficulties_Total      0
SDQ_SDQ_Emotional_Problems      0
SDQ_SDQ_Externalizing           0
SDQ_SDQ_Generating_Impact       0
SDQ_SDQ_Hyperactivity           0
SDQ_SDQ_Internalizing           0
SDQ_SDQ_Peer_Problems           0
SDQ_SDQ_Prosocial               0
MRI_Track_Age_at_Scan         360
dtype: int64

In [97]:
df_meta_b.isnull().sum()

participant_id                       0
Basic_Demos_Enroll_Year              0
Basic_Demos_Study_Site               0
PreInt_Demos_Fam_Child_Ethnicity    11
PreInt_Demos_Fam_Child_Race          0
MRI_Track_Scan_Location              0
Barratt_Barratt_P1_Edu               0
Barratt_Barratt_P1_Occ               0
Barratt_Barratt_P2_Edu               0
Barratt_Barratt_P2_Occ               0
dtype: int64

In [116]:
df_meta = pd.merge(df_meta_a, df_meta_b, on='participant_id', how='outer')
df_meta.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,MRI_Track_Age_at_Scan,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,00aIpNTbG5uh,100.0,13,3,15,44,14,20,27,3,...,14.274127,2019,4,1.0,0,3,21,45,0,0
1,00fV0OyyoLfw,92.27,14,3,12,35,25,28,30,5,...,,2017,1,0.0,9,2,21,0,21,45
2,04X1eiS79T4B,86.67,14,3,21,37,18,26,28,3,...,13.463381,2017,1,1.0,2,2,9,0,0,0
3,05ocQutkURd6,93.34,14,3,11,42,15,20,28,0,...,9.572553,2018,1,3.0,8,2,18,10,18,0
4,06YUNBA9ZRLq,0.0,14,8,12,35,22,12,24,6,...,6.654574,2018,1,0.0,1,2,12,0,0,0


In [117]:
df_meta.isnull().sum()

participant_id                        0
EHQ_EHQ_Total                         0
ColorVision_CV_Score                  0
APQ_P_APQ_P_CP                        0
APQ_P_APQ_P_ID                        0
APQ_P_APQ_P_INV                       0
APQ_P_APQ_P_OPD                       0
APQ_P_APQ_P_PM                        0
APQ_P_APQ_P_PP                        0
SDQ_SDQ_Conduct_Problems              0
SDQ_SDQ_Difficulties_Total            0
SDQ_SDQ_Emotional_Problems            0
SDQ_SDQ_Externalizing                 0
SDQ_SDQ_Generating_Impact             0
SDQ_SDQ_Hyperactivity                 0
SDQ_SDQ_Internalizing                 0
SDQ_SDQ_Peer_Problems                 0
SDQ_SDQ_Prosocial                     0
MRI_Track_Age_at_Scan               360
Basic_Demos_Enroll_Year               0
Basic_Demos_Study_Site                0
PreInt_Demos_Fam_Child_Ethnicity     11
PreInt_Demos_Fam_Child_Race           0
MRI_Track_Scan_Location               0
Barratt_Barratt_P1_Edu                0


### <span style="font-family: Century Gothic">Clean and Preprocess the data</span>

- <span style="font-family: Century Gothic; font-size: 15px">Handle missing values.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Encode categorical variables.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Normalize/scale numerical features.</span>

In [None]:
# This line of code is filling missing values in the 'MRI_Track_Age_at_Scan' column of the DataFrame 'df_meta' with the median value of that column. The `fillna()` method is used to replace NaN (missing) values with a specified value, in this case, the median of the column. The `inplace=True` parameter ensures that the operation is done on the original DataFrame 'df_meta' without creating a new copy.
df_meta.MRI_Track_Age_at_Scan.fillna(df_meta.MRI_Track_Age_at_Scan.median(), inplace=True)

In [None]:
# This line of code is filling missing values in the column `PreInt_Demos_Fam_Child_Ethnicity` of the DataFrame `df_meta` with the median value of that column. The `fillna()` method is used to replace missing (NaN) values with a specified value, in this case, the median of the column. The `inplace=True` parameter ensures that the operation is done on the original DataFrame `df_meta` without creating a new copy.
df_meta.PreInt_Demos_Fam_Child_Ethnicity.fillna(df_meta.PreInt_Demos_Fam_Child_Ethnicity.median(), inplace=True)

In [121]:
df_meta.isnull().sum()

participant_id                      0
EHQ_EHQ_Total                       0
ColorVision_CV_Score                0
APQ_P_APQ_P_CP                      0
APQ_P_APQ_P_ID                      0
APQ_P_APQ_P_INV                     0
APQ_P_APQ_P_OPD                     0
APQ_P_APQ_P_PM                      0
APQ_P_APQ_P_PP                      0
SDQ_SDQ_Conduct_Problems            0
SDQ_SDQ_Difficulties_Total          0
SDQ_SDQ_Emotional_Problems          0
SDQ_SDQ_Externalizing               0
SDQ_SDQ_Generating_Impact           0
SDQ_SDQ_Hyperactivity               0
SDQ_SDQ_Internalizing               0
SDQ_SDQ_Peer_Problems               0
SDQ_SDQ_Prosocial                   0
MRI_Track_Age_at_Scan               0
Basic_Demos_Enroll_Year             0
Basic_Demos_Study_Site              0
PreInt_Demos_Fam_Child_Ethnicity    0
PreInt_Demos_Fam_Child_Race         0
MRI_Track_Scan_Location             0
Barratt_Barratt_P1_Edu              0
Barratt_Barratt_P1_Occ              0
Barratt_Barr

In [123]:
df_meta.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,MRI_Track_Age_at_Scan,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,00aIpNTbG5uh,100.0,13,3,15,44,14,20,27,3,...,14.274127,2019,4,1.0,0,3,21,45,0,0
1,00fV0OyyoLfw,92.27,14,3,12,35,25,28,30,5,...,10.739219,2017,1,0.0,9,2,21,0,21,45
2,04X1eiS79T4B,86.67,14,3,21,37,18,26,28,3,...,13.463381,2017,1,1.0,2,2,9,0,0,0
3,05ocQutkURd6,93.34,14,3,11,42,15,20,28,0,...,9.572553,2018,1,3.0,8,2,18,10,18,0
4,06YUNBA9ZRLq,0.0,14,8,12,35,22,12,24,6,...,6.654574,2018,1,0.0,1,2,12,0,0,0


In [126]:
df_meta.dtypes

participant_id                       object
EHQ_EHQ_Total                       float64
ColorVision_CV_Score                  int64
APQ_P_APQ_P_CP                        int64
APQ_P_APQ_P_ID                        int64
APQ_P_APQ_P_INV                       int64
APQ_P_APQ_P_OPD                       int64
APQ_P_APQ_P_PM                        int64
APQ_P_APQ_P_PP                        int64
SDQ_SDQ_Conduct_Problems              int64
SDQ_SDQ_Difficulties_Total            int64
SDQ_SDQ_Emotional_Problems            int64
SDQ_SDQ_Externalizing                 int64
SDQ_SDQ_Generating_Impact             int64
SDQ_SDQ_Hyperactivity                 int64
SDQ_SDQ_Internalizing                 int64
SDQ_SDQ_Peer_Problems                 int64
SDQ_SDQ_Prosocial                     int64
MRI_Track_Age_at_Scan               float64
Basic_Demos_Enroll_Year               int64
Basic_Demos_Study_Site                int64
PreInt_Demos_Fam_Child_Ethnicity    float64
PreInt_Demos_Fam_Child_Race     

In [None]:
# This code snippet is performing feature scaling on the numerical columns of the DataFrame `df_meta`.

scaler = StandardScaler()

y = df_meta.select_dtypes(include=['float64', 'int64']).columns
df_meta[y] = scaler.fit_transform(df_meta[y])

In [128]:
df_meta.dtypes

participant_id                       object
EHQ_EHQ_Total                       float64
ColorVision_CV_Score                float64
APQ_P_APQ_P_CP                      float64
APQ_P_APQ_P_ID                      float64
APQ_P_APQ_P_INV                     float64
APQ_P_APQ_P_OPD                     float64
APQ_P_APQ_P_PM                      float64
APQ_P_APQ_P_PP                      float64
SDQ_SDQ_Conduct_Problems            float64
SDQ_SDQ_Difficulties_Total          float64
SDQ_SDQ_Emotional_Problems          float64
SDQ_SDQ_Externalizing               float64
SDQ_SDQ_Generating_Impact           float64
SDQ_SDQ_Hyperactivity               float64
SDQ_SDQ_Internalizing               float64
SDQ_SDQ_Peer_Problems               float64
SDQ_SDQ_Prosocial                   float64
MRI_Track_Age_at_Scan               float64
Basic_Demos_Enroll_Year             float64
Basic_Demos_Study_Site              float64
PreInt_Demos_Fam_Child_Ethnicity    float64
PreInt_Demos_Fam_Child_Race     

In [None]:
# For StandardScaler, the mean should be ~0, and the standard deviation should be ~1.

df_meta[y].std()

EHQ_EHQ_Total                       1.000412
ColorVision_CV_Score                1.000412
APQ_P_APQ_P_CP                      1.000412
APQ_P_APQ_P_ID                      1.000412
APQ_P_APQ_P_INV                     1.000412
APQ_P_APQ_P_OPD                     1.000412
APQ_P_APQ_P_PM                      1.000412
APQ_P_APQ_P_PP                      1.000412
SDQ_SDQ_Conduct_Problems            1.000412
SDQ_SDQ_Difficulties_Total          1.000412
SDQ_SDQ_Emotional_Problems          1.000412
SDQ_SDQ_Externalizing               1.000412
SDQ_SDQ_Generating_Impact           1.000412
SDQ_SDQ_Hyperactivity               1.000412
SDQ_SDQ_Internalizing               1.000412
SDQ_SDQ_Peer_Problems               1.000412
SDQ_SDQ_Prosocial                   1.000412
MRI_Track_Age_at_Scan               1.000412
Basic_Demos_Enroll_Year             1.000412
Basic_Demos_Study_Site              1.000412
PreInt_Demos_Fam_Child_Ethnicity    1.000412
PreInt_Demos_Fam_Child_Race         1.000412
MRI_Track_

In [132]:
df_meta.to_excel('METADATA.xlsx', index=False)

### <span style="font-family: Century Gothic">Functional Connectodome</span>
#### <p style="font-family: Century Gothic; font-size: 15px">Purpose: Contains fMRI data (functional connectivity between brain regions).</p>
- <span style="font-family: Century Gothic; font-size: 15px">Inspect the file to understand the structure (e.g., rows = subjects, columns = brain region pairs).</span>
- <span style="font-family: Century Gothic; font-size: 15px">Check for missing values and inconsistencies.</span>
- <span style="font-family: Century Gothic; font-size: 15px">Normalize or standardize the data if necessary.</span>


In [107]:
df_matric.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,70z8Q2xdTXM3,0.093473,0.146902,0.067893,0.015141,0.070221,0.063997,0.055382,-0.035335,0.068583,...,0.003404,-0.010359,-0.050968,-0.014365,0.128066,0.112646,-0.05898,0.028228,0.133582,0.143372
1,WHWymJu6zNZi,0.02958,0.179323,0.112933,0.038291,0.104899,0.06425,0.008488,0.077505,-0.00475,...,-0.008409,-0.008479,0.020891,0.017754,0.09404,0.035141,0.032537,0.075007,0.11535,0.1382
2,4PAQp1M6EyAo,-0.05158,0.139734,0.068295,0.046991,0.111085,0.026978,0.151377,0.021198,0.083721,...,0.053245,-0.028003,0.028773,0.024556,0.166343,0.058925,0.035485,0.063661,0.042862,0.162162
3,obEacy4Of68I,0.016273,0.204702,0.11598,0.043103,0.056431,0.057615,0.055773,0.07503,0.001033,...,-0.023918,-0.005356,0.018607,0.016193,0.072955,0.130135,0.05612,0.084784,0.114148,0.190584
4,s7WzzDcmDOhF,0.065771,0.098714,0.097604,0.112988,0.071139,0.085607,0.019392,-0.036403,-0.020375,...,0.066439,-0.07668,-0.04753,-0.031443,0.221213,0.007343,0.005763,0.08382,0.079582,0.067269


In [111]:
np.mean(df_matric.isnull().sum())

np.float64(0.0)

In [112]:
df_matric.shape

(1213, 19901)