In [14]:
#!/usr/bin/python3

# ***PURPOSE***
# This script generates the vars.txt file (which is a subject x Subject Measures matrix) used by Smith et al. in their analysis of the HCP_500 data
# See reference: https://www.fmrib.ox.ac.uk/datasets/HCP-CCA/

# ***USAGE***
# Multiple files are needed:
# 1. a .txt file containing the names of the subject measures (SMs) to be used in the analysis
# 2. a .txt file containing the names of all subjects to be analyzed (their subject IDs)
# 3. the behavioral data from HCP
# 4. the 'restricted' data from HCP (requires special access, must request this)
# 5. the rfMRI_motion.txt file
# 6. the quarter/release info file (named varsQconf.txt)

# ***NOTE***
# Files 1, 2, 5, and 6 are included in our GitHub repo (named subject_measure_names.txt, subject_ids.txt, rfMRI_motion.txt, and varsQconf.txt, respectively)

# ***EXAMPLE USAGE ON CMD LINE***
# ./generate_vars.py column_headers.txt subjects.txt <behavioral data> <restricted data> rfMRI_motion.txt varsQconf.txt

import numpy as np
import pandas as pd
from pandas import DataFrame
from numpy import genfromtxt
import os
import sys
from pprint import pprint

cwd = os.getcwd()
inputs = os.path.abspath("__file__"+"/../../inputs")
outputs = os.path.abspath("__file__"+"/../../outputs") # NOTE CHANGE THIS TO YOUR DESIRED OUTPUT PATH!

column_headers_fp = os.path.join(inputs, 'subject_measure_names.txt')
subject_ids_fp = os.path.join(inputs, 'subject_ids.txt')
behavioral_data_fp = os.path.join(inputs, 'unrestricted_500_release.csv')
restricted_data_fp = os.path.join(inputs, 'restricted_500_release.csv')
rfMRI_data_fp = os.path.join(inputs, 'rfMRI_motion.txt')
varsQconf_fp = os.path.join(inputs, 'varsQconf.txt')

# get the column headers, and names of subjects
column_headers = [line.rstrip('\n') for line in open(os.path.join(cwd,column_headers_fp))]
subjects = [line.rstrip('.pconn.nii\n') for line in open(os.path.join(cwd,subject_ids_fp))]

# now import "behavioral" and "restricted" datasets into Pandas dataframes
behavioral_data = pd.read_csv(behavioral_data_fp)
restricted_data = pd.read_csv(restricted_data_fp)


# Now we will filter out only the rows that correspond to the subjects specified in subjects.txt
# Sanity check, making sure that the filtering occurs correctly
print('behavior shape before', behavioral_data.shape)
print('shape of restricted before', restricted_data.shape)

# Find the duplicated subject (142626, who is deleted from these)
behavioral_data = behavioral_data[behavioral_data['Subject'].isin(subjects)]
restricted_data = restricted_data[restricted_data['Subject'].isin(subjects)]

print('behavior shape after', behavioral_data.shape)
print('shape of restricted after', restricted_data.shape)

behavior shape before (527, 338)
shape of restricted before (542, 198)
behavior shape after (461, 338)
shape of restricted after (461, 198)


In [15]:
behavioral_data = behavioral_data.drop(columns='Subject')
behavioral_data.shape
behavioral_data.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
522    False
523    False
524    False
525    False
526    False
Length: 461, dtype: bool

In [13]:
duplicateRowsDF = behavioral_data[behavioral_data.duplicated()]
print("Duplicate Rows based on column are:", duplicateRowsDF, sep='\n')

Duplicate Rows based on column are:
Empty DataFrame
Columns: [Release, Gender, Age, Full_MR_Compl, T1_Count, T2_Count, RS-fMRI_Count, Full_Task_fMRI, fMRI_WM_Compl, fMRI_Gamb_Compl, fMRI_Mot_Compl, fMRI_Lang_Compl, fMRI_Soc_Compl, fMRI_Rel_Compl, fMRI_Emo_Compl, dMRI_Compl, dMRI_3T_ReconVrs, fMRI_3T_ReconVrs, MEG_AnyData, MEG_FullProt_Compl, MEG_HeadModel_Avail, MEG_CortRibn_Avail, MEG_Anatomy_Avail, MEG_Anatomy_Compl, MEG_Noise_Avail, MEG_Noise_Compl, MEG_RS_Avail, MEG_RS_Compl, MEG_WM_Avail, MEG_WM_Compl, MEG_StoryMath_Avail, MEG_StoryMath_Compl, MEG_Motor_Avail, MEG_Motor_Compl, Non-TB_Compl, VisProc_Compl, DelDisc_Compl, SCPT_Compl, IWRD_Compl, PMAT_Compl, VSPLOT_Compl, EmoRecog_Compl, NEO-FFI_Compl, ASR-Syn_Compl, ASR-DSM_Compl, Toolbox_Compl, MMSE_Compl, PSQI_Compl, Alert_Compl, ASQ_Compl, MRsession_Scanner, MRsession_Scans, MRsession_Label, MMSE_Score, PSQI_Score, PicSeq_Unadj, PicSeq_AgeAdj, CardSort_Unadj, CardSort_AgeAdj, Flanker_Unadj, Flanker_AgeAdj, PMAT24_A_CR, PMAT24_A_S

In [7]:
behavioral_data.columns.difference(['Subject'])

Index(['ASQ_Compl', 'ASR-DSM_Compl', 'ASR-Syn_Compl', 'Age', 'Alert_Compl',
       'AngAffect_Unadj', 'AngAggr_Unadj', 'AngHostil_Unadj',
       'CardSort_AgeAdj', 'CardSort_Unadj',
       ...
       'dMRI_3T_ReconVrs', 'dMRI_Compl', 'fMRI_3T_ReconVrs', 'fMRI_Emo_Compl',
       'fMRI_Gamb_Compl', 'fMRI_Lang_Compl', 'fMRI_Mot_Compl',
       'fMRI_Rel_Compl', 'fMRI_Soc_Compl', 'fMRI_WM_Compl'],
      dtype='object', length=337)