In [1]:
#!/usr/bin/python3

# ***PURPOSE***
# This script generates the vars.txt file (which is a subject x Subject Measures matrix) used by Smith et al. in their analysis of the HCP_500 data
# See reference: https://www.fmrib.ox.ac.uk/datasets/HCP-CCA/

# ***USAGE***
# Multiple files are needed:
# 1. a .txt file containing the names of the subject measures (SMs) to be used in the analysis
# 2. a .txt file containing the names of all subjects to be analyzed (their subject IDs)
# 3. the behavioral data from HCP
# 4. the 'restricted' data from HCP (requires special access, must request this)
# 5. the rfMRI_motion.txt file
# 6. the quarter/release info file (named varsQconf.txt)

# ***NOTE***
# Files 1, 2, 5, and 6 are included in our GitHub repo (named subject_measure_names.txt, subject_ids.txt, rfMRI_motion.txt, and varsQconf.txt, respectively)

# ***EXAMPLE USAGE ON CMD LINE***
# ./generate_vars.py column_headers.txt subjects.txt <behavioral data> <restricted data> rfMRI_motion.txt varsQconf.txt

import numpy as np
import pandas as pd
from pandas import DataFrame
from numpy import genfromtxt
import os
import sys
from pprint import pprint

cwd = os.getcwd()
inputs = os.path.abspath("__file__"+"/../../inputs")
outputs = os.path.abspath("__file__"+"/../../outputs") # NOTE CHANGE THIS TO YOUR DESIRED OUTPUT PATH!

column_headers_fp = os.path.join(inputs, 'subject_measure_names.txt')
subject_ids_fp = os.path.join(inputs, 'subject_ids.txt')
behavioral_data_fp = os.path.join(inputs, 'unrestricted_500_release.csv')
restricted_data_fp = os.path.join(inputs, 'restricted_500_release.csv')
rfMRI_data_fp = os.path.join(inputs, 'rfMRI_motion.txt')
varsQconf_fp = os.path.join(inputs, 'varsQconf.txt')

# get the column headers, and names of subjects
column_headers = [line.rstrip('\n') for line in open(os.path.join(cwd,column_headers_fp))]
subjects = [line.rstrip('.pconn.nii\n') for line in open(os.path.join(cwd,subject_ids_fp))]

# now import "behavioral" and "restricted" datasets into Pandas dataframes
behavioral_data = pd.read_csv(behavioral_data_fp)
restricted_data = pd.read_csv(restricted_data_fp)


# Now we will filter out only the rows that correspond to the subjects specified in subjects.txt
# Sanity check, making sure that the filtering occurs correctly
print('behavior shape before', behavioral_data.shape)
print('shape of restricted before', restricted_data.shape)

#filter the behavioral and restricted datasets to contain only the relevant 461 subject data
behavioral_data = behavioral_data[behavioral_data['Subject'].isin(subjects)]
restricted_data = restricted_data[restricted_data['Subject'].isin(subjects)]

print('behavior shape after', behavioral_data.shape)
print('shape of restricted after', restricted_data.shape)

behavior shape before (527, 338)
shape of restricted before (542, 198)
behavior shape after (461, 338)
shape of restricted after (461, 198)


In [2]:
# Now import the rfMRI and quarter/release (varsQconf) data
varsqconf = pd.read_csv(varsQconf_fp, names=['quarter/release'])
rfmri = pd.read_csv(rfMRI_data_fp, sep=" ", names=['rfmri_motion'])

In [3]:
# reindex so that the varsqconf has the correct subject IDs as its row labels
varsqconf.index = rfmri.index

In [4]:
# concatenate the rfMRI and varsQconf data (we will need to do this later anyway)
rfmri_varsqconf = pd.concat([rfmri, varsqconf], axis=1)

In [5]:
# get the names of column headers
behav_headers=list(behavioral_data.columns.values)
restrict_headers=list(restricted_data.columns.values)

# Make lowercase
column_headers=[element.lower() for element in column_headers]
behav_headers=[element.lower() for element in behav_headers]
restrict_headers=[element.lower() for element in restrict_headers]

In [6]:
# Now let's lets get the column names that are overlapped in each
overlap_in_behav = np.intersect1d(column_headers,behav_headers)
overlap_in_restrict = np.intersect1d(column_headers,restrict_headers)

In [7]:
# Now pull out the columns and their data
# first we will need to convert all the column headers to lowercase
behavioral_data.columns = behavioral_data.columns.str.lower()
restricted_data.columns = restricted_data.columns.str.lower()

behavioral_data_filtered_cols = behavioral_data[overlap_in_behav]
restricted_data_filtered_cols = restricted_data[overlap_in_restrict]

In [8]:
# check that all dimensions are correct before we attempt to concat the dataframes
print(behavioral_data_filtered_cols.shape)
print(restricted_data_filtered_cols.shape)
print(rfmri_varsqconf.shape)

(461, 285)
(461, 189)
(461, 2)


In [9]:
# concat the dataframes

# first reindex all of them to match rfmri_varsqconf
behavioral_data_filtered_cols.index = rfmri_varsqconf.index
restricted_data_filtered_cols.index = rfmri_varsqconf.index

vars = pd.concat([behavioral_data_filtered_cols, restricted_data_filtered_cols, rfmri_varsqconf], axis = 1)

In [11]:
# drop the duplicated 'subject' column
# vars = vars.drop(columns='subject')
vars = vars.reindex(columns = column_headers)

In [13]:
vars

Unnamed: 0,subject id,quarter/release,sex,age,handedness,race,rfmri_motion,ssaga_employ,ssaga_income,ssaga_educ,...,neofac_e,noise_comp,odor_unadj,odor_ageadj,paininterf_tscore,taste_unadj,taste_ageadj,mars_log_score,mars_errs,mars_final
100307,,0.0,,26-30,95,White,0.065499,2.0,7.0,16.0,...,37.0,3.6,101.12,86.45,38.6,71.69,71.76,1.76,0.0,1.76
100408,,0.0,,31-35,55,White,0.098191,2.0,7.0,16.0,...,33.0,2.0,108.79,98.04,52.6,114.01,113.59,1.76,2.0,1.68
101006,,1.0,,31-35,90,Black or African Am.,0.086306,2.0,3.0,12.0,...,29.0,6.0,122.25,111.41,38.6,123.80,123.31,1.80,0.0,1.80
101107,,1.0,,22-25,5,White,0.100864,2.0,3.0,12.0,...,28.0,6.8,108.79,97.19,50.1,134.65,131.38,1.84,0.0,1.84
101309,,1.0,,26-30,45,Black or African Am.,0.059464,2.0,3.0,16.0,...,26.0,5.2,122.25,110.45,38.6,106.39,104.39,1.80,0.0,1.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984472,,0.0,,26-30,100,Asian/Nat. Hawaiian/Othr Pacific Is.,0.094989,0.0,1.0,16.0,...,25.0,2.8,122.25,110.45,38.6,108.73,108.00,1.76,0.0,1.76
987983,,1.0,,26-30,70,White,0.054518,2.0,7.0,16.0,...,34.0,5.2,108.79,97.19,56.4,88.02,87.70,1.88,0.0,1.88
991267,,1.0,,26-30,75,White,0.083035,2.0,7.0,14.0,...,27.0,2.8,122.25,111.41,46.6,83.23,81.55,1.84,0.0,1.84
992774,,0.0,,31-35,100,White,0.071538,0.0,3.0,12.0,...,32.0,8.4,122.25,111.41,50.1,107.17,103.55,1.76,0.0,1.76


In [12]:
# output vars.txt to the 'outputs' folder
vars.to_csv(os.path.join(outputs, "vars_500_release_test.txt"))