In [1]:
# Code used to generate the subject measures "vars" matrix for the HPC_1200 data
# MOTE: this 'vars.txt' file will NOT include all 478 SMs
# instead will just use the specific 158 that Smith et al. fed into their CCA analysis

# Files needed:
# 1. the unrestricted behavioral dat
# 2. the 'restricted' dataset
# 3. list of subjects in the HPC_1200 release (1003 people)
# 4. list of the SMs to be used

import numpy as np
import pandas as pd
from pandas import DataFrame
from numpy import genfromtxt
import os
import sys
from pprint import pprint

cwd = os.getcwd()
inputs = os.path.abspath("__file__"+"/../../inputs")
outputs = os.path.abspath("__file__"+"/../../outputs") # NOTE CHANGE THIS TO YOUR DESIRED OUTPUT PATH!

subject_measures_fp = os.path.join(inputs, '158_SMs.txt')
subject_ids_fp = os.path.join(inputs, 'subjectIDs.txt')
behavioral_data_fp = os.path.join(inputs, 'unrestricted.csv')
restricted_data_fp = os.path.join(inputs, 'restricted.csv')
# rfMRI_data_fp = os.path.join(inputs, 'rfMRI_motion.txt')
# varsQconf_fp = os.path.join(inputs, 'varsQconf.txt')


subject_measures = list(pd.read_csv(subject_measures_fp, delim_whitespace=True, header=None).iloc[0, :])
subjects = [line.rstrip('\n') for line in open(os.path.join(cwd,subject_ids_fp))]

In [2]:
# Import "behavioral" and "restricted" datasets into Pandas dataframes
behavioral_data = pd.read_csv(behavioral_data_fp)
restricted_data = pd.read_csv(restricted_data_fp)

# Filter out only the rows that correspond to the subjects specified in subjects.txt
# Sanity check, making sure that the filtering occurs correctly
print('behavior shape before', behavioral_data.shape)
print('shape of restricted before', restricted_data.shape)

#filter the behavioral and restricted datasets to contain only the relevant 461 subject data
behavioral_data = behavioral_data[behavioral_data['Subject'].isin(subjects)]
restricted_data = restricted_data[restricted_data['Subject'].isin(subjects)]

print('behavior shape after', behavioral_data.shape)
print('shape of restricted after', restricted_data.shape)

behavior shape before (1206, 582)
shape of restricted before (1206, 201)
behavior shape after (1003, 582)
shape of restricted after (1003, 201)


In [6]:
# get the names of column headers
behav_headers=list(behavioral_data.columns.values)
restrict_headers=list(restricted_data.columns.values)

# convert all the column headers to lowercase
behavioral_data.columns = behavioral_data.columns.str.lower()
restricted_data.columns = restricted_data.columns.str.lower()

subject_measures = [element.lower() for element in subject_measures]

behav_headers = [element.lower() for element in behav_headers]
restrict_headers = [element.lower() for element in restrict_headers]

In [7]:
# Now let's lets get the column from the restricted and unrestricted dfs that are needed to compose vars
overlap_in_behav = np.intersect1d(subject_measures,behav_headers)
overlap_in_restrict = np.intersect1d(subject_measures,restrict_headers)

In [18]:
behavioral_data_filtered = behavioral_data[overlap_in_behav]
restricted_data_filtered = restricted_data[overlap_in_restrict]

In [20]:
behavioral_data_filtered.shape

(1003, 61)

In [21]:
restricted_data_filtered.shape

(1003, 84)

In [22]:
61+84

145

In [24]:
# it appears that we are still missing 158 - 145 = 13 measures, where are they?
missing_in_behav = np.setdiff1d(subject_measures,behav_headers)
missing_in_restrict = np.setdiff1d(subject_measures,restrict_headers)
missing_in_behav_and_restrict = np.setdiff1d(missing_in_behav,restrict_headers)

In [29]:
# No idea where these come from, so they will just have to be blank columns in the final 'vars' matrix
missing_in_behav_and_restrict

array(['asr_aggr_pct', 'asr_attn_pct', 'asr_intr_pct', 'asr_rule_pct',
       'asr_soma_pct', 'asr_thot_pct', 'asr_witd_pct', 'dsm_adh_pct',
       'dsm_antis_pct', 'dsm_anxi_pct', 'dsm_avoid_pct', 'dsm_depr_pct',
       'dsm_somp_pct'], dtype='<U34')

In [31]:
# concat the dataframes
# first reindex all of them to match rfmri_varsqconf
behavioral_data_filtered.index = subjects
restricted_data_filtered.index = subjects

vars = pd.concat([behavioral_data_filtered, restricted_data_filtered], axis = 1)

In [32]:
vars.shape

(1003, 145)

In [35]:
vars = vars.reindex(columns = subject_measures)

In [36]:
vars.shape

(1003, 158)

In [37]:
# output vars.txt to the 'outputs' folder
vars.to_csv(os.path.join(outputs, "vars.txt"))