In [87]:
from __future__ import division
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
import os
import pandas as pd
from scipy.stats import linregress
import nibabel as nib
import urllib
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')
import scipy 

In [7]:
behav_data_f = '../../Phenotypic_V1_0b_preprocessed1.csv'
df = pd.read_csv(behav_data_f)
df = df.loc[df['func_perc_fd'].notnull(), :]
df = df.loc[df['FILE_ID']!='no_filename', :]
df['AGE_YRS'] = np.floor(df['AGE_AT_SCAN'])

## Correlate each vertex with age

In [17]:
def split_two_matched_samples(df, motion_thresh, age_l, age_u, n):
    """
    This function takes in a data frame, thresholds it to only include
    participants whose percentage bad frames are less than motion_thresh
    and participants who are between the lower and upper age limits (inclusive),
    then returns two matched samples of size n. The samples are matched on
    age in years, autism diagnosis, gender and scanning site. This function also selectively samples the
    func_perc_fd
    Information about the motion measure is here:
    http://preprocessed-connectomes-project.org/quality-assessment-protocol/
    """
    
    # Start by removing all participants whose data is below a certain
    # motion threshold.
    df_samp_motion =  df.loc[df['func_perc_fd'] < motion_thresh, :]

    # Then remove participants who are younger (in years) than age_l and older
    # than age_u. Note that this means people who are age_l and age_u
    # (eg 6 and 10) will be included in the sample.
    df_samp = df_samp_motion.loc[(df_samp_motion['AGE_YRS']>=age_l)
                                    & (df_samp_motion['AGE_YRS']<=age_u), :]
                                    
    ##sort subjects based on motion
    sort_column_list = ['func_perc_fd']
    df_motion_sorted = df_samp.sort_values(by=sort_column_list)
    
    ##rank subjects by motion
    r=range(len(df_motion_sorted))
    r_df=pd.DataFrame(r)
    r_df.columns = ['rank']
    r_df['newcol'] = df_motion_sorted.index
    r_df.set_index('newcol', inplace=True)
    r_df.index.names = [None]
    df_motion_sorted_rank=pd.concat ([r_df,df_motion_sorted], axis=1)
    
    ##create bins of subjects in quartiles
    l=len(df_motion_sorted_rank)
    chunk=l/4
    chunk1=chunk
    chunk2=2*chunk
    chunk3=3*chunk
    chunk4=l
    
    first=df_motion_sorted_rank[df_motion_sorted_rank['rank']<=chunk1]
    second=df_motion_sorted_rank[(df_motion_sorted_rank['rank']>chunk1) & (df_motion_sorted_rank['rank']<=chunk2)]
    third=df_motion_sorted_rank[(df_motion_sorted_rank['rank']>chunk2) & (df_motion_sorted_rank['rank']<=chunk3)]
    fourth=df_motion_sorted_rank[df_motion_sorted_rank['rank']>=chunk3]
    
    ##take 2n/4 from each bin
    n_samp=(n*2)/4
    n_samp
    n_samp=int(n_samp)

    # Shuffle these remaining participants to ensure you get different sub
    # samples each time you run the code.
    first_rand = first.reindex(np.random.permutation(first.index))
    second_rand = second.reindex(np.random.permutation(second.index))
    third_rand = third.reindex(np.random.permutation(third.index))
    fourth_rand = fourth.reindex(np.random.permutation(fourth.index))

    # Only keep the top 2*n/4 participants.
    first_samp_2n = first_rand.iloc[:n_samp, :]
    second_samp_2n = second_rand.iloc[:n_samp, :]
    third_samp_2n = third_rand.iloc[:n_samp, :]
    fourth_samp_2n = fourth_rand.iloc[:n_samp, :]
    
    #append these together
    frames = [first_samp_2n, second_samp_2n, third_samp_2n,fourth_samp_2n]
    final_df = pd.concat(frames)

    # Sort these participants according to the sort columns of interest
    sort_column_list = ['DSM_IV_TR', 'DX_GROUP', 'SITE_ID', 'SEX', 'AGE_YRS']
    df_samp_2n_sorted = final_df.sort_values(by=sort_column_list)

    # Now put all even numbered participants in group A and all odd numbered
    # participants in group B.
    df_grp_A = df_samp_2n_sorted.iloc[::2, :]
    df_grp_B = df_samp_2n_sorted.iloc[1::2, :]

    # Boom! Return these two data frames
    return df_grp_A, df_grp_B

In [19]:
df_A, df_B = split_two_matched_samples(df, 50, 6, 18, 100)
print df_A[['AGE_AT_SCAN', 'DX_GROUP', 'SEX']].describe()
print df_B[['AGE_AT_SCAN', 'DX_GROUP', 'SEX']].describe()

       AGE_AT_SCAN  DX_GROUP         SEX
count   100.000000    100.00  100.000000
mean     13.321114      1.55    1.180000
std       2.852127      0.50    0.386123
min       7.290000      1.00    1.000000
25%      11.000000      1.00    1.000000
50%      13.265000      2.00    1.000000
75%      15.363000      2.00    1.000000
max      18.900000      2.00    2.000000
       AGE_AT_SCAN  DX_GROUP         SEX
count   100.000000    100.00  100.000000
mean     13.032420      1.55    1.160000
std       2.698831      0.50    0.368453
min       7.190000      1.00    1.000000
25%      11.003750      1.00    1.000000
50%      13.145000      2.00    1.000000
75%      14.872500      2.00    1.000000
max      18.658500      2.00    2.000000


In [20]:
ts_df = pd.read_table('DATA/Caltech_0051456_rois_aal.1D')
corr_mat_r = ts_df.corr()
corr_mat_z = np.arctanh(corr_mat_r)


  app.launch_new_instance()


In [None]:
#loop through correlation matrix and correlate each value with age

In [21]:
def make_group_corr_mat(df):
    """
    This function reads in each subject's aal roi time series files and creates roi-roi correlation matrices
    for each subject and then sums them all together. The final output is a 3d matrix of all subjects 
    roi-roi correlations, a mean roi-roi correlation matrix and a roi-roi covariance matrix. 
    **NOTE WELL** This returns correlations transformed by the Fisher z, aka arctanh, function.    
    """

    # for each subject do the following
    
    for i, (sub, f_id, age) in enumerate(df[['SUB_ID', 'FILE_ID', 'AGE_AT_SCAN']].values):
        
        #read each subjects aal roi time series files
        ts_df = pd.read_table('DATA/{}_rois_aal.1D'.format(f_id))

        #create a correlation matrix from the roi all time series files
        corr_mat_r = ts_df.corr()
        #the correlations need to be transformed to Fisher z, which is
        #equivalent to the arctanh function.
        corr_mat_z = np.arctanh(corr_mat_r)
        
        #for the first subject, add a correlation matrix of zeros that is the same dimensions as the aal roi-roi matrix
        if i == 0:
            all_corr_mat = np.zeros([corr_mat_z.shape[0], corr_mat_z.shape[1], len(df)])

        #now add the correlation matrix you just created for each subject to the all_corr_mat matrix (3D)
        all_corr_mat[:, :, i] = corr_mat_z
    
    #create the mean correlation matrix (ignore nas - sometime there are some...)
    av_corr_mat = np.nanmean(all_corr_mat, axis=2)
    #create the group covariance matrix (ignore nas - sometime there are some...)
    var_corr_mat = np.nanvar(all_corr_mat, axis=2)
        
    return all_corr_mat, av_corr_mat, var_corr_mat

In [22]:
all_corr_mat_A, av_corr_mat_A, var_corr_mat_A = make_group_corr_mat(df_A)




In [23]:
all_corr_mat_A.shape

(116, 116, 100)

## For each subject we have roi-roi correlation matrix and one age value.
### Then create 116x116 matrix of age value per subject (in same 3d array) 
### Then correlate age matrix with roi-roi matrix
### put age - roiroi matrixes into 1 line


In [41]:
age=df_A.loc[:, 'AGE_AT_SCAN']
age_df=pd.DataFrame(age)
age_roi_corr =np.ones((116,116))
age_roi_corr.shape ##creating the output I think
age_df.index=[x for x in range(age_df.shape[0])]
age_df.head(10)


Unnamed: 0,AGE_AT_SCAN
0,9.73
1,13.99
2,12.63
3,11.6982
4,11.2628
5,10.8652
6,9.15
7,9.7201
8,10.6927
9,7.7502


In [46]:
all_corr_mat_A[1,2,:]

array([ 0.65691784,  0.23128996,  0.69851264,  0.71805923,  0.74720872,
        0.74754804,  0.24581721,  0.86322516,  0.48608233,  0.43203345,
        0.85399245, -0.05374002,  0.38370457,  0.59347863,  0.34366852,
        0.65074167,  0.32499882,  0.4906399 ,  0.44083767,  0.41443276,
        0.16604599,  0.56843077,  0.31380302,  0.31325585,  0.56895794,
        0.40284926,  0.93789404,  0.48238168,  0.86762259,  0.26203163,
        0.00569282,  0.22160202,  0.15952851,  0.21377688,  0.55122523,
        0.23810659, -0.06483061,  0.45108884, -0.47212159,  0.54462073,
        0.71281875,  0.44090874,  0.31000203,  0.80029935,  0.50226682,
        0.46569185,  0.53294123,  0.35982994,  0.60877792,  0.15180832,
       -0.06773877,  0.24463589,  1.08214376,  0.79627112,  0.3293669 ,
        0.20421669,  0.88653539,  0.79559591,  0.54083736,  0.45755944,
        0.13855241,  0.46829449,  0.24857392,  0.92588276,  0.72724967,
        0.26526113,  0.1889162 ,  0.53164814,  0.42696645,  0.24

In [43]:
corr_dat=pd.DataFrame(all_corr_mat_A[1,2,:]) #length of matrix
corr_dat["age"]=age_df
corr_dat.corr()["age"][0]
corr_dat.head(20)
corr_dat.shape

(100, 2)

In [50]:
#age_roi_corr[r,rn]=corr_dat.corr()

In [51]:
for  r in range(all_corr_mat_A.shape[0]):
            for rn in range(all_corr_mat_A.shape[0]):
                if rn != r:
                    corr_dat=pd.DataFrame(all_corr_mat_A[r,rn,:])
                    corr_dat["age"]=age_df
                    age_roi_corr[r,rn]=corr_dat.corr()["age"][0]
                    #motion_roi_corr[r,rn] = np.arctanh(np.corrcoef(corr_dat,motion)[0,1]) 

In [54]:
age_roi_corr.shape

(116, 116)

## make the new function

In [72]:
def make_group_corr_mat(df):
    """
    This function reads in each subject's aal roi time series files and creates roi-roi correlation matrices
    for each subject and then sums them all together. The final output is a 3d matrix of all subjects 
    roi-roi correlations, a mean roi-roi correlation matrix and a roi-roi covariance matrix. 
    **NOTE WELL** This returns correlations transformed by the Fisher z, aka arctanh, function.    
    """

    # for each subject do the following
    
    for i, (sub, f_id, age) in enumerate(df[['SUB_ID', 'FILE_ID', 'AGE_AT_SCAN']].values):
        
        #read each subjects aal roi time series files
        ts_df = pd.read_table('DATA/{}_rois_aal.1D'.format(f_id))

        #create a correlation matrix from the roi all time series files
        corr_mat_r = ts_df.corr()
        #the correlations need to be transformed to Fisher z, which is
        #equivalent to the arctanh function.
        corr_mat_z = np.arctanh(corr_mat_r)
        
        #for the first subject, add a correlation matrix of zeros that is the same dimensions as the aal roi-roi matrix
        if i == 0:
            all_corr_mat = np.zeros([corr_mat_z.shape[0], corr_mat_z.shape[1], len(df)])

        #now add the correlation matrix you just created for each subject to the all_corr_mat matrix (3D)
        all_corr_mat[:, :, i] = corr_mat_z
    
    
    
    ##now correlate with age for each matrix
    age=df.loc[:, 'AGE_AT_SCAN']
    age_df=pd.DataFrame(age)
    age_df.index=[x for x in range(age_df.shape[0])]
    for  r in range(all_corr_mat.shape[0]):
            for rn in range(all_corr_mat.shape[0]):
                if rn != r:
                    corr_dat=pd.DataFrame(all_corr_mat[r,rn,:])
                    corr_dat["age"]=age_df
                    age_roi_corr[r,rn]=corr_dat.corr()["age"][0]
    
    #create the mean correlation matrix (ignore nas - sometime there are some...)
    #av_corr_mat = np.nanmean(all_corr_mat, axis=2)
    #create the group covariance matrix (ignore nas - sometime there are some...)
    #var_corr_mat = np.nanvar(all_corr_mat, axis=2)
        
    return all_corr_mat,age_roi_corr

In [73]:
all_corr_mat_A, age_roi_mat_B= make_group_corr_mat(df_B)



In [74]:
age_roi_mat_B

array([[  1.00000000e+00,  -5.65776966e-04,  -3.65131334e-02, ...,
         -1.84722567e-01,  -2.62490568e-02,   1.69916609e-02],
       [ -5.65776966e-04,   1.00000000e+00,  -6.68782233e-02, ...,
         -2.44823388e-01,  -1.61156888e-01,  -1.34590782e-01],
       [ -3.65131334e-02,  -6.68782233e-02,   1.00000000e+00, ...,
         -7.31339317e-02,  -3.21894827e-03,  -3.22128755e-02],
       ..., 
       [ -1.84722567e-01,  -2.44823388e-01,  -7.31339317e-02, ...,
          1.00000000e+00,  -3.80152859e-02,  -1.65075016e-01],
       [ -2.62490568e-02,  -1.61156888e-01,  -3.21894827e-03, ...,
         -3.80152859e-02,   1.00000000e+00,  -4.54409080e-02],
       [  1.69916609e-02,  -1.34590782e-01,  -3.22128755e-02, ...,
         -1.65075016e-01,  -4.54409080e-02,   1.00000000e+00]])

In [91]:
df_A, df_B = split_two_matched_samples(df, 40, 6, 18, 50)


## does age and motion correlate in theses split samples?

In [92]:
age=df_A["AGE_AT_SCAN"]
motion=df_A["func_perc_fd"]
scipy.stats.pearsonr(age, motion) #returns r and p

(-0.058761134814792254, 0.68522374395478358)

In [86]:
df_A[["AGE_AT_SCAN","func_perc_fd"]].corr()

Unnamed: 0,AGE_AT_SCAN,func_perc_fd
AGE_AT_SCAN,1.0,-0.13244
func_perc_fd,-0.13244,1.0
