In [2]:
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
import os
import pandas as pd
from scipy.stats import linregress
import nibabel as nib
import urllib
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')
from __future__ import division

In [3]:
behav_data_f = '../../Phenotypic_V1_0b_preprocessed1.csv'
df = pd.read_csv(behav_data_f)
df = df.loc[df['func_perc_fd'].notnull(), :]
df = df.loc[df['FILE_ID']!='no_filename', :]
df['AGE_YRS'] = np.floor(df['AGE_AT_SCAN'])

## Correlate each vertex with age

In [5]:
def split_two_matched_samples(df, motion_thresh, age_l, age_u, n):
    """
    This function takes in a data frame, thresholds it to only include
    participants whose percentage bad frames are less than motion_thresh
    and participants who are between the lower and upper age limits (inclusive),
    then returns two matched samples of size n. The samples are matched on
    age in years, autism diagnosis, gender and scanning site. This function also selectively samples the
    func_perc_fd
    Information about the motion measure is here:
    http://preprocessed-connectomes-project.org/quality-assessment-protocol/
    """
    
    # Start by removing all participants whose data is below a certain
    # motion threshold.
    df_samp_motion =  df.loc[df['func_perc_fd'] < motion_thresh, :]

    # Then remove participants who are younger (in years) than age_l and older
    # than age_u. Note that this means people who are age_l and age_u
    # (eg 6 and 10) will be included in the sample.
    df_samp = df_samp_motion.loc[(df_samp_motion['AGE_YRS']>=age_l)
                                    & (df_samp_motion['AGE_YRS']<=age_u), :]
                                    
    ##sort subjects based on motion
    sort_column_list = ['func_perc_fd']
    df_motion_sorted = df_samp.sort_values(by=sort_column_list)
    
    ##rank subjects by motion
    r=range(len(df_motion_sorted))
    r_df=pd.DataFrame(r)
    r_df.columns = ['rank']
    r_df['newcol'] = df_motion_sorted.index
    r_df.set_index('newcol', inplace=True)
    r_df.index.names = [None]
    df_motion_sorted_rank=pd.concat ([r_df,df_motion_sorted], axis=1)
    
    ##create bins of subjects in quartiles
    l=len(df_motion_sorted_rank)
    chunk=l/4
    chunk1=chunk
    chunk2=2*chunk
    chunk3=3*chunk
    chunk4=l
    
    first=df_motion_sorted_rank[df_motion_sorted_rank['rank']<=chunk1]
    second=df_motion_sorted_rank[(df_motion_sorted_rank['rank']>chunk1) & (df_motion_sorted_rank['rank']<=chunk2)]
    third=df_motion_sorted_rank[(df_motion_sorted_rank['rank']>chunk2) & (df_motion_sorted_rank['rank']<=chunk3)]
    fourth=df_motion_sorted_rank[df_motion_sorted_rank['rank']>=chunk3]
    
    ##take 2n/4 from each bin
    n_samp=(n*2)/4
    n_samp

    # Shuffle these remaining participants to ensure you get different sub
    # samples each time you run the code.
    first_rand = first.reindex(np.random.permutation(first.index))
    second_rand = second.reindex(np.random.permutation(second.index))
    third_rand = third.reindex(np.random.permutation(third.index))
    fourth_rand = fourth.reindex(np.random.permutation(fourth.index))

    # Only keep the top 2*n/4 participants.
    first_samp_2n = first_rand.iloc[:n_samp, :]
    second_samp_2n = second_rand.iloc[:n_samp, :]
    third_samp_2n = third_rand.iloc[:n_samp, :]
    fourth_samp_2n = fourth_rand.iloc[:n_samp, :]
    
    #append these together
    frames = [first_samp_2n, second_samp_2n, third_samp_2n,fourth_samp_2n]
    final_df = pd.concat(frames)

    # Sort these participants according to the sort columns of interest
    sort_column_list = ['DSM_IV_TR', 'DX_GROUP', 'SITE_ID', 'SEX', 'AGE_YRS']
    df_samp_2n_sorted = final_df.sort_values(by=sort_column_list)

    # Now put all even numbered participants in group A and all odd numbered
    # participants in group B.
    df_grp_A = df_samp_2n_sorted.iloc[::2, :]
    df_grp_B = df_samp_2n_sorted.iloc[1::2, :]

    # Boom! Return these two data frames
    return df_grp_A, df_grp_B

In [6]:
df_A, df_B = split_two_matched_samples(df, 50, 6, 18, 100)
print df_A[['AGE_AT_SCAN', 'DX_GROUP', 'SEX']].describe()
print df_B[['AGE_AT_SCAN', 'DX_GROUP', 'SEX']].describe()

       AGE_AT_SCAN    DX_GROUP        SEX
count   100.000000  100.000000  100.00000
mean     13.601489    1.500000    1.21000
std       2.908456    0.502519    0.40936
min       7.933600    1.000000    1.00000
25%      10.980000    1.000000    1.00000
50%      13.985000    1.500000    1.00000
75%      15.927500    2.000000    1.00000
max      18.900000    2.000000    2.00000
       AGE_AT_SCAN    DX_GROUP        SEX
count   100.000000  100.000000  100.00000
mean     13.097167    1.510000    1.21000
std       3.031123    0.502418    0.40936
min       7.228000    1.000000    1.00000
25%      10.737500    1.000000    1.00000
50%      13.090000    2.000000    1.00000
75%      15.547500    2.000000    1.00000
max      18.900000    2.000000    2.00000


  return self._getitem_tuple(key)


In [25]:
ts_df = pd.read_table('DATA/Caltech_0051456_rois_aal.1D')
corr_mat_r = ts_df.corr()
corr_mat_z = np.arctanh(corr_mat_r)

In [None]:
#loop through correlation matrix and correlate each value with age

In [8]:
def make_group_corr_mat(df):
    """
    This function reads in each subject's aal roi time series files and creates roi-roi correlation matrices
    for each subject and then sums them all together. The final output is a 3d matrix of all subjects 
    roi-roi correlations, a mean roi-roi correlation matrix and a roi-roi covariance matrix. 
    **NOTE WELL** This returns correlations transformed by the Fisher z, aka arctanh, function.    
    """

    # for each subject do the following
    
    for i, (sub, f_id, age) in enumerate(df[['SUB_ID', 'FILE_ID', 'AGE_AT_SCAN']].values):
        
        #read each subjects aal roi time series files
        ts_df = pd.read_table('DATA/{}_rois_aal.1D'.format(f_id))

        #create a correlation matrix from the roi all time series files
        corr_mat_r = ts_df.corr()
        #the correlations need to be transformed to Fisher z, which is
        #equivalent to the arctanh function.
        corr_mat_z = np.arctanh(corr_mat_r)
        
        #for the first subject, add a correlation matrix of zeros that is the same dimensions as the aal roi-roi matrix
        if i == 0:
            all_corr_mat = np.zeros([corr_mat_z.shape[0], corr_mat_z.shape[1], len(df)])

        #now add the correlation matrix you just created for each subject to the all_corr_mat matrix (3D)
        all_corr_mat[:, :, i] = corr_mat_z
    
    #create the mean correlation matrix (ignore nas - sometime there are some...)
    av_corr_mat = np.nanmean(all_corr_mat, axis=2)
    #create the group covariance matrix (ignore nas - sometime there are some...)
    var_corr_mat = np.nanvar(all_corr_mat, axis=2)
        
    return all_corr_mat, av_corr_mat, var_corr_mat

In [38]:
all_corr_mat_A, av_corr_mat_A, var_corr_mat_A = make_group_corr_mat(df_A)


In [39]:
all_corr_mat_A.shape

(116, 116, 100)

In [33]:
## For each subject we have roi-roi correlation matrix and one age value.
#Then create 116x116 matrix of age value per subject (in same 3d array)
#Then correlate age matrix with roi-roi matrix
#put age - roiroi matrixes into 1 line

In [106]:
age=df_A.loc[:, 'AGE_AT_SCAN']
age_df=pd.DataFrame(age)
age_roi_corr =np.ones((116,116))
age_roi_corr.shape ##creating the output I think
age_df.head()
#need to make the index 0-100


Unnamed: 0,AGE_AT_SCAN
101,9.42
95,10.22
127,15.4
838,7.9336
848,10.8652


In [107]:
r=range(len(age))
r_df=pd.DataFrame(r)
r_df.columns = ['rank']
r_df['newcol'] = age_df.index
r_df.set_index('newcol', inplace=True)
r_df.index.names = [None]
#r_df['age']=age_df['AGE_AT_SCAN']
r_df.head(3)

Unnamed: 0,rank
101,0
95,1
127,2


In [110]:
corr_dat=pd.DataFrame(all_corr_mat_A[1,2,:]) #length of matrix
corr_dat["age"]=age
corr_dat.corr()["age"][0]
corr_dat.head(20)


Unnamed: 0,0,age
0,0.058755,
1,0.912022,
2,0.354744,
3,0.150852,
4,0.747548,
5,1.074216,
6,0.411452,
7,0.427048,
8,0.596651,
9,0.587405,


In [54]:
#age_roi_corr[r,rn]=corr_dat.corr()

In [55]:
for  r in range(all_corr_mat_A.shape[0]):
            for rn in range(all_corr_mat_A.shape[0]):
                if rn != r:
                    corr_dat=pd.DataFrame(all_corr_mat_A[r,rn,:])
                    corr_dat["age"]=age
                    age_roi_corr[r,rn]=corr_dat.corr()["age"][0]
                    #motion_roi_corr[r,rn] = np.arctanh(np.corrcoef(corr_dat,motion)[0,1]) 

In [57]:
age_roi_corr

array([[ 1.        , -0.05788752, -0.27590469, ..., -0.5725525 ,
        -0.8222929 , -0.66760981],
       [-0.05788752,  1.        , -0.19144147, ..., -0.72691603,
        -0.81954384, -0.65546931],
       [-0.27590469, -0.19144147,  1.        , ..., -0.38396543,
        -0.71823665, -0.54922236],
       ..., 
       [-0.5725525 , -0.72691603, -0.38396543, ...,  1.        ,
        -0.50417864, -0.02371075],
       [-0.8222929 , -0.81954384, -0.71823665, ..., -0.50417864,
         1.        ,  0.1239588 ],
       [-0.66760981, -0.65546931, -0.54922236, ..., -0.02371075,
         0.1239588 ,  1.        ]])