In [1]:
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
import os
import pandas as pd
from scipy.stats import linregress
import nibabel as nib
import urllib
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')
from __future__ import division

The minimum supported version is 2.1



In [3]:
behav_data_f = '../../Phenotypic_V1_0b_preprocessed1.csv'
df = pd.read_csv(behav_data_f)
df = df.loc[df['func_perc_fd'].notnull(), :]
df = df.loc[df['FILE_ID']!='no_filename', :]
df['AGE_YRS'] = np.floor(df['AGE_AT_SCAN'])

## Create function that test  motion bins - so only high, medium, or low motion

### the idea here is that perhaps motion really matters if you have a homogenous high motion group and not one saturated by non-motion people

In [12]:
def split_two_matched_samples(df, motion_thresh, age_l, age_u, n):
    """
    This function takes in a data frame, thresholds it to only include
    participants whose percentage bad frames are less than motion_thresh
    and participants who are between the lower and upper age limits (inclusive),
    then returns two matched samples of size n. The samples are matched on
    age in years, autism diagnosis, gender and scanning site.
    Information about the motion measure is here:
    http://preprocessed-connectomes-project.org/quality-assessment-protocol/
    """

    # Start by removing all participants whose data is below a certain
    # motion threshold. stratify by 10 perc_fd
    motion_thresh_lower = motion_thresh - 10
    
    
    df_samp_motion = df.loc[(df['func_perc_fd']>=motion_thresh_lower)
                                    & (df['func_perc_fd']<=motion_thresh), :]    
    

    # Then remove participants who are younger (in years) than age_l and older
    # than age_u. Note that this means people who are age_l and age_u
    # (eg 6 and 10) will be included in the sample.
    df_samp = df_samp_motion.loc[(df_samp_motion['AGE_YRS']>=age_l)
                                    & (df_samp_motion['AGE_YRS']<=age_u), :]

    # Shuffle these remaining participants to ensure you get different sub
    # samples each time you run the code.
    df_samp_rand = df_samp.reindex(np.random.permutation(df_samp.index))

    # Only keep the top 2*n participants.
    df_samp_2n = df_samp_rand.iloc[:2*n, :]

    # Sort these participants according to the sort columns of interest
    sort_column_list = ['DSM_IV_TR', 'DX_GROUP', 'SITE_ID', 'SEX', 'AGE_YRS']
    df_samp_2n_sorted = df_samp_2n.sort_values(by=sort_column_list)

    # Now put all even numbered participants in group A and all odd numbered
    # participants in group B.
    df_grp_A = df_samp_2n_sorted.iloc[::2, :]
    df_grp_B = df_samp_2n_sorted.iloc[1::2, :]

    # Boom! Return these two data frames
    return df_grp_A, df_grp_B


In [18]:
df_A, df_B = split_two_matched_samples(df, 10, 6, 18, 10)
print df_A[['AGE_AT_SCAN', 'DX_GROUP', 'SEX', 'func_perc_fd']].describe()
print df_B[['AGE_AT_SCAN', 'DX_GROUP', 'SEX','func_perc_fd']].describe()

       AGE_AT_SCAN   DX_GROUP        SEX  func_perc_fd
count    10.000000  10.000000  10.000000     10.000000
mean     13.402180   1.700000   1.100000      3.332739
std       3.027525   0.483046   0.316228      2.709015
min       9.660000   1.000000   1.000000      0.000000
25%      11.515000   1.250000   1.000000      0.621834
50%      12.535000   2.000000   1.000000      3.927057
75%      13.802500   2.000000   1.000000      5.916480
max      18.737900   2.000000   2.000000      6.369427
       AGE_AT_SCAN   DX_GROUP        SEX  func_perc_fd
count    10.000000  10.000000  10.000000     10.000000
mean     13.383800   1.600000   1.300000      2.499048
std       3.393531   0.516398   0.483046      2.321534
min       7.660000   1.000000   1.000000      0.473934
25%      11.410000   1.000000   1.000000      0.620976
50%      13.914000   2.000000   1.000000      1.617666
75%      15.700000   2.000000   1.750000      3.818613
max      18.010000   2.000000   2.000000      7.462687
