# Statistical Tests for fMRI preprocessing paper

This notebook performs a repeated measures anova on the data that is shown in the boxplots in the manuscript. These data are contained in three .csv files: Segmentation.csv, Modularity.csv, Mean_Strength.csv. 


In [1]:
from scipy.io import loadmat
import numpy as np 
import pandas as pd 

from statsmodels.stats.anova import AnovaRM


In [20]:
subjs=[1,2,4,*range(6,12),*range(13,26),27,28,29,*range(31,37),*range(39,42),43,44]
remove=[2,5,6,11,12,18,23,30] #manual removal of outliers, these are CONN IDs which correspond to the position of the subject in subjs
new_inds={*range(36)}.difference({num-1 for num in remove})
subset_subjs=[subjs[ind] for ind in new_inds]

[1, 4, 6, 9, 10, 11, 13, 16, 17, 18, 19, 20, 22, 23, 24, 25, 28, 29, 31, 32, 33, 34, 36, 39, 40, 41, 43, 44]


In [None]:
# TODO 
#add post hoc tests

In [2]:
def print_mean_std(df,class_1,class_2,output):
    header_1=list(class_1.keys())[0]
    header_2=list(class_2.keys())[0]
    print(f'{output} across {header_1} and {header_2}')
    for it_1 in list(class_1.values())[0]:
        for it_2 in list(class_2.values())[0]:
            print_mean=df.loc[df[header_1]==it_1].loc[df[header_2]==it_2][output].mean()
            print_std=df.loc[df[header_1]==it_1].loc[df[header_2]==it_2][output].std()
            print(f'For {it_1} and {it_2} the mean is {print_mean:.3f} and the std is {print_std:.3f}')




In [3]:
df_seg = pd.read_csv('Segmentation.csv',index_col=0)
df_seg.head()


Unnamed: 0,Subject,Pipeline,Tissue,Lesion Overlap,Lesion Fraction,Intersection Overlap,Intersection No Lesion Overlap
0,1,Default,CSF,103521,0.081483,0.95901,0.956535
1,2,Default,CSF,29617,0.03238,0.970924,0.970166
2,4,Default,CSF,17478,0.014303,0.969307,0.969372
3,6,Default,CSF,25468,0.021833,0.958084,0.957977
4,7,Default,CSF,620,0.000478,0.958721,0.958702


In [8]:

print('CSF')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='CSF'],depvar='Lesion Overlap',subject='Subject',within=['Pipeline']).fit()) #run seperate tests to match manuscript, could run this as a two way

print('WM')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='WM'],depvar='Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print('GM')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='GM'],depvar='Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print_mean_std(df_seg,{'Tissue':['WM','GM','CSF']},{'Pipeline':['Default','Tpm','Fmriprep']},'Lesion Overlap')

CSF
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 19.5258 2.0000 70.0000 0.0000

WM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 13.6640 2.0000 70.0000 0.0000

GM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 14.4469 2.0000 70.0000 0.0000

Lesion Overlap across Tissue and Pipeline
For WM and Default the mean is 38992.000 and the std is 49979.676
For WM and Tpm the mean is 24893.278 and the std is 34383.576
For WM and Fmriprep the mean is 40779.750 and the std is 48325.425
For GM and Default the mean is 60043.750 and the std is 69259.082
For GM and Tpm the mean is 38052.500 and the std is 46299.587
For GM and Fmriprep the mean is 47882.944 and the std is 55845.347
For CSF and Default the mean is 45547.250 and the std is 56876.057
For CSF and Tpm the mean is 23000.750 and the std is 38392.689
For CSF 

In [35]:
print('WM')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='WM'],depvar='Lesion Fraction',subject='Subject',within=['Pipeline']).fit())

print('GM')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='GM'],depvar='Lesion Fraction',subject='Subject',within=['Pipeline']).fit())

print('CSF')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='CSF'],depvar='Lesion Fraction',subject='Subject',within=['Pipeline']).fit()) 

print_mean_std(df_seg,{'Tissue':['WM','GM','CSF']},{'Pipeline':['Default','Tpm','Fmriprep']},'Lesion Fraction')

WM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 14.6704 2.0000 70.0000 0.0000

GM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 15.2542 2.0000 70.0000 0.0000

CSF
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  6.7349 2.0000 70.0000 0.0021

Lesion Fraction across Tissue and Pipeline
For WM and Default the mean is 0.036 and the std is 0.042
For WM and Tpm the mean is 0.024 and the std is 0.029
For WM and Fmriprep the mean is 0.035 and the std is 0.041
For GM and Default the mean is 0.038 and the std is 0.042
For GM and Tpm the mean is 0.025 and the std is 0.029
For GM and Fmriprep the mean is 0.031 and the std is 0.037
For CSF and Default the mean is 0.034 and the std is 0.040
For CSF and Tpm the mean is 0.018 and the std is 0.025
For CSF and Fmriprep the mean is 0.030 and the std is 0.052


In [36]:

print('WM')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='WM'],depvar='Intersection No Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print('GM')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='GM'],depvar='Intersection No Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print('CSF')
print(AnovaRM(df_seg.loc[df_seg['Tissue']=='CSF'],depvar='Intersection No Lesion Overlap',subject='Subject',within=['Pipeline']).fit()) 

print_mean_std(df_seg,{'Tissue':['WM','GM','CSF']},{'Pipeline':['Default','Tpm','Fmriprep']},'Intersection No Lesion Overlap')

WM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 85.6049 2.0000 70.0000 0.0000

GM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  0.0473 2.0000 70.0000 0.9539

CSF
                 Anova
          F Value  Num DF  Den DF Pr > F
----------------------------------------
Pipeline 2383.0164 2.0000 70.0000 0.0000

Intersection No Lesion Overlap across Tissue and Pipeline
For WM and Default the mean is 0.186 and the std is 0.032
For WM and Tpm the mean is 0.188 and the std is 0.033
For WM and Fmriprep the mean is 0.299 and the std is 0.052
For GM and Default the mean is 0.171 and the std is 0.030
For GM and Tpm the mean is 0.169 and the std is 0.024
For GM and Fmriprep the mean is 0.171 and the std is 0.047
For CSF and Default the mean is 0.960 and the std is 0.016
For CSF and Tpm the mean is 0.960 and the std is 0.015
For CSF and Fmriprep the mean is 0.121 and the st

In [31]:
# redo tests with removal of outliers

df_seg_sub = df_seg.loc[[ind for ind in range(len(df_seg)) if df_seg['Subject'][ind] in subset_subjs]]
print(f'Number of subjects in df_seg is {len(set(df_seg["Subject"]))}')
print(f'Number of subjects in df_seg_sub is {len(set(df_seg_sub["Subject"]))}')

Number of subjects in df_seg is 36
Number of subjects in df_seg_sub is 28


In [32]:
print('CSF')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='CSF'],depvar='Lesion Overlap',subject='Subject',within=['Pipeline']).fit()) #run seperate tests to match manuscript, could run this as a two way

print('WM')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='WM'],depvar='Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print('GM')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='GM'],depvar='Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print_mean_std(df_seg_sub,{'Tissue':['WM','GM','CSF']},{'Pipeline':['Default','Tpm','Fmriprep']},'Lesion Overlap')

CSF
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 14.9400 2.0000 54.0000 0.0000

WM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  9.9912 2.0000 54.0000 0.0002

GM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 11.3662 2.0000 54.0000 0.0001

Lesion Overlap across Tissue and Pipeline
For WM and Default the mean is 40690.464 and the std is 51771.695
For WM and Tpm the mean is 26191.179 and the std is 36307.406
For WM and Fmriprep the mean is 40835.786 and the std is 46763.584
For GM and Default the mean is 59840.179 and the std is 64645.876
For GM and Tpm the mean is 39341.643 and the std is 47332.781
For GM and Fmriprep the mean is 47942.179 and the std is 54564.349
For CSF and Default the mean is 47989.250 and the std is 59625.200
For CSF and Tpm the mean is 24711.357 and the std is 42182.723
For CSF 

In [33]:
print('WM')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='WM'],depvar='Lesion Fraction',subject='Subject',within=['Pipeline']).fit())

print('GM')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='GM'],depvar='Lesion Fraction',subject='Subject',within=['Pipeline']).fit())

print('CSF')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='CSF'],depvar='Lesion Fraction',subject='Subject',within=['Pipeline']).fit()) 

print_mean_std(df_seg_sub,{'Tissue':['WM','GM','CSF']},{'Pipeline':['Default','Tpm','Fmriprep']},'Lesion Fraction')

WM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 10.8946 2.0000 54.0000 0.0001

GM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 11.2603 2.0000 54.0000 0.0001

CSF
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  5.3273 2.0000 54.0000 0.0077

Lesion Fraction across Tissue and Pipeline
For WM and Default the mean is 0.037 and the std is 0.042
For WM and Tpm the mean is 0.025 and the std is 0.030
For WM and Fmriprep the mean is 0.035 and the std is 0.039
For GM and Default the mean is 0.038 and the std is 0.041
For GM and Tpm the mean is 0.026 and the std is 0.030
For GM and Fmriprep the mean is 0.031 and the std is 0.036
For CSF and Default the mean is 0.035 and the std is 0.039
For CSF and Tpm the mean is 0.018 and the std is 0.027
For CSF and Fmriprep the mean is 0.029 and the std is 0.050


In [34]:
print('WM')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='WM'],depvar='Intersection No Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print('GM')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='GM'],depvar='Intersection No Lesion Overlap',subject='Subject',within=['Pipeline']).fit())

print('CSF')
print(AnovaRM(df_seg_sub.loc[df_seg_sub['Tissue']=='CSF'],depvar='Intersection No Lesion Overlap',subject='Subject',within=['Pipeline']).fit()) 

print_mean_std(df_seg_sub,{'Tissue':['WM','GM','CSF']},{'Pipeline':['Default','Tpm','Fmriprep']},'Intersection No Lesion Overlap')

WM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 61.2423 2.0000 54.0000 0.0000

GM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  0.0680 2.0000 54.0000 0.9344

CSF
                 Anova
          F Value  Num DF  Den DF Pr > F
----------------------------------------
Pipeline 1741.4310 2.0000 54.0000 0.0000

Intersection No Lesion Overlap across Tissue and Pipeline
For WM and Default the mean is 0.186 and the std is 0.034
For WM and Tpm the mean is 0.183 and the std is 0.026
For WM and Fmriprep the mean is 0.294 and the std is 0.056
For GM and Default the mean is 0.168 and the std is 0.027
For GM and Tpm the mean is 0.166 and the std is 0.023
For GM and Fmriprep the mean is 0.166 and the std is 0.041
For CSF and Default the mean is 0.959 and the std is 0.018
For CSF and Tpm the mean is 0.958 and the std is 0.017
For CSF and Fmriprep the mean is 0.125 and the st

In [4]:
df_mod = pd.read_csv('Modularity.csv',index_col=0)
df_mod.head()

Unnamed: 0,Subject,Pipeline,Atlas,Modularity
0,1,Default,AAL,0.386872
1,2,Default,AAL,0.352606
2,4,Default,AAL,0.328717
3,6,Default,AAL,0.376981
4,7,Default,AAL,0.449123


In [52]:
print(AnovaRM(df_mod.loc[df_mod['Atlas']=='AAL'],depvar='Modularity',subject='Subject',within=['Pipeline']).fit())

print(AnovaRM(df_mod.loc[df_mod['Atlas']=='SCH'],depvar='Modularity',subject='Subject',within=['Pipeline']).fit())
print_mean_std(df_mod,{'Atlas':['AAL','SCH']},{'Pipeline':['Default','Tpm','Fmriprep']},'Modularity')

                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 10.5605 2.0000 70.0000 0.0001

                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  1.4828 2.0000 70.0000 0.2340

Modularity across Atlas and Pipeline
For AAL and Default the mean is 0.37 and the std is 0.07
For AAL and Tpm the mean is 0.37 and the std is 0.07
For AAL and Fmriprep the mean is 0.33 and the std is 0.07
For SCH and Default the mean is 0.34 and the std is 0.08
For SCH and Tpm the mean is 0.34 and the std is 0.08
For SCH and Fmriprep the mean is 0.32 and the std is 0.10


In [5]:
df_str = pd.read_csv('Mean_Strength.csv',index_col=0)
df_str.head()

Unnamed: 0,Subject,Pipeline,Atlas,Mean Ov Str,Mean No Ov Str
0,1,Default,AAL,0.002957,0.081233
1,2,Default,AAL,0.004769,0.082893
2,4,Default,AAL,0.000199,0.124326
3,6,Default,AAL,-0.000546,-0.049359
4,7,Default,AAL,-1.7e-05,0.060569


In [53]:
print(AnovaRM(df_str.loc[df_str['Atlas']=='SCH'],depvar='Mean Ov Str',subject='Subject',within=['Pipeline']).fit())

print(AnovaRM(df_str.loc[df_str['Atlas']=='AAL'],depvar='Mean Ov Str',subject='Subject',within=['Pipeline']).fit())
print_mean_std(df_str,{'Atlas':['AAL','SCH']},{'Pipeline':['Default','Tpm','Fmriprep']},'Mean Ov Str')

                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  0.8620 2.0000 70.0000 0.4268

                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  0.2422 2.0000 70.0000 0.7856

Mean Ov Str across Atlas and Pipeline
For AAL and Default the mean is 0.00 and the std is 0.00
For AAL and Tpm the mean is 0.00 and the std is 0.00
For AAL and Fmriprep the mean is 0.00 and the std is 0.00
For SCH and Default the mean is 0.00 and the std is 0.00
For SCH and Tpm the mean is 0.00 and the std is 0.00
For SCH and Fmriprep the mean is 0.00 and the std is 0.00


In [54]:
print(AnovaRM(df_str.loc[df_str['Atlas']=='SCH'],depvar='Mean No Ov Str',subject='Subject',within=['Pipeline']).fit())

print(AnovaRM(df_str.loc[df_str['Atlas']=='AAL'],depvar='Mean No Ov Str',subject='Subject',within=['Pipeline']).fit())
print_mean_std(df_str,{'Atlas':['AAL','SCH']},{'Pipeline':['Default','Tpm','Fmriprep']},'Mean No Ov Str')

                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  3.2786 2.0000 70.0000 0.0435

                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  4.9648 2.0000 70.0000 0.0096

Mean No Ov Str across Atlas and Pipeline
For AAL and Default the mean is 0.05 and the std is 0.04
For AAL and Tpm the mean is 0.05 and the std is 0.04
For AAL and Fmriprep the mean is 0.09 and the std is 0.11
For SCH and Default the mean is 0.02 and the std is 0.04
For SCH and Tpm the mean is 0.02 and the std is 0.04
For SCH and Fmriprep the mean is 0.04 and the std is 0.07


In [6]:
df_ts = pd.read_csv('Time_Series.csv',index_col=0)
df_ts.head()

Unnamed: 0,Subject,Pipeline,Lesion Var,Rel Var CSF,Rel Var GM,Rel Var WM,Rel Var Mean,Rel Var AAL,Rel Var SCH
0,1,Default,0.001905,0.020813,1.336829,0.381818,0.058351,0.030954,0.071011
1,2,Default,0.000802,0.009592,1.768672,0.283971,0.027691,0.045219,0.059368
2,4,Default,0.002672,0.015222,1.753608,0.66341,0.044266,0.080141,0.151632
3,6,Default,0.00751,0.114544,4.240808,0.877394,0.296859,0.104457,0.207845
4,7,Default,0.210434,0.395258,73.808262,10.405097,1.136516,0.660114,2.778838


In [6]:
print('Lesion')
print(AnovaRM(df_ts,depvar='Rel Var WM',subject='Subject',within=['Pipeline']).fit())

print('WM')
print(AnovaRM(df_ts,depvar='Rel Var WM',subject='Subject',within=['Pipeline']).fit())

print('GM')
print(AnovaRM(df_ts,depvar='Rel Var GM',subject='Subject',within=['Pipeline']).fit())

print('CSF')
print(AnovaRM(df_ts,depvar='Rel Var CSF',subject='Subject',within=['Pipeline']).fit())

print('AAL')
print(AnovaRM(df_ts,depvar='Rel Var AAL',subject='Subject',within=['Pipeline']).fit())

print('SCH')
print(AnovaRM(df_ts,depvar='Rel Var SCH',subject='Subject',within=['Pipeline']).fit())

print('Mean')
print(AnovaRM(df_ts,depvar='Rel Var Mean',subject='Subject',within=['Pipeline']).fit())

Lesion
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  2.0669 2.0000 70.0000 0.1342

WM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  2.0669 2.0000 70.0000 0.1342

GM
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  4.3971 2.0000 70.0000 0.0159

CSF
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline 11.0665 2.0000 70.0000 0.0001

AAL
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  0.4996 2.0000 70.0000 0.6089

SCH
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  2.8924 2.0000 70.0000 0.0621

Mean
                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
Pipeline  9.0013 2.0000 70.000