In [45]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import scipy.stats as stats
from scipy.stats import ttest_rel

# create a function for performing the paired t-test 
#     and for calculating the effect size of the paired t-test
#     and also returning the means of the two samples 
def paired_ttest(x,y,alternative='two-sided'):
    x_arr = np.array(x)
    y_arr = np.array(y)
    # check for number of outliers in difference between two arrays
    #    based on a threshold of differences IQD * 3 and remove outliers 
    #    from both arrays and add number of outliers to output
    # differences = x_arr - y_arr
    # outliers = []
    # for i in range(len(x_arr)):
    #    if abs(x_arr[i] - y_arr[i]) > (3 * (np.percentile(differences,75) - np.percentile(differences,25))):
    #       outliers.append(i)
    # number_of_outliers = len(outliers)
    # print("Number of outliers removed: " + str(number_of_outliers))
    # x_arr = np.delete(x_arr,outliers)
    # y_arr = np.delete(y_arr,outliers)    
    # paired t-test from scipy.stats
    t_statistic, p_value = ttest_rel(x_arr,y_arr,alternative=alternative)
    # Calculate effect size (Cohen's d)
    # calculate mean of differences between two arrays
    mean_diff = np.mean(x_arr - y_arr)
    mean_x = np.mean(x_arr)
    mean_y = np.mean(y_arr)
    # calculate cohen's d' corrected for related samples
    # calculate standard deviation of differences between two arrays
    differences = x_arr - y_arr
    std_diff = np.std(differences)
    # calculate cohen's d
    d = mean_diff / std_diff
    # calculate hedge's g
    g = d * (1 - (3 / (4 * (len(x_arr) - 2) - 1)))
    # pooled_std = np.sqrt((np.std(x_arr) ** 2 + np.std(y_arr) ** 2) / 2)
    # d = mean_diff / pooled_std
    # if sample size is < 20, calculate hedge's g instead of cohen's d
    #    hedge's g is a correction for small sample sizes
    # if len(x_arr) < 20:
    #     g = d * (1 - (3 / (4 * (len(x_arr) - 2) - 1)))
    #     return t_statistic, p_value, g, mean_x, mean_y, number_of_outliers
    return t_statistic, p_value, g, mean_x, mean_y

# create a function for performing the shapiro-wilk test for normality
#     of differences between two paired samples and for creating a qq plot
#     of the differences between the two paired samples
def shapiro_wilk_qq(x,y,recon1,recon2,walk_length):
    # shapiro-wilk test for normality
    shapiro_stat, shapiro_p = stats.shapiro(x-y)
    # # make figure
    # plt.figure(figsize=(8,6))
    # # qq plot
    # qq = stats.probplot(x-y, dist="norm", plot=plt)
    # # plot name
    # # qq_plot_name = '/datain/dataset/plots/normality/qq_plot_differences'+str(x)+'_'+str(y)+'.png' # Uncomment for use in singularity
    # qq_plot_name = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/plots/normality/qq_plot_differences'+str(recon1)+'_'+str(recon2)+'_walklength_'+str(walk_length)+'.png'
    # # make matplotlib plot
    # plt.title('QQ Plot for Differences between '+str(recon1)+' and '+str(recon2))
    # plt.xlabel('Theoretical Quantiles')
    # plt.ylabel('Ordered Values')
    # plt.tight_layout()
    # # save qq plot
    # plt.savefig(qq_plot_name)
    # plt.close()
    return shapiro_stat, shapiro_p

# create a function for performing the wilcoxon signed rank test with paired samples 
#     and calculating the effect size of the paired samples wilcoxon signed rank test
#     and also returning the medians of the two samples
def wilcoxon_test(x,y,alternative='two-sided'):
    x_arr = np.array(x)
    y_arr = np.array(y)
    w_statistic, p_value = stats.wilcoxon(x_arr,y_arr,alternative=alternative,zero_method='pratt',mode='exact')
    median_x = np.median(x_arr)
    median_y = np.median(y_arr)
    # Calculate effect size (r)
    r = w_statistic / (len(x_arr) * (len(y_arr) + 1) / 2)
    return w_statistic, p_value, r, median_x, median_y


# create a function that reads a dataframe and returns a dataframe with the
#     values of column i for which the row values in the 'Recon 1' column equal to recon1 
#     and the values in the 'Recon 2' column equal to recon2
def get_df_int(df,recon1,recon2,i):
    if i is not None:
        df_i = df[(df['Recon 1'][i] == recon1) & (df['Recon 2'][i] == recon2)]
    else:
        df_i = df[(df['Recon 1'] == recon1) & (df['Recon 2'] == recon2)]
    return df_i

# # create a function for calculating the mean, standard deviation, and median of the
# #     values in each column of a dataframe - where the 'Recon 1' is equal to recon1
# #     and 'Recon 2' is equal to recon2
# def get_stats(df,recon1,recon2):
#     df_i = df.loc[(df['Recon 1'] == recon1) & (df['Recon 2'] == recon2)]
#     mean = df_i.mean(axis=0)
#     std = df_i.std(axis=0)
#     median = df_i.median(axis=0)
#     return mean, std, median

def print_stats(df):
    print('Mean:')
    print(df.mean(axis=0))
    print('Standard Deviation:')
    print(df.std(axis=0))
    print('Median:')
    print(df.median(axis=0))
    print('Min:')
    print(df.min(axis=0))
    print('Max:')
    print(df.max(axis=0))


Read in files and perform statistical tests of independence for Pearson scores at each walk length

In [47]:
# Read data
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'
verbose = False
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path).glob('*batch??.csv')
for file in file_list:
    if verbose is True:
        print(file)
    df = pd.read_csv(file)
    # ,ave to a copy of df, then perform Fisher's r to z transformation on Pearson scores
    df_z = df.copy()
    for i in range(1, 6):
        df_z[str(i)] = np.arctanh(df_z[str(i)])
    # print(df_z)
    if verbose is True:
        print(file.stem)
    # compare difference in score between Recon methods for each walk length column
    for i in range(1, 6):
        if verbose is True:
            print('Walk Length: '+str(i))
        # get unique values of Recon column
        recon_list = df['Recon'].unique()
        df_stats_all = pd.DataFrame()
        df_stats = pd.DataFrame()
        # loop through each unique value of Recon column
        for recon in recon_list:
            if verbose is True:
                print(recon)
            # combine statistics from each Recon method into dataframe
            df_stats = pd.DataFrame({'Recon': recon_list, 'Mean': df[df['Recon'] == str(recon)][str(i)].mean(), 'Stdev': df[df['Recon'] == str(recon)][str(i)].std(
            ), 'Median': df[df['Recon'] == str(recon)][str(i)].median(), 'IQR': df[df['Recon'] == str(recon)][str(i)].quantile(q=0.75)-df[df['Recon'] == str(recon)][str(i)].quantile(q=0.25)})
            # combine df_stats from each Recon method into dataframe
            df_stats_all = pd.concat([df_stats, df_stats_all], axis=0)
        # print(df_stats_all)
        # save dataframe to csv
        df_stats_all.to_csv(main_path+'/stats/'+file.stem +
                            '_walk_length_'+str(i)+'_stats.csv', index=False)
        # calculate p-value for difference in score between Recon methods in recon_list, with comparisons between each pair of Recon methods, save all statistics and p-values to csv
        df_z_ttest_rel_results = pd.DataFrame()
        df_z_shapiro_wilk_results = pd.DataFrame()
        df_z_wilcoxon_results = pd.DataFrame()
        for recon in recon_list:
            for recon2 in recon_list:
                if recon != recon2:
                    if verbose is True:
                        print(recon)
                        print(recon2)
                    # calculate t-test on z-scores, get t-statistic and p-value and effect size, means
                    ttest_rel_z_score = paired_ttest(df_z[df_z['Recon'] == str(
                        recon)][str(i)], df_z[df_z['Recon'] == str(recon2)][str(i)], alternative='less')
                    if verbose:
                        print(ttest_rel_z_score[2])
                    # calculate shapiro-wilk test on z-scores, get W-statistic and p-value
                    # get array of df_z values for recon and recon2
                    recon_1_z_scores = np.array(df_z[df_z['Recon'] == str(recon)][str(i)])
                    recon_2_z_scores = np.array(df_z[df_z['Recon'] == str(recon2)][str(i)])
                    # calculate differences between arrays of recon and recon2 z-scores
                    differences = recon_1_z_scores - recon_2_z_scores
                    shapiro_wilk_w, shapiro_wilk_p = stats.shapiro(differences)
                    if verbose:
                        print(shapiro_wilk_p)
                    # calculate wilcoxon test on z-scores, get test statistic and p-value and effect size, medians
                    wilcoxon_results = wilcoxon_test(df_z[df_z['Recon'] == str(
                        recon)][str(i)], df_z[df_z['Recon'] == str(recon2)][str(i)], alternative='less')
                    # combine stats into dataframe
                    df_z_ttest_rel_results = pd.concat([df_z_ttest_rel_results, pd.DataFrame({'Recon 1': recon, 'Recon 2': recon2, 'T-statistic Paired T-test': ttest_rel_z_score[0],
                                                                                                'p-value Paired T-test': ttest_rel_z_score[1], 'Effect Size d': ttest_rel_z_score[2],
                                                                                                'Mean 1': ttest_rel_z_score[3], 'Mean 2': ttest_rel_z_score[4]}, index=[0])], axis=0)
                    df_z_shapiro_wilk_results = pd.concat([df_z_shapiro_wilk_results, pd.DataFrame({'Recon 1': recon, 'Recon 2': recon2, 'W-statistic Shapiro-Wilk test': shapiro_wilk_w,
                                                                                                    'p-value Shapiro-Wilk test': shapiro_wilk_p}, index=[0])], axis=0)
                    df_z_wilcoxon_results = pd.concat([df_z_wilcoxon_results, pd.DataFrame({'Recon 1': recon, 'Recon 2': recon2, 'Test statistic Wilcoxon Signed Rank Test': wilcoxon_results[0],
                                                                                            'p-value Wilcoxon Signed Rank Test': wilcoxon_results[1], 'Effect Size r': wilcoxon_results[2],
                                                                                            'Median 1': wilcoxon_results[3], 'Median 2': wilcoxon_results[4]}, index=[0])], axis=0)
        # print('T-test rel z-score results')
        # print(df_z_ttest_rel_results)
        # print('Shapiro-Wilk z-score results')
        # print(df_z_shapiro_wilk_results)
        # print('Wilcoxon z-score results')
        # print(df_z_wilcoxon_results)

        # combine df_z_ttest_rel_results, df_z_shapiro_wilk_results, df_z_wilcoxon_results into one dataframe "stats_all"
        df_z_ttest_rel_results = df_z_ttest_rel_results.reset_index(
            drop=True)
        df_z_shapiro_wilk_results = df_z_shapiro_wilk_results.reset_index(
            drop=True)
        df_z_wilcoxon_results = df_z_wilcoxon_results.reset_index(drop=True)
        stats_all = pd.concat(
            [df_z_ttest_rel_results, df_z_shapiro_wilk_results, df_z_wilcoxon_results], axis=1)
        # check if the p-value for the Shapiro-Wilk test is less than 0.05,
        # if so, check if the p-value for the Wilcoxon test is less than 0.005,
        # if so, then add a tally to the "significant_result" column. 
        # If the p-value for the Shapiro-Wilk test is greater than 0.05 and 
        # the p-value of the paired ttest is less than 0.005, then add a tally to the "significant_result" column.
        stats_all['significant_result'] = 0
        # make lists for getting the average of test statistic and p-value for paired ttest and wilcoxon
        # 'Average T-Statistic (Paired T-test)'
        # 'Average p-value (Paired T-test)'
        # 'Average Effect Size (Cohens d)'
        # 'Average W-Statistic (Wilcoxon SR Test)'
        # 'Average p-value (Wilcoxon SR Test)'
        # 'Average Effect Size (Wilcoxon r)'
        alpha_shapiro_wilk = 0.05
        alpha_wilcoxon = 0.05 / 9
        alpha_paired_ttest = 0.05 / 9
        for index, row in stats_all.iterrows():
            if float(row['p-value Shapiro-Wilk test']) < 0.05:
                if float(row['p-value Wilcoxon Signed Rank Test']) < alpha_wilcoxon:
                    stats_all.at[index, 'significant_result'] = 1
            elif float(row['p-value Shapiro-Wilk test']) > 0.05:
                if float(row['p-value Paired T-test']) < alpha_paired_ttest:
                    stats_all.at[index, 'significant_result'] = 1
        # print(stats_all)
        stats_all.to_csv(main_path+'/stats/'+file.stem + '_walk_length_' +
                         str(i)+'_stats_all.csv', index=False)
        print(str(stats_all['significant_result'].sum())+' significant results out of '+str(len(stats_all['significant_result'])/2)+' total results')



        # save dataframe to csv
        df_z_ttest_rel_results.to_csv(
            main_path+'/stats/'+file.stem+'_walk_length_'+str(i)+'_z_score_ttest_rel_results.csv', index=False)
        df_z_shapiro_wilk_results.to_csv(
            main_path+'/stats/'+file.stem+'_walk_length_'+str(i)+'_shapiro_wilk_results.csv', index=False)
        df_z_wilcoxon_results.to_csv(
            main_path+'/stats/'+file.stem+'_walk_length_'+str(i)+'_wilcoxon_results.csv', index=False)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  cond2 = (x >= np.asarray(_b)) & cond0
  r_plus = np.sum((d > 0) * r)
  r_minus = np.sum((d < 0) * r)


0 significant results out of 3.0 total results
1 significant results out of 3.0 total results
1 significant results out of 3.0 total results
1 significant results out of 3.0 total results
1 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
3 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant results out of 3.0 total results
2 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  cond2 = (x >= np.asarray(_b)) & cond0
  r_plus = np.sum((d > 0) * r)
  r_minus = np.sum((d < 0) * r)


2 significant results out of 3.0 total results
1 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
1 significant results out of 3.0 total results
0 significant results out of 3.0 total results
0 significant results out of 3.0 total results
0 significant results out of 3.0 total results
3 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
2 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant results out of 3.0 total results
3 significant

Make Box Plots

In [36]:
# Read data
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'

file_list = Path(main_path).glob('*batch??.csv')
for file in file_list:
    print(file)
    df = pd.read_csv(file)
    print(file.stem)
    # melt data for boxplot
    dd=pd.melt(df,id_vars=['Recon'],value_vars=['1','2','3','4','5'],var_name='Walk Length')
    # seaborn boxplot with hue based on recon method
    sns.boxplot(x='Walk Length',y='value',data=dd,hue='Recon')
    # plt.show()
    plt.ylabel('Pearson Score')
    # save figure
    plt.savefig(main_path+'/plots/'+file.stem+'_box_plot.png')
    plt.close()

/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/count_all_percent_batch24.csv
count_all_percent_batch24
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/count_all_percent_batch33.csv
count_all_percent_batch33
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/count_all_percent_batch39.csv
count_all_percent_batch39
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/volume_weighted_all_percent_batch47.csv
volume_weighted_all_percent_batch47
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/volume_weighted_all_percent_batch16.csv
volume_weighted_all_percent_batch16
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/count_all_percent_batch11.csv
count_all_percent_batch11
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/count_all_percent_batch42.csv
count_all_percent_batch42
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/mean_path_length_all_percent_batch19.csv
mean_path_length_all_percent_batch19
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/count_all_percent_batch21.csv
c

In [None]:
# Fix _stats.csv files to have correct summary statistics for each method sorted !!!


In [41]:
!for i in `seq 0 49`; do ii="/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch${i}_walk_length_4_stats.csv"; echo ${ii}; head -n 1 ${ii} > tmp; tail -n 3 ${ii} | head -n 1 >> tmp; head -n 6 ${ii} | tail -n 1 >> tmp; head -n 4 ${ii} | tail -n 1 >> tmp; mv tmp ${ii::-4}_clean.csv; done
!for i in `seq 0 49`; do ii="/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/mean_path_length_all_percent_batch${i}_walk_length_4_stats.csv"; echo ${ii}; head -n 1 ${ii} > tmp; tail -n 3 ${ii} | head -n 1 >> tmp; head -n 6 ${ii} | tail -n 1 >> tmp; head -n 4 ${ii} | tail -n 1 >> tmp; mv tmp ${ii::-4}_clean.csv; done
!for i in `seq 0 49`; do ii="/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch${i}_walk_length_4_stats.csv"; echo ${ii}; head -n 1 ${ii} > tmp; tail -n 3 ${ii} | head -n 1 >> tmp; head -n 6 ${ii} | tail -n 1 >> tmp; head -n 4 ${ii} | tail -n 1 >> tmp; mv tmp ${ii::-4}_clean.csv; done


/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch0_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch1_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch2_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch3_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch4_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch5_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch6_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted_all_percent_batch7_walk_length_4_stats.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/volume_weighted

In [48]:
import math
# get mean pearson scores and standard deviations for pearson scores for each recon method from _walk_length_4_stats.csv files
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats'
for weighting in ['count', 'volume_weighted', 'mean_path_length']:
    batch_list = []
    file_list = Path(main_path).glob(str(weighting)+'_all_percent_batch*_walk_length_4_stats_clean.csv')
    for file in file_list:
        df = pd.read_csv(file)
        batch_list.append(df)
    df = pd.concat(batch_list, axis=0)
    # save dataframe to csv
    df.to_csv(main_path+'/'+str(weighting)+'_walk_length_4_stats_50batches.csv', index=False)
    # get mean and range for Mean and Stdev for each recon method from df, save to summary csv
    recons = df['Recon'].unique()
    for recon in recons:
        df_recon = df[df['Recon'] == recon]
        mean_mean_recon = df_recon['Mean'].mean()
        # range_mean_recon = df_recon['Mean'].max() - df_recon['Mean'].min()
        # std_mean_recon = df_recon['Mean'].std()
        # mean_std_recon = df_recon['Stdev'].mean()
        # calculate mean of standard deviations: Average S.D. = √((s12 +  s22 + … + sk2) / k)
        mean_std_recon = math.sqrt(
            (df_recon['Stdev']**2).sum() / len(df_recon['Stdev']))
        # range_std_recon = df_recon['Stdev'].max() - df_recon['Stdev'].min()
        # std_std_recon = df_recon['Stdev'].std()
        df_recon_summary = pd.DataFrame(
            {'Recon': [recon], 'Mean of Means': [mean_mean_recon], 'Mean of Standard Deviations': [mean_std_recon]}) # 'Range of Mean': [range_mean_recon],  'Standard Deviation of Means': [std_mean_recon],  'Range of Standard Deviations': [range_std_recon], 'Standard deviation of Standard Deviations': [std_std_recon]
        df_recon_summary.to_csv(
            main_path+'/'+str(weighting)+'_walk_length_4_stats_50batches_summary.csv', index=False, mode='a', header=False)

# # combine all summary csvs into one csv
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats'
# for weighting in ['count', 'volume_weighted', 'mean_path_length']:
#     df_list = []
#     file_list = Path(main_path).glob(str(weighting)+'_walk_length_4_stats_50batches_summary.csv')
#     for file in file_list:
#         df = pd.read_csv(file)
#         df_list.append(df)
#     df = pd.concat(df_list, axis=0)
#     # save dataframe to csv
#     df.to_csv(main_path+'/'+str(weighting)+'_walk_length_4_stats_50batches_summary_all.csv', index=False)



In [49]:
# define a function to read in all _stats.csv files FROM WALK LENGTH 4
#     and combine them into one dataframe
def combine_stats_csv(main_path,weighting_method):
    file_list = Path(main_path).glob(str(weighting_method)+'_*_walk_length_4*_stats.csv')
    df = pd.DataFrame()
    for file in file_list:
        df = pd.concat([df, pd.read_csv(file)], axis=0)
    return df

# define a function to take the mean of the Pearson Scores for each recon method
#     and return a dataframe with the mean Pearson Scores and standard deviations

def get_mean_stdev(df):
    df_mean_stdev = pd.DataFrame()
    for recon in df['Recon'].unique():
        df_mean = pd.concat([df_mean, pd.DataFrame({'Recon': recon, 'Mean': df.loc[df['Recon'] == recon]['Mean'],
                                                    'Standard Deviation': df.loc[df['Recon'] == recon]['Pearson Score'].std()}, index=[0])], axis=0)
    return df_mean_stdev

Summarizing Number of Significant Results

In [50]:
# read in all stats_all files from walk length 4
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats'
file_list = Path(main_path).glob('*walk_length_4_stats_all.csv')
df_list_count = []
df_list_vol = []
df_list_mean_length = []
for file in file_list:
    if 'count' in file.stem:
        df_list_count.append(pd.read_csv(file))
    elif 'vol' in file.stem:
        df_list_vol.append(pd.read_csv(file))
    elif 'mean_path_length' in file.stem:
        df_list_mean_length.append(pd.read_csv(file))
df_count = pd.concat(df_list_count)
df_vol = pd.concat(df_list_vol)
df_mean_length = pd.concat(df_list_mean_length)

# define a function to calculate the Fraction of significant results for each Recon 1 vs Recon 2 method comparison
# where the value of 'Recon 1' contains recon1 and of 'Recon 2' contains recon2
def sig_frac(df, recon1, recon2):
    df = df[(df['Recon 1'].str.contains(recon1)) & (df['Recon 2'].str.contains(recon2))]
    sig_frac = df['significant_result'].sum() / (len(df['significant_result']))
    return sig_frac

def mean_stats(df, recon1, recon2):
    df = df[(df['Recon 1'].str.contains(recon1)) & (df['Recon 2'].str.contains(recon2))]
    test_statistic_list_ptt = []
    p_value_list_ptt = []
    effect_size_list_ptt = []
    test_statistic_list_wsr = []
    p_value_list_wsr = []
    effect_size_list_wsr = []
    # report mean t-statistic, p-value, effect size if shapiro-wilk test is significant
    for index, row in df.iterrows():
        if row['p-value Shapiro-Wilk test'] < 0.05:
            test_statistic_list_wsr.append(row['Test statistic Wilcoxon Signed Rank Test'])
            p_value_list_wsr.append(row['p-value Wilcoxon Signed Rank Test'])
            effect_size_list_wsr.append(row['Effect Size r'])
        elif row['p-value Shapiro-Wilk test'] > 0.05:
            test_statistic_list_ptt.append(row['T-statistic Paired T-test'])
            p_value_list_ptt.append(row['p-value Paired T-test'])
            effect_size_list_ptt.append(row['Effect Size d'])
    mean_test_statistic_ptt = np.mean(np.array(test_statistic_list_ptt))
    mean_p_value_ptt = np.mean(np.array(p_value_list_ptt))
    mean_effect_size_ptt = np.mean(np.array(effect_size_list_ptt))
    mean_test_statistic_wsr = np.mean(np.array(test_statistic_list_wsr))
    mean_p_value_wsr = np.mean(np.array(p_value_list_wsr))
    mean_effect_size_wsr = np.mean(np.array(effect_size_list_wsr))
    return mean_test_statistic_ptt, mean_p_value_ptt, mean_effect_size_ptt, mean_test_statistic_wsr, mean_p_value_wsr, mean_effect_size_wsr

# Calculate the Fraction of significant results for each Recon 1 vs Recon 2 method comparison
# for count where the value of 'Recon 1' contains DTI and of 'Recon 2' contains MSMT
sig_frac_count_dm = sig_frac(df_count, 'DTI', 'MSMT')
print("Fraction of significant paired differences Streamline Count: DTI vs MSMT")
print(sig_frac_count_dm)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Streamline Count: DTI vs MSMT")
print(mean_stats(df_count, 'DTI', 'MSMT'))
# for vol where the value of 'Recon 1' contains DTI and of 'Recon 2' contains MSMT
sig_frac_vol_dm = sig_frac(df_vol, 'DTI', 'MSMT')
print("Fraction of significant paired differences Volume Weighted Streamline Count: DTI vs MSMT")
print(sig_frac_vol_dm)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Volume Weighted Streamline Count: DTI vs MSMT")
print(mean_stats(df_vol, 'DTI', 'MSMT'))
# for mean length where the value of 'Recon 1' contains DTI and of 'Recon 2' contains MSMT
sig_frac_mean_length_dm = sig_frac(df_mean_length, 'MSMT', 'DTI')
print("Fraction of significant paired differences Mean Length: MSMT vs DTI")
print(sig_frac_mean_length_dm)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Mean Length: MSMT vs DTI")
print(mean_stats(df_mean_length, 'MSMT', 'DTI'))
sig_frac_count_gm = sig_frac(df_count, 'GQI', 'MSMT')
print("Fraction of significant paired differences Streamline Count: GQI vs MSMT")
print(sig_frac_count_gm)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Streamline Count: GQI vs MSMT")
print(mean_stats(df_count, 'GQI', 'MSMT'))
sig_frac_vol_gm = sig_frac(df_vol, 'GQI', 'MSMT')
print("Fraction of significant paired differences Volume Weighted Streamline Count: GQI vs MSMT")
print(sig_frac_vol_gm)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Volume Weighted Streamline Count: GQI vs MSMT")
print(mean_stats(df_vol, 'GQI', 'MSMT'))
sig_frac_mean_length_gm = sig_frac(df_mean_length, 'MSMT', 'GQI')
print("Fraction of significant paired differences Mean Length: MSMT vs GQI")
print(sig_frac_mean_length_gm)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Mean Length: MSMT vs GQI")
print(mean_stats(df_mean_length, 'MSMT', 'GQI'))
sig_frac_count_dg = sig_frac(df_count, 'DTI', 'GQI')
print("Fraction of significant paired differences Streamline Count: DTI vs GQI")
print(sig_frac_count_dg)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Streamline Count: DTI vs GQI")
print(mean_stats(df_count, 'DTI', 'GQI'))
sig_frac_vol_dg = sig_frac(df_vol, 'DTI', 'GQI')
print("Fraction of significant paired differences Volume Weighted Streamline Count: DTI vs GQI")
print(sig_frac_vol_dg)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Volume Weighted Streamline Count: DTI vs GQI")
print(mean_stats(df_vol, 'DTI', 'GQI'))
sig_frac_mean_length_dg = sig_frac(df_mean_length, 'GQI', 'DTI')
print("Fraction of significant paired differences Mean Length: GQI vs DTI")
print(sig_frac_mean_length_dg)
print("Mean Test Statistic, Mean p-value, Mean Effect Size for Mean Length: GQI vs DTI")
print(mean_stats(df_mean_length, 'GQI', 'DTI'))


# ### Other tail

# # Calculate the Fraction of significant results for each Recon 1 vs Recon 2 method comparison
# # for count where the value of 'Recon 1' contains DTI and of 'Recon 2' contains MSMT
# sig_frac_count_dm = sig_frac(df_count, 'MSMT', 'DTI')
# print("Fraction of significant paired differences Streamline Count: MSMT vs DTI")
# print(sig_frac_count_dm)
# # for vol where the value of 'Recon 1' contains DTI and of 'Recon 2' contains MSMT
# sig_frac_vol_dm = sig_frac(df_vol, 'MSMT', 'DTI')
# print("Fraction of significant paired differences Volume Weighted Streamline Count: MSMT vs DTI")
# print(sig_frac_vol_dm)
# # for mean length where the value of 'Recon 1' contains DTI and of 'Recon 2' contains MSMT
# sig_frac_mean_length_dm = sig_frac(df_mean_length, 'DTI', 'MSMT')
# print("Fraction of significant paired differences Mean Length: MSMT vs DTI")
# print(sig_frac_mean_length_dm)
# sig_frac_count_gm = sig_frac(df_count, 'MSMT', 'GQI')
# print("Fraction of significant paired differences Streamline Count: MSMT vs GQI")
# print(sig_frac_count_gm)
# sig_frac_vol_gm = sig_frac(df_vol, 'MSMT', 'GQI')
# print("Fraction of significant paired differences Volume Weighted Streamline Count: MSMT vs GQI")
# print(sig_frac_vol_gm)
# sig_frac_mean_length_gm = sig_frac(df_mean_length, 'GQI', 'MSMT')
# print("Fraction of significant paired differences Mean Length: GQI vs MSMT")
# print(sig_frac_mean_length_gm)
# sig_frac_count_dg = sig_frac(df_count, 'GQI', 'DTI')
# print("Fraction of significant paired differences Streamline Count: GQI vs DTI")
# print(sig_frac_count_dg)
# sig_frac_vol_dg = sig_frac(df_vol, 'GQI', 'DTI')
# print("Fraction of significant paired differences Volume Weighted Streamline Count: GQI vs DTI")
# print(sig_frac_vol_dg)
# sig_frac_mean_length_dg = sig_frac(df_mean_length, 'DTI', 'GQI')
# print("Fraction of significant paired differences Mean Length: DTI vs GQI")
# print(sig_frac_mean_length_dg)

# Fraction of significant paired differences Streamline Count: MSMT vs DTI
# 0.1
# Fraction of significant paired differences Volume Weighted Streamline Count: MSMT vs DTI
# 0.0
# Fraction of significant paired differences Mean Length: MSMT vs DTI
# 0.0
# Fraction of significant paired differences Streamline Count: MSMT vs GQI
# 0.08
# Fraction of significant paired differences Volume Weighted Streamline Count: MSMT vs GQI
# 0.0
# Fraction of significant paired differences Mean Length: GQI vs MSMT
# 0.0
# Fraction of significant paired differences Streamline Count: GQI vs DTI
# 0.2
# Fraction of significant paired differences Volume Weighted Streamline Count: GQI vs DTI
# 0.08
# Fraction of significant paired differences Mean Length: DTI vs GQI
# 0.0


Fraction of significant paired differences Streamline Count: DTI vs MSMT
0.36
Mean Test Statistic, Mean p-value, Mean Effect Size for Streamline Count: DTI vs MSMT
(-1.716862891463359, 0.2820768212181645, -0.36854612480245574, 85.5, 0.4580886363983154, 0.3701298701298701)
Fraction of significant paired differences Volume Weighted Streamline Count: DTI vs MSMT
0.9
Mean Test Statistic, Mean p-value, Mean Effect Size for Volume Weighted Streamline Count: DTI vs MSMT
(-6.109079351107941, 0.010810647001937974, -1.3113904040656958, 18.0, 0.00057442982991535, 0.07792207792207789)
Fraction of significant paired differences Mean Length: MSMT vs DTI
0.98
Mean Test Statistic, Mean p-value, Mean Effect Size for Mean Length: MSMT vs DTI
(-14.104268544875435, 0.0010585575498094915, -3.0276579109683373, 1.2105263157894737, 1.9826387104235198e-06, 0.005240373661426279)
Fraction of significant paired differences Streamline Count: GQI vs MSMT
0.38
Mean Test Statistic, Mean p-value, Mean Effect Size for 

In [51]:
# read in z-score paired t-test results from all batches in main_path/stats with walk length 4
verbose = False
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path+'/stats').glob('volume_*_walk_length_4_z_score_ttest_rel_results.csv')
df_ii = pd.DataFrame()
for file in file_list:
    if verbose:
        print(file)
    df_ii = pd.concat([df,pd.read_csv(file)],axis=0)

df_dm = get_df_int(df=df_ii, recon1='DTI Node Volume Weighted Streamline Count', recon2='MSMT CSD SIFT2 Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dm)
# save out mean, std, max, min to one csv
df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_paired_ttest_results_mean.csv')
df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_paired_ttest_results_std.csv')
df_dm.max().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_paired_ttest_results_max.csv')
df_dm.min().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_paired_ttest_results_min.csv')

df_gm = get_df_int(df=df_ii, recon1='GQI Node Volume Weighted Streamline Count', recon2='MSMT CSD SIFT2 Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_gm)
# save out mean, std, max, min to one csv
df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_paired_ttest_results_mean.csv')
df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_paired_ttest_results_std.csv')
df_gm.max().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_paired_ttest_results_max.csv')
df_gm.min().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_paired_ttest_results_min.csv')

df_dq = get_df_int(df=df_ii, recon1='DTI Node Volume Weighted Streamline Count', recon2='GQI Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dq)
# save out mean, std, max, min to one csv
df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_paired_ttest_results_mean.csv')
df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_paired_ttest_results_std.csv')
df_dq.max().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_paired_ttest_results_max.csv')
df_dq.min().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_paired_ttest_results_min.csv')

# read in z-score paired t-test results from all batches in main_path/stats with walk length 4
verbose = False
df_vol_ttest = pd.DataFrame()
df_vol_ttest = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)
df_vol_ttest.columns = ['DTI_MSMT_mean','DTI_MSMT_std','GQI_MSMT_mean','GQI_MSMT_std','DTI_GQI_mean','DTI_GQI_std']
df_vol_ttest.to_csv(main_path+'/stats/walk_4_volume_weighted_paired_ttest_results.csv')

  df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_paired_ttest_results_mean.csv')
  df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_paired_ttest_results_std.csv')
  df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_paired_ttest_results_mean.csv')
  df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_paired_ttest_results_std.csv')
  df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_paired_ttest_results_mean.csv')
  df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_paired_ttest_results_std.csv')
  df_vol_ttest = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)


In [52]:
# read in z-score wilcoxon signed rank test results from all batches in main_path/stats with walk length 4
verbose = False
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path+'/stats').glob('volume_*_walk_length_4_wilcoxon_results.csv')
df_ii = pd.DataFrame()
for file in file_list:
    if verbose:
        print(file)
    df_ii = pd.concat([df,pd.read_csv(file)],axis=0)

df_dm = get_df_int(df=df_ii, recon1='DTI Node Volume Weighted Streamline Count', recon2='MSMT CSD SIFT2 Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dm)
# save out mean, std, max, min to one csv
df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_wilcoxon_results_mean.csv')
df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_wilcoxon_results_std.csv')
df_dm.max().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_wilcoxon_results_max.csv')
df_dm.min().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_wilcoxon_results_min.csv')

df_gm = get_df_int(df=df_ii, recon1='GQI Node Volume Weighted Streamline Count', recon2='MSMT CSD SIFT2 Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_gm)
# save out mean, std, max, min to one csv
df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_wilcoxon_results_mean.csv')
df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_wilcoxon_results_std.csv')
df_gm.max().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_wilcoxon_results_max.csv')
df_gm.min().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_wilcoxon_results_min.csv')

df_dq = get_df_int(df=df_ii, recon1='DTI Node Volume Weighted Streamline Count', recon2='GQI Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dq)
# save out mean, std, max, min to one csv
df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_wilcoxon_results_mean.csv')
df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_wilcoxon_results_std.csv')
df_dq.max().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_wilcoxon_results_max.csv')
df_dq.min().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_wilcoxon_results_min.csv')

# combine all volume weighted wilcoxon signed rank test results at walk length 4
verbose = False
df_vol_wil = pd.DataFrame()
df_vol_wil = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)
df_vol_wil.columns = ['DTI_MSMT_mean','DTI_MSMT_std','GQI_MSMT_mean','GQI_MSMT_std','DTI_GQI_mean','DTI_GQI_std']
df_vol_wil.to_csv(main_path+'/stats/walk_4_volume_weighted_wilcoxon_results.csv')

  df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_wilcoxon_results_mean.csv')
  df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_volume_weighted_wilcoxon_results_std.csv')
  df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_wilcoxon_results_mean.csv')
  df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_volume_weighted_wilcoxon_results_std.csv')
  df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_wilcoxon_results_mean.csv')
  df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_volume_weighted_wilcoxon_results_std.csv')
  df_vol_wil = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)


In [53]:
# read in z-score paired t-test results from all batches in main_path/stats with walk length 4
verbose = False
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path+'/stats').glob('count_*_walk_length_4_z_score_ttest_rel_results.csv')
df_ii = pd.DataFrame()
for file in file_list:
    if verbose:
        print(file)
    df_ii = pd.concat([df,pd.read_csv(file)],axis=0)

df_dm = get_df_int(df=df_ii, recon1='DTI Streamline Count', recon2='MSMT CSD SIFT2 Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dm)
# save out mean, std, max, min to one csv
df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_paired_ttest_results_mean.csv')
df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_paired_ttest_results_std.csv')
df_dm.max().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_paired_ttest_results_max.csv')
df_dm.min().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_paired_ttest_results_min.csv')

df_gm = get_df_int(df=df_ii, recon1='GQI Streamline Count', recon2='MSMT CSD SIFT2 Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_gm)
# save out mean, std, max, min to one csv
df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_paired_ttest_results_mean.csv')
df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_paired_ttest_results_std.csv')
df_gm.max().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_paired_ttest_results_max.csv')
df_gm.min().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_paired_ttest_results_min.csv')

df_dq = get_df_int(df=df_ii, recon1='DTI Streamline Count', recon2='GQI Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dq)
# save out mean, std, max, min to one csv
df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_paired_ttest_results_mean.csv')
df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_paired_ttest_results_std.csv')
df_dq.max().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_paired_ttest_results_max.csv')
df_dq.min().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_paired_ttest_results_min.csv')


# combine all count t-test results at walk length 4
verbose = False
df_count_ttest = pd.DataFrame()
df_count_ttest = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)
df_count_ttest.columns = ['DTI_MSMT_mean','DTI_MSMT_std','GQI_MSMT_mean','GQI_MSMT_std','DTI_GQI_mean','DTI_GQI_std']
df_count_ttest.to_csv(main_path+'/stats/walk_4_count_paired_ttest_results.csv')

  df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_paired_ttest_results_mean.csv')
  df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_paired_ttest_results_std.csv')
  df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_paired_ttest_results_mean.csv')
  df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_paired_ttest_results_std.csv')
  df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_paired_ttest_results_mean.csv')
  df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_paired_ttest_results_std.csv')
  df_count_ttest = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)


In [54]:
# read in z-score wilcoxon signed rank test results from all batches in main_path/stats with walk length 4
verbose = False
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path+'/stats').glob('count_*_walk_length_4_wilcoxon_results.csv')
df_ii = pd.DataFrame()
for file in file_list:
    if verbose:
        print(file)
    df_ii = pd.concat([df,pd.read_csv(file)],axis=0)

df_dm = get_df_int(df=df_ii, recon1='DTI Streamline Count', recon2='MSMT CSD SIFT2 Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dm)
# save out mean, std, max, min to one csv
df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_wilcoxon_results_mean.csv')
df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_wilcoxon_results_std.csv')
df_dm.max().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_wilcoxon_results_max.csv')
df_dm.min().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_wilcoxon_results_min.csv')

df_gm = get_df_int(df=df_ii, recon1='GQI Streamline Count', recon2='MSMT CSD SIFT2 Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_gm)
# save out mean, std, max, min to one csv
df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_wilcoxon_results_mean.csv')
df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_wilcoxon_results_std.csv')
df_gm.max().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_wilcoxon_results_max.csv')
df_gm.min().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_wilcoxon_results_min.csv')

df_dq = get_df_int(df=df_ii, recon1='DTI Streamline Count', recon2='GQI Streamline Count', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dq)
# save out mean, std, max, min to one csv
df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_wilcoxon_results_mean.csv')
df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_wilcoxon_results_std.csv')
df_dq.max().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_wilcoxon_results_max.csv')
df_dq.min().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_wilcoxon_results_min.csv')

# combine all volume weighted wilcoxon signed rank test results at walk length 4
verbose = False
df_count_wil = pd.DataFrame()
df_count_wil = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)
df_count_wil.columns = ['DTI_MSMT_mean','DTI_MSMT_std','GQI_MSMT_mean','GQI_MSMT_std','DTI_GQI_mean','DTI_GQI_std']
df_count_wil.to_csv(main_path+'/stats/walk_4_count_wilcoxon_results.csv')

  df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_wilcoxon_results_mean.csv')
  df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_count_wilcoxon_results_std.csv')
  df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_wilcoxon_results_mean.csv')
  df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_count_wilcoxon_results_std.csv')
  df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_wilcoxon_results_mean.csv')
  df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_count_wilcoxon_results_std.csv')
  df_count_wil = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)


In [55]:
# read in shapiro-wilk results from all batches in main_path/stats with walk length 4
verbose = False
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/'

shapiro_file_list = Path(main_path).glob('volume_*_walk_length_4_shapiro_wilk_results.csv')
df_sh = pd.DataFrame()
for sh_file in shapiro_file_list:
    if verbose:
        print(sh_file)
    df_sh = pd.concat([df_sh,pd.read_csv(sh_file)],axis=0)
df_sh_dm = get_df_int(df=df_sh, recon1='DTI Node Volume Weighted Streamline Count', recon2='MSMT CSD SIFT2 Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_dm)
# save out mean, std to one csv
df_sh_dm.mean().to_csv(main_path+'/walk_4_DTI_msmt_volume_weighted_shapiro_wilk_results_mean.csv')
df_sh_dm.std().to_csv(main_path+'/walk_4_DTI_msmt_volume_weighted_shapiro_wilk_results_std.csv')

df_sh_gm = get_df_int(df=df_sh, recon1='GQI Node Volume Weighted Streamline Count', recon2='MSMT CSD SIFT2 Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_gm)
# save out mean, std to one csv
df_sh_gm.mean().to_csv(main_path+'/walk_4_GQI_msmt_volume_weighted_shapiro_wilk_results_mean.csv')
df_sh_gm.std().to_csv(main_path+'/walk_4_GQI_msmt_volume_weighted_shapiro_wilk_results_std.csv')

df_sh_dq = get_df_int(df=df_sh, recon1='DTI Node Volume Weighted Streamline Count', recon2='GQI Node Volume Weighted Streamline Count', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_dq)
# save out mean, std to one csv
df_sh_dq.mean().to_csv(main_path+'/walk_4_DTI_GQI_volume_weighted_shapiro_wilk_results_mean.csv')
df_sh_dq.std().to_csv(main_path+'/walk_4_DTI_GQI_volume_weighted_shapiro_wilk_results_std.csv')

# read in mean and std files for shapiro-wilk at walk 4, summarize results for different recon1 reecon2 combinations
verbose = False
shapiro_list = ['walk_4_DTI_msmt_volume_weighted_shapiro_wilk_results_mean.csv',
              'walk_4_DTI_msmt_volume_weighted_shapiro_wilk_results_std.csv',
              'walk_4_DTI_GQI_volume_weighted_shapiro_wilk_results_mean.csv',
              'walk_4_DTI_GQI_volume_weighted_shapiro_wilk_results_std.csv',
              'walk_4_GQI_msmt_volume_weighted_shapiro_wilk_results_mean.csv',
              'walk_4_GQI_msmt_volume_weighted_shapiro_wilk_results_std.csv']
df_summary = pd.DataFrame()
for s_file in shapiro_list:
    if verbose:
        print(s_file)
    df = pd.read_csv(main_path+'/'+s_file, index_col=0)
    df_summary = pd.concat([df_summary,df],axis=1)

df_summary.columns = ['DTI MSMT mean', 'DTI MSMT std', 'DTI GQI mean', 'DTI GQI std', 'GQI MSMT mean', 'GQI MSMT std']
df_summary.to_csv(main_path+'/walk_4_volume_weighted_shapiro_wilk_summary.csv')


shapiro_file_list = Path(main_path).glob('count_*_walk_length_4_shapiro_wilk_results.csv')
df_sh = pd.DataFrame()
for sh_file in shapiro_file_list:
    if verbose:
        print(sh_file)
    df_sh = pd.concat([df_sh,pd.read_csv(sh_file)],axis=0)
df_sh_dm = get_df_int(df=df_sh, recon1='DTI Streamline Count', recon2='MSMT CSD SIFT2 Streamline Count', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_dm)
# save out mean, std to one csv
df_sh_dm.mean().to_csv(main_path+'/walk_4_DTI_msmt_count_shapiro_wilk_results_mean.csv')
df_sh_dm.std().to_csv(main_path+'/walk_4_DTI_msmt_count_shapiro_wilk_results_std.csv')

df_sh_gm = get_df_int(df=df_sh, recon1='GQI Streamline Count', recon2='MSMT CSD SIFT2 Streamline Count', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_gm)
# save out mean, std to one csv
df_sh_gm.mean().to_csv(main_path+'/walk_4_GQI_msmt_count_shapiro_wilk_results_mean.csv')
df_sh_gm.std().to_csv(main_path+'/walk_4_GQI_msmt_count_shapiro_wilk_results_std.csv')

df_sh_dq = get_df_int(df=df_sh, recon1='DTI Streamline Count', recon2='GQI Streamline Count', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_dq)
# save out mean, std to one csv
df_sh_dq.mean().to_csv(main_path+'/walk_4_DTI_GQI_count_shapiro_wilk_results_mean.csv')
df_sh_dq.std().to_csv(main_path+'/walk_4_DTI_GQI_count_shapiro_wilk_results_std.csv')

# read in mean and std files for shapiro-wilk at walk 4, summarize results for different recon1 reecon2 combinations
verbose = False
shapiro_list = ['walk_4_DTI_msmt_count_shapiro_wilk_results_mean.csv',
              'walk_4_DTI_msmt_count_shapiro_wilk_results_std.csv',
              'walk_4_DTI_GQI_count_shapiro_wilk_results_mean.csv',
              'walk_4_DTI_GQI_count_shapiro_wilk_results_std.csv',
              'walk_4_GQI_msmt_count_shapiro_wilk_results_mean.csv',
              'walk_4_GQI_msmt_count_shapiro_wilk_results_std.csv']
df_summary = pd.DataFrame()
for s_file in shapiro_list:
    if verbose:
        print(s_file)
    df = pd.read_csv(main_path+'/'+s_file, index_col=0)
    df_summary = pd.concat([df_summary,df],axis=1)

df_summary.columns = ['DTI MSMT mean', 'DTI MSMT std', 'DTI GQI mean', 'DTI GQI std', 'GQI MSMT mean', 'GQI MSMT std']
df_summary.to_csv(main_path+'/walk_4_count_shapiro_wilk_summary.csv')




  df_sh_dm.mean().to_csv(main_path+'/walk_4_DTI_msmt_volume_weighted_shapiro_wilk_results_mean.csv')
  df_sh_dm.std().to_csv(main_path+'/walk_4_DTI_msmt_volume_weighted_shapiro_wilk_results_std.csv')
  df_sh_gm.mean().to_csv(main_path+'/walk_4_GQI_msmt_volume_weighted_shapiro_wilk_results_mean.csv')
  df_sh_gm.std().to_csv(main_path+'/walk_4_GQI_msmt_volume_weighted_shapiro_wilk_results_std.csv')
  df_sh_dq.mean().to_csv(main_path+'/walk_4_DTI_GQI_volume_weighted_shapiro_wilk_results_mean.csv')
  df_sh_dq.std().to_csv(main_path+'/walk_4_DTI_GQI_volume_weighted_shapiro_wilk_results_std.csv')
  df_sh_dm.mean().to_csv(main_path+'/walk_4_DTI_msmt_count_shapiro_wilk_results_mean.csv')
  df_sh_dm.std().to_csv(main_path+'/walk_4_DTI_msmt_count_shapiro_wilk_results_std.csv')
  df_sh_gm.mean().to_csv(main_path+'/walk_4_GQI_msmt_count_shapiro_wilk_results_mean.csv')
  df_sh_gm.std().to_csv(main_path+'/walk_4_GQI_msmt_count_shapiro_wilk_results_std.csv')
  df_sh_dq.mean().to_csv(main_path+'/walk_

In [56]:
# read in z-score paired t-test results from all batches in main_path/stats with walk length 4
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path+'/stats').glob('count_*_walk_length_4_z_score_ttest_rel_results.csv')
df = pd.DataFrame()
for file in file_list:
    print(file)
    df = pd.concat([df,pd.read_csv(file)],axis=0)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is GQI Streamline Count and Recon 2 is MSMT CSD SIFT2 Streamline Count
print('mean')
print(df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].mean())
print('std')
print(df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].std())
print('max')
print(df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].max())
print('min')
print(df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].min())

# save out mean, std, max, min to csv
df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].mean().to_csv(main_path+'/stats/walk_4_gqi_msmt_count_paired_ttest_results_mean.csv')
df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].std().to_csv(main_path+'/stats/walk_4_gqi_msmt_count_paired_ttest_results_std.csv')
df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].max().to_csv(main_path+'/stats/walk_4_gqi_msmt_count_paired_ttest_results_max.csv')
df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].min().to_csv(main_path+'/stats/walk_4_gqi_msmt_count_paired_ttest_results_min.csv')

/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch41_walk_length_4_z_score_ttest_rel_results.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch11_walk_length_4_z_score_ttest_rel_results.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch29_walk_length_4_z_score_ttest_rel_results.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch4_walk_length_4_z_score_ttest_rel_results.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch12_walk_length_4_z_score_ttest_rel_results.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch2_walk_length_4_z_score_ttest_rel_results.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch48_walk_length_4_z_score_ttest_rel_results.csv
/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats/count_all_percent_batch28_walk_length_4_z_score_ttest_re

  print(df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].mean())
  print(df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].std())
  df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].mean().to_csv(main_path+'/stats/walk_4_gqi_msmt_count_paired_ttest_results_mean.csv')
  df[(df['Recon 1']=='GQI Streamline Count') & (df['Recon 2']=='MSMT CSD SIFT2 Streamline Count')].std().to_csv(main_path+'/stats/walk_4_gqi_msmt_count_paired_ttest_results_std.csv')


In [57]:
# read in paired t-test results from all batches in main_path/stats with walk length 4
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/'
# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path+'/stats').glob('mean_path_length_*_walk_length_4_z_score_ttest_rel_results.csv')
df_ii = pd.DataFrame()
for file in file_list:
    if verbose:
        print(file)
    df_ii = pd.concat([df,pd.read_csv(file)],axis=0)
df_dm = get_df_int(df=df_ii, recon1='DTI Mean Length', recon2='MSMT CSD Mean Length', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dm)
# save out mean, std, max, min to one csv
df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_paired_ttest_results_mean.csv')
df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_paired_ttest_results_std.csv')
df_dm.max().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_paired_ttest_results_max.csv')
df_dm.min().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_paired_ttest_results_min.csv')

df_gm = get_df_int(df=df_ii, recon1='GQI Mean Length', recon2='MSMT CSD Mean Length', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_gm)
# save out mean, std, max, min to one csv
df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_paired_ttest_results_mean.csv')
df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_paired_ttest_results_std.csv')
df_gm.max().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_paired_ttest_results_max.csv')
df_gm.min().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_paired_ttest_results_min.csv')

df_dq = get_df_int(df=df_ii, recon1='DTI Mean Length', recon2='GQI Mean Length', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dq)
# save out mean, std, max, min to one csv
df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_paired_ttest_results_mean.csv')
df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_paired_ttest_results_std.csv')
df_dq.max().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_paired_ttest_results_max.csv')
df_dq.min().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_paired_ttest_results_min.csv')

# combine all volume weighted wilcoxon signed rank test results at walk length 4
verbose = False
df_ml_wil = pd.DataFrame()
df_ml_wil = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)
df_ml_wil.columns = ['DTI_MSMT_mean','DTI_MSMT_std','GQI_MSMT_mean','GQI_MSMT_std','DTI_GQI_mean','DTI_GQI_std']
df_ml_wil.to_csv(main_path+'/stats/walk_4_mean_length_paired_ttest_results.csv')



# main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v3/scrambled_dataset/'
file_list = Path(main_path+'/stats').glob('mean_path_length_*_walk_length_4_wilcoxon_results.csv')
df_ii = pd.DataFrame()
for file in file_list:
    if verbose:
        print(file)
    df_ii = pd.concat([df,pd.read_csv(file)],axis=0)
df_dm = get_df_int(df=df_ii, recon1='DTI Mean Length', recon2='MSMT CSD Mean Length', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dm)
# save out mean, std, max, min to one csv
df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_wilcoxon_results_mean.csv')
df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_wilcoxon_results_std.csv')
df_dm.max().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_wilcoxon_results_max.csv')
df_dm.min().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_wilcoxon_results_min.csv')

df_gm = get_df_int(df=df_ii, recon1='GQI Mean Length', recon2='MSMT CSD Mean Length', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_gm)
# save out mean, std, max, min to one csv
df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_wilcoxon_results_mean.csv')
df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_wilcoxon_results_std.csv')
df_gm.max().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_wilcoxon_results_max.csv')
df_gm.min().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_wilcoxon_results_min.csv')

df_dq = get_df_int(df=df_ii, recon1='DTI Mean Length', recon2='GQI Mean Length', i=None)
# get mean and standard deviation, max, min of t-statistic and p-value for the row where Recon 1 is DTI Node Volume Weighted Streamline Count and Recon 2 is MSMT CSD SIFT2 Node Volume Weighted Streamline Count
if verbose:
    print_stats(df=df_dq)
# save out mean, std, max, min to one csv
df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_wilcoxon_results_mean.csv')
df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_wilcoxon_results_std.csv')
df_dq.max().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_wilcoxon_results_max.csv')
df_dq.min().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_wilcoxon_results_min.csv')

# combine all volume weighted wilcoxon signed rank test results at walk length 4
verbose = False
df_ml_wil = pd.DataFrame()
df_ml_wil = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)
df_ml_wil.columns = ['DTI_MSMT_mean','DTI_MSMT_std','GQI_MSMT_mean','GQI_MSMT_std','DTI_GQI_mean','DTI_GQI_std']
df_ml_wil.to_csv(main_path+'/stats/walk_4_mean_length_wilcoxon_results.csv')

  df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_paired_ttest_results_mean.csv')
  df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_paired_ttest_results_std.csv')
  df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_paired_ttest_results_mean.csv')
  df_gm.std().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_paired_ttest_results_std.csv')
  df_dq.mean().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_paired_ttest_results_mean.csv')
  df_dq.std().to_csv(main_path+'/stats/walk_4_DTI_GQI_mean_length_paired_ttest_results_std.csv')
  df_ml_wil = pd.concat([df_dm.mean(),df_dm.std(),df_gm.mean(),df_gm.std(),df_dq.mean(),df_dq.std()],axis=1)
  df_dm.mean().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_wilcoxon_results_mean.csv')
  df_dm.std().to_csv(main_path+'/stats/walk_4_DTI_msmt_mean_length_wilcoxon_results_std.csv')
  df_gm.mean().to_csv(main_path+'/stats/walk_4_GQI_msmt_mean_length_wilcoxon_results_mean.csv')
  df_gm.std()

In [58]:
main_path = '/home/paul/thesis/dev/SAY_sf_prediction_v4/dataset/stats'
shapiro_file_list = Path(main_path).glob('mean_path_length_*_walk_length_4_shapiro_wilk_results.csv')
df_sh = pd.DataFrame()
for sh_file in shapiro_file_list:
    if verbose:
        print(sh_file)
    df_sh = pd.concat([df_sh,pd.read_csv(sh_file)],axis=0)

df_sh_dm = get_df_int(df=df_sh, recon1='DTI Mean Length', recon2='MSMT CSD Mean Length', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_dm)
# save out mean, std to one csv
df_sh_dm.mean().to_csv(main_path+'/walk_4_DTI_msmt_mean_length_shapiro_wilk_results_mean.csv')
df_sh_dm.std().to_csv(main_path+'/walk_4_DTI_msmt_mean_length_shapiro_wilk_results_std.csv')

df_sh_gm = get_df_int(df=df_sh, recon1='GQI Mean Length', recon2='MSMT CSD Mean Length', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_gm)
# save out mean, std to one csv
df_sh_gm.mean().to_csv(main_path+'/walk_4_GQI_msmt_mean_length_shapiro_wilk_results_mean.csv')
df_sh_gm.std().to_csv(main_path+'/walk_4_GQI_msmt_mean_length_shapiro_wilk_results_std.csv')

df_sh_dq = get_df_int(df=df_sh, recon1='DTI Mean Length', recon2='GQI Mean Length', i=None)
# get mean and standard deviation of W statistic and p-value
if verbose:
    print_stats(df=df_sh_dq)
# save out mean, std to one csv
df_sh_dq.mean().to_csv(main_path+'/walk_4_DTI_GQI_mean_length_shapiro_wilk_results_mean.csv')
df_sh_dq.std().to_csv(main_path+'/walk_4_DTI_GQI_mean_length_shapiro_wilk_results_std.csv')

# read in mean and std files for shapiro-wilk at walk 4, summarize results for different recon1 reecon2 combinations
verbose = False
shapiro_list = ['walk_4_DTI_msmt_mean_length_shapiro_wilk_results_mean.csv',
              'walk_4_DTI_msmt_mean_length_shapiro_wilk_results_std.csv',
              'walk_4_DTI_GQI_mean_length_shapiro_wilk_results_mean.csv',
              'walk_4_DTI_GQI_mean_length_shapiro_wilk_results_std.csv',
              'walk_4_GQI_msmt_mean_length_shapiro_wilk_results_mean.csv',
              'walk_4_GQI_msmt_mean_length_shapiro_wilk_results_std.csv']
df_summary = pd.DataFrame()
for s_file in shapiro_list:
    if verbose:
        print(s_file)
    df = pd.read_csv(main_path+'/'+s_file, index_col=0)
    df_summary = pd.concat([df_summary,df],axis=1)

df_summary.columns = ['DTI MSMT mean', 'DTI MSMT std', 'DTI GQI mean', 'DTI GQI std', 'GQI MSMT mean', 'GQI MSMT std']
df_summary.to_csv(main_path+'/walk_4_mean_length_shapiro_wilk_summary.csv')


  df_sh_dm.mean().to_csv(main_path+'/walk_4_DTI_msmt_mean_length_shapiro_wilk_results_mean.csv')
  df_sh_dm.std().to_csv(main_path+'/walk_4_DTI_msmt_mean_length_shapiro_wilk_results_std.csv')
  df_sh_gm.mean().to_csv(main_path+'/walk_4_GQI_msmt_mean_length_shapiro_wilk_results_mean.csv')
  df_sh_gm.std().to_csv(main_path+'/walk_4_GQI_msmt_mean_length_shapiro_wilk_results_std.csv')
  df_sh_dq.mean().to_csv(main_path+'/walk_4_DTI_GQI_mean_length_shapiro_wilk_results_mean.csv')
  df_sh_dq.std().to_csv(main_path+'/walk_4_DTI_GQI_mean_length_shapiro_wilk_results_std.csv')
