In [34]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from scipy import stats

In [2]:
metrics  = ['precision', 'recall', 'dice', 'hd', 'sd']

In [16]:
def concat_all_folds(experiment_path):
    """This function retrieves the test results located in test_postprocess and 
    returns the averages performance per organ and overall.
    """
    result = []
    all_df = []
    for filepath in glob.glob(experiment_path + '/*/*/test_postprocess/*.csv'):
        df = pd.read_csv(filepath)
        all_df.append(df)

    all_df = pd.concat(all_df, axis=0)
    # remove background
    all_df = all_df.loc[:, ~all_df.columns.str.contains('background')]
    # remove 'present labels' and 'mean_dice'
    all_df = all_df.drop(['present labels', 'mean_dice'], axis=1)
    return all_df
    


In [85]:
basepath='/export/scratch3/grewal/OAR_segmentation/runs/final_experiments_new'
os.listdir(basepath)


['good-baseline-folds-noisy-data_05122022_160609',
 'good-baseline-folds-32',
 'basic-teacher-basic-student-folds',
 'robust-teacher-folds_05122022_161207',
 'robust-teacher-robust-student-folds',
 'basic-teacher-folds-32',
 'basic-teacher-robust-student-folds_02122022']

In [86]:
experiments_info = {'basic-student': 'basic-teacher-basic-student-folds',
                    'robust-teacher': 'robust-teacher-folds_05122022_161207',
                    'robust-teacher-robust-student': 'robust-teacher-robust-student-folds',
                    'robust-student': 'basic-teacher-robust-student-folds_02122022',
                    'unet-noisy': 'good-baseline-folds-noisy-data_05122022_160609',
                    'unet': 'good-baseline-folds-32',
                    'basic-teacher': 'basic-teacher-folds-32'}

all_df = []
for experiment_name, experiment_path in experiments_info.items():
    experiment_fullpath = os.path.join(basepath, experiment_path)
    df = concat_all_folds(experiment_fullpath)

    # all metrics average over OARs
    mean_metrics = {}
    for metric in metrics:
        mean_metrics[f"{metric}"] = df.filter(like=metric).mean(axis=1)

    df_metrics = pd.DataFrame.from_dict(mean_metrics)
    df_metrics["experiment_name"] = experiment_name
    all_df.append(df_metrics)

df = pd.concat(all_df, axis=0)

In [50]:
df.head()

Unnamed: 0,precision,recall,dice,hd,sd,experiment_name
0,0.8875,0.865,0.86,5.7725,0.85875,basic-student
1,0.8725,0.86,0.86,13.71775,0.78775,basic-student
2,0.9125,0.8825,0.895,9.184,0.864,basic-student
3,0.885,0.9725,0.93,5.259,0.915,basic-student
4,0.9175,0.885,0.8925,7.698,0.89125,basic-student


## describe and normality test
### We do Kolmogorov-Smirnov test because N>50

In [87]:
for experiment_name in experiments_info.keys():
    print(experiment_name)
    df_experiment = df.loc[df["experiment_name"]==experiment_name]
    print(df_experiment.describe().loc[["mean", "std"], :])
    print("")


# for experiment_name in experiments_info.keys():
#     print(experiment_name)
#     df_experiment = df.loc[df["experiment_name"]==experiment_name]
#     # kstest
#     for metric in metrics:
#         x = df_experiment.loc[:, metric].values
#         ks_result = stats.ks_1samp(x, stats.norm.cdf)
#         print(metric, ks_result)
#     print("")



basic-student
          dice         hd        sd
mean  0.870086  10.635553  0.846372
std   0.046246   7.996050  0.051777

robust-teacher
          dice         hd        sd
mean  0.853086  11.567255  0.823047
std   0.052472   7.727776  0.057154

robust-teacher-robust-student
          dice        hd        sd
mean  0.871571  9.921549  0.848246
std   0.041884  4.715523  0.046790

robust-student
         dice         hd        sd
mean  0.87109  10.392840  0.847620
std   0.04275   6.675385  0.048497

unet-noisy
          dice         hd        sd
mean  0.834676  16.063017  0.802284
std   0.061577   9.067847  0.068164

unet
          dice         hd        sd
mean  0.850167  12.439081  0.819993
std   0.059150  10.584513  0.065504

basic-teacher
          dice         hd        sd
mean  0.853638  11.610893  0.823302
std   0.055356   7.940141  0.061791



### distributions not normal, so Friedman's test for main effect

In [79]:
metrics = ["dice", "hd", "sd"]
for metric in metrics:
    data = []
    for experiment_name in experiments_info.keys():
        x = df.loc[df.experiment_name==experiment_name, metric].values
        data.append(x)
    
    test_stat = stats.friedmanchisquare(*data)
    print(metric)
    print(test_stat)
    print("")

precision
FriedmanchisquareResult(statistic=16.05325329202224, pvalue=0.0011058295831555085)

recall
FriedmanchisquareResult(statistic=408.57903879560035, pvalue=3.0677939118411136e-88)

dice
FriedmanchisquareResult(statistic=256.94183095842305, pvalue=2.062188102852081e-55)

hd
FriedmanchisquareResult(statistic=73.88942031758283, pvalue=6.267728008645628e-16)

sd
FriedmanchisquareResult(statistic=267.0085927057482, pvalue=1.3697488436069074e-57)



### significant main effect in all metrics, so wilcoxon signed rank test for post-hoc comparisons

In [88]:
metrics = ["dice", "hd", "sd"]
experiment_names = list(experiments_info.keys())
experiment_pairs = [(a, b) for idx, a in enumerate(experiment_names) for b in experiment_names[idx+1:]]
for metric in metrics:
    print(metric)
    data ={}
    for experiment_name in experiment_names:
        x = df.loc[df.experiment_name==experiment_name, metric].values
        data[experiment_name] = x
    
    for pair in experiment_pairs:
        x, y = data[pair[0]], data[pair[1]]
        test_stat = stats.wilcoxon(x, y)
        print(f"{pair[0]} vs. {pair[1]}", test_stat)
    print("")

dice
basic-student vs. robust-teacher WilcoxonResult(statistic=23161.5, pvalue=1.80360512459764e-36)
basic-student vs. robust-teacher-robust-student WilcoxonResult(statistic=60722.0, pvalue=0.718460130921065)
basic-student vs. robust-student WilcoxonResult(statistic=60930.5, pvalue=0.6538551549722539)
basic-student vs. unet-noisy WilcoxonResult(statistic=10994.5, pvalue=6.41158955300171e-61)
basic-student vs. unet WilcoxonResult(statistic=22854.5, pvalue=4.5126471640319576e-38)
basic-student vs. basic-teacher WilcoxonResult(statistic=20046.5, pvalue=2.20615275903994e-41)
robust-teacher vs. robust-teacher-robust-student WilcoxonResult(statistic=17704.0, pvalue=4.3389033085445e-46)
robust-teacher vs. robust-student WilcoxonResult(statistic=22318.5, pvalue=1.553205221617875e-38)
robust-teacher vs. unet-noisy WilcoxonResult(statistic=34556.5, pvalue=1.5302008162909071e-21)
robust-teacher vs. unet WilcoxonResult(statistic=62780.5, pvalue=0.7950701554635415)
robust-teacher vs. basic-teacher 

In [76]:
for grp in x.items():
    print(grp)

('dice', experiment_name
basic-student                    True
robust-student                   True
robust-teacher                   True
robust-teacher-robust-student    True
Name: dice, dtype: bool)
