In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from scipy import stats
from collections import OrderedDict

In [2]:
metrics  = ['dice', 'sd', 'hd']
basepath='/export/scratch3/grewal/OAR_segmentation/runs/final_experiments_new'
print(os.listdir(basepath))
experiments_info = OrderedDict({'unet-noisy': 'good-baseline-folds-noisy-data_05122022_160609',
                    'unet': 'good-baseline-folds-32',
                    'basic-teacher': 'basic-teacher-folds-32',
                    'basic-student': 'basic-teacher-basic-student-folds',
                    'robust-teacher': 'robust-teacher-folds_05122022_161207',
                    'robust-student': 'basic-teacher-robust-student-folds_02122022',
                    'robust-teacher-robust-student': 'robust-teacher-robust-student-folds',
                    'robust-teacher-robust-student - iter2': 'robust-teacher-robust-student-iter2-folds',
                    'robust-teacher-robust-student - iter3': 'robust-teacher-robust-student-iter3-folds',
                     })

['robust-teacher-robust-student-iter3-folds', 'robust-teacher-robust-student-iter2-folds', 'good-baseline-folds-noisy-data_05122022_160609', 'good-baseline-folds-32', 'basic-teacher-basic-student-folds', 'robust-teacher-folds_05122022_161207', 'robust-teacher-robust-student-folds', 'basic-teacher-folds-32', 'basic-teacher-robust-student-folds_02122022']


In [3]:
def concat_all_folds(experiment_path):
    """This function retrieves the test results located in test_postprocess and 
    returns the averages performance per organ and overall.
    """
    result = []
    all_df = []
    for filepath in glob.glob(experiment_path + '/*/*/test_postprocess/*.csv'):
        df = pd.read_csv(filepath)
        all_df.append(df)

    all_df = pd.concat(all_df, axis=0)
    # remove background
    all_df = all_df.loc[:, ~all_df.columns.str.contains('background')]
    # remove 'present labels' and 'mean_dice'
    all_df = all_df.drop(['present labels', 'mean_dice'], axis=1)
    return all_df
    


## OAR wise metrics

In [4]:
all_df = []
for experiment_name, experiment_path in experiments_info.items():
    experiment_fullpath = os.path.join(basepath, experiment_path)
    df = concat_all_folds(experiment_fullpath)

    # filter metrics
    all_metrics = []
    for metric in metrics:
        all_metrics.append(df.filter(like=metric))
    
    df_metrics = pd.concat(all_metrics, axis=1)

    # describe
    df_summary = df_metrics.describe().loc[["mean", "std"], :]
    df_summary["experiment_name"] = experiment_name

    all_df.append(df_summary)

df = pd.concat(all_df, axis=0)
print(df.head())
df.to_csv("../outputs/results_table.csv")

      dice_bowel_bag  dice_bladder  dice_hip  dice_rectum  sd_bowel_bag  \
mean        0.794705      0.892514  0.915695     0.735790      0.615531   
std         0.112442      0.160511  0.037321     0.139310      0.107703   
mean        0.847657      0.902419  0.919086     0.731505      0.663699   
std         0.067057      0.156189  0.022342     0.153214      0.092677   
mean        0.848819      0.906114  0.918095     0.741524      0.664528   

      sd_bladder    sd_hip  sd_rectum  hd_bowel_bag  hd_bladder     hd_hip  \
mean    0.882371  0.966181   0.745053     35.273295    7.835055   4.164103   
std     0.169874  0.047100   0.149898     24.767987   10.514225  20.143955   
mean    0.900714  0.970312   0.745246     19.343290    9.681867   2.933217   
std     0.163579  0.032738   0.155091     11.699754   37.379887   1.002871   
mean    0.905482  0.968602   0.754598     18.434255    7.959590   2.948901   

      hd_rectum experiment_name  
mean  16.979613      unet-noisy  
std   11.862

## per scan mean metrics

In [5]:
# make latex table
for i in range(0, len(df), 2):
    mean = df.iloc[i].to_list()[:-1]
    mean = [str(np.round(item, 4)) for item in mean]
    std = df.iloc[i+1].to_list()[:-1]
    std = [str(np.round(item, 4)) for item in std]
    entry_list = [f"& {item1} ({item2}) " for item1, item2 in zip(mean, std)]
    entry_str = "".join(entry_list)

    experiment_name = df.iloc[i].to_list()[-1]
    entry_str = experiment_name +  " " + entry_str + '\\'
    print(entry_str)

unet-noisy & 0.7947 (0.1124) & 0.8925 (0.1605) & 0.9157 (0.0373) & 0.7358 (0.1393) & 0.6155 (0.1077) & 0.8824 (0.1699) & 0.9662 (0.0471) & 0.7451 (0.1499) & 35.2733 (24.768) & 7.8351 (10.5142) & 4.1641 (20.144) & 16.9796 (11.8622) \
unet & 0.8477 (0.0671) & 0.9024 (0.1562) & 0.9191 (0.0223) & 0.7315 (0.1532) & 0.6637 (0.0927) & 0.9007 (0.1636) & 0.9703 (0.0327) & 0.7452 (0.1551) & 19.3433 (11.6998) & 9.6819 (37.3799) & 2.9332 (1.0029) & 17.798 (13.2601) \
basic-teacher & 0.8488 (0.0621) & 0.9061 (0.1459) & 0.9181 (0.025) & 0.7415 (0.1405) & 0.6645 (0.0868) & 0.9055 (0.1561) & 0.9686 (0.0384) & 0.7546 (0.141) & 18.4343 (9.9719) & 7.9596 (26.749) & 2.9489 (1.0336) & 17.1008 (12.607) \
basic-student & 0.8631 (0.0559) & 0.9213 (0.1143) & 0.9265 (0.0214) & 0.7695 (0.1263) & 0.6891 (0.0857) & 0.9313 (0.1197) & 0.9755 (0.0318) & 0.7896 (0.1292) & 17.2601 (10.4694) & 6.3063 (22.6983) & 2.8656 (1.0433) & 16.1102 (18.3493) \
robust-teacher & 0.8469 (0.0701) & 0.9023 (0.1543) & 0.9173 (0.024) & 0

In [6]:
all_df = []
for experiment_name, experiment_path in experiments_info.items():
    experiment_fullpath = os.path.join(basepath, experiment_path)
    df = concat_all_folds(experiment_fullpath)

    # all metrics average over OARs
    mean_metrics = {}
    for metric in metrics:
        mean_metrics[f"{metric}"] = df.filter(like=metric).mean(axis=1)

    df_metrics = pd.DataFrame.from_dict(mean_metrics)
    df_metrics["experiment_name"] = experiment_name
    all_df.append(df_metrics)

df = pd.concat(all_df, axis=0)

In [50]:
df.head()

Unnamed: 0,precision,recall,dice,hd,sd,experiment_name
0,0.8875,0.865,0.86,5.7725,0.85875,basic-student
1,0.8725,0.86,0.86,13.71775,0.78775,basic-student
2,0.9125,0.8825,0.895,9.184,0.864,basic-student
3,0.885,0.9725,0.93,5.259,0.915,basic-student
4,0.9175,0.885,0.8925,7.698,0.89125,basic-student


## describe and normality test
### We do Kolmogorov-Smirnov test because N>50

In [7]:
for experiment_name in experiments_info.keys():
    print(experiment_name)
    df_experiment = df.loc[df["experiment_name"]==experiment_name]
    print(df_experiment.describe().loc[["mean", "std"], :])
    print("")


# for experiment_name in experiments_info.keys():
#     print(experiment_name)
#     df_experiment = df.loc[df["experiment_name"]==experiment_name]
#     # kstest
#     for metric in metrics:
#         x = df_experiment.loc[:, metric].values
#         ks_result = stats.ks_1samp(x, stats.norm.cdf)
#         print(metric, ks_result)
#     print("")



unet-noisy
          dice        sd         hd
mean  0.834676  0.802284  16.063017
std   0.061577  0.068164   9.067847

unet
          dice        sd         hd
mean  0.850167  0.819993  12.439081
std   0.059150  0.065504  10.584513

basic-teacher
          dice        sd         hd
mean  0.853638  0.823302  11.610893
std   0.055356  0.061791   7.940141

basic-student
          dice        sd         hd
mean  0.870086  0.846372  10.635553
std   0.046246  0.051777   7.996050

robust-teacher
          dice        sd         hd
mean  0.853086  0.823047  11.567255
std   0.052472  0.057154   7.727776

robust-student
         dice        sd         hd
mean  0.87109  0.847620  10.392840
std   0.04275  0.048497   6.675385

robust-teacher-robust-student
          dice        sd        hd
mean  0.871571  0.848246  9.921549
std   0.041884  0.046790  4.715523

robust-teacher-robust-student - iter2
          dice        sd        hd
mean  0.873995  0.852945  9.852228
std   0.041282  0.046036  4.861

### distributions not normal, so Friedman's test for main effect

In [79]:
metrics = ["dice", "hd", "sd"]
for metric in metrics:
    data = []
    for experiment_name in experiments_info.keys():
        x = df.loc[df.experiment_name==experiment_name, metric].values
        data.append(x)
    
    test_stat = stats.friedmanchisquare(*data)
    print(metric)
    print(test_stat)
    print("")

precision
FriedmanchisquareResult(statistic=16.05325329202224, pvalue=0.0011058295831555085)

recall
FriedmanchisquareResult(statistic=408.57903879560035, pvalue=3.0677939118411136e-88)

dice
FriedmanchisquareResult(statistic=256.94183095842305, pvalue=2.062188102852081e-55)

hd
FriedmanchisquareResult(statistic=73.88942031758283, pvalue=6.267728008645628e-16)

sd
FriedmanchisquareResult(statistic=267.0085927057482, pvalue=1.3697488436069074e-57)



### significant main effect in all metrics, so wilcoxon signed rank test for post-hoc comparisons

In [88]:
metrics = ["dice", "hd", "sd"]
experiment_names = list(experiments_info.keys())
experiment_pairs = [(a, b) for idx, a in enumerate(experiment_names) for b in experiment_names[idx+1:]]
for metric in metrics:
    print(metric)
    data ={}
    for experiment_name in experiment_names:
        x = df.loc[df.experiment_name==experiment_name, metric].values
        data[experiment_name] = x
    
    for pair in experiment_pairs:
        x, y = data[pair[0]], data[pair[1]]
        test_stat = stats.wilcoxon(x, y)
        print(f"{pair[0]} vs. {pair[1]}", test_stat)
    print("")

dice
basic-student vs. robust-teacher WilcoxonResult(statistic=23161.5, pvalue=1.80360512459764e-36)
basic-student vs. robust-teacher-robust-student WilcoxonResult(statistic=60722.0, pvalue=0.718460130921065)
basic-student vs. robust-student WilcoxonResult(statistic=60930.5, pvalue=0.6538551549722539)
basic-student vs. unet-noisy WilcoxonResult(statistic=10994.5, pvalue=6.41158955300171e-61)
basic-student vs. unet WilcoxonResult(statistic=22854.5, pvalue=4.5126471640319576e-38)
basic-student vs. basic-teacher WilcoxonResult(statistic=20046.5, pvalue=2.20615275903994e-41)
robust-teacher vs. robust-teacher-robust-student WilcoxonResult(statistic=17704.0, pvalue=4.3389033085445e-46)
robust-teacher vs. robust-student WilcoxonResult(statistic=22318.5, pvalue=1.553205221617875e-38)
robust-teacher vs. unet-noisy WilcoxonResult(statistic=34556.5, pvalue=1.5302008162909071e-21)
robust-teacher vs. unet WilcoxonResult(statistic=62780.5, pvalue=0.7950701554635415)
robust-teacher vs. basic-teacher 

In [76]:
for grp in x.items():
    print(grp)

('dice', experiment_name
basic-student                    True
robust-student                   True
robust-teacher                   True
robust-teacher-robust-student    True
Name: dice, dtype: bool)
