In [1]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro, kruskal, mannwhitneyu
import matplotlib.pyplot as plt

In [2]:
defa_ft = pd.read_csv('data/scores/scores_default_finetune.csv')
defa_st = pd.read_csv('data/scores/scores_default_scratch.csv')
segr_ft = pd.read_csv('data/scores/scores_segresnet_finetune.csv')
segr_st = pd.read_csv('data/scores/scores_segresnet_scratch.csv')
tver_ft = pd.read_csv('data/scores/scores_tverskybce_finetune.csv')
tver_st = pd.read_csv('data/scores/scores_tverskybce_scratch.csv')

In [3]:
columns = defa_ft.columns.tolist()
columns = [col for col in columns if col != 'sample'] # ignore the ids
ids = defa_ft['sample'].tolist()

In [4]:
columns, ids

(['DICE_et',
  'DICE_tc',
  'DICE_wt',
  'Hausdorff_et',
  'Hausdorff_tc',
  'Hausdorff_wt',
  'Sensitivity_et',
  'Sensitivity_tc',
  'Sensitivity_wt',
  'Specificity_et',
  'Specificity_tc',
  'Specificity_wt',
  'True_volume_et',
  'True_volume_tc',
  'True_volume_wt',
  'Predicted_volume_et',
  'Predicted_volume_tc',
  'Predicted_volume_wt'],
 ['SBT-MET-B000730-20180517.nii.gz',
  'SBT-MET-A000726-20180528.nii.gz',
  'SBT-MET-A000727-20180724.nii.gz',
  'SBT-MET-A000728-20180326.nii.gz',
  'SBT-MET-A000729-20181109.nii.gz',
  'SBT-MET-B000731-20180316.nii.gz',
  'SBT-MET-B000732-20161017.nii.gz',
  'SBT-MET-B000733-20181114.nii.gz',
  'SBT-MET-B000734-20180423.nii.gz',
  'SBT-MET-D000739-20180319.nii.gz'])

## Fine-tuned vs scratch-trained models

#### Check normality of Dice

In [5]:
dataframes = {
    'default_finetune'   : defa_ft,
    'default_scratch'    : defa_st,
    'segresnet_finetune' : segr_ft,
    'segresnet_scratch'  : segr_st,
    'tverskybce_finetune': tver_ft,
    'tverskybce_scratch' : tver_st
}

def check_normality(dataframes, metric_columns):
    normality_results = {}
    for model, df in dataframes.items():
        normality_results[model] = {}
        for metric_column in metric_columns:
            stat, p_value = shapiro(df[metric_column])
            is_normal = "Normal" if p_value > 0.05 else "Not Normal"
            normality_results[model][metric_column] = {'W-statistic': stat, 'p-value': p_value, 'Normality': is_normal}
    
    return normality_results

In [6]:
for metric in ['DICE', 'Hausdorff', 'Sensitivity', 'Specificity']:
    normality_results_all = check_normality(
        dataframes,
        metric_columns=[f'{metric}_et', f'{metric}_tc', f'{metric}_wt']
    )
    normality_df = pd.DataFrame.from_dict({(i,j): normality_results_all[i][j] 
                                        for i in normality_results_all.keys() 
                                        for j in normality_results_all[i].keys()},
                                        orient='index')

    print(normality_df)
    print()

                             W-statistic   p-value   Normality
default_finetune    DICE_et     0.912360  0.297596      Normal
                    DICE_tc     0.596355  0.000050  Not Normal
                    DICE_wt     0.885041  0.149012      Normal
default_scratch     DICE_et     0.937942  0.530351      Normal
                    DICE_tc     0.731380  0.002118  Not Normal
                    DICE_wt     0.960990  0.797102      Normal
segresnet_finetune  DICE_et     0.897431  0.205272      Normal
                    DICE_tc     0.522722  0.000007  Not Normal
                    DICE_wt     0.901460  0.227315      Normal
segresnet_scratch   DICE_et     0.890584  0.172164      Normal
                    DICE_tc     0.627135  0.000117  Not Normal
                    DICE_wt     0.926187  0.411448      Normal
tverskybce_finetune DICE_et     0.866469  0.090908      Normal
                    DICE_tc     0.884365  0.146394      Normal
                    DICE_wt     0.929221  0.440274     

#### Non-parametric test

In [7]:
def compare_finetuned_vs_scratch(dataframes, fine_tuned_models, scratch_models, metric_columns):
    comparison_results = {}
    for metric_column in metric_columns:
        fine_tuned_scores = pd.concat([dataframes[model][metric_column] for model in fine_tuned_models])
        scratch_scores = pd.concat([dataframes[model][metric_column] for model in scratch_models])
        u_stat, p_value = mannwhitneyu(fine_tuned_scores, scratch_scores, alternative='greater')
        
        is_significant = "Significant" if p_value < 0.05 else "Not Significant"
        comparison_results[metric_column] = {
            'Mann-Whitney U-statistic': u_stat,
            'p-value': p_value,
            'Significant Difference': is_significant
        }
    return comparison_results

In [8]:
fine_tuned_models = ['default_finetune', 'segresnet_finetune', 'tverskybce_finetune']
scratch_models    = ['default_scratch' , 'segresnet_scratch' , 'tverskybce_scratch' ]

for metric in ['DICE', 'Hausdorff', 'Sensitivity', 'Specificity']:
    comparison_results = compare_finetuned_vs_scratch(
    dataframes,
    fine_tuned_models,
    scratch_models,
    metric_columns=[f'{metric}_et', f'{metric}_tc', f'{metric}_wt']
)
    comparison_df = pd.DataFrame(comparison_results).T
    print(comparison_df)
    print('')

        Mann-Whitney U-statistic   p-value Significant Difference
DICE_et                    481.0  0.326022        Not Significant
DICE_tc                    551.0  0.068661        Not Significant
DICE_wt                    465.0  0.415128        Not Significant

             Mann-Whitney U-statistic   p-value Significant Difference
Hausdorff_et                    364.0  0.899701        Not Significant
Hausdorff_tc                    233.5  0.999393        Not Significant
Hausdorff_wt                    451.0  0.497045        Not Significant

               Mann-Whitney U-statistic   p-value Significant Difference
Sensitivity_et                    491.0  0.274663        Not Significant
Sensitivity_tc                    517.0  0.162763        Not Significant
Sensitivity_wt                    475.0  0.358594        Not Significant

               Mann-Whitney U-statistic   p-value Significant Difference
Specificity_et                    434.0  0.596364        Not Significant
Specificity

## Architectures comparision

In [9]:
def check_normality(dataframes, metric_columns):
    normality_results = {}
    for model, df in dataframes.items():
        normality_results[model] = {}
        for metric_column in metric_columns:
            stat, p_value = shapiro(df[metric_column])
            is_normal = "Normal" if p_value > 0.05 else "Not Normal"
            normality_results[model][metric_column] = {'W-statistic': stat, 'p-value': p_value, 'Normality': is_normal}
    
    return normality_results

def compare_architectures(dataframes, architecture_groups, metric_columns):
    comparison_results = {}
    for metric_column in metric_columns:

        # Gather the scores for each architecture
        default_scores = pd.concat([dataframes[model][metric_column] for model in architecture_groups['default']])
        segresnet_scores = pd.concat([dataframes[model][metric_column] for model in architecture_groups['segresnet']])
        tverskybce_scores = pd.concat([dataframes[model][metric_column] for model in architecture_groups['tverskybce']])
        
        # Perform the Kruskal-Wallis test for significant differences
        stat, p_value = kruskal(default_scores, segresnet_scores, tverskybce_scores)
        is_significant = "Significant" if p_value < 0.05 else "Not Significant"
        
        # Store the results
        comparison_results[metric_column] = {
            'Kruskal-Wallis H-statistic': stat,
            'p-value': p_value,
            'Significant Difference': is_significant
        }
    
    return comparison_results

def pairwise_compare_architectures(dataframes, architecture_groups, metric_columns):
    pairwise_comparisons = {}
    architecture_pairs = [
        ('tverskybce', 'segresnet'),
        ('tverskybce', 'default'),
        ('segresnet', 'default'),
    ]
    
    for metric_column in metric_columns:
        pairwise_comparisons[metric_column] = {}
        
        for arch1, arch2 in architecture_pairs:
            # Gather the scores for the two architectures
            arch1_scores = pd.concat([dataframes[model][metric_column] for model in architecture_groups[arch1]])
            arch2_scores = pd.concat([dataframes[model][metric_column] for model in architecture_groups[arch2]])
            
            # Perform the Mann-Whitney U test
            stat, p_value = mannwhitneyu(arch1_scores, arch2_scores, alternative='greater')
            is_significant = "Significantly greater" if p_value < 0.05 else ""
            
            # Store the results
            pairwise_comparisons[metric_column][f'{arch1} vs {arch2}'] = f'p_value={p_value:.2f} => {is_significant}'
            # pairwise_comparisons[metric_column][f'{arch1} vs {arch2}'] = {
            #     'Mann-Whitney U-statistic': stat,
            #     'p-value': p_value,
            #     'Significant Difference': is_significant
            # }
    
    return pairwise_comparisons

In [10]:
architecture_groups = {
    'default'   : ['default_finetune', 'default_scratch'],
    'segresnet' : ['segresnet_finetune', 'segresnet_scratch'],
    'tverskybce': ['tverskybce_finetune', 'tverskybce_scratch']
}
architecture_comparison_dfs = []

for metric in ['DICE', 'Hausdorff', 'Sensitivity', 'Specificity']:
    architecture_comparison_results = pairwise_compare_architectures(
        dataframes,
        architecture_groups,
        metric_columns=[f'{metric}_et', f'{metric}_tc', f'{metric}_wt']
    )

    architecture_comparison_df = pd.DataFrame(architecture_comparison_results).T
    architecture_comparison_dfs.append(architecture_comparison_df)
    print(architecture_comparison_df)

                       tverskybce vs segresnet  \
DICE_et                       p_value=0.48 =>    
DICE_tc                       p_value=0.59 =>    
DICE_wt  p_value=0.04 => Significantly greater   

                         tverskybce vs default segresnet vs default  
DICE_et                       p_value=0.55 =>      p_value=0.62 =>   
DICE_tc                       p_value=0.69 =>      p_value=0.63 =>   
DICE_wt  p_value=0.04 => Significantly greater     p_value=0.49 =>   
             tverskybce vs segresnet tverskybce vs default  \
Hausdorff_et        p_value=0.37 =>       p_value=0.42 =>    
Hausdorff_tc        p_value=0.72 =>       p_value=0.82 =>    
Hausdorff_wt        p_value=0.75 =>       p_value=0.88 =>    

             segresnet vs default  
Hausdorff_et     p_value=0.53 =>   
Hausdorff_tc     p_value=0.54 =>   
Hausdorff_wt     p_value=0.72 =>   
                              tverskybce vs segresnet  \
Sensitivity_et  p_value=0.01 => Significantly greater   
Sensitivity_

In [11]:
ac_df = pd.concat(architecture_comparison_dfs)
ac_df

Unnamed: 0,tverskybce vs segresnet,tverskybce vs default,segresnet vs default
DICE_et,p_value=0.48 =>,p_value=0.55 =>,p_value=0.62 =>
DICE_tc,p_value=0.59 =>,p_value=0.69 =>,p_value=0.63 =>
DICE_wt,p_value=0.04 => Significantly greater,p_value=0.04 => Significantly greater,p_value=0.49 =>
Hausdorff_et,p_value=0.37 =>,p_value=0.42 =>,p_value=0.53 =>
Hausdorff_tc,p_value=0.72 =>,p_value=0.82 =>,p_value=0.54 =>
Hausdorff_wt,p_value=0.75 =>,p_value=0.88 =>,p_value=0.72 =>
Sensitivity_et,p_value=0.01 => Significantly greater,p_value=0.01 => Significantly greater,p_value=0.57 =>
Sensitivity_tc,p_value=0.01 => Significantly greater,p_value=0.01 => Significantly greater,p_value=0.51 =>
Sensitivity_wt,p_value=0.00 => Significantly greater,p_value=0.00 => Significantly greater,p_value=0.59 =>
Specificity_et,p_value=0.98 =>,p_value=0.98 =>,p_value=0.53 =>
