In [13]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Function to read and process the evaluation results from a CSV file
def process_results(csv_file, model_name):
    df = pd.read_csv(csv_file)
    df['Model'] = model_name
    return df[['Model', 'Subpart', 'MSE', 'RMSE', 'MAE']]

# Directory containing the evaluation results
results_dir = "../Predictions/Volumetric/"

# DataFrame to store the evaluation results for each subpart
all_results = []

# Iterate over the models in the directory
for model_name in os.listdir(results_dir):
    model_path = os.path.join(results_dir, model_name)
    eval_file = os.path.join(model_path, "eval.csv")
    try:
        assert os.path.exists(eval_file)
    except AssertionError:
        print(f"Missing evaluation results for {model_name}")
        continue
    # Process the evaluation results for the current model
    results = process_results(eval_file, model_name)
    all_results.append(results)

all_results = pd.concat(all_results, ignore_index=True)

def get_nice_dataframe_sub(df, metrics, subpart = "Full", vertical = False):
    models = df["Model"].unique()
    df = df.copy()
    df = df[df["Subpart"] == subpart]

    # creates a dataframe where each metric has a column for its mean and std.
    # the mean and std. are computed for each model
    # the dataframe is then saved as a csv file
    df_std = pd.DataFrame(columns=metrics)
    for metric in metrics:
        df_std[metric] = df.groupby(["Model"])[metric].std()

    df_mean = pd.DataFrame(columns=metrics)
    for metric in metrics:
        df_mean[metric] = df.groupby(["Model"])[metric].mean()

    df_mean = df_mean.round(2)
    df_std = df_std.round(2)

    #combine both dataframes, intercalating columns

    empty_df = pd.DataFrame(columns = metrics)
    for metric in metrics:
        i = 0
        for model in models:
            mean_str = str(df_mean.loc[model, metric]) 
            std_str = str(df_std.loc[model, metric]) 
            if len(mean_str) == 3 or (mean_str[2] == '.' and len(mean_str) == 4):
                mean_str += '0'
            if len(std_str) == 3 or (std_str[2] == '.' and len(std_str) == 4):
                std_str += '0'
              
            empty_df.loc[model, metric] = mean_str + " (" + std_str + ")"
            i+=1

    # transposes the dataframe
    if vertical:
        empty_df = empty_df.T
    
    return empty_df

metrics = ['MAE', 'MSE', 'RMSE']
subparts = ['Full', 'LV', 'RV', 'LA', 'RA', 'aorta']

for subpart in subparts:
    print(subpart)
    nice = get_nice_dataframe_sub(all_results, metrics, subpart, vertical=1)
    display(nice)


Full


Unnamed: 0,VOL_ROI_DS_1_REG_0.01,VOL_ROI_DS_1_REG_0,VOL_ROI_DS_1_REG_0.0001,VOL_ROI_DS_1_REG_0.001
MAE,2.11 (0.61),2.08 (0.63),2.07 (0.64),2.04 (0.61)
MSE,8.39 (6.00),8.25 (6.14),8.22 (6.12),7.93 (5.63)
RMSE,2.77 (0.84),2.74 (0.88),2.73 (0.88),2.69 (0.84)


LV


Unnamed: 0,VOL_ROI_DS_1_REG_0.01,VOL_ROI_DS_1_REG_0,VOL_ROI_DS_1_REG_0.0001,VOL_ROI_DS_1_REG_0.001
MAE,1.66 (0.54),1.57 (0.56),1.57 (0.57),1.56 (0.57)
MSE,4.85 (3.67),4.41 (3.72),4.37 (3.67),4.37 (3.74)
RMSE,2.09 (0.70),1.97 (0.71),1.96 (0.73),1.96 (0.73)


RV


Unnamed: 0,VOL_ROI_DS_1_REG_0.01,VOL_ROI_DS_1_REG_0,VOL_ROI_DS_1_REG_0.0001,VOL_ROI_DS_1_REG_0.001
MAE,1.94 (0.71),1.91 (0.74),1.90 (0.75),1.86 (0.70)
MSE,7.09 (6.72),7.03 (7.02),7.00 (7.07),6.56 (6.08)
RMSE,2.48 (0.96),2.45 (1.01),2.45 (1.01),2.38 (0.93)


LA


Unnamed: 0,VOL_ROI_DS_1_REG_0.01,VOL_ROI_DS_1_REG_0,VOL_ROI_DS_1_REG_0.0001,VOL_ROI_DS_1_REG_0.001
MAE,2.16 (0.69),2.13 (0.70),2.14 (0.71),2.11 (0.70)
MSE,8.39 (7.31),8.28 (7.16),8.33 (7.60),8.10 (7.30)
RMSE,2.75 (0.92),2.73 (0.93),2.73 (0.95),2.69 (0.93)


RA


Unnamed: 0,VOL_ROI_DS_1_REG_0.01,VOL_ROI_DS_1_REG_0,VOL_ROI_DS_1_REG_0.0001,VOL_ROI_DS_1_REG_0.001
MAE,2.40 (0.76),2.35 (0.78),2.35 (0.77),2.33 (0.77)
MSE,10.60 (9.18),10.28 (9.03),10.27 (8.95),10.08 (8.81)
RMSE,3.08 (1.06),3.02 (1.07),3.03 (1.05),3.00 (1.05)


aorta


Unnamed: 0,VOL_ROI_DS_1_REG_0.01,VOL_ROI_DS_1_REG_0,VOL_ROI_DS_1_REG_0.0001,VOL_ROI_DS_1_REG_0.001
MAE,2.17 (0.80),2.16 (0.84),2.12 (0.82),2.13 (0.82)
MSE,8.71 (7.85),8.68 (8.12),8.40 (7.72),8.44 (7.67)
RMSE,2.76 (1.05),2.74 (1.09),2.69 (1.07),2.71 (1.05)


In [14]:
import pandas as pd
from scipy.stats import ttest_ind


df = all_results[all_results['Subpart'] == 'Full']

metrics = ['MAE', 'MSE']

models = df['Model'].unique()

# Compare all pairs of models
for i in range(len(models)):
    for j in range(i+1, len(models)):
        model1 = models[i]
        model2 = models[j]
        
        print(f"Comparing {model1} and {model2}:")

        for metric in metrics:
            # Select the values for each model
            values_model1 = df[df['Model'] == model1][metric]
            values_model2 = df[df['Model'] == model2][metric]

            # Perform the Wilcoxon test
            stat, p = ttest_ind(values_model1, values_model2)
            
            # Print the result
           
            if p > 0.05:
                print(f"Metric {metric}: p-value={p}")
            #    print(f"Fail to reject the null hypothesis, suggesting no significant difference for {metric}")
            else:
                print(f"Reject the null hypothesis, suggesting a significant difference for {metric}")
        print("\n")

Comparing VOL_ROI_DS_1_REG_0.01 and VOL_ROI_DS_1_REG_0:
Metric MAE: p-value=0.1565173366054036
Metric MSE: p-value=0.573063875701248


Comparing VOL_ROI_DS_1_REG_0.01 and VOL_ROI_DS_1_REG_0.0001:
Metric MAE: p-value=0.0950431647626491
Metric MSE: p-value=0.4875303514864634


Comparing VOL_ROI_DS_1_REG_0.01 and VOL_ROI_DS_1_REG_0.001:
Reject the null hypothesis, suggesting a significant difference for MAE
Metric MSE: p-value=0.05127937242289168


Comparing VOL_ROI_DS_1_REG_0 and VOL_ROI_DS_1_REG_0.0001:
Metric MAE: p-value=0.8001946957682271
Metric MSE: p-value=0.8979903093443942


Comparing VOL_ROI_DS_1_REG_0 and VOL_ROI_DS_1_REG_0.001:
Metric MAE: p-value=0.18896094803042218
Metric MSE: p-value=0.17883546604740855


Comparing VOL_ROI_DS_1_REG_0.0001 and VOL_ROI_DS_1_REG_0.001:
Metric MAE: p-value=0.2931472694238161
Metric MSE: p-value=0.22502174462885152




In [15]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Function to read and process the evaluation results from a CSV file
def process_results(csv_file, model_name):
    df = pd.read_csv(csv_file)
    df['Model'] = model_name
    return df

# Directory containing the evaluation results
results_dir = "../Predictions/Volumetric/"

# DataFrame to store the evaluation results for each subpart
all_results = []

# Iterate over the models in the directory
for model_name in os.listdir(results_dir):
    model_path = os.path.join(results_dir, model_name)
    eval_file = os.path.join(model_path, "metrics.csv")

    if not os.path.exists(eval_file):
        continue
    
    # Process the evaluation results for the current model
    results = process_results(eval_file, model_name)
    all_results.append(results)

all_results = pd.concat(all_results, ignore_index=True)

def get_nice_dataframe(df, metrics, vertical = False):
    models = df["Model"].unique()

    # creates a dataframe where each metric has a column for its mean and std.
    # the mean and std. are computed for each model
    # the dataframe is then saved as a csv file
    df_std = pd.DataFrame(columns=metrics)
    for metric in metrics:
        df_std[metric] = df.groupby(["Model"])[metric].std()

    df_mean = pd.DataFrame(columns=metrics)
    for metric in metrics:
        df_mean[metric] = df.groupby(["Model"])[metric].mean()

    df_mean = df_mean.round(2)
    df_std = df_std.round(2)

    #combine both dataframes, intercalating columns

    empty_df = pd.DataFrame(columns = metrics)
    for metric in metrics:
        i = 0
        for model in models:
            mean_str = str(df_mean.loc[model, metric]) 
            std_str = str(df_std.loc[model, metric]) 
            if len(mean_str) == 3 or (mean_str[2] == '.' and len(mean_str) == 4):
                mean_str += '0'
            if len(std_str) == 3 or (std_str[2] == '.' and len(std_str) == 4):
                std_str += '0'
              
            empty_df.loc[model, metric] = mean_str + " (" + std_str + ")"
            i+=1

    # transposes the dataframe
    if vertical:
        empty_df = empty_df.T
    
    return empty_df


metrics = ['LV Endo - DC', 'LV Endo - HD', 'LV Endo - MCD', 
           'LV Myo - DC', 'LV Myo - HD', 'LV Myo - MCD', 
           'RV Endo - DC', 'RV Endo - HD', 'RV Endo - MCD']

nice = get_nice_dataframe(all_results, metrics, vertical=1)

display(nice)

Unnamed: 0,VOL_ROI_DS_1_REG_0.01,VOL_ROI_DS_1_REG_0,VOL_ROI_DS_1_REG_0.0001,VOL_ROI_DS_1_REG_0.001
LV Endo - DC,0.88 (0.05),0.90 (0.04),0.90 (0.04),0.90 (0.05)
LV Endo - HD,5.21 (1.42),4.36 (1.22),4.32 (1.24),4.41 (1.35)
LV Endo - MCD,1.89 (0.62),1.52 (0.46),1.51 (0.49),1.58 (0.54)
LV Myo - DC,0.74 (0.06),0.78 (0.04),0.78 (0.04),0.76 (0.05)
LV Myo - HD,5.30 (1.57),5.27 (1.47),4.98 (1.40),5.17 (1.50)
LV Myo - MCD,1.96 (0.77),1.86 (0.61),1.81 (0.64),1.95 (0.72)
RV Endo - DC,0.85 (0.06),0.85 (0.06),0.86 (0.05),0.85 (0.05)
RV Endo - HD,7.55 (2.80),7.22 (2.76),6.97 (2.54),7.38 (2.67)
RV Endo - MCD,2.13 (0.69),2.05 (0.64),2.02 (0.63),2.09 (0.64)


In [16]:
import pandas as pd
from scipy.stats import wilcoxon


df = all_results


metrics = ['LV Endo - DC', 'LV Endo - HD', 'LV Endo - MCD', 
           'LV Myo - DC', 'LV Myo - HD', 'LV Myo - MCD', 
           'RV Endo - DC', 'RV Endo - HD', 'RV Endo - MCD']

models = df['Model'].unique()

# Compare all pairs of models
for i in range(len(models)):
    for j in range(i+1, len(models)):
        model1 = models[i]
        model2 = models[j]
        
        print(f"Comparing {model1} and {model2}:")

        for metric in metrics:
            # Select the values for each model
            values_model1 = df[df['Model'] == model1][metric]
            values_model2 = df[df['Model'] == model2][metric]

            # Perform the Wilcoxon test
            stat, p = ttest_ind(values_model1, values_model2)
            
            # Print the result
           
            if p > 0.05:
                print(f"Metric {metric}: p-value={p}")
            #    print(f"Fail to reject the null hypothesis, suggesting no significant difference for {metric}")
            else:
                print(f"Reject the null hypothesis, suggesting a significant difference for {metric}")
        print("\n")

Comparing VOL_ROI_DS_1_REG_0.01 and VOL_ROI_DS_1_REG_0:
Reject the null hypothesis, suggesting a significant difference for LV Endo - DC
Reject the null hypothesis, suggesting a significant difference for LV Endo - HD
Reject the null hypothesis, suggesting a significant difference for LV Endo - MCD
Reject the null hypothesis, suggesting a significant difference for LV Myo - DC
Metric LV Myo - HD: p-value=0.621523895489626
Reject the null hypothesis, suggesting a significant difference for LV Myo - MCD
Reject the null hypothesis, suggesting a significant difference for RV Endo - DC
Reject the null hypothesis, suggesting a significant difference for RV Endo - HD
Reject the null hypothesis, suggesting a significant difference for RV Endo - MCD


Comparing VOL_ROI_DS_1_REG_0.01 and VOL_ROI_DS_1_REG_0.0001:
Reject the null hypothesis, suggesting a significant difference for LV Endo - DC
Reject the null hypothesis, suggesting a significant difference for LV Endo - HD
Reject the null hypothes