In [1]:
import sys, os, glob
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from PIL import Image
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from DGXutils import GetFileNames, GetLowestGPU

sys.path.append('../')

from utils.Metrics import iou

In [2]:
# get accuracy per image for each model
base_path = '../data/test/{}_test_predictions_by_pod/'

vanilla_path = base_path.format('vanilla')
up_path = base_path.format('up')
down_path = base_path.format('down')

gt_path = '../data/test/test_masks_by_pod/'

# only need one set of names since they are the same for all models
gt_names = GetFileNames(gt_path)

In [19]:
# load the data
vanilla_preds, up_preds, down_preds, gt_masks = [], [], [], []

print('Loading data...')
for name in tqdm(gt_names):
    # load
    vanilla_pred = np.array(Image.open(vanilla_path + 'pred_' + name)) / 255
    up_pred = np.array(Image.open(up_path + 'pred_' + name)) / 255
    down_pred = np.array(Image.open(down_path + 'pred_' + name)) / 255

    # turn bg to black
    vanilla_pred[vanilla_pred.sum(axis=2) == 3] = 0
    up_pred[up_pred.sum(axis=2) == 3] = 0
    down_pred[down_pred.sum(axis=2) == 3] = 0

    # append
    vanilla_preds.append(vanilla_pred)
    up_preds.append(up_pred)
    down_preds.append(down_pred)

    gt_masks.append(np.array(Image.open(gt_path + name)) / 255)
print("Done.")

Loading data...


  0%|          | 0/64 [00:00<?, ?it/s]

Done.


In [20]:
# get accuracies for each model
vanilla_seed_acc, up_seed_acc, down_seed_acc = [], [], []

print("Calculating accuracies...")
for i in tqdm(range(len(gt_masks))):
    vanilla, up, down, mask = vanilla_preds[i], up_preds[i], down_preds[i], gt_masks[i]

    vanilla_seed = vanilla[:, :, 2]
    up_seed = up[:, :, 2]
    down_seed = down[:, :, 2]
    mask_seed = mask[:, :, 2]

    vanilla_seed_acc.append(iou(vanilla_seed, mask_seed))
    up_seed_acc.append(iou(up_seed, mask_seed))
    down_seed_acc.append(iou(down_seed, mask_seed))
print("Done.")
    

Calculating accuracies...


  0%|          | 0/64 [00:00<?, ?it/s]

Done.


In [22]:
np.array(vanilla_seed_acc).mean(), np.array(up_seed_acc).mean(), np.array(down_seed_acc).mean()

(0.7692997804533273, 0.7666305824958907, 0.7718830387601895)

# Seed Model Significance Test

In [24]:
# create dataframe
seed_df = pd.DataFrame({
    'vanilla': vanilla_seed_acc,
    'upweighted': up_seed_acc,
    'downweighted': down_seed_acc
})
seed_df_melt = pd.melt(seed_df.reset_index(), id_vars=['index'], value_vars=['vanilla', 'upweighted', 'downweighted'])
seed_df_melt.columns = ['index', 'treatments', 'value']

In [25]:
# run ANOVA
model = ols('value ~ C(treatments)', data=seed_df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)
print()

                 sum_sq     df         F    PR(>F)
C(treatments)  0.000883    2.0  0.035066  0.965548
Residual       2.379375  189.0       NaN       NaN



In [26]:
# run tukey HSD test
m_comp = pairwise_tukeyhsd(endog=seed_df_melt['value'], groups=seed_df_melt['treatments'], alpha=0.05)
print(m_comp)

     Multiple Comparison of Means - Tukey HSD, FWER=0.05     
   group1      group2   meandiff p-adj   lower  upper  reject
-------------------------------------------------------------
downweighted upweighted  -0.0053 0.9621 -0.0521 0.0416  False
downweighted    vanilla  -0.0026 0.9907 -0.0494 0.0443  False
  upweighted    vanilla   0.0027 0.9901 -0.0442 0.0495  False
-------------------------------------------------------------
