In [1]:
import argparse
import math
import os
from heapq import heappush, heappop, heappushpop

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import rankdata, norm
from sklearn.metrics import roc_auc_score, roc_curve
from IPython.display import Image 
from IPython.display import Markdown as md
from tqdm import tqdm
import re
from collections import defaultdict

from vilio.utils.ens import *

In [2]:
pd.set_option("display.width", 180)
pd.set_option("display.max_rows", None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Utils

In [3]:
def average(data, weights=None):
    N = data.shape[1]
    if weights is None:
        weights = [1/N] * N
    elif np.sum(weights) != 1.:
        weights = weights / np.sum(weights)
    
    # Compute weighted avg
    return data.apply(lambda row: row.multiply(weights).sum(), axis=1)

In [4]:
def acc_from_roc(labels, probas, splits=None):
    '''Determines the greatest achievable accuracy from the ROC curve.'''
    if splits is None:
        splits = (250, 250)

    fpr, tpr, thresholds = roc_curve(labels, probas)
    tp = tpr * splits[0]
    tn = (1 - fpr) * splits[1]
    acc = (tp + tn) / np.sum(splits)
    best_threshold = thresholds[np.argmax(acc)]

    return np.amax(acc), best_threshold

## Load Data

In [5]:
data_dir = 'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data'
feature_dir = os.path.join(data_dir, 'features')
anno_dir = os.path.join(feature_dir, 'annotations')
gt_dir = os.path.join(anno_dir, 'gt')
img_dir = os.path.join(feature_dir, 'img')

In [6]:
paths = ['C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v3\\98',
        'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v3\\61',
        'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v3\\43']
gt_path = 'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\features\\annotations\\gt'

In [7]:
# Ground truth
gt_dev = pd.read_json(os.path.join(gt_path, 'dev_all.jsonl'), lines=True)
gt_ts = pd.read_json(os.path.join(gt_path, 'test_seen.jsonl'), lines=True)
gt_tu = pd.read_json(os.path.join(gt_path, 'test_unseen.jsonl'), lines=True)
gt_test = gt_ts.append(gt_tu)
dev_or = {'Correct': gt_dev}

# Load data
dev, ts, tu = {}, {}, {}
experiments = []
for path in tqdm(paths):
    seed = path.split('\\')[-1]
    for csv in sorted(os.listdir(path)):
        if ".csv" in csv and "test_unseen" in csv:
            exp = csv.split('_')[0]
            name = exp + seed
            experiments.append(name)            
            dev[name] = pd.read_csv(os.path.join(path, f'{exp}_dev_all.csv'))
            dev_idx = dev[name].id.values
            tu[name] = pd.read_csv(os.path.join(path, f'{exp}_test_unseen.csv'))
            tu_idx = tu[name].id.values
            ts[name] = pd.read_csv(os.path.join(path, f'{exp}_test_seen.csv'))
            ts_idx = ts[name].id.values

dev_or.update(dev)
dev_probas = pd.DataFrame({k: v.proba.values for k, v in dev.items()})
ts_probas = pd.DataFrame({k: v.proba.values for k, v in ts.items()})
tu_probas = pd.DataFrame({k: v.proba.values for k, v in tu.items()})

100%|██████████| 3/3 [00:07<00:00,  2.50s/it]


In [8]:
# Average over seeds
seeds = [43, 61, 98]
for model in ['U', 'O', 'D', 'X']:
    for num in [1, 5, 10, 15, 20, 36, 50, 72]:
        for flag in ['', 'a', 'c', 'ac']:
            try:
                subset = dev_probas[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                dev_probas[f'{model}{num}{flag}m'] = mean
                #dev_probas[f'{model}{num}{flag}sd'] = std_dev(subset, mean)
                subset = ts_probas[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                ts_probas[f'{model}{num}{flag}m'] = mean
                #ts_probas[f'{model}{num}{flag}sd'] = std_dev(subset, mean)
                subset = tu_probas[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                tu_probas[f'{model}{num}{flag}m'] = mean
                #tu_probas[f'{model}{num}{flag}sd'] = std_dev(subset, mean)
            except:
                print(f'Missing {[f"{model}{num}{flag}{seed}" for seed in [43, 61, 98]]}')
test_probas = ts_probas.append(tu_probas)
test_idx = np.concatenate((ts_idx, tu_idx))

Missing ['U72c43', 'U72c61', 'U72c98']
Missing ['U72ac43', 'U72ac61', 'U72ac98']
Missing ['O72c43', 'O72c61', 'O72c98']
Missing ['O72ac43', 'O72ac61', 'O72ac98']
Missing ['D72c43', 'D72c61', 'D72c98']
Missing ['D72ac43', 'D72ac61', 'D72ac98']
Missing ['X72c43', 'X72c61', 'X72c98']
Missing ['X72ac43', 'X72ac61', 'X72ac98']


In [9]:
experiments

['D1098',
 'D10a98',
 'D10ac98',
 'D10c98',
 'D1598',
 'D15a98',
 'D15ac98',
 'D15c98',
 'D198',
 'D1a98',
 'D1ac98',
 'D1c98',
 'D2098',
 'D20a98',
 'D20ac98',
 'D20c98',
 'D3698',
 'D36a98',
 'D36ac98',
 'D36c98',
 'D5098',
 'D50a98',
 'D50ac98',
 'D50c98',
 'D598',
 'D5a98',
 'D5ac98',
 'D5c98',
 'D7298',
 'D72a98',
 'O1098',
 'O10a98',
 'O10ac98',
 'O10c98',
 'O1598',
 'O15a98',
 'O15ac98',
 'O15c98',
 'O198',
 'O1a98',
 'O1ac98',
 'O1c98',
 'O2098',
 'O20a98',
 'O20ac98',
 'O20c98',
 'O3698',
 'O36a98',
 'O36ac98',
 'O36c98',
 'O5098',
 'O50a98',
 'O50ac98',
 'O50c98',
 'O598',
 'O5a98',
 'O5ac98',
 'O5c98',
 'O7298',
 'O72a98',
 'U1098',
 'U10a98',
 'U10ac98',
 'U10c98',
 'U1598',
 'U15a98',
 'U15ac98',
 'U15c98',
 'U198',
 'U1a98',
 'U1ac98',
 'U1c98',
 'U2098',
 'U20a98',
 'U20ac98',
 'U20c98',
 'U3698',
 'U36a98',
 'U36ac98',
 'U36c98',
 'U5098',
 'U50a98',
 'U50ac98',
 'U50c98',
 'U598',
 'U5a98',
 'U5ac98',
 'U5c98',
 'U7298',
 'U72a98',
 'X1098',
 'X10a98',
 'X10ac98',
 'X1

## Display Scores

In [10]:
seed = 'm'

### Dev

In [11]:
scores = {}
for exp in dev_probas.keys():
    scores[exp]=roc_auc_score(gt_dev.label, dev_probas[exp])
    
out = '| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |\n'
out += '| --- | --- | --- | --- | --- | --- | --- | --- |\n'
for model in ['U', 'O', 'D', 'X']:   
    for flag in ['', 'a', 'c', 'ac']:
        row = [f'{scores.get(f"{model}{num}{flag}{seed}")}' for num in [1, 5, 10, 15, 20, 36, 50, 72]]
        out += f'{" | ".join(row)} |\n'
md(out)

| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |
| --- | --- | --- | --- | --- | --- | --- | --- |
0.7331437813559148 | 0.7686126649565782 | 0.776205045791225 | 0.7775133665049293 | 0.7937901123919605 | 0.7816752686178158 | 0.7827672528355534 | 0.7738768530251053 |
0.7294248539728652 | 0.7624213204767643 | 0.7778945308073473 | 0.7873618279403736 | 0.7887937695089162 | 0.7869703619000524 | 0.7864037663153773 | 0.7869600601621494 |
0.7162798364084022 | 0.7724140062428532 | 0.7889482955774638 | 0.782293372892007 | 0.7823963902710387 | 0.7826745371944247 | 0.7801403096702415 | None |
0.7176705710253319 | 0.7650585653799796 | 0.7773794439121879 | 0.7828084597871661 | 0.7806966035170133 | 0.7908953240411656 | 0.782015225968621 | None |
0.6940692894891368 | 0.7603300676824181 | 0.771229306383987 | 0.7676443015936789 | 0.7710335733638265 | 0.770497882992861 | 0.7709202542468914 | 0.7640077881138548 |
0.6848698375415933 | 0.7587126948316181 | 0.7675206807388406 | 0.7716207724243082 | 0.7707863316541501 | 0.7734750852468811 | 0.7779357377589601 | 0.7729806018275284 |
0.6856424678843321 | 0.7614735605896715 | 0.7714868498315666 | 0.7703021499727003 | 0.7680254658960967 | 0.773804740859783 | 0.770394865613829 | None |
0.680161943319838 | 0.7530673424606731 | 0.7647907201944969 | 0.7685302510533527 | 0.7774000473879945 | 0.7776472890976708 | 0.7783787124887969 | None |
0.6756806873319529 | 0.728683128843836 | 0.747864964819565 | 0.7466596614848925 | 0.743672157492969 | 0.7452689268679626 | 0.7439915113679678 | 0.753170359839705 |
0.6747741343964726 | 0.7194012629930668 | 0.7434352175211958 | 0.7506361323155216 | 0.7435176314244213 | 0.7369348209042865 | 0.7492144924848823 | 0.7513160470171318 |
0.6700765419126206 | 0.7443108652429664 | 0.7529025146542222 | 0.7542520423195394 | 0.7548804483316335 | 0.760505197226772 | 0.7555294578195342 | None |
0.6649874833884476 | 0.7274366185575506 | 0.7521916947389025 | 0.7426213802268442 | 0.7459900485211856 | 0.7700446065251207 | 0.7593514025816155 | None |
0.6314965334651955 | 0.6831288438359551 | 0.7054321063963491 | 0.7086565503600457 | 0.704298915226999 | 0.7018058946544281 | 0.6997455470737913 | 0.7115307352350342 |
0.6295701084773002 | 0.6967992500334806 | 0.6977161047068641 | 0.7085329295052075 | 0.7196175994890338 | 0.7084711190777885 | 0.7103460353761679 | 0.723429242513212 |
0.6297246345458479 | 0.6875997980859372 | 0.6944401520536514 | 0.705421804658446 | 0.7040001648278064 | 0.7057308567955414 | 0.7018367998681377 | None |
0.629559806739397 | 0.6849007427553028 | 0.7069670653439235 | 0.7012496008076563 | 0.7065755993036026 | 0.7015277477310422 | 0.7124887968600302 | None |


### Test Seen

In [12]:
scores = {}
for exp in ts_probas.keys():
    scores[exp]=roc_auc_score(gt_ts.label, ts_probas[exp])
    
out = '| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |\n'
out += '| --- | --- | --- | --- | --- | --- | --- | --- |\n'
for model in ['U', 'O', 'D', 'X']:   
    for flag in ['', 'a', 'c', 'ac']:
        row = [f'{scores.get(f"{model}{num}{flag}{seed}")}' for num in [1, 5, 10, 15, 20, 36, 50, 72]]
        out += f'{" | ".join(row)} |\n'
md(out)

| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |
| --- | --- | --- | --- | --- | --- | --- | --- |
0.7214725890356142 | 0.7873389355742296 | 0.7948259303721489 | 0.7912685074029612 | 0.8045978391356543 | 0.8056862745098038 | 0.8071788715486194 | 0.7975590236094438 |
0.7304721888755501 | 0.7840136054421769 | 0.7967386954781913 | 0.7955702280912365 | 0.8000240096038416 | 0.8030572228891556 | 0.8014405762304923 | 0.7964465786314525 |
0.7356502601040416 | 0.7864105642256903 | 0.8037094837935175 | 0.8047579031612644 | 0.8095038015206082 | 0.8080232092837135 | 0.8106962785114046 | None |
0.7326610644257703 | 0.778155262104842 | 0.7984153661464586 | 0.7982392957182873 | 0.8042617046818727 | 0.8038875550220088 | 0.8084473789515805 | None |
0.7199559823929571 | 0.7803241296518608 | 0.7886994797919168 | 0.7877991196478592 | 0.7940016006402562 | 0.7948419367747097 | 0.7956222488995599 | 0.7976910764305722 |
0.7147338935574228 | 0.7733333333333333 | 0.791156462585034 | 0.7890716286514605 | 0.7930372148859544 | 0.8013525410164065 | 0.7924649859943977 | 0.7951260504201679 |
0.717374949979992 | 0.7869707883153261 | 0.7941296518607444 | 0.8004761904761903 | 0.7957743097238895 | 0.7946218487394958 | 0.7984833933573429 | None |
0.7176990796318528 | 0.7734973989595838 | 0.790656262505002 | 0.7975070028011204 | 0.7995438175270109 | 0.7970108043217285 | 0.796922769107643 | None |
0.708795518207283 | 0.7450740296118448 | 0.7572909163665467 | 0.7632252901160465 | 0.7694117647058824 | 0.7704321728691477 | 0.7709163665466187 | 0.776734693877551 |
0.7003841536614647 | 0.7303521408563426 | 0.749547819127651 | 0.7595078031212485 | 0.7564785914365747 | 0.7464025610244098 | 0.7564265706282514 | 0.7654901960784314 |
0.7111084433773509 | 0.7473909563825529 | 0.7634293717486995 | 0.762060824329732 | 0.7728171268507402 | 0.7764865946378551 | 0.7764945978391357 | None |
0.7111964785914366 | 0.7497318927571027 | 0.7653021208483393 | 0.7644817927170868 | 0.7690556222488996 | 0.7739255702280914 | 0.7818007202881152 | None |
0.6707523009203682 | 0.7139095638255303 | 0.7246498599439777 | 0.7161344537815126 | 0.7282392957182873 | 0.7238495398159264 | 0.7254421768707483 | 0.7272068827531013 |
0.6618007202881153 | 0.7096118447378952 | 0.7316526610644257 | 0.7270748299319728 | 0.7327931172468987 | 0.7288115246098439 | 0.7247779111644659 | 0.7301040416166467 |
0.6655022008803522 | 0.7216006402561026 | 0.7304121648659464 | 0.7246098439375751 | 0.7364985994397759 | 0.736546618647459 | 0.7392877150860344 | None |
0.6612484993997599 | 0.7294157663065226 | 0.7418367346938776 | 0.7331772709083634 | 0.7388995598239294 | 0.7391316526610644 | 0.7440896358543417 | None |


### Test Unseen

In [13]:
scores = {}
for exp in tu_probas.keys():
    scores[exp]=roc_auc_score(gt_tu.label, tu_probas[exp])
    
out = '| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |\n'
out += '| --- | --- | --- | --- | --- | --- | --- | --- |\n'
for model in ['U', 'O', 'D', 'X']:   
    for flag in ['', 'a', 'c', 'ac']:
        row = [f'{scores.get(f"{model}{num}{flag}{seed}")}' for num in [1, 5, 10, 15, 20, 36, 50, 72]]
        out += f'{" | ".join(row)} |\n'
md(out)

| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |
| --- | --- | --- | --- | --- | --- | --- | --- |
0.7282538666666667 | 0.7962613333333334 | 0.7975509333333333 | 0.7957653333333333 | 0.7958602666666668 | 0.7983946666666666 | 0.7971530666666666 | 0.7895146666666667 |
0.7145493333333331 | 0.7793365333333333 | 0.7897504 | 0.7937034666666667 | 0.794176 | 0.7944192000000001 | 0.7951552000000001 | 0.7923754666666667 |
0.7248618666666667 | 0.7957685333333333 | 0.7978901333333334 | 0.8003669333333334 | 0.8036021333333332 | 0.8069674666666667 | 0.7991711999999999 | None |
0.7169557333333334 | 0.7849621333333333 | 0.7969344 | 0.7951744 | 0.7994474666666666 | 0.7998773333333334 | 0.8021632 | None |
0.6907498666666667 | 0.7659285333333334 | 0.7735370666666667 | 0.7758805333333333 | 0.7784928 | 0.7783914666666665 | 0.7869802666666668 | 0.7851850666666667 |
0.6835381333333335 | 0.7524330666666665 | 0.7737845333333333 | 0.7770325333333332 | 0.7831989333333332 | 0.7906794666666668 | 0.7856736000000001 | 0.7934186666666667 |
0.6897866666666667 | 0.7635594666666665 | 0.7781951999999999 | 0.7883359999999999 | 0.7868298666666667 | 0.7870293333333334 | 0.7912181333333332 | None |
0.6820106666666665 | 0.7552192 | 0.7754133333333333 | 0.7845408 | 0.7894570666666667 | 0.7924650666666666 | 0.7936501333333333 | None |
0.6943786666666667 | 0.7455050666666666 | 0.7520650666666666 | 0.7650912 | 0.7652288 | 0.7726912 | 0.7675445333333334 | 0.7737824 |
0.6815882666666666 | 0.7299690666666667 | 0.7474581333333332 | 0.7545845333333332 | 0.7536330666666666 | 0.7541877333333333 | 0.7583157333333334 | 0.7635242666666667 |
0.6929333333333333 | 0.7468117333333334 | 0.7645909333333333 | 0.7654240000000001 | 0.7688458666666667 | 0.7743466666666667 | 0.7676490666666667 | None |
0.685808 | 0.7413653333333332 | 0.7573589333333334 | 0.7613429333333332 | 0.7596415999999999 | 0.7690591999999999 | 0.7739626666666668 | None |
0.6509066666666667 | 0.7084085333333334 | 0.721344 | 0.7197770666666666 | 0.7226112 | 0.7136197333333334 | 0.7223904 | 0.7252288 |
0.6448330666666666 | 0.7078485333333334 | 0.7273088 | 0.7276981333333332 | 0.7270058666666668 | 0.733536 | 0.7279765333333332 | 0.7310432 |
0.6444501333333333 | 0.703824 | 0.7156010666666668 | 0.7193749333333332 | 0.7205194666666667 | 0.7119711999999999 | 0.7216842666666667 | None |
0.6423349333333332 | 0.7119765333333333 | 0.7300789333333333 | 0.7304490666666668 | 0.7283050666666666 | 0.7333354666666666 | 0.7325535999999999 | None |


### Test

In [14]:
scores = {}
for exp in test_probas.keys():
    scores[exp]=roc_auc_score(gt_test.label, test_probas[exp])
    
out = '| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |\n'
out += '| --- | --- | --- | --- | --- | --- | --- | --- |\n'
for model in ['U', 'O', 'D', 'X']:   
    for flag in ['', 'a', 'c', 'ac']:
        row = [f'{scores.get(f"{model}{num}{flag}{seed}")}' for num in [1, 5, 10, 15, 20, 36, 50, 72]]
        out += f'{" | ".join(row)} |\n'
md(out)

| 1 | 5 | 10 | 15 | 20 | 36 | 50 | 72 |
| --- | --- | --- | --- | --- | --- | --- | --- |
0.7261235337243402 | 0.7929971590909091 | 0.7946856671554251 | 0.791702254398827 | 0.7960373900293255 | 0.7984934017595308 | 0.798163947947214 | 0.789562866568915 |
0.7200664406158358 | 0.7802167338709678 | 0.7904916605571848 | 0.7919515212609971 | 0.7923015945747801 | 0.7946531341642228 | 0.7937587060117302 | 0.7911780608504398 |
0.7295408724340177 | 0.7923680351906158 | 0.7998680351906159 | 0.8013407258064515 | 0.8047960960410557 | 0.8067082111436951 | 0.8023946114369501 | None |
0.7231020894428153 | 0.7821050219941349 | 0.7970798203812317 | 0.7950311583577713 | 0.7997736436950147 | 0.8005798662023461 | 0.803207478005865 | None |
0.7014910190615836 | 0.770893969941349 | 0.7780599340175953 | 0.7793580461876832 | 0.7829316348973607 | 0.7829197214076247 | 0.7886235337243402 | 0.787451887829912 |
0.6951539589442816 | 0.7598020527859237 | 0.7792948130498534 | 0.7799963343108505 | 0.7851310483870968 | 0.7916688049853372 | 0.7855150293255133 | 0.7913957111436949 |
0.7001273826979472 | 0.7722218658357771 | 0.7837353372434017 | 0.7922598973607038 | 0.7890510447214076 | 0.7892269978005865 | 0.7929893695014663 | None |
0.6953395344574781 | 0.7618227639296188 | 0.7804802052785924 | 0.7883536473607039 | 0.7924239369501467 | 0.793078262463343 | 0.7940918255131966 | None |
0.6985447214076246 | 0.7438228555718475 | 0.7510213526392963 | 0.760883431085044 | 0.7634943181818182 | 0.7681231671554252 | 0.7649381414956011 | 0.7713677602639297 |
0.6883715175953078 | 0.7290844941348973 | 0.7461565249266863 | 0.7528913123167156 | 0.7503871884164223 | 0.7465968658357771 | 0.7530681818181818 | 0.7596822305718475 |
0.6989598607038122 | 0.7473185483870968 | 0.7637820747800586 | 0.7643158907624634 | 0.769646719208211 | 0.7743241385630498 | 0.7703207478005865 | None |
0.6947305718475074 | 0.7444716825513196 | 0.7599372250733137 | 0.7622539406158357 | 0.7620555351906159 | 0.7704806634897361 | 0.7760167705278592 | None |
0.6579169721407624 | 0.7090904508797655 | 0.7212825329912023 | 0.7171792521994135 | 0.7230241935483871 | 0.7148879673753665 | 0.7214561950146627 | 0.724002474340176 |
0.6512839076246335 | 0.7084594941348974 | 0.7278761913489736 | 0.7255576429618769 | 0.7264523002199414 | 0.7292696114369501 | 0.7240313416422288 | 0.7284462060117302 |
0.6523932368035191 | 0.7102332294721407 | 0.7211326979472141 | 0.7219070747800587 | 0.7261872250733139 | 0.7212138013196481 | 0.7285346407624634 | None |
0.650032991202346 | 0.7183289039589442 | 0.7343177236070381 | 0.7315382148093841 | 0.7321792521994135 | 0.7358486070381232 | 0.7367379948680352 | None |


## Correlation

In [15]:
# Spearman Correlations:
print('\n' + '-' * 50)
print("Spearman Corrs:")
dev_corr = dev_probas.corr(method='spearman')
test_seen_corr = ts_probas.corr(method='spearman')
test_unseen_corr = tu_probas.corr(method='spearman')

print('\n', dev_corr)
print('\n', test_seen_corr)
print('\n', test_unseen_corr)
print('\n' + '-' * 50)


--------------------------------------------------
Spearman Corrs:

          D1098  D10a98  D10ac98  D10c98  D1598  D15a98  D15ac98  D15c98  D198  D1a98  ...  X36m  X36am  X36cm  X36acm  X50m  X50am  X50cm  X50acm  X72m  X72am
D1098     1.00    0.89     0.85    0.84   0.89    0.89     0.83    0.80  0.72   0.73  ...  0.67   0.65   0.64    0.62  0.67   0.66   0.65    0.63  0.67   0.65
D10a98    0.89    1.00     0.89    0.85   0.88    0.91     0.87    0.81  0.70   0.74  ...  0.66   0.64   0.64    0.63  0.66   0.65   0.65    0.63  0.66   0.65
D10ac98   0.85    0.89     1.00    0.89   0.87    0.86     0.90    0.86  0.69   0.71  ...  0.63   0.61   0.61    0.61  0.62   0.62   0.62    0.61  0.63   0.62
D10c98    0.84    0.85     0.89    1.00   0.87    0.83     0.85    0.87  0.68   0.69  ...  0.62   0.60   0.62    0.60  0.61   0.61   0.62    0.60  0.62   0.61
D1598     0.89    0.88     0.87    0.87   1.00    0.91     0.87    0.88  0.74   0.72  ...  0.68   0.65   0.64    0.63  0.67   0.67   0.

## Ensembling

In [16]:
mean_dev_probas = dev_probas[[k for k in dev_probas.keys() if '20m' in k or '20am' in k]].copy()
mean_test_probas = test_probas[[k for k in test_probas.keys() if '20m' in k or '20am' in k]].copy()

### Arithmetic Mean

In [17]:
dev_av = average(mean_dev_probas.apply(np.exp)).apply(np.log)
test_av = average(mean_test_probas.apply(np.exp)).apply(np.log)
print('Valid')
print(f"AUROC: {roc_auc_score(gt_dev.label, dev_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_dev.label, dev_av)[0]:.4f}")
print('Test')
print(f"AUROC: {roc_auc_score(gt_test.label, test_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_test.label, test_av)[0]:.4f}")

Valid
AUROC: 0.7832
Accuracy: 0.7197
Test
AUROC: 0.7908
Accuracy: 0.7266


### Geometric Mean

In [18]:
dev_av = average(mean_dev_probas)
test_av = average(mean_test_probas)
print('Valid')
print(f"AUROC: {roc_auc_score(gt_dev.label, dev_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_dev.label, dev_av)[0]:.4f}")
print('Test')
print(f"AUROC: {roc_auc_score(gt_test.label, test_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_test.label, test_av)[0]:.4f}")

Valid
AUROC: 0.7795
Accuracy: 0.7139
Test
AUROC: 0.7940
Accuracy: 0.7283


### Rank Average

In [19]:
dev_av = average(mean_dev_probas.apply(lambda col: rankdata(col) / len(col)))
test_av = average(mean_test_probas.apply(lambda col: rankdata(col) / len(col)))
print('Valid')
print(f"AUROC: {roc_auc_score(gt_dev.label, dev_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_dev.label, dev_av)[0]:.4f}")
print('Test')
print(f"AUROC: {roc_auc_score(gt_test.label, test_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_test.label, test_av)[0]:.4f}")

Valid
AUROC: 0.7851
Accuracy: 0.7178
Test
AUROC: 0.7977
Accuracy: 0.7291


### Simple 1

In [20]:
weights_dev = Simplex(mean_dev_probas, gt_dev.label)
print(list(zip(mean_dev_probas.keys(), weights_dev)))
dev_av = average(mean_dev_probas, weights=weights_dev)
test_av = average(mean_test_probas, weights=weights_dev)
print('Valid')
print(f"AUROC: {roc_auc_score(gt_dev.label, dev_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_dev.label, dev_av)[0]:.4f}")
print('Test')
print(f"AUROC: {roc_auc_score(gt_test.label, test_av):.4f}")
print(f"Accuracy: {acc_from_roc(gt_test.label, test_av)[0]:.4f}")

640
Optimizing 8 inputs.
Optimized = 0.7998269308032265
Weights = [0.30127431 0.30127431 0.30127431 0.04606828 0.027949   0.01624437
 0.00205635 0.00344334]
[('U20m', 0.3012743066889827), ('U20am', 0.3012743066889827), ('O20m', 0.3012743066889827), ('O20am', 0.046068275195908934), ('D20m', 0.02794900155202536), ('D20am', 0.01624436737708213), ('X20m', 0.0020563461505954126), ('X20am', 0.0034433435147859623)]
Valid
AUROC: 0.7998
Accuracy: 0.7324
Test
AUROC: 0.8085
Accuracy: 0.7334


In [21]:
 # Get accuracy thresholds & optimize (This does not add value to the roc auc, but just to also have an acc score)
acc, threshold = acc_from_roc(gt_dev.label, dev_av)
dev_labels = dev_av.apply(lambda x: 1 if x > threshold else 0)
dev_out = pd.DataFrame({'id': dev_idx, 'proba': dev_av, 'label': dev_labels})
test_labels = test_av.apply(lambda x: 1 if x > threshold else 0)
test_out = pd.DataFrame({'id': test_idx, 'proba': test_av, 'label': test_labels})

In [22]:
threshold

-5.098829766206683

### Simple 2 - Main Loop

In [23]:
loop, last_score, delta = 0, 0, 0.1
mean_dev_probas = dev_probas[[k for k in dev_probas.keys() if '20m' in k or '20am' in k]].copy()
mean_test_probas = test_probas[[k for k in test_probas.keys() if '20m' in k or '20am' in k]].copy()

while delta > 0.0001:

    # Individual AUROCs
    print('\n' + '-' * 21 , 'ROUND ' + str(loop) , '-' * 21)
    print("Individual AUROCs for Validation Sets:\n")
    for i, column in enumerate(mean_dev_probas):   
        score = roc_auc_score(gt_dev.label, mean_dev_probas.iloc[:, i])
        print(column, score)

    # Drop worst performing sets
    if loop > -1:
        print('\n' + '-' * 50)
        scores = mean_dev_probas.apply(lambda col: roc_auc_score(gt_dev.label, col), result_type='reduce')
        num = len(scores)
        while len(scores) > np.floor(num/2) and len(scores) > 10:
            worst = scores.idxmin()
            #del dev[worst]
            mean_dev_probas.drop(worst, axis=1, inplace=True)
            mean_test_probas.drop(worst, axis=1, inplace=True)
            scores.drop(worst, inplace=True)
            print("Dropped:", worst)

    # Simple
    print('Simple:')
    weights_dev = Simplex(mean_dev_probas, gt_dev.label)
    mean_dev_probas[f'SX_{loop}'] = average(mean_dev_probas, weights=weights_dev)
    mean_test_probas[f'SX_{loop}'] = average(mean_test_probas, weights=weights_dev)
    score = roc_auc_score(gt_dev.label, mean_dev_probas[f'SX_{loop}'])
    print(f"AUROC: {score:.4f}")
    print(f"Accuracy: {acc_from_roc(gt_dev.label, mean_dev_probas[f'SX_{loop}'])[0]:.4f}")
    print('\n' + '-' * 50)
    
    # Arithmetic Mean
    print('Arithmetic Mean:')
    mean_dev_probas[f'AM_{loop}'] = average(mean_dev_probas.apply(np.exp)).apply(np.log)
    mean_test_probas[f'AM_{loop}'] = average(mean_test_probas.apply(np.exp)).apply(np.log)
    print(f"AUROC: {roc_auc_score(gt_dev.label, mean_dev_probas[f'AM_{loop}']):.4f}")
    print(f"Accuracy: {acc_from_roc(gt_dev.label, mean_dev_probas[f'AM_{loop}'])[0]:.4f}")
    print('\n' + '-' * 50)
    
    # Geometric Mean (remain in logspace)
    print('Geometric Mean:')
    mean_dev_probas[f'GM_{loop}'] = average(mean_dev_probas)
    mean_test_probas[f'GM_{loop}'] = average(mean_test_probas)
    print(f"AUROC: {roc_auc_score(gt_dev.label, mean_dev_probas[f'GM_{loop}']):.4f}")
    print(f"Accuracy: {acc_from_roc(gt_dev.label, mean_dev_probas[f'GM_{loop}'])[0]:.4f}")
    print('\n' + '-' * 50)

    # TODO: Power Average
    '''
    print('Power Average:')
    dev_PA = simple_average(dev_probas, dev[0], power=2, normalize=True)
    test_PA = simple_average(test_probas, test[0], power=2, normalize=True)
    test_unseen_PA = simple_average(test_unseen_probas, test_unseen[0], power=2, normalize=True)
    print(roc_auc_score(dev_df.label, dev_PA.proba), accuracy_score(dev_df.label, dev_PA.label))
    print('\n' + '-' * 50)
    '''
    
    # Rank Average
    print('Rank Average:')
    mean_dev_probas[f'RA_{loop}'] = average(mean_dev_probas.apply(lambda col: rankdata(col) / len(col)))
    mean_test_probas[f'RA_{loop}'] = average(mean_test_probas.apply(lambda col: rankdata(col) / len(col)))
    print(f"AUROC: {roc_auc_score(gt_dev.label, mean_dev_probas[f'RA_{loop}']):.4f}")
    print(f"Accuracy: {acc_from_roc(gt_dev.label, mean_dev_probas[f'RA_{loop}'])[0]:.4f}")
    print('\n' + '-' * 50)
    
    # Calculate Delta & increment loop
    delta = abs(score - last_score)
    last_score = score

    loop += 1

    print("Currently at {} after {} loops.".format(last_score, loop))

# As Simplex at some point simply weighs the highest of all - lets take sx as the final prediction after x loops
dev_best = mean_dev_probas[f'SX_{loop-1}']
test_best = mean_test_probas[f'SX_{loop-1}']

 # Get accuracy thresholds & optimize (This does not add value to the roc auc, but just to also have an acc score)
acc, threshold = acc_from_roc(gt_dev.label, dev_best)
dev_labels = dev_best.apply(lambda x: 1 if x > threshold else 0)
dev_out = pd.DataFrame({'id': dev_idx, 'proba': dev_best, 'label': dev_labels})
test_labels = ts_best.apply(lambda x: 1 if x > threshold else 0)
test_out = pd.DataFrame({'id': ts_idx, 'proba': test_best, 'label': ts_labels})

print("Finished!")


--------------------- ROUND 0 ---------------------
Individual AUROCs for Validation Sets:

U20m 0.7937901123919605
U20am 0.7887937695089162
O20m 0.7710335733638265
O20am 0.7707863316541501
D20m 0.743672157492969
D20am 0.7435176314244213
X20m 0.704298915226999
X20am 0.7196175994890338

--------------------------------------------------
Simple:
640
Optimizing 8 inputs.
Optimized = 0.7998269308032265
Weights = [0.30127431 0.30127431 0.30127431 0.04606828 0.027949   0.01624437
 0.00205635 0.00344334]
AUROC: 0.7998
Accuracy: 0.7324

--------------------------------------------------
Arithmetic Mean:
AUROC: 0.7843
Accuracy: 0.7185

--------------------------------------------------
Geometric Mean:
AUROC: 0.7833
Accuracy: 0.7180

--------------------------------------------------
Rank Average:
AUROC: 0.7876
Accuracy: 0.7183

--------------------------------------------------
Currently at 0.7998269308032265 after 1 loops.

--------------------- ROUND 1 ---------------------
Individual AUROCs

NameError: name 'ts_best' is not defined

In [None]:
print('Valid')
print(f"AUROC: {roc_auc_score(gt_dev.label, dev_best):.4f}")
print(f"Accuracy: {acc_from_roc(gt_dev.label, dev_best)[0]:.4f}")
print('Test')
print(f"AUROC: {roc_auc_score(gt_test.label, test_best):.4f}")
print(f"Accuracy: {acc_from_roc(gt_test.label, test_best)[0]:.4f}")

## Score

In [None]:
print('Dev')
score = roc_auc_score(gt_dev.label, dev_out.proba)
acc, threshold = acc_from_roc(gt_dev.label, dev_out.proba)
print(f'AUROC: {score:.4f}')
print(f'Accuracy: {acc:.4f}')
print(f'Threshold: {threshold:.4f}')
print('\nTest Seen')
score = roc_auc_score(gt_ts.label, ts_out.proba)
acc, threshold = acc_from_roc(gt_ts.label, ts_out.proba)
print(f'AUROC: {score:.4f}')
print(f'Accuracy: {acc:.4f}')
print(f'Threshold: {threshold:.4f}')
print('\nTest Unseen')
score = roc_auc_score(gt_tu.label, tu_out.proba)
acc, threshold = acc_from_roc(gt_tu.label, tu_out.proba)
print(f'AUROC: {score:.4f}')
print(f'Accuracy: {acc:.4f}')
print(f'Threshold: {threshold:.4f}')

In [None]:
new_acc = 1 - np.sum(abs(gt_dev.label - dev_labels)) / len(dev_labels)
new_acc

## Dump Output

In [None]:
experiment = '98'
ts_out.to_csv(os.path.join(path, f"final/FIN_test_seen_{experiment}_{loop}.csv"), index=False)
tu_out.to_csv(os.path.join(path, f"final/FIN_test_unseen_{experiment}_{loop}.csv"), index=False)

## Model Comparison

In [None]:
all_dev_labels = pd.DataFrame({k: v.label.values for k, v in dev_or.items()})
all_dev_labels['Ensemble'] = dev_out.label.values
all_dev_labels.set_index(dev_idx, inplace=True)
all_dev_labels

In [None]:
compare = mean_test_probas.copy()
compare['EnsProb'] = test_out.proba.values
compare['EnsHard'] = test_out.label.values
compare['Correct'] = gt_test.label.values
full_idx = gt_test.img.apply(lambda x: x[4:-4]).values
compare.set_index(full_idx, inplace=True)
compare

In [None]:
diff = compare['Correct'] - compare['EnsHard']
fns = compare[diff == 1]
fps = compare[diff == -1] 
print(f'{len(fns)/len(diff)} false negatives, {len(fps)/len(diff)} false positives')

In [None]:
roc_auc_score(gt_test.label, test_out.proba)

In [None]:
(len(compare[compare['Correct'] == 1]) - len(fns))/len(diff)

In [None]:
91/640

In [None]:
fps

In [None]:
fns

In [None]:
img_id = '78156'
img = Image(filename=os.path.join(img_dir, img_id + '.png'))
display(img)

In [None]:
row = fps.loc[img_id]
models = row[[k for k in row.keys() if 'm' in k]]

N = len(models)
theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
radii = abs(models.values)
width = 2*np.pi / N
colors = plt.cm.viridis(radii / 10)

ax = plt.subplot(projection='polar')
ax.bar(theta, radii, width=width, color=colors, alpha=0.5, edgecolor='black')

labels = [''.join(k[:-1].split('20')) for k in models.keys()]
xtl = ax.set_xticklabels(labels)

plt.show()