## Imports

In [4]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve

## Utils

In [5]:
def average(data, weights=None):
    N = data.shape[1]
    if weights is None:
        weights = [1/N] * N
    elif np.sum(weights) != 1.:
        weights = weights / np.sum(weights)
    
    # Compute weighted avg
    return data.apply(lambda row: row.multiply(weights).sum(), axis=1)

## Ground Truth

In [13]:
gt_path = 'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\features\\annotations\\gt'
gt_dev = pd.read_json(os.path.join(gt_path, 'dev_all.jsonl'), lines=True)
gt_ts = pd.read_json(os.path.join(gt_path, 'test_seen.jsonl'), lines=True)
gt_tu = pd.read_json(os.path.join(gt_path, 'test_unseen.jsonl'), lines=True)
gt_test = gt_ts.append(gt_tu)

## Load V2 Data

In [10]:
paths = ['C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v2\\98',
        'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v2\\43']

In [11]:
# Load data
dev, ts, tu = {}, {}, {}
experiments = []
for path in tqdm(paths):
    seed = path.split('\\')[-1]
    for csv in sorted(os.listdir(path)):
        if ".csv" in csv and "test_unseen" in csv:
            exp = csv.split('_')[0]
            name = exp + seed
            experiments.append(name)            
            dev[name] = pd.read_csv(os.path.join(path, f'{exp}_dev_all.csv'))
            dev_idx = dev[name].id.values
            tu[name] = pd.read_csv(os.path.join(path, f'{exp}_test_unseen.csv'))
            tu_idx = tu[name].id.values
            ts[name] = pd.read_csv(os.path.join(path, f'{exp}_test_seen.csv'))
            ts_idx = ts[name].id.values

dev_probas = pd.DataFrame({k: v.proba.values for k, v in dev.items()})
ts_probas = pd.DataFrame({k: v.proba.values for k, v in ts.items()})
tu_probas = pd.DataFrame({k: v.proba.values for k, v in tu.items()})

100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


In [12]:
# Average over seeds
seeds = [43, 98]
for model in ['U', 'O', 'D', 'X']:
    for num in [1, 5, 10, 15, 20, 36, 50, 72]:
        for flag in ['', 'a', 'c', 'ac']:
            try:
                subset = dev_probas[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                dev_probas[f'{model}{num}{flag}m'] = mean
                subset = ts_probas[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                ts_probas[f'{model}{num}{flag}m'] = mean
                subset = tu_probas[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                tu_probas[f'{model}{num}{flag}m'] = mean
            except:
                print(f'Missing {[f"{model}{num}{flag}{seed}" for seed in seeds]}')
test_probas = ts_probas.append(tu_probas)

Missing ['U72c43', 'U72c98']
Missing ['U72ac43', 'U72ac98']
Missing ['O72c43', 'O72c98']
Missing ['O72ac43', 'O72ac98']
Missing ['D72c43', 'D72c98']
Missing ['D72ac43', 'D72ac98']
Missing ['X72c43', 'X72c98']
Missing ['X72ac43', 'X72ac98']


In [19]:
for exp in test_probas:
    if exp[-1] == 'm':
        dev_probas[exp].to_csv(f'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v2\\mean\\{exp[:-1]}_dev.csv', index=False)
        #ts_probas[exp].to_csv(f'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v2\\mean\\{exp[:-1]}_ts.csv', index=False)
        #tu_probas[exp].to_csv(f'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v2\\mean\\{exp[:-1]}_tu.csv', index=False)
        test_probas[exp].to_csv(f'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v2\\mean\\{exp[:-1]}_test.csv', index=False)

## Load V3 Data

In [30]:
paths = ['C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v3\\98',
        'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v3\\61',
        'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\v3\\43']

In [31]:
# Load data
dev2, ts2, tu2 = {}, {}, {}
experiments2 = []
for path in tqdm(paths):
    seed = path.split('\\')[-1]
    for csv in sorted(os.listdir(path)):
        if ".csv" in csv and "test_unseen" in csv:
            exp = csv.split('_')[0]
            name = exp + seed
            experiments2.append(name)            
            dev2[name] = pd.read_csv(os.path.join(path, f'{exp}_dev_all.csv'))
            dev_idx2 = dev2[name].id.values
            tu2[name] = pd.read_csv(os.path.join(path, f'{exp}_test_unseen.csv'))
            tu_idx2 = tu2[name].id.values
            ts2[name] = pd.read_csv(os.path.join(path, f'{exp}_test_seen.csv'))
            ts_idx2 = ts2[name].id.values

dev_probas2 = pd.DataFrame({k: v.proba.values for k, v in dev2.items()})
ts_probas2 = pd.DataFrame({k: v.proba.values for k, v in ts2.items()})
tu_probas2 = pd.DataFrame({k: v.proba.values for k, v in tu2.items()})

100%|██████████| 3/3 [00:02<00:00,  1.40it/s]


In [32]:
# Average over seeds
seeds = [43, 61, 98]
for model in ['U', 'O', 'D', 'X']:
    for num in [20]:
        for flag in ['', 'a']:
            try:
                subset = dev_probas2[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                dev_probas2[f'{model}{num}{flag}m'] = mean
                subset = ts_probas2[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                ts_probas2[f'{model}{num}{flag}m'] = mean
                subset = tu_probas2[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                tu_probas2[f'{model}{num}{flag}m'] = mean
            except:
                print(f'Missing {[f"{model}{num}{flag}{seed}" for seed in seeds]}')
test_probas2 = ts_probas2.append(tu_probas2)

## Entities

In [33]:
paths = ['C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\entity\\98',
         'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\entity\\43',
         'C:\\Users\\obarn\\Projects\\F-MT126-1\\vilio\\data\\outputs\\entity\\61',
        ]

In [34]:
# Load data
dev3, ts3, tu3 = {}, {}, {}
experiments3 = []
for path in tqdm(paths):
    seed = path.split('\\')[-1]
    for csv in sorted(os.listdir(path)):
        if ".csv" in csv and "test_unseen" in csv:
            exp = csv.split('_')[0]
            name = exp + seed
            experiments3.append(name)            
            dev3[name] = pd.read_csv(os.path.join(path, f'{exp}_dev_all.csv'))
            dev_idx3 = dev3[name].id.values
            tu3[name] = pd.read_csv(os.path.join(path, f'{exp}_test_unseen.csv'))
            tu_idx3 = tu3[name].id.values
            ts3[name] = pd.read_csv(os.path.join(path, f'{exp}_test_seen.csv'))
            ts_idx3 = ts3[name].id.values

dev_probas3 = pd.DataFrame({k: v.proba.values for k, v in dev3.items()})
ts_probas3 = pd.DataFrame({k: v.proba.values for k, v in ts3.items()})
tu_probas3 = pd.DataFrame({k: v.proba.values for k, v in tu3.items()})

100%|██████████| 3/3 [00:00<00:00, 17.71it/s]


In [35]:
# Average over seeds
seeds = [43, 61, 98]
for model in ['U', 'O', 'D', 'X']:
    for num in [20]:
        for flag in ['', 'a']:
            try:
                subset = dev_probas3[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                dev_probas3[f'{model}{num}{flag}m'] = mean
                subset = ts_probas3[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                ts_probas3[f'{model}{num}{flag}m'] = mean
                subset = tu_probas2[[f'{model}{num}{flag}{seed}' for seed in seeds]]
                mean =  average(subset)
                tu_probas3[f'{model}{num}{flag}m'] = mean
            except:
                print(f'Missing {[f"{model}{num}{flag}{seed}" for seed in seeds]}')
test_probas3 = ts_probas3.append(tu_probas2)

## Hypothesis Test

In [36]:
deltas = []
for model in ['U', 'O', 'D', 'X']:
    for num in [20]:
        for flag in ['', 'a']:
            name = f'{model}{num}{flag}m'
            if name in ('U72cm', 'U72acm', 'O72cm', 'O72acm', 'D72cm', 'D72acm', 'X72cm', 'X72acm'):
                continue
            deltas.append(roc_auc_score(gt_test.label, test_probas3[name]) - roc_auc_score(gt_test.label, test_probas2[name]))

N = len(deltas)
mu_hat = np.mean(deltas)
sigma = np.std(deltas)
sigma = np.sqrt(np.square(sigma)*(N/(N-1)))
print(mu_hat)
print(sigma)

0.006843125687316709
0.0065215556598565184
