# Edgeprobe Aggregate Analysis

This notebook is intended to be run on the output of the [`analyze_runs.py`](analyze_runs.py) script; run that on a folder of experiments to produce a `scores.tsv` file that can be loaded here.

In [1]:
import sys, os, re, json
from importlib import reload
import itertools
import collections

import numpy as np
import pandas as pd

import analysis
reload(analysis)

tasks = analysis.TASKS
exp_types = analysis.EXP_TYPES
palette = analysis.EXP_PALETTE

task_sort_key = analysis.task_sort_key
exp_type_sort_key = analysis.exp_type_sort_key

from scipy.special import logsumexp
from scipy.stats import entropy

def softmax(x, axis=None):
    return np.exp(x - logsumexp(x, axis=axis, keepdims=True))

In [2]:
import bokeh
import bokeh.plotting as bp
bp.output_notebook()

import datetime
import socket
def get_compact_timestamp():
    now = datetime.datetime.now()
    return now.strftime("%Y%m%d.%H%M%S")

def _save_figure_to_bucket(fig, name, title=None, export_format="html"):
    now = get_compact_timestamp()
    fname = f"{name}.{now:s}.{export_format}"
    title = title or name
    if fname.endswith('.png'):
        bokeh.io.export_png(p, os.path.join("/tmp", fname))
    else:
        bp.save(p, os.path.join("/tmp", fname), title=title, 
                resources=bokeh.resources.CDN)
    hostname = socket.gethostname()
    GCP_PROJECT="edge-probing"
    !gsutil cp /tmp/$fname gs://$GCP_PROJECT/$hostname/plots/$fname
    !gsutil acl ch -u AllUsers:R gs://$GCP_PROJECT/$hostname/plots/$fname
    url = f"https://storage.googleapis.com/{GCP_PROJECT}/{hostname}/plots/{fname}"
    print(f"Public URL: {url}")
    return url

In [3]:
ID_COLS = ['run', 'task', 'split']

def agg_label_group(df, task_predicate, label_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
    mask = df['task'].map(task_predicate) & df['label'].map(label_predicate)
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf

def agg_stratifier_group(df, stratifier, key_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
    # Use this for short-circuit evaluation, so we don't call key_predicate on invalid keys
    mask = [(s == stratifier and key_predicate(key)) 
            for s, key in zip(df['stratifier'], df['stratum_key'])]
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf    

def load_scores_file(filename, tag=None, seed=None):
    df = pd.read_csv(filename, sep="\t", header=0)
    df.drop(['Unnamed: 0'], axis='columns', inplace=True)
    # df['task_raw'] = df['task'].copy()
    df['task'] = df['task'].map(analysis.clean_task_name)
    if not "stratifier" in df.columns:
        df["stratifier"] = None
    if not "stratum_key" in df.columns:
        df["stratum_key"] = 0
        
    ###
    # Add additional custom aggregations
    _eg = []
    # SRL core, non-core, and cleaned micro F1
    _eg.append(agg_label_group(df, analysis.is_srl_task, analysis.is_core_role, "_core_"))
    _eg.append(agg_label_group(df, analysis.is_srl_task, analysis.is_non_core_role, "_non_core_"))
    _eg.append(agg_label_group(df, analysis.is_srl_task, analysis.is_core_or_noncore, "_clean_micro_"))
    # Constituents: split into POS, nonterminals
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: int(x) == 1, "_pos_"))
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: int(x) > 1, "_nonterminal_"))
    # Relations: ignore negative class (no_relation)
    _eg.append(agg_label_group(df, analysis.is_relation_task, analysis.is_positive_relation, "_clean_micro_"))
    df = pd.concat([df] + _eg, ignore_index=True, sort=False)
    
    df.insert(0, "exp_name", df['run'].map(lambda p: os.path.basename(os.path.dirname(p.strip("/")))))
    df.insert(1, "exp_type", df['exp_name'].map(analysis.get_exp_type))
    df.insert(1, "layer_num", df['exp_name'].map(analysis.get_layer_num))
    if tag is not None:
        df.insert(0, "tag", tag)
    df.insert(1, "seed", seed)
    return df

## Specify score files and load

In [16]:
score_files = []
# Add (tag, path/to/scores.tsv) tuples here; results will be concatenated.
score_files = [
    ("base", "/nfs/jiant/exp/iftenney/20190721-test-ep-bert/stats.tsv"),
    ("base", "/nfs/jiant/exp/iftenney/20190721-test-ep-bert-medium/stats.tsv"),
]
dfs = []
for tag, score_file in score_files:
    df = load_scores_file(score_file, tag=tag)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True, sort=False)
def _format_display_col(exp_type, layer_num, tag):
    ret = exp_type
    if layer_num:
        ret += f"-{layer_num}"
    if tag:
        ret += f" ({tag})"
    return ret

df['display_col'] = list(map(_format_display_col, df.exp_type, df.layer_num, df.tag))
print(df['task'].unique())
print(df['exp_type'].unique())

['rel-semeval' 'spr2' 'coref-ontonotes' 'ner-ontonotes' 'srl-ontonotes']
['bert-base-uncased-lex' 'bert-base-uncased-mix']


In [17]:
analysis.score_from_confusion_matrix(df)

def _get_final_score(row):
    return row['f1_score'], row['f1_errn95']

df['score'], df['score_errn95'] = zip(*(_get_final_score(row) for i, row in df.iterrows()))

For DPR, we need to average across multiple runs to get a good estimate of performance.

In [18]:
mask = df['task'] == 'dpr'
mask &= df['label'] != "__run_info__"
mask &= df['seed'].notnull()
gb_cols = ["tag", "exp_name", "exp_type", "task", "label", "split", "display_col"]
gb = df[mask].groupby(by=gb_cols)
new_rows = []
for key, idxs in gb.groups.items():
    new_row = dict(zip(gb_cols, key))
    new_row["seed"] = "_mean_"
    new_row["score"] = df.loc[idxs, "score"].mean()
    new_row["score_errn95"] = 1.96 * np.sqrt(df.loc[idxs, "score"].var()/len(idxs))
    new_rows.append(new_row)
    
agg_df = pd.DataFrame.from_records(new_rows)
df = pd.concat([df, agg_df], ignore_index=True, sort=False)

For SemEval 2010 Task 8, the official metric is macro-averaged F1 over non-Other labels. Compute this so we can compare to SOTA.

In [19]:
mask = df['task'] == 'rel-semeval'
mask &= df['split'].notnull()
mask &= df['label'].map(analysis.is_positive_relation)
_id_cols = ['run', 'split']
_agg_cols = ['score']
gb = df[mask][_id_cols + _agg_cols].groupby(_id_cols)
afd = gb.agg('mean')
afd = afd.reset_index()

csv_args = dict(float_format="%.4f")
print(afd.to_csv(index=False, **csv_args))

run,split,score
/nfs/jiant/exp/iftenney/20190721-test-ep-bert//bert-base-uncased-lex-edges-rel-semeval/run,test,0.5261
/nfs/jiant/exp/iftenney/20190721-test-ep-bert//bert-base-uncased-lex-edges-rel-semeval/run,val,0.5013
/nfs/jiant/exp/iftenney/20190721-test-ep-bert//bert-base-uncased-mix-edges-rel-semeval/run,test,0.7442
/nfs/jiant/exp/iftenney/20190721-test-ep-bert//bert-base-uncased-mix-edges-rel-semeval/run,val,0.7531



## Compute clean metrics for each task

For most tasks this is just the micro or macro average F1, but we need to ignore the 0 label for coref, and drop references and continuations for SRL.

In [23]:
SPLIT = "test"
# SPLIT = "val"
mask = df['split'] == SPLIT

final_scores = []
for task in df['task'].unique():
    task_scores = df[mask & (df['task'] == task)]
    if analysis.is_coref_task(task):
        final_scores.append(task_scores[task_scores['label'] == "1"])
        # For GAP coref, have stratified by gender
        if task.startswith("coref-gap"):
            final_scores.append(task_scores[task_scores['label'] == "_info.pronoun_gender_MASCULINE_1_"])
            final_scores.append(task_scores[task_scores['label'] == "_info.pronoun_gender_FEMININE_1_"])
    elif task == "dpr":
        dpr_mask = task_scores['seed'] == "_mean_"
        dpr_mask &= task_scores['label'] == "_micro_avg_"
        final_scores.append(task_scores[dpr_mask])
    elif analysis.is_srl_task(task):
        final_scores.append(task_scores[task_scores['label'] == '_core_'])
        final_scores.append(task_scores[task_scores['label'] == '_non_core_'])
        # Use clean version, average only over core or noncore roles.
        final_scores.append(task_scores[task_scores['label'] == '_clean_micro_'])
    elif analysis.is_relation_task(task):
        # Relation tasks include specific "no_relation" label
        final_scores.append(task_scores[task_scores['label'] == '_clean_micro_'])
    elif task == "noun-verb":
        # Noun-verb reports accuracy on VERB class
        final_scores.append(task_scores[task_scores['label'] == 'VERB'])
    else:
        final_scores.append(task_scores[task_scores['label'] == '_micro_avg_'])
        
fdf = pd.concat(final_scores, axis=0, ignore_index=True, sort=False)
# fdf['task_and_metric'] = ["%s-%s" % tl for tl in zip(fdf.task, fdf.label)]
def format_display_row(task, label, seed):
    ret = f"{task}-{label}"
    if seed:
        ret += f":{seed}"
    return ret

fdf['display_row'] = [format_display_row(*args) for args in zip(fdf.task, fdf.label, fdf.seed)]
print(len(fdf))
fdf

14


Unnamed: 0,tag,seed,exp_name,layer_num,exp_type,run,task,split,label,fn_count,...,precision,recall,f1_score,accuracy_errn95,precision_errn95,recall_errn95,f1_errn95,score,score_errn95,display_row
0,base,,bert-base-uncased-lex-edges-rel-semeval,,bert-base-uncased-lex,/nfs/jiant/exp/iftenney/20190721-test-ep-bert/...,rel-semeval,test,_clean_micro_,1153,...,0.711538,0.490499,0.580696,0.001578,0.022482,0.020597,0.021498,0.580696,0.021498,rel-semeval-_clean_micro_
1,base,,bert-base-uncased-mix-edges-rel-semeval,,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-test-ep-bert/...,rel-semeval,test,_clean_micro_,494,...,0.864614,0.781706,0.821072,0.001104,0.014825,0.01702,0.015847,0.821072,0.015847,rel-semeval-_clean_micro_
2,base,,bert-base-uncased-lex-edges-spr2,,bert-base-uncased-lex,/nfs/jiant/exp/iftenney/20190721-test-ep-bert/...,spr2,test,_micro_avg_,846,...,0.829507,0.80394,0.816523,0.006187,0.011398,0.011846,0.011618,0.816523,0.011618,spr2-_micro_avg_
3,base,,bert-base-uncased-mix-edges-spr2,,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-test-ep-bert/...,spr2,test,_micro_avg_,750,...,0.84881,0.826188,0.837346,0.005882,0.010834,0.011307,0.011066,0.837346,0.011066,spr2-_micro_avg_
4,base,,bert-base-uncased-lex-edges-coref-ontonotes,,bert-base-uncased-lex,/nfs/jiant/exp/iftenney/20190721-test-ep-bert-...,coref-ontonotes,test,1,1666,...,0.767659,0.72344,0.744894,0.003639,0.010986,0.011296,0.011139,0.744894,0.011139,coref-ontonotes-1
5,base,,bert-base-uncased-mix-edges-coref-ontonotes,,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-test-ep-bert-...,coref-ontonotes,test,1,507,...,0.893441,0.915837,0.9045,0.002355,0.007696,0.007011,0.007338,0.9045,0.007338,coref-ontonotes-1
6,base,,bert-base-uncased-lex-edges-ner-ontonotes,,bert-base-uncased-lex,/nfs/jiant/exp/iftenney/20190721-test-ep-bert-...,ner-ontonotes,test,_micro_avg_,1374,...,0.928761,0.890831,0.909401,0.000407,0.004589,0.005448,0.004982,0.909401,0.004982,ner-ontonotes-_micro_avg_
7,base,,bert-base-uncased-mix-edges-ner-ontonotes,,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-test-ep-bert-...,ner-ontonotes,test,_micro_avg_,550,...,0.968147,0.956301,0.962187,0.000266,0.003087,0.003571,0.003312,0.962187,0.003312,ner-ontonotes-_micro_avg_
8,base,,bert-base-uncased-lex-edges-srl-ontonotes,,bert-base-uncased-lex,/nfs/jiant/exp/iftenney/20190721-test-ep-bert-...,srl-ontonotes,test,_core_,12281,...,0.79527,0.705753,0.747842,0.000625,0.004109,0.004372,0.004237,0.747842,0.004237,srl-ontonotes-_core_
9,base,,bert-base-uncased-mix-edges-srl-ontonotes,,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-test-ep-bert-...,srl-ontonotes,test,_core_,2885,...,0.947286,0.930877,0.93901,0.00032,0.002163,0.002434,0.00229,0.93901,0.00229,srl-ontonotes-_core_


Pivot DataFrame to present each task on a row, and each experiment on a column.

This form is suitable to copy-paste into a spreadsheet.

In [24]:
# Pivot to wide-form for spreadsheet, and sort in (mostly) stable order.
sheet_df = fdf.pivot(index="display_row", columns="display_col", values="score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print((100*sheet_df).to_csv(**csv_args))

display_row,bert-base-uncased-lex (base),bert-base-uncased-mix (base)
ner-ontonotes-_micro_avg_,90.9401,96.2187
srl-ontonotes-_clean_micro_,75.3051,91.5637
srl-ontonotes-_core_,74.7842,93.9010
srl-ontonotes-_non_core_,76.5765,86.0360
coref-ontonotes-1,74.4894,90.4500
spr2-_micro_avg_,81.6523,83.7346
rel-semeval-_clean_micro_,58.0696,82.1072



Print the same format, but show the 95% confidence intervals for each score.

In [25]:
sheet_df = fdf.pivot(index="display_row", columns="display_col", values="score_errn95")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print((100*sheet_df).to_csv(**csv_args))

display_row,bert-base-uncased-lex (base),bert-base-uncased-mix (base)
ner-ontonotes-_micro_avg_,0.4982,0.3312
srl-ontonotes-_clean_micro_,0.3499,0.2206
srl-ontonotes-_core_,0.4237,0.2290
srl-ontonotes-_non_core_,0.5961,0.4978
coref-ontonotes-1,1.1139,0.7338
spr2-_micro_avg_,1.1618,1.1066
rel-semeval-_clean_micro_,2.1498,1.5847

