# Edgeprobe Aggregate Analysis

This notebook is intended to be run on the output of the [`analyze_runs.py`](analyze_runs.py) script; run that on a folder of experiments to produce a `scores.tsv` file that can be loaded here.

**Note:** This is the notebook used to produce the plots and tables in the paper. Keep a frozen copy for posterity.

In [1]:
import sys, os, re, json
from importlib import reload
import itertools
import collections

import numpy as np
import pandas as pd

In [2]:
import bokeh
import bokeh.plotting as bp
bp.output_notebook()

In [3]:
import analysis
reload(analysis)

<module 'analysis' from '/nfs/jsalt/home/iftenney/jiant_test/probing/analysis.py'>

In [16]:
tasks = [
    "pos-ontonotes",
    "constituent-ontonotes",
    "nonterminal-ontonotes",
    "dep-labeling-ewt",
    "ner-ontonotes",
    "srl-conll2012",
    "coref-ontonotes-conll",
    "spr1",
    "spr2",
    "dpr",
]

# exp_types = [
#     "glove",
#     "cove",
#     "elmo-chars",
#     "elmo-ortho",
#     "elmo-full",
#     "openai"
# ]
# See https://bokeh.pydata.org/en/latest/docs/reference/palettes.html
_clist = bokeh.palettes.Category20[20]
exp_types_clist_idx = [
    ("glove", 2), # orange
    ("cove",  6), # deep red
    ("elmo-chars", 18), # aqua
    ("elmo-ortho", 8), # purple
    ("elmo-full", 0), # blue
    ("openai-lex", 16), # olive
    ("openai-cat", 4), # green
    ("openai", 4), # green
    ("openai-bwb", 12), # pink
    ("train-chars", 10), # brown
]
exp_types_colored = collections.OrderedDict()
# Use lighter versions for base model, darker for CNN
for k, v in exp_types_clist_idx:
    exp_types_colored[k] = _clist[v + 1]
    exp_types_colored[k+"-cnn1"] = _clist[v]
    exp_types_colored[k+"-cnn2"] = _clist[v]

def exp_type_sort_key(candidate):
    exp_type, _ = candidate.split(" ", 1)
    return (exp_types.index(exp_type), candidate)

def task_sort_key(candidate):
    for i, name in enumerate(tasks):
        if candidate.startswith(name):
            return (i, candidate)
    return (len(tasks), candidate)

exp_types, palette = zip(*exp_types_colored.items())
fill_cmap = bokeh.transform.factor_cmap('exp_type', palette, exp_types)

exp_types_colored

OrderedDict([('glove', '#ffbb78'),
             ('glove-cnn1', '#ff7f0e'),
             ('glove-cnn2', '#ff7f0e'),
             ('cove', '#ff9896'),
             ('cove-cnn1', '#d62728'),
             ('cove-cnn2', '#d62728'),
             ('elmo-chars', '#9edae5'),
             ('elmo-chars-cnn1', '#17becf'),
             ('elmo-chars-cnn2', '#17becf'),
             ('elmo-ortho', '#c5b0d5'),
             ('elmo-ortho-cnn1', '#9467bd'),
             ('elmo-ortho-cnn2', '#9467bd'),
             ('elmo-full', '#aec7e8'),
             ('elmo-full-cnn1', '#1f77b4'),
             ('elmo-full-cnn2', '#1f77b4'),
             ('openai-lex', '#dbdb8d'),
             ('openai-lex-cnn1', '#bcbd22'),
             ('openai-lex-cnn2', '#bcbd22'),
             ('openai-cat', '#98df8a'),
             ('openai-cat-cnn1', '#2ca02c'),
             ('openai-cat-cnn2', '#2ca02c'),
             ('openai', '#98df8a'),
             ('openai-cnn1', '#2ca02c'),
             ('openai-cnn2', '#2ca02c'),
        

In [17]:
def get_exp_type(exp_name):
    m = re.match(r"([a-z-]+)-edges-([a-z-]+)", exp_name)
    assert m is not None, f"Unable to parse run name: {run_path}"
    prefix, task = m.groups()
    return prefix

def clean_task_name(task_name):
    c1 = re.sub(r"^edges-", "", task_name)
    c2 = re.sub(r"-openai$", "", c1)
    return c2

ID_COLS = ['run', 'task', 'split']

def is_core_role(label):
    return re.match(r"^ARG[0-5A]$", label) is not None

def is_non_core_role(label):
    return re.match(r"^ARGM(-.+)?$", label) is not None

def is_core_or_noncore(label):
    return is_core_role(label) or is_non_core_role(label)

def is_srl_task(task):
    return task.startswith("srl-")

def is_coref_task(task):
    return task.startswith("coref-")

def is_constituent_task(task):
    return task.startswith("constituent-")

def agg_label_group(df, task_predicate, label_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
    mask = df['task'].map(task_predicate) & df['label'].map(label_predicate)
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf

def agg_stratifier_group(df, stratifier, key_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
    mask = (df['stratifier'] == stratifier) & df['stratum_key'].map(key_predicate)
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf    

def load_scores_file(filename, tag=None):
    df = pd.read_csv(filename, sep="\t", header=0)
    df.drop(['Unnamed: 0'], axis='columns', inplace=True)
    # df['task_raw'] = df['task'].copy()
    df['task'] = df['task'].map(clean_task_name)
    if not "stratifier" in df.columns:
        df["stratifier"] = None
    if not "stratum_key" in df.columns:
        df["stratum_key"] = 0
        
    ###
    # Add additional custom aggregations
    _eg = []
    # SRL core, non-core, and cleaned micro F1
    _eg.append(agg_label_group(df, is_srl_task, is_core_role, "_core_"))
    _eg.append(agg_label_group(df, is_srl_task, is_non_core_role, "_non_core_"))
    _eg.append(agg_label_group(df, is_srl_task, is_core_or_noncore, "_clean_micro_"))
    # Constituents: split into POS, nonterminals
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: x == 1, "_pos_"))
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: x > 1, "_nonterminal_"))
    df = pd.concat([df] + _eg, ignore_index=True, sort=False)
    
    df.insert(0, "exp_name", df['run'].map(lambda p: os.path.basename(os.path.dirname(p.strip("/")))))
    df.insert(1, "exp_type", df['exp_name'].map(get_exp_type))
    if tag is not None:
        df.insert(0, "tag", tag)
    return df

In [18]:
# score_files = [
#     ("base", "/nfs/jsalt/home/iftenney/exp/edges-20180913/scores_cm_newd.tsv"),
# #     ("base", "/nfs/jsalt/home/iftenney/exp/edges-20180921-train/scores_cm.tsv"),
#     ("cnn1", "/nfs/jsalt/home/iftenney/exp/edges-20180913-cnn1/scores_cm_newd.tsv"),
#     ("cnn2", "/nfs/jsalt/home/iftenney/exp/edges-20180914-cnn2/scores_cm_newd.tsv"),
# #     ("bow", "/nfs/jsalt/home/iftenney/exp/edges-20180920-bow/scores_cm.tsv"),
# ]
score_files = [
    ("base", "/nfs/jsalt/home/iftenney/exp/edges-20180921-openai/scores_cm.tsv"),
    ("base", "/nfs/jsalt/home/iftenney/exp/edges-20180922-openai/scores_cm.tsv"),
    ("base", "/nfs/jsalt/home/iftenney/exp/final_20180927/base/scores_cm.tsv"),
    ("cnn1", "/nfs/jsalt/home/iftenney/exp/final_20180927/cnn1/scores_cm.tsv"),
    ("cnn2", "/nfs/jsalt/home/iftenney/exp/final_20180927/cnn2/scores_cm.tsv"),
]
# scores_file = "/nfs/jsalt/home/iftenney/exp/edges-20180921-openai/scores_cm.tsv"
dfs = []
for tag, score_file in score_files:
    df = load_scores_file(score_file, tag=tag)
    dfs.append(df)
df = pd.concat(dfs, ignore_index=True, sort=False)
df['display_col'] = ["%s (%s)" % et for et in zip(df.exp_type, df.tag)]
print(df['task'].unique())
print(df['exp_type'].unique())

['constituent-ontonotes' 'coref-ontonotes-conll' 'dep-labeling-ewt' 'dpr'
 'ner-ontonotes' 'spr1' 'spr2' 'srl-conll2012' 'nonterminal-ontonotes'
 'pos-ontonotes']
['openai-cat' 'openai-lex' 'openai-bwb' 'cove' 'elmo-chars' 'elmo-full'
 'elmo-ortho' 'glove' 'openai']


In [19]:
def harmonic_mean(a, b):
    return 2 * a * b / (a + b)

df['precision'] = df.tp_count / (df.tp_count + df.fp_count)
df['recall'] = df.tp_count / (df.tp_count + df.fn_count)
df['f1_score'] = harmonic_mean(df.precision, df.recall).fillna(0)

# Approximate error intervals using normal approximation
# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
z = 1.96 # 95% confidence
df['precision_errn95'] = z * (df.precision * (1 - df.precision) / (df.tp_count + df.fp_count)).map(np.sqrt)
df['recall_errn95'] = z * (df.recall * (1 - df.recall) / (df.tp_count + df.fn_count)).map(np.sqrt)
# This is almost certainly not the right way to combine for F1 score - TODO to figure this out?
df['f1_errn95'] = harmonic_mean(df.precision_errn95, df.recall_errn95)

## Compute clean metrics for each task

For most tasks this is just the micro or macro average F1, but we need to ignore the 0 label for coref, and drop references and continuations for SRL.

In [88]:
SPLIT = "test"
mask = df['split'] == SPLIT
mask &= (df['exp_type'] != "openai")
mask &= (df['task'] != "constituent-ontonotes")
# mask &= df['exp_type'].map(lambda t: t in {'elmo-chars', 'elmo-full', 'elmo-ortho'})
# mask &= df['tag'] == "base"

final_scores = []
for task in df['task'].unique():
    task_scores = df[mask & (df['task'] == task)]
    if is_coref_task(task):
        final_scores.append(task_scores[task_scores['label'] == "1"])
    elif is_srl_task(task):
        final_scores.append(task_scores[task_scores['label'] == '_core_'])
        final_scores.append(task_scores[task_scores['label'] == '_non_core_'])
        # Use clean version, average only over core or noncore roles.
        final_scores.append(task_scores[task_scores['label'] == '_clean_micro_'])
    elif is_constituent_task(task):
        final_scores.append(task_scores[task_scores['label'] == '_pos_'])
        final_scores.append(task_scores[task_scores['label'] == '_nonterminal_'])
        final_scores.append(task_scores[task_scores['label'] == '_micro_avg_'])
    else:
        final_scores.append(task_scores[task_scores['label'] == '_micro_avg_'])
        
fdf = pd.concat(final_scores, axis=0, ignore_index=True, sort=False)
fdf['task_and_metric'] = ["%s-%s" % tl for tl in zip(fdf.task, fdf.label)]
print(len(fdf))
fdf

108


Unnamed: 0,tag,exp_name,exp_type,label,num_epochs,num_steps,run,task,split,fn_count,...,stratifier,stratum_key,display_col,precision,recall,f1_score,precision_errn95,recall_errn95,f1_errn95,task_and_metric
0,base,openai-cat-edges-coref-ontonotes-conll,openai-cat,1,,,/nfs/jsalt/home/iftenney/exp/edges-20180921-op...,coref-ontonotes-conll,test,1031.0,...,,,openai-cat (base),0.843982,0.828851,0.836348,0.009247,0.009511,0.009377,coref-ontonotes-conll-1
1,base,openai-lex-edges-coref-ontonotes-conll,openai-lex,1,,,/nfs/jsalt/home/iftenney/exp/edges-20180921-op...,coref-ontonotes-conll,test,1998.0,...,,,openai-lex (base),0.774976,0.668327,0.717711,0.011356,0.011889,0.011617,coref-ontonotes-conll-1
2,base,openai-bwb-edges-coref-ontonotes-conll,openai-bwb,1,,,/nfs/jsalt/home/iftenney/exp/edges-20180922-op...,coref-ontonotes-conll,test,1085.0,...,,,openai-bwb (base),0.837119,0.819887,0.828413,0.009422,0.009704,0.009561,coref-ontonotes-conll-1
3,base,cove-edges-coref-ontonotes-conll,cove,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,test,1339.0,...,,,cove (base),0.807341,0.777722,0.792255,0.010147,0.010500,0.010320,coref-ontonotes-conll-1
4,base,elmo-chars-edges-coref-ontonotes-conll,elmo-chars,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,test,1644.0,...,,,elmo-chars (base),0.780610,0.727092,0.752901,0.010828,0.011249,0.011035,coref-ontonotes-conll-1
5,base,elmo-full-edges-coref-ontonotes-conll,elmo-full,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,test,979.0,...,,,elmo-full (base),0.842237,0.837483,0.839854,0.009231,0.009316,0.009274,coref-ontonotes-conll-1
6,base,elmo-ortho-edges-coref-ontonotes-conll,elmo-ortho,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,test,1183.0,...,,,elmo-ortho (base),0.792178,0.803619,0.797857,0.010173,0.010032,0.010102,coref-ontonotes-conll-1
7,base,glove-edges-coref-ontonotes-conll,glove,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,test,1752.0,...,,,glove (base),0.750791,0.709163,0.729384,0.011239,0.011469,0.011353,coref-ontonotes-conll-1
8,cnn1,elmo-chars-edges-coref-ontonotes-conll,elmo-chars,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/cn...,coref-ontonotes-conll,test,1246.0,...,,,elmo-chars (cnn1),0.801275,0.793161,0.797197,0.010128,0.010228,0.010178,coref-ontonotes-conll-1
9,cnn2,elmo-chars-edges-coref-ontonotes-conll,elmo-chars,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/cn...,coref-ontonotes-conll,test,1267.0,...,,,elmo-chars (cnn2),0.807229,0.789675,0.798355,0.010072,0.010292,0.010181,coref-ontonotes-conll-1


In [89]:
# Pivot to wide-form for spreadsheet, and sort in (mostly) stable order.
sheet_df = fdf.pivot(index="task_and_metric", columns="display_col", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print(sheet_df.to_csv())

task_and_metric,glove (base),cove (base),elmo-chars (base),elmo-chars (cnn1),elmo-chars (cnn2),elmo-ortho (base),elmo-full (base),openai-lex (base),openai-cat (base),openai-bwb (base)
pos-ontonotes-_micro_avg_,0.8567657874965399,0.940388747073811,0.904439191714334,0.9574490264924825,0.9607675370507749,0.9105050572487232,0.9671130403533161,0.8819696633264332,0.9489259283825913,
nonterminal-ontonotes-_micro_avg_,0.5614604435661102,0.8157281012456721,0.6914408733347803,0.8398566806599108,0.8474476246680436,0.7153444688679135,0.8458977350702335,0.6514730337561286,0.8133257955096107,
dep-labeling-ewt-_micro_avg_,0.7500632722085452,0.836149346554461,0.8035985306732948,0.9072513342537453,0.9167451960503133,0.8535812291263375,0.9394805668754941,0.776902417188899,0.9213940059371314,0.9215239335721263
ner-ontonotes-_micro_avg_,0.8838085667629695,0.9025351645159951,0.9203461461058564,0.939792052873378,0.944181190265842,0.9287146179135595,0.9557288542291542,0.8862632869991823,0.9285627946331441,0.

In [90]:
sheet_df = fdf.pivot(index="task_and_metric", columns="display_col", values="f1_errn95")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print(sheet_df.to_csv())

task_and_metric,glove (base),cove (base),elmo-chars (base),elmo-chars (cnn1),elmo-chars (cnn2),elmo-ortho (base),elmo-full (base),openai-lex (base),openai-cat (base),openai-bwb (base)
pos-ontonotes-_micro_avg_,0.0014754868731930756,0.0009751363394922367,0.0012531322128521058,0.0008568561601975483,0.0008228081203835572,0.0012154220478059544,0.0007578406819400379,0.0013770397008379457,0.000936162002343635,
nonterminal-ontonotes-_micro_avg_,0.002361682155042982,0.001748907174111198,0.0021297572178241206,0.0016560691311704393,0.001621758679342687,0.002062424692633641,0.0016296036096583376,0.0022107668731449264,0.0017635762323095266,
dep-labeling-ewt-_micro_avg_,0.005173870306317738,0.004342170093182933,0.004769309992899786,0.0035051711645642195,0.003351966223218507,0.0042430072190879995,0.0029077202937506995,0.005020391744062097,0.0033020997468528822,0.003294246909749387
ner-ontonotes-_micro_avg_,0.005342857601821733,0.0049238037070870105,0.004719571964636728,0.004104131444831603,0.0039808

In [23]:
sorted(fdf['task_and_metric'].unique(), key=task_sort_key)

['pos-ontonotes-_micro_avg_',
 'nonterminal-ontonotes-_micro_avg_',
 'dep-labeling-ewt-_micro_avg_',
 'ner-ontonotes-_micro_avg_',
 'srl-conll2012-_clean_micro_',
 'srl-conll2012-_core_',
 'srl-conll2012-_non_core_',
 'coref-ontonotes-conll-1',
 'spr1-_micro_avg_',
 'spr2-_micro_avg_',
 'dpr-_micro_avg_']

# Plot "final scores"

Make ELMo baselines figure / table.

In [24]:
fdf.columns

Index(['tag', 'exp_name', 'exp_type', 'label', 'num_epochs', 'num_steps',
       'run', 'task', 'split', 'fn_count', 'fp_count', 'tn_count', 'tp_count',
       'stratifier', 'stratum_key', 'display_col', 'precision', 'recall',
       'f1_score', 'precision_errn95', 'recall_errn95', 'f1_errn95',
       'task_and_metric'],
      dtype='object')

In [41]:
def _make_display_name(task, label):
    task_to_display_name = {
        "pos-ontonotes": "Part-of-Speech",
        "nonterminal-ontonotes": "Constituents",
        "dep-labeling-ewt": "Dependencies",
        "ner-ontonotes": "Entities",
        "srl-conll2012": "SRL",
        "coref-ontonotes-conll": "OntoNotes Coref.",
        "spr1": "SPR1",
        "spr2": "SPR2",
        "dpr": "Winograd Coref."
    }
    display_task = task_to_display_name[task]
    if label in {"_micro_avg_", "1"}:
        return display_task
    elif label == "_clean_micro_":
        return f"{display_task} (all)"
    elif label == "_core_":
        return f"{display_task} (core)"
    elif label == "_non_core_":
        return f"{display_task} (non-core)"
    else:
        clean_label = label.strip("_")
        return f"{display_task} ({clean_label})"

In [44]:
mask = (fdf.exp_type == "elmo-chars") 
mask |= ((fdf.exp_type == "elmo-ortho") | (fdf.exp_type == "elmo-full")) & (fdf.tag == "base")
plot_df = fdf[mask].copy()

_SCORE_COL="f1_score"
_ERROR_COL="f1_errn95"
plot_df['_sort_key'] = plot_df['task_and_metric'].map(task_sort_key)
plot_df.sort_values(by="_sort_key", axis=0, inplace=True)
plot_df['_display_name'] = [_make_display_name(*tl) 
                            for tl in zip(plot_df['task'], plot_df['label'])]
plot_df['col_key'] = list(zip(plot_df['exp_type'], plot_df['tag']))
plot_df['fmt_score'] = plot_df[_SCORE_COL].map(
    lambda s: "{:.0f}".format(100*s)
)
plot_df['_err_upper'] = plot_df[_SCORE_COL] + plot_df[_ERROR_COL]
plot_df['_err_lower'] = plot_df[_SCORE_COL] - plot_df[_ERROR_COL]
print("Found %s entries" % len(plot_df))
# long_ds = bokeh.models.ColumnDataSource(data=plot_df)

ordered_labels = list(reversed(plot_df['_display_name'].unique())) + [""]
_RP = 0.3
factor_range = bokeh.models.FactorRange(*ordered_labels, range_padding=_RP,
                                        range_padding_units='absolute')
tools = "save,reset"

xstart = 20
xend = 100
_FONT_SIZE = "15pt"
p = bp.figure(y_range=factor_range, x_range=[xstart, xend],
              height=550, width=1200, tools=tools)

label_kw = dict(text_align="center", text_baseline="middle",
                text_font_size=_FONT_SIZE)

palette = bokeh.models.palettes.Category20[20]
col_keys = [
    ("elmo-chars", "base", "Lex.", "diamond"),
    ("elmo-chars", "cnn1", "CNN1", "triangle"),
    ("elmo-chars", "cnn2", "CNN2", "inverted_triangle"),
    ("elmo-ortho", "base", "Ortho.", "square"),
    ("elmo-full", "base", "Full", "circle"),
]
for i,ck in enumerate(col_keys):
    print(ck)
    ds = plot_df[plot_df.col_key == ck[:2]]
    y = ds["_display_name"]
    x = 100*ds[_SCORE_COL]
    e = 100*ds[_ERROR_COL]
    c = palette[2*i]
    cp = palette[2*i+1]
    p.hbar(y=y, left=x-e, right=x+e, height=0.90,
           fill_color=c, fill_alpha=0.5,
           line_color=None, line_width=0)
    p.hbar(y=y, left=x-0.05, right=x+0.05, height=0.90,
           fill_color="Black", fill_alpha=1.0,
           line_color=None, line_width=0)
    p.hbar(y=y, left=0, right=x, height=0.90,
           fill_color="Gray", fill_alpha=0.05,
           line_color=None, line_width=0)
    p.scatter(y=y, x=x, size=12, fill_color=c, legend=ck[2],
              marker=ck[3])
    
    dss = bokeh.models.ColumnDataSource(data=ds)
    lpos = xstart+(0.7+i)*(xend-xstart)/20
    score_labels = bokeh.models.LabelSet(y="_display_name", x=lpos,
                                         text="fmt_score", source=dss, **label_kw)
    p.add_layout(score_labels)
    cat_label = bokeh.models.Label(y=len(ordered_labels)-1+0.6, x=lpos, text=ck[2], 
                                   angle=np.pi/6, **label_kw)
    p.add_layout(cat_label)
#     error_bars = bokeh.models.Whisker(base="_display_name", upper="_err_upper", lower="_err_lower",
#                                       source=dss, level="glyph", dimension="width")
#     p.add_layout(error_bars)
    

# p.xaxis.major_label_orientation = 1
p.xaxis.bounds = (0,100)
p.yaxis.bounds = (-_RP, 2*len(col_keys)+2)
p.yaxis.major_label_text_font_size = "13pt"
p.xaxis.major_label_text_font_size = _FONT_SIZE
p.ygrid.grid_line_alpha = 0
p.xgrid.grid_line_alpha = 0
p.xaxis.axis_label = "F1 Score"
p.xaxis.axis_label_text_font_size = _FONT_SIZE
p.legend.orientation = "horizontal"
p.legend.background_fill_color = None
p.legend.border_line_color = None
p.legend.label_text_font_size = _FONT_SIZE

p.min_border = 0
bp.show(p)

Found 55 entries
('elmo-chars', 'base', 'Lex.', 'diamond')
('elmo-chars', 'cnn1', 'CNN1', 'triangle')
('elmo-chars', 'cnn2', 'CNN2', 'inverted_triangle')
('elmo-ortho', 'base', 'Ortho.', 'square')
('elmo-full', 'base', 'Full', 'circle')


In [45]:
# DEPRECATED
plot_df = fdf.copy()

_SCORE_COL="f1_score"
_ERROR_COL="f1_errn95"
plot_df['_sort_key'] = plot_df['task_and_metric'].map(task_sort_key)
plot_df.sort_values(by="_sort_key", axis=0, inplace=True)
plot_df['_display_name'] = [_make_display_name(*tl) 
                            for tl in zip(plot_df['task'], plot_df['label'])]
plot_df['row_key'] = list(zip(plot_df['_display_name'], plot_df['exp_type']))
plot_df['fmt_score'] = plot_df[_SCORE_COL].map(
    lambda s: "{:.02f}".format(s)
)
plot_df['_err_upper'] = plot_df[_SCORE_COL] + plot_df[_ERROR_COL]
plot_df['_err_lower'] = plot_df[_SCORE_COL] - plot_df[_ERROR_COL]
print("Found %s entries" % len(plot_df))
long_ds = bokeh.models.ColumnDataSource(data=plot_df)

filtered_exp_types = [e for e in exp_types if e in set(plot_df['exp_type'].unique())]
# ordered_labels = sorted(plot_df['_display_name'].unique(), key=task_sort_key)
ordered_labels = plot_df['_display_name'].unique()
categories = list(itertools.product(ordered_labels, filtered_exp_types))
factor_range = bokeh.models.FactorRange(*categories, range_padding=0.5,
                                       range_padding_units='absolute')
tools = "save,reset"

p = bp.figure(x_range=factor_range, y_range=[0,1],
              width=1250, tools=tools)
p.vbar(x='row_key', top=_SCORE_COL, width=0.95,
       fill_color=fill_cmap,
       line_color="Gray", source=long_ds)
label_kw = dict(text_align="right", text_baseline="middle", y_offset=-5,
                text_font_size="11pt", angle=90, angle_units='deg')
score_labels = bokeh.models.LabelSet(x='row_key', y="_err_lower",
                                     text="fmt_score",
                                     source=long_ds, **label_kw)
p.add_layout(score_labels)
error_bars = bokeh.models.Whisker(base="row_key", upper="_err_upper", lower="_err_lower",
                                  source=long_ds, level="glyph")
p.add_layout(error_bars, )
p.xaxis.major_label_orientation = 1
p.yaxis.bounds = (0,1)

bp.show(p)

Found 108 entries


## Plot by constituent height

POS tags are height 1, higher nonterminals are height > 1.

In [46]:
df.columns

Index(['tag', 'exp_name', 'exp_type', 'label', 'num_epochs', 'num_steps',
       'run', 'task', 'split', 'fn_count', 'fp_count', 'tn_count', 'tp_count',
       'stratifier', 'stratum_key', 'display_col', 'precision', 'recall',
       'f1_score', 'precision_errn95', 'recall_errn95', 'f1_errn95'],
      dtype='object')

In [57]:
mask = (df.split == "val")
mask &= (df.stratifier == "info.height") & (df.task == "nonterminal-ontonotes")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [67]:
mask = (df.split == "val")
mask &= df.stratifier.notnull() & (df.task == "srl-conll2012")
# mask &= ((df['exp_type'] == "elmo-full") & (df['tag'] == "base")) | \
#          (df['exp_type'] == "elmo-chars")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [76]:
mask = (df.split == "val")
mask &= df.stratifier.notnull() & (df.task == "coref-ontonotes-conll")
# mask &= ((df['exp_type'] == "elmo-full") & (df['tag'] == "base")) | \
#          (df['exp_type'] == "elmo-chars")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [80]:
mask = (df.split == "val")
mask &= df.stratifier.notnull() & (df.task == "dep-labeling-ewt")
# mask &= ((df['exp_type'] == "elmo-full") & (df['tag'] == "base")) | \
#          (df['exp_type'] == "elmo-chars")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [81]:
plot_df.tag.unique()

array(['base', 'cnn1', 'cnn2'], dtype=object)

In [98]:
height = 400
width = 900
_FONT_SIZE = "14pt"

# title = "F1 by height on OntoNotes constituent labeling"
# _X_LABEL = "Constituent Height"
# _X_RANGE = [1.5,20.5]
# _Y_RANGE = [0,1]
# title = "F1 by span distance on OntoNotes SRL"
# _X_LABEL = "Span separation distance (tokens)"
# _X_RANGE = [-0.5, 15.5]
# _Y_RANGE = [0.5,1]
# _X_LABEL = "Span separation distance (tokens)"
# _X_RANGE = [-0.5, 15.5]
# _Y_RANGE = [0.5,1]
# title = "F1 by span distance on dependency labeling"
_X_LABEL = "Span separation distance (tokens)"
_X_RANGE = [-0.5, 15.5]
_Y_RANGE = [0.4,1.05]

# def _make_plot_legend_name(exp_type, tag):
#     if exp_type == "elmo-full":
#         return "Full ELMo"
#     elif exp_type == "elmo-chars":
#         return f"Lex. ({tag})"
#     else:
#         raise ValueError(f"Unrecognized experiment: ({exp_type}, {tag})")

col_keys = [
    ("elmo-chars", "base", "Lex.", "diamond"),
    ("elmo-chars", "cnn1", "CNN1", "triangle"),
    ("elmo-chars", "cnn2", "CNN2", "inverted_triangle"),
    ("elmo-ortho", "base", "Ortho.", "square"),
    ("elmo-full", "base", "Full", "circle"),
]

plot_df['col_key'] = list(zip(plot_df['exp_type'], plot_df['tag']))
        
_SCORE_COL="f1_score"
_ERROR_COL="f1_errn95"

tools = "save,reset"

palette = bokeh.models.palettes.Category20[20]
p = bp.figure(x_range=_X_RANGE, y_range=_Y_RANGE, 
              width=width, height=height,
              tools=tools)
crs = []
# gb = plot_df.groupby(by=["exp_type", "tag", "display_col"])
# for i, ((exp_type, tag, name), idx) in enumerate(gb.groups.items()):
for i, ck in enumerate(col_keys):
    ds = plot_df[plot_df['col_key'] == ck[:2]]
    et_key = exp_type + (f"-{tag}" if tag != "base" else "")
#     x = plot_df.loc[idx, "stratum_key"]
#     y = plot_df.loc[idx, _SCORE_COL]
#     e = plot_df.loc[idx, _ERROR_COL]
    x = ds["stratum_key"]
    y = ds[_SCORE_COL]
    e = ds[_ERROR_COL]
    c = palette[2*i]
#     display_name = _make_plot_legend_name(exp_type, tag) + "  "
#     display_name = ck[2]
    p.line(x=x, y=y, color=c, line_width=2)
    cr = p.scatter(x=x, y=y, color=c, size=12, hover_fill_color="Gray",
                   legend=ck[2], marker=ck[3])
    crs.append(cr)
    ds = bokeh.models.ColumnDataSource(data=dict(x=x, upper=y+e, lower=y-e))
#     error_bars = bokeh.models.Whisker(base='x', upper='upper', lower='lower',
#                                       line_color=c,
#                                       source=ds)
#     p.add_layout(error_bars)
    error_band = bokeh.models.Band(base='x', upper='upper', lower='lower',
                                   source=ds, level='underlay',
                                   fill_alpha=0.4, line_width=1,
                                   line_color=palette[2*i],
                                   fill_color=palette[2*i+1])
    p.add_layout(error_band)

p.yaxis.bounds = (0,1)

##
# Overlay histogram at bottom of the plot (this looks nice)
counts = plot_df['tp_count'] + plot_df['fn_count']
strata = plot_df['stratum_key']
hist_height = 0.3
p.extra_y_ranges = {"hist": bokeh.models.Range1d(min(counts), 
                                                 max(counts)/hist_height)}
p.add_layout(bokeh.models.LinearAxis(y_range_name="hist",
                                     bounds=(min(counts), max(counts))), "right")
p.vbar(x=strata, top=counts, width=0.9,
       y_range_name='hist', color="Gray")

# Add fancy hover tool
tooltips = [
    ("Height", "@x"),
    ("F1 score", "@y{0.00}"),
]
p.add_tools(bokeh.models.HoverTool(tooltips=tooltips, renderers=crs, 
#                                    mode='vline',
                                  ))
p.xaxis.axis_label = _X_LABEL
p.yaxis[0].axis_label = "F1 Score"
p.yaxis[1].axis_label = ""
p.yaxis[1].formatter = bokeh.models.NumeralTickFormatter(format="0a")
p.legend.orientation = "horizontal"
# p.legend.location = "bottom_right"
# p.legend.background_fill_alpha = 0.0
# p.legend.border_line_alpha = 0

p.yaxis.major_label_text_font_size = "13pt"
p.xaxis.major_label_text_font_size = "13pt"
p.xaxis.major_label_text_font_size = _FONT_SIZE
p.xaxis.axis_label_text_font_size = _FONT_SIZE
p.yaxis.axis_label_text_font_size = _FONT_SIZE
# p.legend.orientation = "horizontal"
p.legend.background_fill_color = None
p.legend.border_line_color = None
p.legend.label_text_font_size = _FONT_SIZE

p.min_border = 0

bp.show(p)

In [66]:
p1 = p

In [None]:
mask = (df.split == "val")
mask &= (df.stratifier == "info.height") & (df.task == "constituent-ontonotes")
mask &= df['exp_type'].map(lambda t: t in {'elmo-chars', 'elmo-full', 'elmo-ortho', 'openai-cat'})
sheet_df = df[mask].pivot(index="stratum_key", columns="display_col", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index), axis=0)
# sheet_df
print(sheet_df.to_csv())

In [None]:
sheet_df = df[mask].pivot(index="stratum_key", columns="display_col", values="f1_errn95")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index), axis=0)
# sheet_df
print(sheet_df.to_csv())

## Plot by span distance

Absolute distance (in number of tokens under model tokenization) between midpoint of `span1` and midpoint of `span2`.

In [None]:
df.columns

In [None]:
df['task'].unique()

In [None]:
mask = df.stratifier.notnull() & (df.task == "srl-conll2012") & (df.split == "val")
mask &= df['exp_type'].map(lambda t: t in {'elmo-chars', 'elmo-full', 'elmo-ortho', 'openai-cat'})
sheet_df = df[mask].pivot(index="stratum_key", columns="display_col", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index), axis=0)
# sheet_df
print(sheet_df.to_csv())

## Plot by label for SRL core, non-core

In [None]:
mask = df.label.map(is_core_or_noncore) & (df.task == "srl-conll2012") & (df.split == "val")
sheet_df = df[mask].pivot(index="label", columns="exp_type", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print(sheet_df.to_csv())

In [None]:
plot_df = df[df.label.map(is_core_role) & df.task == "srl-conll2012"].copy()

_SCORE_COL="f1_score"
plot_df['row_key'] = list(zip(plot_df['task'], plot_df['exp_type']))
plot_df['fmt_score'] = plot_df[_SCORE_COL].map(
    lambda s: "{:.02f}".format(s)
)
print("Found %s entries" % len(plot_df))
long_ds = bokeh.models.ColumnDataSource(data=plot_df)

factor_range = bokeh.models.FactorRange(*categories, range_padding=0.5,
                                       range_padding_units='absolute')
tools = "save,reset"

p = bp.figure(x_range=factor_range, y_range=[0,1],
              width=1250, tools=tools)
p.vbar(x='row_key', top=_SCORE_COL, width=0.95,
       fill_color=fill_cmap,
       line_color="Gray", source=long_ds)
label_kw = dict(text_align="right", text_baseline="middle", y_offset=-3,
                text_font_size="11pt", angle=90, angle_units='deg')
score_labels = bokeh.models.LabelSet(x='row_key', y=_SCORE_COL,
                                     text="fmt_score",
                                     source=long_ds, **label_kw)
p.add_layout(score_labels)
p.xaxis.major_label_orientation = 1
p.yaxis.bounds = (0,1)

bp.show(p)

In [None]:
import holoviews as hv
hv.extension("bokeh")

In [None]:
plot_df = fdf.copy()
key_dimensions = [("exp_type", "Experiment Type"),
                  ("label", "Label"),
                  ("task", "Task")]
value_dimensions = [("f1_score", "F1 Score")]
plot = hv.Table(plot_df, key_dimensions, value_dimensions)

%%opts Bars [xrotation=75 width=900 height=500 show_legend=False tools=['hover']]
%%opts Bars [color=Cycle('Category20')]
plot.to.bars(["Task", "Experiment Type"], "F1 Score", [])