# Edgeprobe Aggregate Analysis (for ICLR camera-ready)

This is the main analysis notebook for [What do you learn from context? Probing for sentence structure in contextualized word representations](https://openreview.net/forum?id=SJzSgnRcKX), a.k.a. "the edge probing paper."

This notebook is intended to be run on the output of the [`analyze_runs.py`](analyze_runs.py) script; run that on a folder of experiments to produce a `scores.tsv` file that can be loaded here.

In [1]:
import sys, os, re, json
from importlib import reload
import itertools
import collections

import numpy as np
import pandas as pd

In [2]:
import bokeh
import bokeh.plotting as bp
bp.output_notebook()

In [3]:
import analysis
reload(analysis)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


<module 'analysis' from '/nfs/jsalt/home/iftenney/jiant_test/probing/analysis.py'>

In [4]:
tasks = [
    "pos-ontonotes",
    "nonterminal-ontonotes",
    "constituent-ontonotes",
    "dep-labeling-ewt",
    "ner-ontonotes",
    "srl-conll2012",
    "coref-ontonotes-conll",
    "spr1",
    "spr2",
    "dpr",
    "ner-tacred",
    "rel-tacred",
    "rel-semeval",
    "coref-gap",
    "coref-gap-ontonotes",
    "noun-verb",
]

# See https://bokeh.pydata.org/en/latest/docs/reference/palettes.html
_clist = bokeh.palettes.Category20[20]
exp_types_clist_idx = [
    ("glove", 2), # orange
    ("cove",  6), # deep red
    ("elmo-chars", 18), # aqua
    ("elmo-ortho", 8), # purple
    ("elmo-full", 0), # blue
    ("openai-lex", 16), # olive
    ("openai-cat", 4), # green
    ("openai-mix", 12), # pink
    ("openai", 4), # green
    ("openai-bwb", 12), # pink
    ("train-chars", 10), # brown
]
# Make all BERT expts the same color
for bert_name in ['base-uncased', 'base-cased', 'large-uncased', 'large-cased']:
    exp_types_clist_idx.append((f"bert-{bert_name}-lex", 16))  # olive
    exp_types_clist_idx.append((f"bert-{bert_name}-cat", 4))   # green
    exp_types_clist_idx.append((f"bert-{bert_name}-mix", 12))  # pink
    
exp_types_colored = collections.OrderedDict()
# Use lighter versions for base model, darker for CNN
for k, v in exp_types_clist_idx:
    exp_types_colored[k] = _clist[v + 1]
    exp_types_colored[k+"-cnn1"] = _clist[v]
    exp_types_colored[k+"-cnn2"] = _clist[v]

def exp_type_sort_key(candidate):
    exp_type = candidate.split(" ", 1)[0]
    return (exp_types.index(exp_type), candidate)

def task_sort_key(candidate):
    for i, name in enumerate(tasks):
        if candidate.startswith(name):
            return (i, candidate)
    return (len(tasks), candidate)

exp_types, palette = zip(*exp_types_colored.items())
fill_cmap = bokeh.transform.factor_cmap('exp_type', palette, exp_types)

exp_types_colored

OrderedDict([('glove', '#ffbb78'),
             ('glove-cnn1', '#ff7f0e'),
             ('glove-cnn2', '#ff7f0e'),
             ('cove', '#ff9896'),
             ('cove-cnn1', '#d62728'),
             ('cove-cnn2', '#d62728'),
             ('elmo-chars', '#9edae5'),
             ('elmo-chars-cnn1', '#17becf'),
             ('elmo-chars-cnn2', '#17becf'),
             ('elmo-ortho', '#c5b0d5'),
             ('elmo-ortho-cnn1', '#9467bd'),
             ('elmo-ortho-cnn2', '#9467bd'),
             ('elmo-full', '#aec7e8'),
             ('elmo-full-cnn1', '#1f77b4'),
             ('elmo-full-cnn2', '#1f77b4'),
             ('openai-lex', '#dbdb8d'),
             ('openai-lex-cnn1', '#bcbd22'),
             ('openai-lex-cnn2', '#bcbd22'),
             ('openai-cat', '#98df8a'),
             ('openai-cat-cnn1', '#2ca02c'),
             ('openai-cat-cnn2', '#2ca02c'),
             ('openai-mix', '#f7b6d2'),
             ('openai-mix-cnn1', '#e377c2'),
             ('openai-mix-cnn2', '#e377c2

In [5]:
def get_exp_type(exp_name):
    m = re.match(r"([a-z-]+)-edges-([a-z-]+)", exp_name)
    assert m is not None, f"Unable to parse run name: {run_path}"
    prefix, task = m.groups()
    return prefix

def clean_task_name(task_name):
    c1 = re.sub(r"^edges-", "", task_name)
    c2 = re.sub(r"-openai$", "", c1)
    return c2

ID_COLS = ['run', 'task', 'split']

def is_core_role(label):
    return re.match(r"^ARG[0-5A]$", label) is not None

def is_non_core_role(label):
    return re.match(r"^ARGM(-.+)?$", label) is not None

def is_core_or_noncore(label):
    return is_core_role(label) or is_non_core_role(label)

def is_srl_task(task):
    return task.startswith("srl-")

def is_coref_task(task):
    return task.startswith("coref-")

def is_constituent_task(task):
    return task.startswith("constituent-")

def is_relation_task(task):
    return task.startswith("rel-")

def is_positive_relation(label):
    return (not label.startswith("_")) and (label != "no_relation") and (label != "Other")

def agg_label_group(df, task_predicate, label_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
    mask = df['task'].map(task_predicate) & df['label'].map(label_predicate)
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf

def agg_stratifier_group(df, stratifier, key_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
#     mask = (df['stratifier'] == stratifier) & df['stratum_key'].map(key_predicate)
    # Use this for short-circuit evaluation, so we don't call key_predicate on invalid keys
    mask = [(s == stratifier and key_predicate(key)) 
            for s, key in zip(df['stratifier'], df['stratum_key'])]
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf    

def load_scores_file(filename, tag=None, seed=None):
    df = pd.read_csv(filename, sep="\t", header=0)
    df.drop(['Unnamed: 0'], axis='columns', inplace=True)
    # df['task_raw'] = df['task'].copy()
    df['task'] = df['task'].map(clean_task_name)
    if not "stratifier" in df.columns:
        df["stratifier"] = None
    if not "stratum_key" in df.columns:
        df["stratum_key"] = 0
        
    ###
    # Add additional custom aggregations
    _eg = []
    # SRL core, non-core, and cleaned micro F1
    _eg.append(agg_label_group(df, is_srl_task, is_core_role, "_core_"))
    _eg.append(agg_label_group(df, is_srl_task, is_non_core_role, "_non_core_"))
    _eg.append(agg_label_group(df, is_srl_task, is_core_or_noncore, "_clean_micro_"))
    # Constituents: split into POS, nonterminals
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: int(x) == 1, "_pos_"))
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: int(x) > 1, "_nonterminal_"))
    # Relations: ignore negative class (no_relation)
    _eg.append(agg_label_group(df, is_relation_task, is_positive_relation, "_clean_micro_"))
    df = pd.concat([df] + _eg, ignore_index=True, sort=False)
    
    df.insert(0, "exp_name", df['run'].map(lambda p: os.path.basename(os.path.dirname(p.strip("/")))))
    df.insert(1, "exp_type", df['exp_name'].map(get_exp_type))
    if tag is not None:
        df.insert(0, "tag", tag)
    df.insert(1, "seed", seed)
    return df

In [6]:
def _make_display_name(task, label):
    task_to_display_name = {
        "pos-ontonotes": "Part-of-Speech",
        "nonterminal-ontonotes": "Constituents",
        "dep-labeling-ewt": "Dependencies",
        "ner-ontonotes": "Entities",
        "srl-conll2012": "SRL",
        "coref-ontonotes-conll": "OntoNotes coref.",
        "spr1": "SPR1",
        "spr2": "SPR2",
        "dpr": "Winograd coref.",
        "ner-tacred": "Entities (TACRED)",
        "rel-tacred": "Relations (TACRED)",
        "rel-semeval": "Relations (SemEval)",
        "coref-gap": "GAP coref.",
        "coref-gap-ontonotes": "GAP coref (train OOD)",
        "noun-verb": "Noun-Verb",
    }
    display_task = task_to_display_name[task]
    if label in {"_micro_avg_", "1", None}:
        return display_task
    elif label == "_clean_micro_":
        if task.startswith("srl-"):
            return f"{display_task} (all)"
        else:
            return display_task
    elif label == "_core_":
        return f"{display_task} (core)"
    elif label == "_non_core_":
        return f"{display_task} (non-core)"
    elif label.endswith(":_mean_"):
        return f"{display_task} (mean)"
    else:
        clean_label = label.strip("_")
        return f"{display_task} ({clean_label})"

In [7]:
score_files = []
score_files = [
    ("base", "/nfs/jsalt/home/iftenney/exp/edges-20180921-openai/scores_cm.tsv"),
    ("base", "/nfs/jsalt/home/iftenney/exp/edges-20180922-openai/scores_cm.tsv"),
    ("base", "/nfs/jsalt/home/iftenney/exp/final_20180927/base/scores_cm.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190124-bert/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190125-bert-splitconst/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190125-bert-large/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190129-bert-mix/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190205-tacred/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190205-semeval/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190205-noun-verb/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190207-gap/scores.tsv"),
    ("base", "/nfs/jsalt/exp/edges-20190209-openai-mix/scores.tsv"),
    ("bow", "/nfs/jsalt/exp/edges-20190220-rel-bow/scores.tsv"),
    ("bow", "/nfs/jsalt/exp/edges-20190220-core-bow/scores.tsv"),
    ("cnn1", "/nfs/jsalt/home/iftenney/exp/final_20180927/cnn1/scores_cm.tsv"),
    ("cnn2", "/nfs/jsalt/home/iftenney/exp/final_20180927/cnn2/scores_cm.tsv"),
]

dfs = []
for tag, score_file in score_files:
    df = load_scores_file(score_file, tag=tag)
    dfs.append(df)

# Load cross-val score files
for seed in [100,101,102,103,104]:
    df = load_scores_file(f"/nfs/jsalt/exp/edges-20190204-dpr-{seed:d}/scores.tsv", 
                          tag="base", seed=seed)
    dfs.append(df)
    df = load_scores_file(f"/nfs/jsalt/exp/edges-20190213-dpr-openai-mix-{seed:d}/scores.tsv", 
                          tag="base", seed=seed)
    dfs.append(df)
    df = load_scores_file(f"/nfs/jsalt/exp/edges-20190220-dpr-bert-cat-{seed:d}/scores.tsv", 
                          tag="base", seed=seed)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True, sort=False)
df['display_col'] = ["%s (%s)" % et for et in zip(df.exp_type, df.tag)]
print(df['task'].unique())
print(df['exp_type'].unique())

['constituent-ontonotes' 'coref-ontonotes-conll' 'dep-labeling-ewt' 'dpr'
 'ner-ontonotes' 'spr1' 'spr2' 'srl-conll2012' 'nonterminal-ontonotes'
 'pos-ontonotes' 'rel-tacred' 'rel-semeval' 'noun-verb'
 'coref-gap-ontonotes' 'coref-gap' 'ner-tacred']
['openai-cat' 'openai-lex' 'openai-bwb' 'cove' 'elmo-chars' 'elmo-full'
 'elmo-ortho' 'glove' 'openai' 'bert-base-cased-cat' 'bert-base-cased-lex'
 'bert-base-uncased-cat' 'bert-base-uncased-lex' 'bert-large-cased-cat'
 'bert-large-cased-lex' 'bert-large-uncased-cat' 'bert-large-uncased-lex'
 'bert-base-cased-mix' 'bert-base-uncased-mix' 'bert-large-cased-mix'
 'bert-large-uncased-mix' 'openai-mix']


In [8]:
def harmonic_mean(a, b):
    return 2 * a * b / (a + b)

df['pred_pos_count'] = df.tp_count + df.fp_count
df['true_pos_count'] = df.tp_count + df.fn_count
df['total_count'] = df.tp_count + df.tn_count + df.fp_count + df.fn_count

# NOTE: this overwrites any _macro_avg_ rows by recomputing the micro-average!
df['accuracy'] = (df.tp_count + df.tn_count) / df.total_count
df['precision'] = df.tp_count / df.pred_pos_count
df['recall'] = df.tp_count / df.true_pos_count
df['f1_score'] = harmonic_mean(df.precision, df.recall).fillna(0)

# Approximate error intervals using normal approximation
# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
z = 1.96 # 95% confidence
df['accuracy_errn95'] = z * (df.accuracy * (1 - df.accuracy) / df.total_count).map(np.sqrt)
df['precision_errn95'] = z * (df.precision * (1 - df.precision) / df.pred_pos_count).map(np.sqrt)
df['recall_errn95'] = z * (df.recall * (1 - df.recall) / df.true_pos_count).map(np.sqrt)
# This is probably not the right way to combine for F1 score - TODO to figure this out?
df['f1_errn95'] = harmonic_mean(df.precision_errn95, df.recall_errn95)

def _get_final_score(row):
    return row['f1_score'], row['f1_errn95']

df['score'], df['score_errn95'] = zip(*(_get_final_score(row) for i, row in df.iterrows()))

For DPR, we need to average across multiple runs to get a good estimate of performance.

In [9]:
mask = df['task'] == 'dpr'
mask &= df['label'] != "__run_info__"
mask &= df['seed'].notnull()
gb_cols = ["tag", "exp_name", "exp_type", "task", "label", "split", "display_col"]
gb = df[mask].groupby(by=gb_cols)
new_rows = []
for key, idxs in gb.groups.items():
    new_row = dict(zip(gb_cols, key))
    new_row["seed"] = "_mean_"
    new_row["score"] = df.loc[idxs, "score"].mean()
    new_row["score_errn95"] = 1.96 * np.sqrt(df.loc[idxs, "score"].var()/len(idxs))
    new_rows.append(new_row)
    
agg_df = pd.DataFrame.from_records(new_rows)
df = pd.concat([df, agg_df], ignore_index=True, sort=False)

For SemEval 2010 Task 8, the official metric is macro-averaged F1 over non-Other labels. Compute this so we can compare to SOTA.

In [10]:
mask = df['task'] == 'rel-semeval'
mask &= df['split'].notnull()
mask &= df['label'].map(is_positive_relation)
_id_cols = ['run', 'split']
_agg_cols = ['score']
gb = df[mask][_id_cols + _agg_cols].groupby(_id_cols)
afd = gb.agg('mean')
afd = afd.reset_index()

csv_args = dict(float_format="%.4f")
print(afd.to_csv(index=False, **csv_args))

run,split,score
/nfs/jsalt/exp/edges-20190205-semeval/bert-base-uncased-cat-edges-rel-semeval/run,test,0.7033
/nfs/jsalt/exp/edges-20190205-semeval/bert-base-uncased-cat-edges-rel-semeval/run,val,0.7056
/nfs/jsalt/exp/edges-20190205-semeval/bert-base-uncased-lex-edges-rel-semeval/run,test,0.5326
/nfs/jsalt/exp/edges-20190205-semeval/bert-base-uncased-lex-edges-rel-semeval/run,val,0.5021
/nfs/jsalt/exp/edges-20190205-semeval/bert-base-uncased-mix-edges-rel-semeval/run,test,0.7441
/nfs/jsalt/exp/edges-20190205-semeval/bert-base-uncased-mix-edges-rel-semeval/run,val,0.7556
/nfs/jsalt/exp/edges-20190205-semeval/bert-large-uncased-cat-edges-rel-semeval/run,test,0.7041
/nfs/jsalt/exp/edges-20190205-semeval/bert-large-uncased-cat-edges-rel-semeval/run,val,0.6980
/nfs/jsalt/exp/edges-20190205-semeval/bert-large-uncased-lex-edges-rel-semeval/run,test,0.5254
/nfs/jsalt/exp/edges-20190205-semeval/bert-large-uncased-lex-edges-rel-semeval/run,val,0.5309
/nfs/jsalt/exp/edges-20190205-semeval/bert-la

## Compute clean metrics for each task

For most tasks this is just the micro or macro average F1, but we need to ignore the 0 label for coref, and drop references and continuations for SRL.

In [11]:
# SPLIT = "test"
SPLIT = "val"
mask = df['split'] == SPLIT
mask &= (df['exp_type'] != "openai")
mask &= df['exp_type'].map(lambda s: '-cased-' not in s)  # skip cased BERT for now
mask &= (df['task'] != 'dpr') | df['seed'].notnull()  # only use folds for DPR
# Skip these tasks
mask &= (df['task'] != "constituent-ontonotes")
mask &= (df['task'] != "ner-tacred")
mask &= (df['task'] != "coref-gap")
mask &= (df['task'] != "coref-gap-ontonotes")
mask &= (df['task'] != "noun-verb")
# mask &= (df['task'] != "rel-tacred")
# mask &= (df['task'] != "rel-semeval")

final_scores = []
for task in df['task'].unique():
    task_scores = df[mask & (df['task'] == task)]
    if is_coref_task(task):
        final_scores.append(task_scores[task_scores['label'] == "1"])
        # For GAP coref, have stratified by gender
        if task.startswith("coref-gap"):
            final_scores.append(task_scores[task_scores['label'] == "_info.pronoun_gender_MASCULINE_1_"])
            final_scores.append(task_scores[task_scores['label'] == "_info.pronoun_gender_FEMININE_1_"])
    elif task == "dpr":
        dpr_mask = task_scores['seed'] == "_mean_"
        dpr_mask &= task_scores['label'] == "_micro_avg_"
        final_scores.append(task_scores[dpr_mask])
    elif is_srl_task(task):
        final_scores.append(task_scores[task_scores['label'] == '_core_'])
        final_scores.append(task_scores[task_scores['label'] == '_non_core_'])
        # Use clean version, average only over core or noncore roles.
        final_scores.append(task_scores[task_scores['label'] == '_clean_micro_'])
    elif is_constituent_task(task):
        final_scores.append(task_scores[task_scores['label'] == '_pos_'])
        final_scores.append(task_scores[task_scores['label'] == '_nonterminal_'])
        final_scores.append(task_scores[task_scores['label'] == '_micro_avg_'])
    elif is_relation_task(task):
        # Relation tasks include specific "no_relation" label
        final_scores.append(task_scores[task_scores['label'] == '_clean_micro_'])
    elif task == "noun-verb":
        # Noun-verb reports accuracy on VERB class
        final_scores.append(task_scores[task_scores['label'] == 'VERB'])
    else:
        final_scores.append(task_scores[task_scores['label'] == '_micro_avg_'])
        
fdf = pd.concat(final_scores, axis=0, ignore_index=True, sort=False)
# fdf['task_and_metric'] = ["%s-%s" % tl for tl in zip(fdf.task, fdf.label)]
def format_display_row(task, label, seed):
    ret = f"{task}-{label}"
    if seed:
        ret += f":{seed}"
    return ret

fdf['display_row'] = [format_display_row(*args) for args in zip(fdf.task, fdf.label, fdf.seed)]
print(len(fdf))
fdf

269


Unnamed: 0,tag,seed,exp_name,exp_type,label,num_epochs,num_steps,run,task,split,...,precision,recall,f1_score,accuracy_errn95,precision_errn95,recall_errn95,f1_errn95,score,score_errn95,display_row
0,base,,openai-cat-edges-coref-ontonotes-conll,openai-cat,1,,,/nfs/jsalt/home/iftenney/exp/edges-20180921-op...,coref-ontonotes-conll,val,...,0.855263,0.817258,0.835829,0.003098,0.009258,0.009941,0.009587,0.835829,0.009587,coref-ontonotes-conll-1
1,base,,openai-lex-edges-coref-ontonotes-conll,openai-lex,1,,,/nfs/jsalt/home/iftenney/exp/edges-20180921-op...,coref-ontonotes-conll,val,...,0.808771,0.660696,0.727273,0.003768,0.011192,0.012179,0.011665,0.727273,0.011665,coref-ontonotes-conll-1
2,base,,openai-bwb-edges-coref-ontonotes-conll,openai-bwb,1,,,/nfs/jsalt/home/iftenney/exp/edges-20180922-op...,coref-ontonotes-conll,val,...,0.855844,0.810885,0.832758,0.003118,0.009282,0.010073,0.009661,0.832758,0.009661,coref-ontonotes-conll-1
3,base,,cove-edges-coref-ontonotes-conll,cove,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,val,...,0.835032,0.761970,0.796830,0.003380,0.009994,0.010955,0.010452,0.796830,0.010452,coref-ontonotes-conll-1
4,base,,elmo-chars-edges-coref-ontonotes-conll,elmo-chars,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,val,...,0.802593,0.714261,0.755855,0.003651,0.010853,0.011621,0.011224,0.755855,0.011224,coref-ontonotes-conll-1
5,base,,elmo-full-edges-coref-ontonotes-conll,elmo-full,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,val,...,0.861156,0.834309,0.847520,0.003003,0.009036,0.009564,0.009293,0.847520,0.009293,coref-ontonotes-conll-1
6,base,,elmo-ortho-edges-coref-ontonotes-conll,elmo-ortho,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,val,...,0.812175,0.781261,0.796418,0.003423,0.010243,0.010634,0.010435,0.796418,0.010435,coref-ontonotes-conll-1
7,base,,glove-edges-coref-ontonotes-conll,glove,1,,,/nfs/jsalt/home/iftenney/exp/final_20180927/ba...,coref-ontonotes-conll,val,...,0.793172,0.684292,0.734720,0.003763,0.011217,0.011956,0.011575,0.734720,0.011575,coref-ontonotes-conll-1
8,base,,bert-base-uncased-cat-edges-coref-ontonotes-conll,bert-base-uncased-cat,1,,,/nfs/jsalt/exp/edges-20190124-bert/bert-base-u...,coref-ontonotes-conll,val,...,0.892236,0.892697,0.892467,0.002567,0.007974,0.007961,0.007968,0.892467,0.007968,coref-ontonotes-conll-1
9,base,,bert-base-uncased-lex-edges-coref-ontonotes-conll,bert-base-uncased-lex,1,,,/nfs/jsalt/exp/edges-20190124-bert/bert-base-u...,coref-ontonotes-conll,val,...,0.797980,0.721151,0.757622,0.003651,0.010864,0.011535,0.011189,0.757622,0.011189,coref-ontonotes-conll-1


## Make Table 2 (model comparison)

This should get us 90% of the way there with formatting.

In [12]:
# scores and confidence intervals
sdf = fdf.pivot(index="display_row", columns="display_col", values="score")
cdf = fdf.pivot(index="display_row", columns="display_col", values="score_errn95")

# Table 2, top part (original table)
columns = collections.OrderedDict()
columns[("CoVe", "Lex.")] = sdf['glove (base)']
columns[("CoVe", "Full")] = sdf['cove (base)']
columns[("CoVe", "Abs. $\Delta$")] = sdf['cove (base)'] - sdf['glove (base)']
columns[("ELMo", "Lex.")] = sdf['elmo-chars (base)']
columns[("ELMo", "Full")] = sdf['elmo-full (base)']
columns[("ELMo", "Abs. $\Delta$")] = sdf['elmo-full (base)'] - sdf['elmo-chars (base)']
columns[("Transformer LM", "Lex.")] = sdf['openai-lex (base)']
columns[("Transformer LM", "\texttt{cat}")] = sdf['openai-cat (base)']
# columns[("Transformer LM", "Abs. $\Delta$")] = sdf['openai-cat (base)'] - sdf['openai-lex (base)']
columns[("Transformer LM", "\texttt{mix}")] = sdf['openai-mix (base)']
# columns[("Transformer LM", "Abs. $\Delta$")] = sdf['openai-mix (base)'] - sdf['openai-lex (base)']
COLUMN_FORMAT="|ccr|ccr|ccc"
# Table 2, bottom part (new results)
columns = collections.OrderedDict()
columns[("BERT (base)", "F1 Score", "Lex.")] = sdf['bert-base-uncased-lex (base)']
columns[("BERT (base)", "F1 Score", "\texttt{cat}")] = sdf['bert-base-uncased-cat (base)']
columns[("BERT (base)", "F1 Score", "\texttt{mix}")] = sdf['bert-base-uncased-mix (base)']
# columns[("BERT (base)", "Abs. $\Delta$", "Lex.")] = sdf['bert-base-uncased-mix (base)'] - sdf['bert-base-uncased-lex (base)']
columns[("BERT (base)", "Abs. $\Delta$", "ELMo")] = sdf['bert-base-uncased-mix (base)'] - sdf['elmo-full (base)']
columns[("BERT (large)", "F1 Score", "Lex.")] = sdf['bert-large-uncased-lex (base)']
columns[("BERT (large)", "F1 Score", "\texttt{cat}")] = sdf['bert-large-uncased-cat (base)']
columns[("BERT (large)", "F1 Score", "\texttt{mix}")] = sdf['bert-large-uncased-mix (base)']
# columns[("BERT (large)", "Abs. $\Delta$", "Lex.")] = sdf['bert-large-uncased-mix (base)'] - sdf['bert-large-uncased-lex (base)']
columns[("BERT (large)", "Abs. $\Delta$", "(base)")] = sdf['bert-large-uncased-mix (base)'] - sdf['bert-base-uncased-mix (base)']
columns[("BERT (large)", "Abs. $\Delta$", "ELMo")] = sdf['bert-large-uncased-mix (base)'] - sdf['elmo-full (base)']
COLUMN_FORMAT="|cccr|cccrr"

# Make a DataFrame that looks like the LaTeX table.
pdf = pd.DataFrame(columns)
pdf = pdf.reindex(sorted(pdf.index, key=task_sort_key), axis=0)

# Compute macro average as series, with entries for each column
def _is_core_task(row):
    return not re.match(r'.*-_(non_)?core_$', row)
core_task_mask = list(pdf.index.map(_is_core_task))
macro_average = pdf[core_task_mask].mean()
macro_average.name = "Macro Average"

pdf.index = [_make_display_name(*row.rsplit("-", 1)) for row in pdf.index]
pdf = pdf.append(macro_average)

# Get row maxima
maxima = {}
score_cols = [c for c in pdf.columns if not "Delta" in c[1]]
for row in pdf.index:
    maxima[row] = pdf.loc[row, score_cols].max()

# Format all numbers into strings
# _format_score_col = lambda f: "{:.0f}".format(100*f)
_format_score_col = lambda f: "{:.1f}".format(100*f)
_format_delta_col = lambda f: "{:.1f}".format(100*f)
# def _get_margin_percent(row):
#     if row.startswith("Winograd"):
#         return 3
#     elif row.startswith("SPR"):
#         return 1
#     else:
#         return 0.5
def _get_margin_percent(row):
    if row.startswith("Winograd"):
        return 1.5
    elif row.startswith("SPR"):
        return 0.5
    else:
        return 0.25


def _format_cell(i, row, col, val):
    num_val = val
    if "Delta" in col[1]:
        val = _format_delta_col(val)
    elif row == "Macro Average":
        val = _format_delta_col(val)
    else:
        val = _format_score_col(val)
    if i < len(core_task_mask) and not core_task_mask[i]:
        val = "\textit{" + val + "}"
#     if row.startswith("Winograd") and col == ("BERT (large)", "F1 Score", "\texttt{mix}"):
#         val = val + " $\pm$ 6"
    margin = 0.01 * _get_margin_percent(row)
    if num_val > maxima[row] - margin:
        val = "\textbf{" + val + "}"
    return val
    
for i, row in enumerate(pdf.index):
    for col in pdf.columns:
        pdf.loc[row, col] = _format_cell(i, row, col, pdf.loc[row, col])
        
# Change some row labels
def _rename_row(row):
    if row == "SRL (core)":
        return "\quad Core roles"
    elif row == "SRL (non-core)":
        return "\quad Non-core roles"
    elif row.endswith(" (mean)"):
        return row[:-7]
    elif row.startswith("Relations "):
        return "Rel. " + row.split(" ", 1)[1]
    elif row == "Macro Average":
        return "\midrule " + row
    else:
        return row
pdf.index = pdf.index.map(_rename_row)
    
display(pdf)
        
# Make columns bold.
_make_bold = lambda text: "\textbf{" + text + "}"
pdf.columns = pdf.columns.map(lambda c: tuple(map(_make_bold, c)))

tex = pdf.to_latex(column_format="@{}l"+COLUMN_FORMAT+"@{}", float_format="%.2f",
                   bold_rows=False, escape=False, multicolumn=True, multicolumn_format="c")
print(tex)

Unnamed: 0_level_0,BERT (base),BERT (base),BERT (base),BERT (base),BERT (large),BERT (large),BERT (large),BERT (large),BERT (large)
Unnamed: 0_level_1,F1 Score,F1 Score,F1 Score,Abs. $\Delta$,F1 Score,F1 Score,F1 Score,Abs. $\Delta$,Abs. $\Delta$
Unnamed: 0_level_2,Lex.,\texttt{cat},\texttt{mix},ELMo,Lex.,\texttt{cat},\texttt{mix},(base),ELMo
Part-of-Speech,88.3,\textbf{96.8},96.5,-0.1,88.0,96.3,\textbf{96.6},0.2,0.0
Constituents,66.9,83.5,86.6,1.9,67.8,79.6,\textbf{86.9},0.4,2.2
Dependencies,80.2,93.4,95.4,1.2,80.3,92.1,\textbf{95.6},0.3,1.4
Entities,89.7,95.9,96.0,0.8,90.7,96.0,\textbf{96.5},0.4,1.2
SRL (all),74.0,89.0,90.9,1.1,75.1,87.7,\textbf{92.0},1.0,2.1
\quad Core roles,\textit{73.3},\textit{90.7},\textit{93.0},\textit{0.8},\textit{74.5},\textit{89.2},\textbf{\textit{94.1}},\textit{1.1},\textit{1.9}
\quad Non-core roles,\textit{76.0},\textit{84.6},\textit{85.8},\textit{1.6},\textit{76.4},\textit{83.9},\textbf{\textit{86.8}},\textit{1.0},\textit{2.6}
OntoNotes coref.,75.8,89.2,90.5,5.7,76.0,89.9,\textbf{92.2},1.8,7.5
SPR1,76.7,83.3,\textbf{84.0},1.1,76.7,82.3,\textbf{83.7},-0.3,0.7
SPR2,80.4,\textbf{82.6},\textbf{82.8},0.3,80.5,82.3,\textbf{82.9},0.1,0.4


\begin{tabular}{@{}l|cccr|cccrr@{}}
\toprule
{} & \multicolumn{4}{c}{\textbf{BERT (base)}} & \multicolumn{5}{c}{\textbf{BERT (large)}} \\
{} & \multicolumn{3}{c}{\textbf{F1 Score}} & \textbf{Abs. $\Delta$} & \multicolumn{3}{c}{\textbf{F1 Score}} & \multicolumn{2}{c}{\textbf{Abs. $\Delta$}} \\
{} &        \textbf{Lex.} & \textbf{\texttt{cat}} & \textbf{\texttt{mix}} &          \textbf{ELMo} &         \textbf{Lex.} & \textbf{\texttt{cat}} &   \textbf{\texttt{mix}} &        \textbf{(base)} & \textbf{ELMo} \\
\midrule
Part-of-Speech         &                 88.3 &         \textbf{96.8} &                  96.5 &                   -0.1 &                  88.0 &                  96.3 &           \textbf{96.6} &                    0.2 &           0.0 \\
Constituents           &                 66.9 &                  83.5 &                  86.6 &                    1.9 &                  67.8 &                  79.6 &           \textbf{86.9} &                    0.4 &           2.2 \\
Depend

Pivot DataFrame to present each task on a row, and each experiment on a column.

This form is suitable to copy-paste into a spreadsheet.

In [13]:
# Pivot to wide-form for spreadsheet, and sort in (mostly) stable order.
# mask = fdf['task'].map(lambda s: s.startswith('rel-'))
sheet_df = fdf.pivot(index="display_row", columns="display_col", values="score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print((100*sheet_df).to_csv(**csv_args))

display_row,glove (base),glove (bow),cove (base),elmo-chars (base),elmo-chars (bow),elmo-chars (cnn1),elmo-chars (cnn2),elmo-ortho (base),elmo-full (base),openai-lex (base),openai-lex (bow),openai-cat (base),openai-mix (base),openai-bwb (base),bert-base-uncased-lex (base),bert-base-uncased-lex (bow),bert-base-uncased-cat (base),bert-base-uncased-mix (base),bert-large-uncased-lex (base),bert-large-uncased-lex (bow),bert-large-uncased-cat (base),bert-large-uncased-mix (base)
pos-ontonotes-_micro_avg_,85.9228,88.4670,93.9147,90.6979,90.8564,95.7660,96.0406,91.2232,96.6235,88.0053,90.2986,94.6398,94.6746,,88.3494,90.6203,96.7583,96.4810,87.9578,90.7934,96.2727,96.6480
nonterminal-ontonotes-_micro_avg_,55.1509,58.9663,81.4808,68.7957,72.6901,83.9149,84.7074,71.0830,84.7232,64.3216,66.9262,80.9122,84.1387,,66.9288,70.6040,83.4807,86.5765,67.7886,71.4839,79.6116,86.9384
dep-labeling-ewt-_micro_avg_,76.9048,79.0899,90.0856,80.6221,80.5477,90.7202,91.7454,85.2772,94.2093,77.7549,79.0374,92.0672

Print the same format, but show the 95% confidence intervals for each score.

In [14]:
sheet_df = fdf.pivot(index="display_row", columns="display_col", values="score_errn95")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print((100*sheet_df).to_csv(**csv_args))

display_row,glove (base),glove (bow),cove (base),elmo-chars (base),elmo-chars (bow),elmo-chars (cnn1),elmo-chars (cnn2),elmo-ortho (base),elmo-full (base),openai-lex (base),openai-lex (bow),openai-cat (base),openai-mix (base),openai-bwb (base),bert-base-uncased-lex (base),bert-base-uncased-lex (bow),bert-base-uncased-cat (base),bert-base-uncased-mix (base),bert-large-uncased-lex (base),bert-large-uncased-lex (bow),bert-large-uncased-cat (base),bert-large-uncased-mix (base)
pos-ontonotes-_micro_avg_,0.1253,0.1164,0.0850,0.1059,0.1051,0.0732,0.0707,0.1030,0.0656,0.1186,0.1078,0.0819,0.0818,,0.1171,0.1063,0.0644,0.0671,0.1188,0.1054,0.0690,0.0655
nonterminal-ontonotes-_micro_avg_,0.2055,0.2014,0.1514,0.1847,0.1765,0.1434,0.1403,0.1792,0.1403,0.1925,0.1883,0.1538,0.1424,,0.1881,0.1808,0.1450,0.1325,0.1863,0.1787,0.1577,0.1310
dep-labeling-ewt-_micro_avg_,0.4942,0.4834,0.3664,0.4716,0.4709,0.3512,0.3322,0.4253,0.2838,0.4984,0.4984,0.3316,0.2898,0.3295,0.4827,0.4704,0.3026,0.2572,0.4828,0.47

# Plot "final scores"

Make ELMo baselines figure / table.

In [None]:
fdf.columns

In [None]:
mask = (fdf.exp_type == "elmo-chars") 
mask |= ((fdf.exp_type == "elmo-ortho") | (fdf.exp_type == "elmo-full")) & (fdf.tag == "base")
plot_df = fdf[mask].copy()

_SCORE_COL="f1_score"
_ERROR_COL="f1_errn95"
plot_df['_sort_key'] = plot_df['display_col'].map(task_sort_key)
plot_df.sort_values(by="_sort_key", axis=0, inplace=True)
plot_df['_display_name'] = [_make_display_name(*tl) 
                            for tl in zip(plot_df['task'], plot_df['label'])]
plot_df['col_key'] = list(zip(plot_df['exp_type'], plot_df['tag']))
plot_df['fmt_score'] = plot_df[_SCORE_COL].map(
    lambda s: "{:.0f}".format(100*s)
)
plot_df['_err_upper'] = plot_df[_SCORE_COL] + plot_df[_ERROR_COL]
plot_df['_err_lower'] = plot_df[_SCORE_COL] - plot_df[_ERROR_COL]
print("Found %s entries" % len(plot_df))
# long_ds = bokeh.models.ColumnDataSource(data=plot_df)

ordered_labels = list(reversed(plot_df['_display_name'].unique())) + [""]
_RP = 0.3
factor_range = bokeh.models.FactorRange(*ordered_labels, range_padding=_RP,
                                        range_padding_units='absolute')
tools = "save,reset"

xstart = 20
xend = 100
_FONT_SIZE = "15pt"
p = bp.figure(y_range=factor_range, x_range=[xstart, xend],
              height=550, width=1200, tools=tools)

label_kw = dict(text_align="center", text_baseline="middle",
                text_font_size=_FONT_SIZE)

palette = bokeh.palettes.Category20[20]
col_keys = [
    ("elmo-chars", "base", "Lex.", "diamond"),
    ("elmo-chars", "cnn1", "CNN1", "triangle"),
    ("elmo-chars", "cnn2", "CNN2", "inverted_triangle"),
    ("elmo-ortho", "base", "Ortho.", "square"),
    ("elmo-full", "base", "Full", "circle"),
]
for i,ck in enumerate(col_keys):
    print(ck)
    ds = plot_df[plot_df.col_key == ck[:2]]
    y = ds["_display_name"]
    x = 100*ds[_SCORE_COL]
    e = 100*ds[_ERROR_COL]
    c = palette[2*i]
    cp = palette[2*i+1]
    p.hbar(y=y, left=x-e, right=x+e, height=0.90,
           fill_color=c, fill_alpha=0.5,
           line_color=None, line_width=0)
    p.hbar(y=y, left=x-0.05, right=x+0.05, height=0.90,
           fill_color="Black", fill_alpha=1.0,
           line_color=None, line_width=0)
    p.hbar(y=y, left=0, right=x, height=0.90,
           fill_color="Gray", fill_alpha=0.05,
           line_color=None, line_width=0)
    p.scatter(y=y, x=x, size=12, fill_color=c, legend=ck[2],
              marker=ck[3])
    
    dss = bokeh.models.ColumnDataSource(data=ds)
    lpos = xstart+(0.7+i)*(xend-xstart)/20
    score_labels = bokeh.models.LabelSet(y="_display_name", x=lpos,
                                         text="fmt_score", source=dss, **label_kw)
    p.add_layout(score_labels)
    cat_label = bokeh.models.Label(y=len(ordered_labels)-1+0.6, x=lpos, text=ck[2], 
                                   angle=np.pi/6, **label_kw)
    p.add_layout(cat_label)
#     error_bars = bokeh.models.Whisker(base="_display_name", upper="_err_upper", lower="_err_lower",
#                                       source=dss, level="glyph", dimension="width")
#     p.add_layout(error_bars)
    

# p.xaxis.major_label_orientation = 1
p.xaxis.bounds = (0,100)
p.yaxis.bounds = (-_RP, 2*len(col_keys)+2)
p.yaxis.major_label_text_font_size = "13pt"
p.xaxis.major_label_text_font_size = _FONT_SIZE
p.ygrid.grid_line_alpha = 0
p.xgrid.grid_line_alpha = 0
p.xaxis.axis_label = "F1 Score"
p.xaxis.axis_label_text_font_size = _FONT_SIZE
p.legend.orientation = "horizontal"
p.legend.background_fill_color = None
p.legend.border_line_color = None
p.legend.label_text_font_size = _FONT_SIZE

p.min_border = 0
bp.show(p)

## Plot by constituent height, span distance, etc.

Run one of the cells below to populate `plot_df`, and uncomment the corresponding title block.

In [None]:
df.columns

In [None]:
mask = (df.split == "val")
mask &= (df.stratifier == "info.height") & (df.task == "nonterminal-ontonotes")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [None]:
mask = (df.split == "val")
mask &= df.stratifier.notnull() & (df.task == "srl-conll2012")
# mask &= ((df['exp_type'] == "elmo-full") & (df['tag'] == "base")) | \
#          (df['exp_type'] == "elmo-chars")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [None]:
mask = (df.split == "val")
mask &= df.stratifier.notnull() & (df.task == "coref-ontonotes-conll")
# mask &= ((df['exp_type'] == "elmo-full") & (df['tag'] == "base")) | \
#          (df['exp_type'] == "elmo-chars")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [15]:
mask = (df.split == "val")
mask &= df.stratifier.notnull() & (df.task == "dep-labeling-ewt")
# mask &= ((df['exp_type'] == "elmo-full") & (df['tag'] == "base")) | \
#          (df['exp_type'] == "elmo-chars")
mask &= (((df['exp_type'] == "elmo-full") | (df["exp_type"] == "elmo-ortho")) & (df['tag'] == "base")) | \
         (df['exp_type'] == "elmo-chars")
plot_df = df[mask].copy()

In [16]:
plot_df.tag.unique()

array(['base', 'bow', 'cnn1', 'cnn2'], dtype=object)

In [17]:
height = 400
width = 900
_FONT_SIZE = "14pt"

# title = "F1 by height on OntoNotes constituent labeling"
# _X_LABEL = "Constituent Height"
# _X_RANGE = [1.5,20.5]
# _Y_RANGE = [0,1]
# title = "F1 by span distance on OntoNotes SRL"
# _X_LABEL = "Span separation distance (tokens)"
# _X_RANGE = [-0.5, 15.5]
# _Y_RANGE = [0.5,1]
# _X_LABEL = "Span separation distance (tokens)"
# _X_RANGE = [-0.5, 15.5]
# _Y_RANGE = [0.5,1]
# title = "F1 by span distance on dependency labeling"
_X_LABEL = "Span separation distance (tokens)"
_X_RANGE = [-0.5, 15.5]
_Y_RANGE = [0.4,1.05]

# def _make_plot_legend_name(exp_type, tag):
#     if exp_type == "elmo-full":
#         return "Full ELMo"
#     elif exp_type == "elmo-chars":
#         return f"Lex. ({tag})"
#     else:
#         raise ValueError(f"Unrecognized experiment: ({exp_type}, {tag})")

col_keys = [
    ("elmo-chars", "base", "Lex.", "diamond"),
    ("elmo-chars", "cnn1", "CNN1", "triangle"),
    ("elmo-chars", "cnn2", "CNN2", "inverted_triangle"),
    ("elmo-ortho", "base", "Ortho.", "square"),
    ("elmo-full", "base", "Full", "circle"),
]

plot_df['col_key'] = list(zip(plot_df['exp_type'], plot_df['tag']))
        
_SCORE_COL="f1_score"
_ERROR_COL="f1_errn95"

tools = "save,reset"

palette = bokeh.palettes.Category20[20]
p = bp.figure(x_range=_X_RANGE, y_range=_Y_RANGE, 
              width=width, height=height,
              tools=tools)
crs = []
# gb = plot_df.groupby(by=["exp_type", "tag", "display_col"])
# for i, ((exp_type, tag, name), idx) in enumerate(gb.groups.items()):
for i, ck in enumerate(col_keys):
    ds = plot_df[plot_df['col_key'] == ck[:2]]
    exp_type = ck[0]
    et_key = exp_type + (f"-{tag}" if tag != "base" else "")
#     x = plot_df.loc[idx, "stratum_key"]
#     y = plot_df.loc[idx, _SCORE_COL]
#     e = plot_df.loc[idx, _ERROR_COL]
    x = ds["stratum_key"]
    y = ds[_SCORE_COL]
    e = ds[_ERROR_COL]
    c = palette[2*i]
#     display_name = _make_plot_legend_name(exp_type, tag) + "  "
#     display_name = ck[2]
    p.line(x=x, y=y, color=c, line_width=2)
    cr = p.scatter(x=x, y=y, color=c, size=12, hover_fill_color="Gray",
                   legend=ck[2], marker=ck[3])
    crs.append(cr)
    ds = bokeh.models.ColumnDataSource(data=dict(x=x, upper=y+e, lower=y-e))
#     error_bars = bokeh.models.Whisker(base='x', upper='upper', lower='lower',
#                                       line_color=c,
#                                       source=ds)
#     p.add_layout(error_bars)
    error_band = bokeh.models.Band(base='x', upper='upper', lower='lower',
                                   source=ds, level='underlay',
                                   fill_alpha=0.4, line_width=1,
                                   line_color=palette[2*i],
                                   fill_color=palette[2*i+1])
    p.add_layout(error_band)

p.yaxis.bounds = (0,1)

##
# Overlay histogram at bottom of the plot (this looks nice)
counts = plot_df['tp_count'] + plot_df['fn_count']
strata = plot_df['stratum_key']
hist_height = 0.3
p.extra_y_ranges = {"hist": bokeh.models.Range1d(min(counts), 
                                                 max(counts)/hist_height)}
p.add_layout(bokeh.models.LinearAxis(y_range_name="hist",
                                     bounds=(min(counts), max(counts))), "right")
p.vbar(x=strata, top=counts, width=0.9,
       y_range_name='hist', color="Gray")

# Add fancy hover tool
tooltips = [
    ("Height", "@x"),
    ("F1 score", "@y{0.00}"),
]
p.add_tools(bokeh.models.HoverTool(tooltips=tooltips, renderers=crs, 
#                                    mode='vline',
                                  ))
p.xaxis.axis_label = _X_LABEL
p.yaxis[0].axis_label = "F1 Score"
p.yaxis[1].axis_label = ""
p.yaxis[1].formatter = bokeh.models.NumeralTickFormatter(format="0a")
p.legend.orientation = "horizontal"
# p.legend.location = "bottom_right"
# p.legend.background_fill_alpha = 0.0
# p.legend.border_line_alpha = 0

p.yaxis.major_label_text_font_size = "13pt"
p.xaxis.major_label_text_font_size = "13pt"
p.xaxis.major_label_text_font_size = _FONT_SIZE
p.xaxis.axis_label_text_font_size = _FONT_SIZE
p.yaxis.axis_label_text_font_size = _FONT_SIZE
# p.legend.orientation = "horizontal"
p.legend.background_fill_color = None
p.legend.border_line_color = None
p.legend.label_text_font_size = _FONT_SIZE

p.min_border = 0

bp.show(p)

In [23]:
import bokeh
import bokeh.plotting as bp
bp.output_notebook()

import datetime
import socket
def get_compact_timestamp():
    now = datetime.datetime.now()
    return now.strftime("%Y%m%d.%H%M%S")

def _save_figure_to_bucket(fig, name, title=None, export_format="html"):
    now = get_compact_timestamp()
    fname = f"{name}.{now:s}.{export_format}"
    title = title or name
    if fname.endswith('.png'):
        bokeh.io.export_png(p, os.path.join("/tmp", fname))
    else:
        bp.save(p, os.path.join("/tmp", fname), title=title, 
                resources=bokeh.resources.CDN)
    hostname = socket.gethostname()
    !gsutil cp /tmp/$fname gs://edge-probing/$hostname/plots/$fname
    !gsutil acl ch -u AllUsers:R gs://edge-probing/$hostname/plots/$fname
    url = f"https://storage.googleapis.com/edge-probing/{hostname}/plots/{fname}"
    print(f"Public URL: {url}")
    return url

In [24]:
_save_figure_to_bucket(p, name="scores_by_distance",
                       title="Scores by distance")

Copying file:///tmp/scores_by_distance.20190504.001735.html [Content-Type=text/html]...
/ [1 files][ 44.8 KiB/ 44.8 KiB]                                                
Operation completed over 1 objects/44.8 KiB.                                     
Updated ACL on gs://edge-probing/iftenney/plots/scores_by_distance.20190504.001735.html
Public URL: https://storage.googleapis.com/edge-probing/iftenney/plots/scores_by_distance.20190504.001735.html


'https://storage.googleapis.com/edge-probing/iftenney/plots/scores_by_distance.20190504.001735.html'

In [None]:
p1 = p

In [None]:
mask = (df.split == "val")
mask &= (df.stratifier == "info.height") & (df.task == "constituent-ontonotes")
mask &= df['exp_type'].map(lambda t: t in {'elmo-chars', 'elmo-full', 'elmo-ortho', 'openai-cat'})
sheet_df = df[mask].pivot(index="stratum_key", columns="display_col", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index), axis=0)
# sheet_df
print(sheet_df.to_csv())

In [None]:
sheet_df = df[mask].pivot(index="stratum_key", columns="display_col", values="f1_errn95")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index), axis=0)
# sheet_df
print(sheet_df.to_csv())

## Plot by span distance

Absolute distance (in number of tokens under model tokenization) between midpoint of `span1` and midpoint of `span2`.

In [None]:
df.columns

In [None]:
df['task'].unique()

In [None]:
mask = df.stratifier.notnull() & (df.task == "srl-conll2012") & (df.split == "val")
mask &= df['exp_type'].map(lambda t: t in {'elmo-chars', 'elmo-full', 'elmo-ortho', 'openai-cat'})
sheet_df = df[mask].pivot(index="stratum_key", columns="display_col", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index), axis=0)
# sheet_df
print(sheet_df.to_csv())

## Plot by label for Relations

In [None]:
mask = (df.task == "rel-semeval") & (df.split == "val")
mask &= df.label.map(is_positive_relation)
sheet_df = df[mask].pivot(index="label", columns="exp_type", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print(sheet_df.to_csv())

## Plot by label for SRL core, non-core

In [None]:
mask = df.label.map(is_core_or_noncore) & (df.task == "srl-conll2012") & (df.split == "val")
sheet_df = df[mask].pivot(index="label", columns="exp_type", values="f1_score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print(sheet_df.to_csv())

In [None]:
df[mask].exp_type.unique()

In [None]:
plot_df = df[df.label.map(is_core_role) & (df.task == "srl-conll2012")].copy()

_SCORE_COL="f1_score"
plot_df['row_key'] = list(zip(plot_df['task'], plot_df['exp_type']))
plot_df['fmt_score'] = plot_df[_SCORE_COL].map(
    lambda s: "{:.02f}".format(s)
)
print("Found %s entries" % len(plot_df))
long_ds = bokeh.models.ColumnDataSource(data=plot_df)

factor_range = bokeh.models.FactorRange(*categories, range_padding=0.5,
                                       range_padding_units='absolute')
tools = "save,reset"

p = bp.figure(x_range=factor_range, y_range=[0,1],
              width=1250, tools=tools)
p.vbar(x='row_key', top=_SCORE_COL, width=0.95,
       fill_color=fill_cmap,
       line_color="Gray", source=long_ds)
label_kw = dict(text_align="right", text_baseline="middle", y_offset=-3,
                text_font_size="11pt", angle=90, angle_units='deg')
score_labels = bokeh.models.LabelSet(x='row_key', y=_SCORE_COL,
                                     text="fmt_score",
                                     source=long_ds, **label_kw)
p.add_layout(score_labels)
p.xaxis.major_label_orientation = 1
p.yaxis.bounds = (0,1)

bp.show(p)