# Edgeprobe Aggregate Analysis

This notebook is intended to be run on the output of the [`analyze_runs.py`](analyze_runs.py) script; run that on a folder of experiments to produce a `scores.tsv` file that can be loaded here.

In [1]:
import sys, os, re, json
from importlib import reload
import itertools
import collections

import numpy as np
import pandas as pd

import analysis
reload(analysis)

tasks = analysis.TASKS
exp_types = analysis.EXP_TYPES
palette = analysis.EXP_PALETTE

task_sort_key = analysis.task_sort_key
exp_type_sort_key = analysis.exp_type_sort_key

from scipy.special import logsumexp
from scipy.stats import entropy

def softmax(x, axis=None):
    return np.exp(x - logsumexp(x, axis=axis, keepdims=True))

In [2]:
import bokeh
import bokeh.plotting as bp
bp.output_notebook()

import datetime
import socket
def get_compact_timestamp():
    now = datetime.datetime.now()
    return now.strftime("%Y%m%d.%H%M%S")

def _save_figure_to_bucket(fig, name, title=None, export_format="html"):
    now = get_compact_timestamp()
    fname = f"{name}.{now:s}.{export_format}"
    title = title or name
    if fname.endswith('.png'):
        bokeh.io.export_png(p, os.path.join("/tmp", fname))
    else:
        bp.save(p, os.path.join("/tmp", fname), title=title, 
                resources=bokeh.resources.CDN)
    hostname = socket.gethostname()
    GCP_PROJECT="edge-probing"
    !gsutil cp /tmp/$fname gs://$GCP_PROJECT/$hostname/plots/$fname
    !gsutil acl ch -u AllUsers:R gs://$GCP_PROJECT/$hostname/plots/$fname
    url = f"https://storage.googleapis.com/{GCP_PROJECT}/{hostname}/plots/{fname}"
    print(f"Public URL: {url}")
    return url

In [3]:
ID_COLS = ['run', 'task', 'split']

def agg_label_group(df, task_predicate, label_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
    mask = df['task'].map(task_predicate) & df['label'].map(label_predicate)
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf

def agg_stratifier_group(df, stratifier, key_predicate, group_name):
    agg_map = {k:"sum" for k in df.columns if k.endswith("_count")}
    # Use this for short-circuit evaluation, so we don't call key_predicate on invalid keys
    mask = [(s == stratifier and key_predicate(key)) 
            for s, key in zip(df['stratifier'], df['stratum_key'])]
    sdf = df[mask].groupby(by=ID_COLS).agg(agg_map).reset_index()
    sdf['label'] = group_name
    return sdf    

def load_scores_file(filename, tag=None, seed=None):
    df = pd.read_csv(filename, sep="\t", header=0)
    df.drop(['Unnamed: 0'], axis='columns', inplace=True)
    # df['task_raw'] = df['task'].copy()
    df['task'] = df['task'].map(analysis.clean_task_name)
    if not "stratifier" in df.columns:
        df["stratifier"] = None
    if not "stratum_key" in df.columns:
        df["stratum_key"] = 0
        
    ###
    # Add additional custom aggregations
    _eg = []
    # SRL core, non-core, and cleaned micro F1
    _eg.append(agg_label_group(df, analysis.is_srl_task, analysis.is_core_role, "_core_"))
    _eg.append(agg_label_group(df, analysis.is_srl_task, analysis.is_non_core_role, "_non_core_"))
    _eg.append(agg_label_group(df, analysis.is_srl_task, analysis.is_core_or_noncore, "_clean_micro_"))
    # Constituents: split into POS, nonterminals
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: int(x) == 1, "_pos_"))
    _eg.append(agg_stratifier_group(df, 'info.height', lambda x: int(x) > 1, "_nonterminal_"))
    # Relations: ignore negative class (no_relation)
    _eg.append(agg_label_group(df, analysis.is_relation_task, analysis.is_positive_relation, "_clean_micro_"))
    df = pd.concat([df] + _eg, ignore_index=True, sort=False)
    
    df.insert(0, "exp_name", df['run'].map(lambda p: os.path.basename(os.path.dirname(p.strip("/")))))
    df.insert(1, "exp_type", df['exp_name'].map(analysis.get_exp_type))
    df.insert(1, "layer_num", df['exp_name'].map(analysis.get_layer_num))
    if tag is not None:
        df.insert(0, "tag", tag)
    df.insert(1, "seed", seed)
    return df

## Specify score files and load

In [4]:
score_files = []
# Add (tag, path/to/scores.tsv) tuples here; results will be concatenated.
score_files = [
#     ("base", "/nfs/jiant/exp/iftenney/20190721-test-ep-bert/stats.tsv"),
#     ("base", "/nfs/jiant/exp/iftenney/20190721-test-ep-bert-medium/stats.tsv"),
    ("base", "/nfs/jiant/exp/iftenney/20190721-bert-base-layers/scores.tsv"),
]
dfs = []
for tag, score_file in score_files:
    df = load_scores_file(score_file, tag=tag)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True, sort=False)
def _format_display_col(exp_type, layer_num, tag):
    ret = exp_type
    if layer_num:
        ret += f"-{layer_num}"
    if tag:
        ret += f" ({tag})"
    return ret

df['display_col'] = list(map(_format_display_col, df.exp_type, df.layer_num, df.tag))
print(df['task'].unique())
print(df['exp_type'].unique())

['rel-semeval' 'coref-ontonotes']
['bert-base-uncased-mix']


In [5]:
analysis.score_from_confusion_matrix(df)

def _get_final_score(row):
    return row['f1_score'], row['f1_errn95']

df['score'], df['score_errn95'] = zip(*(_get_final_score(row) for i, row in df.iterrows()))

In [6]:
df.head()

Unnamed: 0,tag,seed,exp_name,layer_num,exp_type,run,task,split,label,fn_count,...,accuracy,precision,recall,f1_score,accuracy_errn95,precision_errn95,recall_errn95,f1_errn95,score,score_errn95
0,base,,bert-base-uncased-mix_00-edges-rel-semeval,0,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,val,"Cause-Effect(e1,e2)",32,...,0.961706,0.571429,0.333333,0.421053,0.011096,0.183303,0.133361,0.154394,0.421053,0.154394
1,base,,bert-base-uncased-mix_00-edges-rel-semeval,0,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,val,"Cause-Effect(e2,e1)",40,...,0.950392,0.776316,0.59596,0.674286,0.012555,0.093688,0.096663,0.095152,0.674286,0.095152
2,base,,bert-base-uncased-mix_00-edges-rel-semeval,0,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,val,"Component-Whole(e1,e2)",32,...,0.964317,0.808511,0.542857,0.649573,0.010726,0.112492,0.116701,0.114558,0.649573,0.114558
3,base,,bert-base-uncased-mix_00-edges-rel-semeval,0,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,val,"Component-Whole(e2,e1)",37,...,0.956484,0.717391,0.471429,0.568966,0.011797,0.130121,0.116941,0.12318,0.568966,0.12318
4,base,,bert-base-uncased-mix_00-edges-rel-semeval,0,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,val,"Content-Container(e1,e2)",12,...,0.981723,0.756757,0.7,0.727273,0.007745,0.138246,0.142015,0.140106,0.727273,0.140106


For DPR, we need to average across multiple runs to get a good estimate of performance.

In [7]:
mask = df['task'] == 'dpr'
mask &= df['label'] != "__run_info__"
mask &= df['seed'].notnull()
gb_cols = ["tag", "exp_name", "exp_type", "task", "label", "split", "display_col"]
gb = df[mask].groupby(by=gb_cols)
new_rows = []
for key, idxs in gb.groups.items():
    new_row = dict(zip(gb_cols, key))
    new_row["seed"] = "_mean_"
    new_row["score"] = df.loc[idxs, "score"].mean()
    new_row["score_errn95"] = 1.96 * np.sqrt(df.loc[idxs, "score"].var()/len(idxs))
    new_rows.append(new_row)
    
agg_df = pd.DataFrame.from_records(new_rows)
df = pd.concat([df, agg_df], ignore_index=True, sort=False)

For SemEval 2010 Task 8, the official metric is macro-averaged F1 over non-Other labels. Compute this so we can compare to SOTA.

In [8]:
mask = df['task'] == 'rel-semeval'
mask &= df['split'].notnull()
mask &= df['label'].map(analysis.is_positive_relation)
_id_cols = ['run', 'split']
_agg_cols = ['score']
gb = df[mask][_id_cols + _agg_cols].groupby(_id_cols)
afd = gb.agg('mean')
afd = afd.reset_index()

csv_args = dict(float_format="%.4f")
print(afd.to_csv(index=False, **csv_args))

run,split,score
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_00-edges-rel-semeval/run,test,0.5293
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_00-edges-rel-semeval/run,val,0.5096
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_01-edges-rel-semeval/run,test,0.5858
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_01-edges-rel-semeval/run,val,0.5713
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_02-edges-rel-semeval/run,test,0.6075
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_02-edges-rel-semeval/run,val,0.5922
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_03-edges-rel-semeval/run,test,0.6082
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_03-edges-rel-semeval/run,val,0.6006
/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_04-edges-rel-semeval/run,test,0.665

## Compute clean metrics for each task

For most tasks this is just the micro or macro average F1, but we need to ignore the 0 label for coref, and drop references and continuations for SRL.

In [9]:
SPLIT = "test"
# SPLIT = "val"
mask = df['split'] == SPLIT

final_scores = []
for task in df['task'].unique():
    task_scores = df[mask & (df['task'] == task)]
    if analysis.is_coref_task(task):
        final_scores.append(task_scores[task_scores['label'] == "1"])
        # For GAP coref, have stratified by gender
        if task.startswith("coref-gap"):
            final_scores.append(task_scores[task_scores['label'] == "_info.pronoun_gender_MASCULINE_1_"])
            final_scores.append(task_scores[task_scores['label'] == "_info.pronoun_gender_FEMININE_1_"])
    elif task == "dpr":
        dpr_mask = task_scores['seed'] == "_mean_"
        dpr_mask &= task_scores['label'] == "_micro_avg_"
        final_scores.append(task_scores[dpr_mask])
    elif analysis.is_srl_task(task):
        final_scores.append(task_scores[task_scores['label'] == '_core_'])
        final_scores.append(task_scores[task_scores['label'] == '_non_core_'])
        # Use clean version, average only over core or noncore roles.
        final_scores.append(task_scores[task_scores['label'] == '_clean_micro_'])
    elif analysis.is_relation_task(task):
        # Relation tasks include specific "no_relation" label
        final_scores.append(task_scores[task_scores['label'] == '_clean_micro_'])
    elif task == "noun-verb":
        # Noun-verb reports accuracy on VERB class
        final_scores.append(task_scores[task_scores['label'] == 'VERB'])
    else:
        final_scores.append(task_scores[task_scores['label'] == '_micro_avg_'])
        
fdf = pd.concat(final_scores, axis=0, ignore_index=True, sort=False)
# fdf['task_and_metric'] = ["%s-%s" % tl for tl in zip(fdf.task, fdf.label)]
def format_display_row(task, label, seed):
    ret = f"{task}-{label}"
    if seed:
        ret += f":{seed}"
    return ret

fdf['display_row'] = [format_display_row(*args) for args in zip(fdf.task, fdf.label, fdf.seed)]
print(len(fdf))
fdf

26


Unnamed: 0,tag,seed,exp_name,layer_num,exp_type,run,task,split,label,fn_count,...,precision,recall,f1_score,accuracy_errn95,precision_errn95,recall_errn95,f1_errn95,score,score_errn95,display_row
0,base,,bert-base-uncased-mix_00-edges-rel-semeval,0,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,1152,...,0.721897,0.490941,0.584429,0.001567,0.022386,0.020597,0.021455,0.584429,0.021455,rel-semeval-_clean_micro_
1,base,,bert-base-uncased-mix_01-edges-rel-semeval,1,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,987,...,0.755477,0.563853,0.645749,0.001478,0.020498,0.020432,0.020465,0.645749,0.020465,rel-semeval-_clean_micro_
2,base,,bert-base-uncased-mix_02-edges-rel-semeval,2,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,933,...,0.786052,0.587715,0.672566,0.001423,0.01954,0.020281,0.019904,0.672566,0.019904,rel-semeval-_clean_micro_
3,base,,bert-base-uncased-mix_03-edges-rel-semeval,3,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,880,...,0.799422,0.611136,0.692712,0.001386,0.01887,0.020085,0.019459,0.692712,0.019459,rel-semeval-_clean_micro_
4,base,,bert-base-uncased-mix_04-edges-rel-semeval,4,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,747,...,0.795383,0.669907,0.727273,0.001336,0.018111,0.019375,0.018722,0.727273,0.018722,rel-semeval-_clean_micro_
5,base,,bert-base-uncased-mix_05-edges-rel-semeval,5,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,674,...,0.813204,0.702165,0.753616,0.001278,0.017281,0.018842,0.018028,0.753616,0.018028,rel-semeval-_clean_micro_
6,base,,bert-base-uncased-mix_06-edges-rel-semeval,6,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,693,...,0.825013,0.693769,0.753721,0.00127,0.017071,0.018991,0.01798,0.753721,0.01798,rel-semeval-_clean_micro_
7,base,,bert-base-uncased-mix_07-edges-rel-semeval,7,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,626,...,0.833928,0.723376,0.774728,0.001224,0.016463,0.018431,0.017391,0.774728,0.017391,rel-semeval-_clean_micro_
8,base,,bert-base-uncased-mix_08-edges-rel-semeval,8,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,548,...,0.841924,0.757844,0.797674,0.001172,0.015843,0.01765,0.016698,0.797674,0.016698,rel-semeval-_clean_micro_
9,base,,bert-base-uncased-mix_09-edges-rel-semeval,9,bert-base-uncased-mix,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,rel-semeval,test,_clean_micro_,541,...,0.85544,0.760937,0.805426,0.001146,0.015362,0.017573,0.016393,0.805426,0.016393,rel-semeval-_clean_micro_


Pivot DataFrame to present each task on a row, and each experiment on a column.

This form is suitable to copy-paste into a spreadsheet.

In [10]:
# Pivot to wide-form for spreadsheet, and sort in (mostly) stable order.
sheet_df = fdf.pivot(index="display_row", columns="display_col", values="score")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print((100*sheet_df).to_csv(**csv_args))

display_row,bert-base-uncased-mix-00 (base),bert-base-uncased-mix-01 (base),bert-base-uncased-mix-02 (base),bert-base-uncased-mix-03 (base),bert-base-uncased-mix-04 (base),bert-base-uncased-mix-05 (base),bert-base-uncased-mix-06 (base),bert-base-uncased-mix-07 (base),bert-base-uncased-mix-08 (base),bert-base-uncased-mix-09 (base),bert-base-uncased-mix-10 (base),bert-base-uncased-mix-11 (base),bert-base-uncased-mix-12 (base)
coref-ontonotes-1,75.7571,82.6482,84.3835,85.4951,86.6596,87.6038,88.4244,89.2798,89.8663,90.1471,90.4958,90.4056,90.2150
rel-semeval-_clean_micro_,58.4429,64.5749,67.2566,69.2712,72.7273,75.3616,75.3721,77.4728,79.7674,80.5426,81.3395,81.6535,81.4419



Print the same format, but show the 95% confidence intervals for each score.

In [11]:
sheet_df = fdf.pivot(index="display_row", columns="display_col", values="score_errn95")
sheet_df = sheet_df.reindex(sorted(sheet_df.columns, 
                                   key=exp_type_sort_key), axis=1)
sheet_df = sheet_df.reindex(sorted(sheet_df.index,
                                   key=task_sort_key), axis=0)
# sheet_df
print((100*sheet_df).to_csv(**csv_args))

display_row,bert-base-uncased-mix-00 (base),bert-base-uncased-mix-01 (base),bert-base-uncased-mix-02 (base),bert-base-uncased-mix-03 (base),bert-base-uncased-mix-04 (base),bert-base-uncased-mix-05 (base),bert-base-uncased-mix-06 (base),bert-base-uncased-mix-07 (base),bert-base-uncased-mix-08 (base),bert-base-uncased-mix-09 (base),bert-base-uncased-mix-10 (base),bert-base-uncased-mix-11 (base),bert-base-uncased-mix-12 (base)
coref-ontonotes-1,1.0862,0.9583,0.9109,0.8787,0.8463,0.8272,0.8057,0.7788,0.7548,0.7507,0.7362,0.7348,0.7503
rel-semeval-_clean_micro_,2.1455,2.0465,1.9904,1.9459,1.8722,1.8028,1.7980,1.7391,1.6698,1.6393,1.6164,1.6019,1.6091



## Load scalar mixing weights

In [12]:
scalar_files = [
    ("base", "/nfs/jiant/exp/iftenney/20190721-bert-base-layers/scalars.tsv"),
]

def load_scalars_file(filename, tag=None):
    df = pd.read_csv(filename, sep="\t", header=0)
    df.drop(['Unnamed: 0'], axis='columns', inplace=True)
    
    df.insert(0, "exp_name", df['run'].map(lambda p: os.path.basename(os.path.dirname(p.strip("/")))))
    df.insert(1, "exp_type", df['exp_name'].map(analysis.get_exp_type))
    df.insert(2, "task", df['exp_name'].map(lambda name: analysis.clean_task_name(name.split("-edges-")[1])))
    if tag is not None:
        df.insert(0, "tag", tag)
        
    return df

dfs = []
for tag, scalar_file in scalar_files:
    dfs.append(load_scalars_file(scalar_file, tag=tag))
scalar_df = pd.concat(dfs, ignore_index=True, sort=False)
scalar_df['display_col'] = ["%s (%s)" % et for et in zip(scalar_df.exp_type, scalar_df.tag)]
# ELMo models also have 'scalar_mix_0.', which is for pretraining and not used by edge probing.
mask = scalar_df['scalar_set'].map(lambda s: s.endswith("scalar_mix.") or s.endswith("scalar_mix_1."))
scalar_df = scalar_df[mask].copy()
print(scalar_df['task'].unique())
print(scalar_df['exp_type'].unique())
print(len(scalar_df))
print("Scalar sets:", scalar_df['scalar_set'].unique())

['coref-ontonotes' 'rel-semeval']
['bert-base-uncased-mix']
26
Scalar sets: ['sent_encoder._text_field_embedder.scalar_mix.']


In [13]:
# Count total scalar columns
scalar_columns = collections.OrderedDict(sorted(
    [(int(m.group(1)), m.group(0)) for m in 
     (re.match("^scalar_parameters\.(\d+)$", str(name)) for name in scalar_df.columns)
     if m]
))

# Fill NaN with -inf for scalar columns
for name in scalar_columns.values():
    scalar_df[name].fillna(value=-np.inf, inplace=True)

# Pre-fill number columns
for number in scalar_columns.keys():
    scalar_df[number] = None
scalar_df["weight_entropy"] = None
    
# Softmax over parameters in each row
num_scalars = max(scalar_columns.keys()) + 1
scalars = {}
masks = {}
for i, row in scalar_df.iterrows():
    arr = np.zeros(num_scalars, dtype=np.float32)
    for j, col in scalar_columns.items():
        arr[j] = float(row[col])
        if np.isnan(arr[j]):
            arr[j] = -np.inf
    # Softmax over row
    scalars[i] = softmax(arr)
    masks[i] = np.isfinite(arr)

# Add softmax weights back to DataFrame, with numeric column names.
# This way, we can convert to long-form for easy plotting.
for i in scalar_df.index:
    for j in scalar_columns.keys():
        scalar_df.loc[i, j] = scalars[i][j]
    # Compute entropy
    scalar_df.loc[i, "weight_entropy"] = entropy(scalars[i], base=2)
    scalar_df.loc[i, "weight_kl_unif"] = entropy(scalars[i], qk=masks[i], base=2)
    # Compute expectation
    weighted_layers = scalars[i] * np.arange(len(scalars[i])) * masks[i]
    scalar_df.loc[i, "weight_exp_layer"] = np.sum(weighted_layers)
    scalar_df.loc[i, "weight_exp_layer_oneplus"] = np.sum(weighted_layers[1:]) / np.sum(scalars[i][1:] * masks[i][1:])

scalar_df.head()



Unnamed: 0,tag,exp_name,exp_type,task,checkpoint,gamma,label,run,scalar_parameters.0,scalar_set,...,7,8,9,10,11,12,weight_entropy,weight_kl_unif,weight_exp_layer,weight_exp_layer_oneplus
0,base,bert-base-uncased-mix_00-edges-coref-ontonotes,bert-base-uncased-mix,coref-ontonotes,/edges-coref-ontonotes/model_state_target_trai...,1.731095,__scalar_mix__,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,0.0,sent_encoder._text_field_embedder.scalar_mix.,...,0,0,0,0,0,0,0.0,0.0,0.0,
1,base,bert-base-uncased-mix_00-edges-rel-semeval,bert-base-uncased-mix,rel-semeval,/edges-rel-semeval/model_state_target_train_va...,1.099132,__scalar_mix__,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,0.0,sent_encoder._text_field_embedder.scalar_mix.,...,0,0,0,0,0,0,0.0,0.0,0.0,
2,base,bert-base-uncased-mix_01-edges-coref-ontonotes,bert-base-uncased-mix,coref-ontonotes,/edges-coref-ontonotes/model_state_target_trai...,1.835713,__scalar_mix__,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,-0.709879,sent_encoder._text_field_embedder.scalar_mix.,...,0,0,0,0,0,0,0.7112,0.2888,0.8053,1.0
3,base,bert-base-uncased-mix_01-edges-rel-semeval,bert-base-uncased-mix,rel-semeval,/edges-rel-semeval/model_state_target_train_va...,1.097378,__scalar_mix__,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,-0.178499,sent_encoder._text_field_embedder.scalar_mix.,...,0,0,0,0,0,0,0.977378,0.022622,0.588313,1.0
4,base,bert-base-uncased-mix_02-edges-coref-ontonotes,bert-base-uncased-mix,coref-ontonotes,/edges-coref-ontonotes/model_state_target_trai...,1.777263,__scalar_mix__,/nfs/jiant/exp/iftenney/20190721-bert-base-lay...,-0.585102,sent_encoder._text_field_embedder.scalar_mix.,...,0,0,0,0,0,0,1.19747,0.387494,1.550549,1.805902


Print scalars from the top layer, in spreadsheet-friendly form:

In [14]:
matcher = "_12"
sheet_df = scalar_df[scalar_df.exp_name.map(lambda s: matcher in s)]
print(sheet_df.to_csv(**csv_args))

,tag,exp_name,exp_type,task,checkpoint,gamma,label,run,scalar_parameters.0,scalar_set,scalar_parameters.1,scalar_parameters.2,scalar_parameters.3,scalar_parameters.4,scalar_parameters.5,scalar_parameters.6,scalar_parameters.7,scalar_parameters.8,scalar_parameters.9,scalar_parameters.10,scalar_parameters.11,scalar_parameters.12,display_col,0,1,2,3,4,5,6,7,8,9,10,11,12,weight_entropy,weight_kl_unif,weight_exp_layer,weight_exp_layer_oneplus
24,base,bert-base-uncased-mix_12-edges-coref-ontonotes,bert-base-uncased-mix,coref-ontonotes,/edges-coref-ontonotes/model_state_target_train_val_73.best.th,1.6825,__scalar_mix__,/nfs/jiant/exp/iftenney/20190721-bert-base-layers//bert-base-uncased-mix_12-edges-coref-ontonotes/run,-0.4338,sent_encoder._text_field_embedder.scalar_mix.,-0.6981,-0.6875,-0.7092,-0.6423,-0.3593,0.0113,0.2223,0.8736,1.3477,0.9775,0.4410,-0.0540,bert-base-uncased-mix (base),0.03805815,0.029217781,0.029529793,0.02889403,0.030893512,0.0410006,0.05939578,0.073342934,0.14067322,0.2