### Comparing loss/validation curves and final scores across experiments

goal: 
    x- see loss/val for one dataset across a set of experiments wrt optim step (~ prop to epochs)
    x- including when task is part of a Multi Task Learning setup
    
Works by collecting data + merging in a dataframe

TODO: 
    - test w/ multiple model

In [2]:
import pandas as pds
import os.path
from glob import glob
import altair as alt
import json

In [3]:
#!ls -ltrh runs

In [4]:
# config
JIANT_DIR = "~/Devel/jiant/"

tasks = set(["annodis","gum","pdtb","rstdt", "stac",
         "pcc", "nldt","prstc","rststb","cdtb",
         "sctb","cstn","tdb","spasctb","rrt","ert"
        ])
pdtb_tasks = set(["pdtb","tdb","cdtb"])

models = {
    "bert":"bert-base-multilingual-uncased",
    "bertC":"bert-base-multilingual-cased",
    "roberta": "xlm-roberta-base",
}

config = {"exp_dir":os.path.expanduser(os.path.join(JIANT_DIR,"runs/")),
          "tasks" : tasks,
          "models": models,
         }

### Learning curves (loss+validation metrics)

In [5]:
def retrieve_expe_dir(taskname,modelname,config):
    """retrieve the place where an experiment was logged
    follows the convention that a call to Multi-task joins task names with a space
    """
    task = " ".join(("disrpt21_%s"%t for t in taskname.split()))
    model = config["models"][modelname]
    return os.path.join(config["exp_dir"],f"run_{task}_{model}")

testdir = retrieve_expe_dir("annodis","roberta",config)
testdir

'/home/muller/Devel/jiant/runs/run_disrpt21_annodis_xlm-roberta-base'

In [6]:
def get_last_run(path):
    """retrieve the directory name of the last experiment for a given config. 
    relies on the name of this being a number built with date+time so max = latest"""
    last = [x for x in glob(os.path.join(path,"*")) if x.split(os.path.sep)[-1].isdigit()]
    if last == []:
        return None
    return max(last)

path = get_last_run(testdir)
path

'/home/muller/Devel/jiant/runs/run_disrpt21_annodis_xlm-roberta-base/1662972526'

In [7]:
# TODO: refactor for modularity, to have separate functions for loss and metrics, then join in a common table 

def get_log(path,logtype):
    """
    given path of an experiment log within a config dir, and a
    logtype = loss or metrics, indicating which type of data is retrieved
    either loss during training, or various metrics during training, on the validation set (p,r,f1)
    
    returns a dataframe with atomic info, either a loss for a training step for a task
    or an evaluation on validation set at a given step during training
    """
    mapping = {"loss":"loss_train.zlog",
               "metrics":"train_val.zlog"}
    data = pds.read_json(os.path.join(path,mapping[logtype]),lines=True)
    if logtype == "loss":
        return data
    else:# should be factored out in another function
        all = dict(zip(["task","f1","precision","recall","task_step","global_step"],[[] for i in range(6)]))
        for idx, line in data.iterrows():
            metrics = line["metrics"]
            
            for task in metrics: 
                all["task"].append(task)
                for s in "f1","recall","precision":
                    all[s].append(float(metrics[task]["minor"][s]))
                all["global_step"].append(int(line["train_state"]["global_steps"]))  
                all["task_step"].append(int(line["train_state"]["task_steps"][task]))
        return pds.DataFrame(all)
        

log = get_log(path,"metrics")
log

Unnamed: 0,task,f1,precision,recall,task_step,global_step
0,disrpt21_annodis,0.84143,0.857944,0.82554,99,99
1,disrpt21_annodis,0.886099,0.883721,0.888489,199,199
2,disrpt21_annodis,0.897046,0.893048,0.901079,299,299
3,disrpt21_annodis,0.899635,0.912963,0.886691,399,399
4,disrpt21_annodis,0.903285,0.916667,0.890288,499,499
5,disrpt21_annodis,0.899548,0.905282,0.893885,599,599
6,disrpt21_annodis,0.89863,0.912801,0.884892,640,640
7,disrpt21_annodis,0.89863,0.912801,0.884892,640,640


In [8]:
def collect_log(tasks,models,logtype,config):
    """
    collects a set of experiments logs in one table, for either loss or validation metrics
    
    eg 
        collect_log(["annodis","rststb","annodis rststb"],["roberta"],"metrics",config)
        collect_log(["annodis","rststb","annodis rststb"],["roberta"],"loss",config)
    """
    merged = []
    for task in tasks:
        MTL = " " in task #MTL exp
        for m in models: 
            try: 
                path = get_last_run(retrieve_expe_dir(task,m,config))
                logs = get_log(path,logtype)
                #print(logs.columns)
                logs["setup"] = "single" if not(MTL) else "MTL: "+task
                logs["model"] = m
            except:
                print("could not find data for",task,"+",m)
            merged.append(logs)
    return pds.concat(merged)


def collect_expe_results(tasks,models,config):
    """reads and merge loss and metrics learning curves into one table"""
    table = []
    for logtype in ("metrics","loss"):
        table.append(collect_log(tasks,models,logtype,config))
    all = pds.merge(*table,"outer")
    return all


In [9]:

df = collect_expe_results(["annodis"],["roberta","bertC"],config)
len(df[df["model"]=="roberta"])

641

In [10]:
df.sample(10)

Unnamed: 0,task,f1,precision,recall,task_step,global_step,setup,model,loss_val,TIMESTAMP
711,disrpt21_annodis,,,,63,63,single,bertC,0.110189,2022-09-15 14:03:24.629147291
1266,disrpt21_annodis,,,,624,624,single,bertC,0.000557,2022-09-15 14:13:42.466043949
1053,disrpt21_annodis,,,,409,409,single,bertC,0.002852,2022-09-15 14:09:50.795456886
824,disrpt21_annodis,,,,177,177,single,bertC,0.021488,2022-09-15 14:05:30.350619316
956,disrpt21_annodis,,,,311,311,single,bertC,0.006452,2022-09-15 14:08:00.822422743
1007,disrpt21_annodis,,,,362,362,single,bertC,0.008107,2022-09-15 14:08:51.957426548
24,disrpt21_annodis,,,,9,9,single,roberta,0.525225,2022-09-12 08:49:04.936621666
210,disrpt21_annodis,,,,196,196,single,roberta,0.034421,2022-09-12 08:52:04.843295813
870,disrpt21_annodis,,,,224,224,single,bertC,0.006375,2022-09-15 14:06:29.200415373
403,disrpt21_annodis,,,,391,391,single,roberta,0.015017,2022-09-12 08:55:26.772369862


In [11]:
# Example visualizations of loss/validation curves

filter = (df["task"]=="disrpt21_annodis")


raw_loss = alt.Chart(df[filter]).mark_line().encode(
  x='task_step',# or global_step
  y='loss_val',
  color='model:N' # setup or model
)

raw_f1 = alt.Chart(df[filter & (df["f1"].notnull())]).mark_point().encode(
  x='task_step',# global_step
  y='f1',
  color='model:N'
)

f1_labels = raw_f1.mark_text(
    align="center",
    dy= -20
).encode(text=alt.Text("f1:Q",format=".1%"))

# rolling average 
avg_loss = alt.Chart(df[filter]).mark_line(color="green"
).transform_window(
    rolling_mean='mean(loss_val)',
    frame=[-10,0]
).encode(
    x='task_step',# global_step
    y='rolling_mean:Q',
    color="model:N"
    #color="task:N"
)

#alt.layer(avg_loss + raw_f1).interactive() #+ f1_labels
avg_loss | (raw_f1 + f1_labels)

### Reading final scores for best models

In [12]:
# CHECK: unclear if best is always in val_metrics.json
#    -> look into best_model.metadata.json
#       slightly difft format tho

def read_final_metrics(path,dataset="val",best_metadata=False):
    if best_metadata: 
        filename = "best_model.metadata.json"
    else:
        filename = "%s_metrics.json"%dataset
    data = json.load(open(os.path.join(path,filename)))
    #print(data)
    results = []
    if best_metadata: data = data["val_state"]["metrics"]
    for key in data: 
        if key!="aggregated":
            task = key
            metrics = []
            for one in ("precision","recall","f1"):
                if best_metadata: 
                    metrics.append(data[task]["minor"][one])
                else:
                    metrics.append(data[task]["metrics"]["minor"][one])
            results.append([task.split("_")[1]]+metrics)
    return pds.DataFrame(results,columns=["task","precision","recall","f1"])


In [13]:
path = retrieve_expe_dir("annodis","roberta",config)
print(path)
read_final_metrics(path)

/home/muller/Devel/jiant/runs/run_disrpt21_annodis_xlm-roberta-base


Unnamed: 0,task,precision,recall,f1
0,annodis,0.916667,0.890288,0.903285


In [14]:
def collect_final_result(tasks,models,config,dataset="val",best_metadata=False):
    """collect final results for best model for latest experiment on a collection of (task*models)"""         
    merged = []
    for task in tasks:
        MTL = " " in task #MTL exp
        for m in models: 
            try: 
                path = retrieve_expe_dir(task,m,config)
                scores = read_final_metrics(path,dataset=dataset,best_metadata=best_metadata)
                scores["setup"] = "single" if not(MTL) else "MTL: "+task
                scores["model"] = m
                merged.append(scores)
            except:
                print("could not load data for",task,"+",m)
            
    return pds.concat(merged)


val_scores = collect_final_result(config["tasks"],["bert","bertC","roberta"],config,best_metadata=True)
#config["tasks"]

filter = val_scores["task"].isin(("pdtb","tdb","cdtb"))
print("===== Scores pdtb ========")
print("---mean=",val_scores[filter].groupby("model").mean()["f1"])
print("---std dev=",val_scores[filter].groupby("model").std()["f1"])
filter = ~(val_scores["task"].isin(("pdtb","tdb","cdtb")))
print("===== Scores segmentation ========")
print("---mean=",val_scores[filter].groupby("model").mean()["f1"])
print("---std dev=",val_scores[filter].groupby("model").std()["f1"])

could not load data for sctb + bertC
could not load data for stac + bertC
could not load data for rstdt + bertC
could not load data for nldt + bertC
could not load data for tdb + bertC
could not load data for pdtb + bertC
could not load data for pcc + bertC
could not load data for cstn + bertC
could not load data for rststb + bertC
could not load data for prstc + bertC
could not load data for spasctb + bertC
could not load data for cdtb + bertC
could not load data for rrt + bertC
could not load data for ert + bertC
could not load data for gum + bertC
---mean= model
bert       0.875804
roberta    0.903032
Name: f1, dtype: float64
---std dev= model
bert       0.049528
roberta    0.046819
Name: f1, dtype: float64
---mean= model
bert       0.918460
bertC      0.884956
roberta    0.922438
Name: f1, dtype: float64
---std dev= model
bert       0.042227
bertC           NaN
roberta    0.040585
Name: f1, dtype: float64


In [15]:
# to compare multiple models across multiple datasets + CI
# TODO: better if averaged on multiple runs
# inspired from: https://altair-viz.github.io/gallery/grouped_bar_chart_with_error_bars.html
scores = alt.Chart().mark_bar().encode(
    x="model",
    y="f1",
    color="model"
)

error_bars = alt.Chart().mark_errorbar(extent='ci').encode(
    x='model',
    y='f1'
)
f1_labels = scores.mark_text(
    align="left",
    dy= -7, dx=-10
).encode(text=alt.Text("f1:Q",format=".1%"))


alt.layer(scores, error_bars, f1_labels,data=val_scores).facet(
    column='task:N'
).interactive()