In [None]:
# Misc procedures during development

In [None]:
# Convert the TSA conll data to DatasetDict
import pandas as pd
import os
from datasets import Dataset, DatasetDict
root_folder = "data"
tsa_folder = os.path.join(root_folder,"tsa_conll")
arrow_folder = os.path.join(root_folder,"tsa_arrow_2")

def parse_conll(raw:str, sep="\t"):
    """Parses the norec-fine conll files with tab separator and sntence id"""
    doc_parsed = [] # One dict per sentence. meta, tokens and tags
    for sent in raw.strip().split("\n\n"):
        meta = ""
        tokens, tags = [], []
        for line in sent.split("\n"):
            if line.startswith("#") and "=" in line:
                meta = line.split("=")[-1]
            else:
                elems = line.strip().split(sep)
                assert len(elems) == 2
                tokens.append(elems[0])
                tags.append(elems[1])
        assert len(meta) > 0
        doc_parsed.append({"idx": meta, "tokens":tokens, "tsa_tags":tags})

    return doc_parsed


splits = {"train": "train", "dev": "validation", "test": "test"} # "validation" for HF naming convention
d_sets = {}
for split in splits:
    path = os.path.join(tsa_folder, split+".conll")
    with open(path) as rf:
        conll_txt = rf.read()
    print(len(conll_txt.split("\n\n")))
    sents = parse_conll(conll_txt)
    # for sent in sents:
        # sent["labels"] = [label_mapping[tag] for tag in sent["tsa_tags"]]
    d_sets[splits[split]] = Dataset.from_pandas(pd.DataFrame(sents))

DatasetDict(d_sets).save_to_disk(arrow_folder)
    # sentences = parse(conll_txt)
    # sentences[0]


["NorBERT_3_x-small", "NorBERT_3_small", "NorBERT_1", "NorBERT_2", "NB-BERT_base", "ScandiBERT", "mBERT", "XLM-R_base", "NorBERT_3_base", "XLM-R_large", "NB-BERT_large", "NorBERT_3_large", "Lenge-NorBERT_x-small", "Lenge-NorBERT_small", "Lenge-NorBERT_base", "NorT5_x-small", "NorT5_small", "NorT5_base", "NorT5_large", "Combined", "Oversampled", "NAK", "NCC", "mC4", "Wiki", "NBDigital"]

["NorBERT_3_x-small", "NorBERT_3_small", "NorBERT_1", "NorBERT_2", "NB-BERT_base", "ScandiBERT", "mBERT", "XLM-R_base", "NorBERT_3_base", "XLM-R_large", "NB-BERT_large", "NorBERT_3_large",  "Combined", "Oversampled", "NAK", "NCC", "mC4", "Wiki", "NBDigital"]

["Lenge-NorBERT_x-small", "Lenge-NorBERT_small", "Lenge-NorBERT_base", "NorT5_x-small", "NorT5_small", "NorT5_base", "NorT5_large"]

{"NorBERT_3_x-small": "ltg/norbert3-xs", 
"NorBERT_3_small": "ltg/norbert3-small", 
"NorBERT_1": "ltg/norbert" , 
"NorBERT_2":"ltg/norbert2",
"NB-BERT_base": "NbAiLab/nb-bert-base",
"ScandiBERT": "vesteinn/ScandiBERT", 
"mBERT": "bert-base-multilingual-cased", 
"XLM-R_base": "xlm-roberta-base",  
"NorBERT_3_base":"ltg/norbert3-base",
"XLM-R_large": "xlm-roberta-large",
"NB-BERT_large": "NbAiLab/nb-bert-large", 
"NorBERT_3_large": "ltg/norbert3-large",  
"Combined": "ltg/norbert3-base", 
"Oversampled":"ltg/norbert3-oversampled-base", 
"NAK": "ltg/norbert3-nak-base", 
"NCC":"ltg/norbert3-ncc-base",
"mC4":"ltg/norbert3-c4-base",
"Wiki": "ltg/norbert3-wiki-base",
"NBDigital": "ltg/norbert3-nb-base"}

In [None]:
import json
# If you want a list of models with where to get them from
models = {
"NorBERT_3_x-small": "ltg/norbert3-xs", 
"NorBERT_3_small": "ltg/norbert3-small", 
"NorBERT_1": "ltg/norbert" , 
"NorBERT_2":"ltg/norbert2",
"NB-BERT_base": "NbAiLab/nb-bert-base",
"ScandiBERT": "vesteinn/ScandiBERT", 
"mBERT": "bert-base-multilingual-cased", 
"XLM-R_base": "xlm-roberta-base",  
"NorBERT_3_base":"ltg/norbert3-base",
"XLM-R_large": "xlm-roberta-large",
"NB-BERT_large": "NbAiLab/nb-bert-large", 
"NorBERT_3_large": "ltg/norbert3-large",
}

with open("configs/models_name_addr1.json", "w", encoding="utf8") as wf:
    json.dump(models, wf)

## Under here, analysis of the json log files 

In [1]:
from pathlib import Path
import json
# Load json files with details on each epoch for each experiment. Get the best epoch
jsons = [(p.stem[:-4] ,json.loads(p.read_text()) )for p in Path("logs/jsons").iterdir()]
best_epochs = []
for r in jsons:
    idx = r[0]
    epoch_eval = [ee for ee in r[1] if "eval_f1" in ee]
    epoch_eval = sorted(epoch_eval, key = lambda l: l["eval_f1"], reverse=True)
    best_epochs.append({"idx": idx, 
                    "epoch": int(epoch_eval[0]["epoch"]),
                    "eval_f1": epoch_eval[0]["eval_f1"] })
    print(best_epochs[-1])


{'idx': '04201157_tsa_NorBERT_3_base', 'epoch': 2, 'eval_f1': 0.5235955056179775}
{'idx': '04201157_tsa_NB-BERT_base', 'epoch': 6, 'eval_f1': 0.5252873563218391}
{'idx': '04201157_tsa_XLM-R_base', 'epoch': 8, 'eval_f1': 0.4937192790824686}


In [4]:
import pandas as pd
import os, sys, json
from pathlib import Path

records = [] # List of dicts that have the model name injected  
for log_path in Path("logs/jsons").iterdir():
    p_stem = log_path.stem
    stem_segments = p_stem.split("_")
    task = stem_segments[1]
    ts = stem_segments[0]
    m_name = p_stem[13:-4]
    log = json.loads(log_path.read_text())

    epoch_eval = [ee for ee in log if "eval_f1" in ee]
    epoch_eval = sorted(epoch_eval, key = lambda l: l["eval_f1"], reverse=True)
    for i, epoch_log in enumerate(epoch_eval):
        epoch_log.update({"timestamp":ts,"model":m_name, "task":task,"best_epoch": i==0, "second_best": i==1})
        records.append(epoch_log)

df_all = pd.DataFrame.from_records(records)
# Function to filter the df_all according to True in "best_epoch" or "second_best"


df = df_all[(df_all["best_epoch"]== True) | (df_all["second_best"]==True) ].sort_values(["task", "eval_f1"], ascending=False)
df[["model", "task", "epoch", "eval_f1", "best_epoch", "second_best"]].to_clipboard()
# df.to_csv("output/dev_evals.csv", index=False) # Write this for reporting and analysis


df[["model", "task", "epoch", "eval_f1", "best_epoch", "second_best"]].sort_values(["model", "eval_f1"], ascending=False)
# df

Unnamed: 0,model,task,epoch,eval_f1,best_epoch,second_best
20,XLM-R_base,tsa,8.0,0.493719,True,False
21,XLM-R_base,tsa,7.0,0.491877,False,True
0,NorBERT_3_base,tsa,2.0,0.523596,True,False
1,NorBERT_3_base,tsa,3.0,0.523167,False,True
10,NB-BERT_base,tsa,6.0,0.525287,True,False
11,NB-BERT_base,tsa,7.0,0.519016,False,True


The below script is redundant after I moved saving best epoch to the config json.

In [5]:
# Create setup dict structure for the final testing with their best epochs from dev testing
# With this you can train new models with the best epochs from dev testing for a number of times to get mean and std.
# You can write a script based on seq_label.py to load the information in the json and train each model with different seeds, and test on the test split

test_setup = []
for rec in records:
    if rec["best_epoch"]:
        test_setup.append({"model": rec["model"], "task":rec["task"], "epochs": int(rec["epoch"]), "eval_f1":rec["eval_f1"] , "tested":False})  
Path("configs/evals").mkdir(exist_ok=True, parents=True)
# Path("configs/evals/test_setup.json").write_text(json.dumps(test_setup))
test_setup

[{'model': 'NorBERT_3_base',
  'task': 'tsa',
  'epochs': 2,
  'eval_f1': 0.5235955056179775,
  'tested': False},
 {'model': 'NB-BERT_base',
  'task': 'tsa',
  'epochs': 6,
  'eval_f1': 0.5252873563218391,
  'tested': False},
 {'model': 'XLM-R_base',
  'task': 'tsa',
  'epochs': 8,
  'eval_f1': 0.4937192790824686,
  'tested': False}]

In [None]:
# Just another view of the dataframe
cols = list(df_all.columns)
cols = [c for c in cols if not any([d in c for d in ["second", "step", "runtime"]])]
df_all[cols][(df_all.task == "tsa") & (df_all.model == "NorBERT_3_base")].sort_values("eval_f1")