# Prepare Tables for Thesis from BCM submissions

In [1]:
submission_path = "/home/mlynatom/master-thesis-repository-tomas-mlynar/bcm/submissions/b->cp_(cs+en)+it_(cs+en-alpaca+dolly)_submission.json"

In [2]:
import json

with open(submission_path, "r") as f:
    data = json.load(f)

In [3]:
EM = "exact_match"
AUC = "avg_mcauroc"
ACC = "acc"
PPL = "word_perplexity"

benchmark2metric = {
    "cs_triviaQA": EM,
    "propaganda_rusko": AUC,
    "propaganda_argumentace": AUC,
    "propaganda_emoce": AUC,
    "propaganda_zanr": AUC,
    "propaganda_zamereni": AUC,
    "ctkfacts_nli": AUC,
    "klokan_qa": ACC,
    "propaganda_nazor": AUC,
    "histcorpus": PPL,
    "propaganda_nalepkovani": AUC,
    "cs_sqad32": EM,
    "belebele": ACC,
    "propaganda_demonizace": AUC,
    "correspondence": PPL,
    "propaganda_fabulace": AUC,
    "propaganda_vina": AUC,
    'sentiment_mall': AUC, 
    'hellaswag': ACC, 
    'agree': AUC, 
    'sentiment_fb': AUC, 
    'grammarerrorcorrection': AUC, 
    'cermat_czmath_open': EM, 
    'cermat_czech_open': EM, 
    'subjectivity': AUC,
    'cermat_czech_mc': ACC,
    'cermat_czech_tf': AUC,
    'cermat_czmath_mc':ACC, 
    'propaganda_strach': AUC, 
    'essay': PPL, 
    'propaganda_lokace': AUC, 
    'snli': AUC, 
    'cs_court_decisions_ner': EM, 
    'fiction': PPL, 
    'sentiment_csfd': AUC, 
    'umimeto_biology': ACC, 
    'umimeto_chemistry': ACC, 
    'umimeto_czech': ACC, 
    'umimeto_history': ACC, 
    'umimeto_informatics': ACC, 
    'umimeto_math':ACC, 
    'umimeto_physics':ACC, 
    'cs_ner':EM, 
    'spoken':PPL,
    'propaganda_relativizace':AUC, 
    'cs_naturalquestions': EM, 
    'havlicek': PPL, 
    'csfever_nli': AUC, 
    'history_ir':ACC,
    'dialect': PPL,
}

In [4]:
#define tables
table2tasks = {
    "czech language understanding": ["agree", 'cermat_czech_tf','grammarerrorcorrection','cermat_czech_mc','umimeto_czech','cermat_czech_open'],
    "czech math reasoning": ['umimeto_math', "klokan_qa", 'cermat_czmath_mc', 'cermat_czmath_open'],
    "factual knowledge": ["cs_naturalquestions", "cs_triviaQA", "umimeto_biology", "umimeto_chemistry", "umimeto_history", "umimeto_informatics", "umimeto_physics"],
    "language modelling": ["dialect", "essay", "fiction", "havlicek", "correspondence", "spoken", "histcorpus", "hellaswag"],
    "NER": ["cs_court_decisions_ner", "cs_ner"],
    "NLI": ["csfever_nli", "ctkfacts_nli", "propaganda_argumentace", "propaganda_demonizace", "propaganda_emoce", "propaganda_fabulace", "propaganda_nalepkovani", "propaganda_lokace", "propaganda_zamereni", "propaganda_nazor", "propaganda_relativizace", "propaganda_rusko", "propaganda_strach", "propaganda_zanr", "snli"],
    "reading comprehension": ["belebele", "history_ir", "cs_sqad32"],
    "sentiment": ["sentiment_csfd", "sentiment_fb", "sentiment_mall", "subjectivity"],
}

In [5]:
model_name = '$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$'

In [6]:
for table, tasks in table2tasks.items():
    print(table)
    print()
    print(f"[{model_name}],", end="")
    for task in tasks:
        bcm_name = "benczechmark_"+task
        if task in benchmark2metric and bcm_name in data["results"]:
            metric = benchmark2metric[task]
            metric_value = data["results"][bcm_name][metric]
            if metric != PPL:
                metric_value = round(metric_value * 100, 2)
            elif metric == PPL:
                metric_value = round(metric_value, 2)
            #print always 2 decimal places
            print(f"[{metric_value:.2f}],", end="")
            
        else:
            print(f"{task}: Not found")
    print()
    print()

czech language understanding

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[78.17],[69.74],[61.11],[38.67],[61.00],[6.09],

czech math reasoning

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[68.00],[19.45],[23.02],[5.97],

factual knowledge

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[9.06],[25.75],[78.00],[72.00],[80.00],[82.00],[76.00],

language modelling

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[393.45],[2694.34],[6078.44],[38456.99],[15511.94],[252.61],[20569.26],[50.75],

NER

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[51.10],[57.45],

NLI

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[80.20],[80.74],[51.03],[82.91],[58.17],[60.81],[62.98],[82.16],[83.82],[72.24],[61.41],[51.17],[65.72],[91.27],[74.53],

reading comprehension

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[76.48],[32.50],[48.24],

sentiment

[$"B"->"CP"_("cs"+"en")+"IT"_("cs"+"en"-"dolly, alpaca")$],[89.17]