In [79]:
import os
import re
import glob
import json
import shutil
import collections
import pandas as pd
import numpy as np

# pubmed general stats table

In [107]:
pubmed_stats_dict = json.load(open("../data/pubmed2024_stats.json"))

In [108]:
df = pd.DataFrame(pubmed_stats_dict)[["sjr_percentiles","h-index_percentiles"]]
df = df.rename(columns={"sjr_percentiles":"sjr","h-index_percentiles":"h-index"})
df.index.name = "percentiles"
df = df.reset_index()
df["percentiles"] = df["percentiles"].apply(lambda x: x.replace("%","\\%"))
df

Unnamed: 0,percentiles,sjr,h-index
0,12.5\%,0.0,21.0
1,25\%,0.0,53.0
2,37.5\%,0.462,77.0
3,50\%,0.759,103.0
4,62.5\%,0.984,142.0
5,75\%,1.312,190.0
6,87.5\%,2.148,276.0


In [109]:
# format columns for rotation latex 
#tdf.columns = [ "\\rotatebox{90}{" + c + "}" for c in tdf.columns]
# get latex string
latex_percentiles = df.to_latex(
    caption="Percentiles Measured on PubMed Baseline dataset",
    label="tab:percentiles",
    float_format="%.2f",
    index=False,
    position="!ht",
    multicolumn_format='c',
)
# convert rules as hlines
for s in ["top","mid","bottom"]:
    to_replace = '\\' + f'{s}rule'
    if s != "top" :
        latex_percentiles = latex_percentiles.replace(to_replace , '\hline')
    else :
        latex_percentiles = latex_percentiles.replace(to_replace , '')

open("tables/percentiles.tex",'w').write(latex_percentiles)

367

# experiments description table 

In [1]:
bounds = {
    "none":[
        (None,None,"all"),
    ],
    "random":[
        (0.0,0.5,"50%"),
        (0.0,0.25,"25%")
    ],
    "h-index":[
        (103,1400,"top50%"),
        (53,190,"mid50%"),
        (190,1400,"top25%"),
        (77,142,"mid25%"),
    ],
    "sjr":[
        (1.312,100.0,"top25%"),
        (0.462,0.984,"mid25%"),
        (0.759,100.0,"top50%"),
    ]
}

In [2]:
exp_list = []
for metric, exps in bounds.items():
    for low, up, name in exps :
        if '%' in name : 
            percent = name[-3:]
            criteria = name[:-3]
        elif name == "all":
            criteria = ""
            percent = "100%"
        exp_list.append({
        "metric":metric,
        "criteria":criteria,
        "remaining":percent,
        })
exp_df = pd.DataFrame(exp_list)
exp_df

NameError: name 'pd' is not defined

In [4]:
# format columns for rotation latex 
#tdf.columns = [ "\\rotatebox{90}{" + c + "}" for c in tdf.columns]
# get latex string
latex_exp = exp_df.to_latex(
    caption="Description of experiences realised",
    label="tab:experiences_desc",
    float_format="%.2f",
    index=False,
    position="!ht",
)
# convert rules as hlines
for s in ["top","mid","bottom"]:
    to_replace = '\\' + f'{s}rule'
    if s != "top" :
        latex_exp = latex_exp.replace(to_replace , '\hline')
    else :
        latex_exp = latex_exp.replace(to_replace , '')

open("tables/experiences_desc.tex",'w').write(latex_exp)

394

# training losses plot

In [None]:
import matplotlib.pyplot as plt

In [None]:
x_step_dict = {}
y_train_loss_dict = {}
for trainer_state in glob.glob("../pretraining/*/trainer_state.json"):
    model = trainer_state.split('/')[-2]
    x_step_dict[model] = []
    y_train_loss_dict[model] = []
    for log_step_dict in json.load(open(trainer_state))["log_history"]:
        if 'loss' in log_step_dict:
            y_train_loss_dict[model].append(log_step_dict['loss'])
        elif 'train_loss' in log_step_dict:
            y_train_loss_dict[model].append(log_step_dict['train_loss'])
        else:
            print("no loss on step : ",log_step_dict)
        x_step_dict[model].append(log_step_dict['step'])

In [None]:
for model in y_train_loss_dict :
    plt.plot(x_step_dict[model], y_train_loss_dict[model], label='model')
plt.legend(loc='best')
plt.show()

# general blurb results table

In [84]:
exp_results = []
for pred_path in glob.glob("out/*/predict_results.json"):
    if "debug" in pred_path: continue
    metrics_results = json.load(open(pred_path))
    metrics_results = {k.replace("predict_","").replace("overall_",""):v for k,v in metrics_results.items()}
    # renaming
    pred_path = re.sub("ncbi_disease","ncbi-disease",pred_path)
    pred_path = re.sub('blurb-','',pred_path)
    pred_path = re.sub('_ckpt-[0-9]*','',pred_path)
    pred_path = re.sub('%','\\%',pred_path)
    if "bert-base" not in pred_path:
        pred_path = pred_path.replace("_","-",1)
    # parsing
    model, corpus, seed = pred_path.split('/')[-2].split('_')
    if model == "none-all": continue
    config = {"model":model,"corpus":corpus,"seed":int(seed.replace('seed',''))}
    exp_results.append(config|metrics_results)
df = pd.DataFrame(exp_results).sort_values(by=["model","corpus","seed"])
df = df[["model","corpus","seed","accuracy","f1","pearsonr"]]

In [85]:
metric_dataset = {
    "f1":["bc5chem","bc5disease","jnlpba","ncbi-disease","bc2gm","hoc","ddi-corpus","chemprot","gad"],
    "accuracy":["pubmed-qa","bioasq-task-b",],
    "pearsonr":["biosses"],
}
def get_perf(row):
    for m,d in metric_dataset.items():
        if row["corpus"] in d:
            return row[m]
# Group by model and corpus and calculate mean and std
_FORMAT_WITH_STD = "${:.2f}_{{\pm{:.2f}}}$"
_FORMAT_WITHOUT_STD = "{:.2f}"
str_mean_and_std = lambda x : _FORMAT_WITHOUT_STD.format(np.mean(x)*100,np.std(x)*100)
agg_df = df.groupby(["model","corpus"]).agg(str_mean_and_std)
agg_df = agg_df.reset_index()
agg_df["performance"] = agg_df.apply(get_perf,axis=1)
agg_df = agg_df[["model","corpus","performance"]]

In [86]:
tasks_datasets = {
    "ner" : ["bc5chem","bc5disease","bc2gm","jnlpba","ncbi-disease",],
    "document_classification" : ["hoc",],
    "pico":["ebm_pico",],
    "sentence_similarity" : ["biosses",],
    "relation_extraction" : ["chemprot","ddi-corpus","gad"],
    "qa" : ["bioasq-task-b","pubmed-qa",]
}
# we did not evaluate pico and sentence similiarity
tasks_datasets.pop("pico")
tasks_datasets.pop("sentence_similarity")
# pivot df
pivoted_df = agg_df.pivot(index='model', columns='corpus', values='performance')
pivoted_df.columns = [col for col in pivoted_df.columns]
pivoted_df = pivoted_df.drop(columns=["biosses"])
pivoted_df.index.name = None
# reorder columns grouping each task together
new_order = [data for task,data_l in tasks_datasets.items() for data in data_l]
pivoted_df = pivoted_df[new_order]
pivoted_df = pivoted_df.fillna("")

In [None]:
# Optional : to add task 
columns_tuples = [(task,dataset) for task,dataset_l in tasks_datasets.items() for dataset in dataset_l]
pivoted_df.columns = pd.MultiIndex.from_tuples(columns_tuples)

In [88]:
# Optional : 
def dataset_avg(row):
    float_values = [float(v.split('_')[0].replace('$','')) for v in row.values if v]
    micro_avg = sum(float_values)/len(row)
    return round(micro_avg,2)
    
def task_avg_multicol(row):
    N_tasks = len(tasks_datasets)
    macro_avg = 0
    for k, v in row.items():
        task = k[0] # multi index
        perf = float(v.split('_')[0].replace('$',''))
        macro_avg += perf/(N_tasks*len(tasks_datasets[task]))
    return round(macro_avg,2)

def task_avg(row):
    N_tasks = len(tasks_datasets)
    macro_avg = 0
    for dataset, perf_s in row.items():
        if not perf_s : continue
        # find task
        current_task = ""
        for task,dataset_list in tasks_datasets.items():
            if dataset in dataset_list:
                current_task = task
                break
        
        perf = float(perf_s.split('_')[0].replace('$',''))
        macro_avg += perf/(N_tasks*len(tasks_datasets[current_task]))
    return round(macro_avg,2)
        
dataset_avg_tmp = pivoted_df.apply(dataset_avg, axis=1)
task_avg_tmp = pivoted_df.apply(task_avg, axis=1)
# only add averages after calculating both ( so that there is not the first avg column that annoys
pivoted_df["Dataset avg."]=dataset_avg_tmp
pivoted_df["Task avg."]=task_avg_tmp

In [92]:
tdf = pivoted_df.transpose()

In [93]:
# OPTIONAL ( TO SEPARE MODEL NAMES WITH SAME METRIC, AND REORDER THEM )
c_tuples = []
for c in tdf.columns:
    if 'bert' in c: 
        c_tuples.append(('\multirow{2}{*}{base}','','0\%'))
        continue
    for metric in ['h-index','random','sjr','none']:
        if metric in c and metric == "random":
            c_tuples.append(('\multirow{2}{*}{'+metric+'}',"",c.replace(metric+'-','')))
            break
        elif metric in c and metric == "none":
            c_tuples.append(('\multirow{2}{*}{'+metric+'}',"","100\%"))
            break
        for criteria in ["mid","top"]:
            if metric in c and criteria in c:
                c_tuples.append((metric,criteria,c.replace(metric+'-','').replace(criteria,'')))
                break
tdf.columns = pd.MultiIndex.from_tuples(c_tuples)
order_dict = {c: i for i, c in enumerate(["\multirow{2}{*}{base}","\multirow{2}{*}{none}","\multirow{2}{*}{random}","h-index","sjr"])}
new_col_order = sorted(tdf.columns, key=lambda x: order_dict[x[0]])
tdf = tdf[new_col_order]

In [94]:
# OPTION 2 : IF SINGLE INDEX (DATASET) rename it
renamed_ind = {"pubmed-qa":"PubMed QA","bioasq-task-b":"BioASQ","ddi-corpus":"DDI","ncbi-disease":"NCBI-disease","chemprot":"ChemProt","bc5chem":"BC5-chem","bc5disease":"BC5-disease","hoc":"HoC"}
renamed_ind = renamed_ind | {d:d.upper() for d in tdf.index if d not in renamed_ind}
tdf.index =  [renamed_ind[i] for i in tdf.index]
tdf

Unnamed: 0_level_0,\multirow{2}{*}{base},\multirow{2}{*}{random},\multirow{2}{*}{random},h-index,h-index,h-index,h-index,sjr,sjr,sjr
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mid,mid,top,top,mid,top,top
Unnamed: 0_level_2,0\%,25\%,50\%,25\%,50\%,25\%,50\%,25\%,25\%,50\%
BC5-chem,87.31,90.03,90.24,89.4,89.93,89.51,89.52,89.72,89.61,89.89
BC5-disease,77.09,81.09,80.72,81.05,80.73,80.38,80.68,81.0,80.76,80.6
BC2GM,75.32,79.17,79.01,79.51,79.51,79.52,79.41,78.74,79.01,79.87
JNLPBA,76.77,78.02,77.85,77.51,77.95,78.13,78.41,78.13,78.28,77.9
NCBI-disease,81.59,84.89,84.45,85.09,84.84,84.63,84.71,84.97,84.3,84.98
HoC,79.22,84.41,84.74,84.72,84.56,84.83,84.71,84.54,85.07,84.76
ChemProt,77.07,79.25,78.83,78.94,79.72,79.0,79.92,78.77,79.62,78.96
DDI,89.11,87.54,87.7,87.91,88.27,86.46,86.8,87.05,85.92,87.76
GAD,76.82,78.09,78.24,78.31,77.38,77.34,78.39,77.42,78.35,77.0
BioASQ,72.19,75.93,75.63,75.63,75.24,74.84,76.07,75.85,75.5,75.22


In [95]:
# format max and second max
def reformat_max(row):
    floats = [float(s) for s in row.values]
    first_max = np.argmax(floats)
    floats.pop(first_max)
    second_max = np.argmax(floats)
    row.iloc[first_max] = "\\textbf{" + str(row.iloc[first_max]) + "}"
    row.iloc[second_max] = "\\underline{" + str(row.iloc[second_max]) + "}"
    return row

tdf = tdf.apply(reformat_max, axis=1)
tdf

Unnamed: 0_level_0,\multirow{2}{*}{base},\multirow{2}{*}{random},\multirow{2}{*}{random},h-index,h-index,h-index,h-index,sjr,sjr,sjr
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mid,mid,top,top,mid,top,top
Unnamed: 0_level_2,0\%,25\%,50\%,25\%,50\%,25\%,50\%,25\%,25\%,50\%
BC5-chem,87.31,\underline{90.03},\textbf{90.24},89.40,89.93,89.51,89.52,89.72,89.61,89.89
BC5-disease,77.09,\textbf{81.09},\underline{80.72},81.05,80.73,80.38,80.68,81.00,80.76,80.60
BC2GM,75.32,79.17,79.01,79.51,79.51,\underline{79.52},79.41,78.74,79.01,\textbf{79.87}
JNLPBA,76.77,78.02,77.85,77.51,77.95,78.13,\textbf{78.41},\underline{78.13},78.28,77.90
NCBI-disease,81.59,84.89,84.45,\textbf{85.09},84.84,84.63,84.71,84.97,\underline{84.30},84.98
HoC,79.22,84.41,84.74,84.72,84.56,\underline{84.83},84.71,84.54,\textbf{85.07},84.76
ChemProt,77.07,79.25,78.83,78.94,\underline{79.72},79.00,\textbf{79.92},78.77,79.62,78.96
DDI,\textbf{89.11},87.54,87.70,\underline{87.91},88.27,86.46,86.80,87.05,85.92,87.76
GAD,76.82,78.09,78.24,78.31,77.38,77.34,\textbf{78.39},\underline{77.42},78.35,77.00
BioASQ,72.19,\underline{75.93},75.63,75.63,75.24,74.84,\textbf{76.07},75.85,75.50,75.22


In [96]:
# format columns for rotation latex 
#tdf.columns = [ "\\rotatebox{90}{" + c + "}" for c in tdf.columns]
caption = """Performance of our different pretrained models on the different evaluation tasks from the BLURB benchmark \cite{guDomainSpecificLanguageModel2022}. \textit{'base'} model is the BERT$_{BASE}$ model \cite{devlinBERTPretrainingDeep2019} from which we continue the pretraining. For the task average, we average the datasets that come from the same task and then do the average of the performance on each task."""
# get latex string
latex_table = tdf.to_latex(
    caption=caption,
    label="tab:blurb_results",
    float_format="%.2f",
    position="!ht",
    multicolumn_format='c',
)
# replace table and tabular
latex_table = latex_table.replace("\\begin{table}","\\begin{table*}")
latex_table = latex_table.replace("\\end{table}","\\end{table*}")
latex_table = latex_table.replace("\\end{tabular}","\\end{NiceTabular}")
# insert lines 
lines = latex_table.split('\n')
lines = (
    lines[0:3] + 
    ['\\centering','\\begin{NiceTabular}{p{1.3cm}ccccccccccc}'] + # modifier ici l|c etc
    lines[4:6] + 
    [" \cmidrule(lr){6-9}\cmidrule(lr){10-12}"] +
    [lines[6]] + 
    ["\cmidrule(lr){2-2}\cmidrule(lr){3-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}\cmidrule(lr){8-9}\cmidrule(lr){10-10}\cmidrule(lr){11-12}"] +
    lines[7:]
)
# add hlines
for i,l in enumerate(lines):
    if l.split(' & ')[0] in ["NCBI-disease","HoC","GAD"]:
        lines[i] = lines[i] + "\\hline"
    elif l.startswith("PubMed QA") :
        lines[i] = lines[i] + "\\hline\\hline"
lines

['\\begin{table*}[!ht]',
 "\\caption{Performance of our different pretrained models on the different evaluation tasks from the BLURB benchmark \\cite{guDomainSpecificLanguageModel2022}. \textit{'base'} model is the BERT$_{BASE}$ model \\cite{devlinBERTPretrainingDeep2019} from which we continue the pretraining. For the task average, we average the datasets that come from the same task and then do the average of the performance on each task.}",
 '\\label{tab:blurb_results}',
 '\\centering',
 '\\begin{NiceTabular}{p{1.3cm}ccccccccccc}',
 '\\toprule',
 ' & \\multirow{2}{*}{base} & \\multicolumn{2}{c}{\\multirow{2}{*}{random}} & \\multicolumn{4}{c}{h-index} & \\multicolumn{3}{c}{sjr} \\\\',
 ' \\cmidrule(lr){6-9}\\cmidrule(lr){10-12}',
 ' &  & \\multicolumn{2}{c}{} & \\multicolumn{2}{c}{mid} & \\multicolumn{2}{c}{top} & mid & \\multicolumn{2}{c}{top} \\\\',
 '\\cmidrule(lr){2-2}\\cmidrule(lr){3-3}\\cmidrule(lr){4-5}\\cmidrule(lr){6-7}\\cmidrule(lr){8-9}\\cmidrule(lr){10-10}\\cmidrule(lr){1

In [97]:
open("tables/blurb_results.tex",'w').write("\n".join(lines))

2552

['a', 'b']
