# Results extraction for paper

In [1]:
import os
import glob
import json
import shutil
import pandas as pd
import numpy as np

## general blurb results table

In [2]:
exp_results = []
for pred_path in glob.glob("out/*/predict_results.json"):
    if "debug" in pred_path: continue
    metrics_results = json.load(open(pred_path))
    metrics_results = {k.replace("predict_","").replace("overall_",""):v for k,v in metrics_results.items()}
    if "ncbi_disease" in pred_path:
        pred_path = pred_path.replace("ncbi_disease","ncbi-disease")
    if "bert-base" not in pred_path:
        pred_path = pred_path.replace("_","-",1)
    model, corpus, seed = pred_path.split('/')[-2].split('_')
    config = {"model":model,"corpus":corpus,"seed":int(seed.replace('seed',''))}
    exp_results.append(config|metrics_results)

In [3]:
df = pd.DataFrame(exp_results).sort_values(by=["model","corpus","seed"])
df = df[["model","corpus","seed","accuracy","f1","pearsonr"]]
df.head(50)

Unnamed: 0,model,corpus,seed,accuracy,f1,pearsonr
7,bert-base-uncased,bioasq-task-b,0,0.754083,,
97,bert-base-uncased,bioasq-task-b,1,0.754083,,
180,bert-base-uncased,bioasq-task-b,2,0.757925,,
215,bert-base-uncased,bioasq-task-b,3,0.556196,,
233,bert-base-uncased,bioasq-task-b,4,0.761768,,
201,bert-base-uncased,biosses,0,,,0.866163
268,bert-base-uncased,biosses,1,,,0.864847
101,bert-base-uncased,biosses,2,,,0.866266
140,bert-base-uncased,biosses,3,,,0.866764
231,bert-base-uncased,biosses,4,,,0.865847


In [4]:
metric_dataset = {
    "f1":[
        "blurb-bc5chem",
        "blurb-bc5disease",
        "blurb-jnlpba",
        "blurb-ncbi-disease",
        "blurb-bc2gm",
        "hoc"
    ],
    "accuracy":[
        "pubmed-qa",
        "bioasq-task-b",
    ],
    "pearsonr":[
        "biosses"
    ],
}


def get_perf(row):
    for m,d in metric_dataset.items():
        if row["corpus"] in d:
            return row[m]

In [5]:
# Group by model and corpus and calculate mean and std
str_mean_and_std = lambda x : "{:.2f}%+/-{:.2f}".format(np.mean(x)*100,np.std(x)*100)
agg_df = df.groupby(["model","corpus"]).agg(str_mean_and_std)
agg_df = agg_df.reset_index()
agg_df["performance"] = agg_df.apply(get_perf,axis=1)
agg_df = agg_df[["model","corpus","performance"]]
agg_df

Unnamed: 0,model,corpus,performance
0,bert-base-uncased,bioasq-task-b,71.68%+/-8.04
1,bert-base-uncased,biosses,86.52%+/-0.11
2,bert-base-uncased,blurb-bc2gm,75.32%+/-0.79
3,bert-base-uncased,blurb-bc5chem,87.31%+/-0.57
4,bert-base-uncased,blurb-bc5disease,77.09%+/-1.01
5,bert-base-uncased,blurb-jnlpba,76.77%+/-0.86
6,bert-base-uncased,blurb-ncbi-disease,81.59%+/-1.03
7,bert-base-uncased,hoc,79.22%+/-1.14
8,bert-base-uncased,pubmed-qa,55.40%+/-0.25
9,h-index-mid25%,bioasq-task-b,75.85%+/-0.83


In [49]:
pivoted_df = agg_df.pivot(index='model', columns='corpus', values='performance')
pivoted_df.columns = [col for col in pivoted_df.columns]
pivoted_df = pivoted_df.drop(columns=["biosses"])
pivoted_df

Unnamed: 0_level_0,bioasq-task-b,blurb-bc2gm,blurb-bc5chem,blurb-bc5disease,blurb-jnlpba,blurb-ncbi-disease,hoc,pubmed-qa
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bert-base-uncased,71.68%+/-8.04,75.32%+/-0.79,87.31%+/-0.57,77.09%+/-1.01,76.77%+/-0.86,81.59%+/-1.03,79.22%+/-1.14,55.40%+/-0.25
h-index-mid25%,75.85%+/-0.83,79.51%+/-0.85,89.40%+/-0.37,81.05%+/-0.88,77.51%+/-0.45,85.09%+/-0.93,84.72%+/-0.34,55.12%+/-0.16
h-index-mid50%,75.79%+/-2.89,79.51%+/-0.55,89.93%+/-0.68,80.73%+/-0.74,77.95%+/-0.63,84.84%+/-0.61,84.48%+/-0.28,55.04%+/-0.43
h-index-top25%,76.23%+/-1.93,79.52%+/-0.60,89.51%+/-0.32,80.38%+/-1.13,78.13%+/-0.53,84.63%+/-0.80,84.74%+/-0.19,54.40%+/-1.41
random-25%,75.79%+/-1.85,79.17%+/-0.42,90.03%+/-0.42,81.09%+/-1.09,78.02%+/-0.53,84.89%+/-0.69,84.41%+/-0.33,55.20%+/-0.00
sjr-mid25%,75.10%+/-2.26,78.74%+/-0.62,89.72%+/-0.40,81.00%+/-1.13,78.13%+/-0.62,84.97%+/-0.51,84.54%+/-0.57,55.20%+/-0.00
sjr-top25%,75.93%+/-1.96,79.01%+/-1.35,89.61%+/-0.74,80.76%+/-1.34,78.28%+/-0.75,84.30%+/-1.10,85.07%+/-0.51,55.24%+/-0.08


In [50]:
def dataset_avg(row):
    values = [float(v.split('%')[0]) for v in row.values]
    return np.mean(np.array(values))
pivoted_df["dataset_avg"] = pivoted_df.apply(dataset_avg, axis=1)
pivoted_df

Unnamed: 0_level_0,bioasq-task-b,blurb-bc2gm,blurb-bc5chem,blurb-bc5disease,blurb-jnlpba,blurb-ncbi-disease,hoc,pubmed-qa,dataset_avg
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bert-base-uncased,71.68%+/-8.04,75.32%+/-0.79,87.31%+/-0.57,77.09%+/-1.01,76.77%+/-0.86,81.59%+/-1.03,79.22%+/-1.14,55.40%+/-0.25,75.5475
h-index-mid25%,75.85%+/-0.83,79.51%+/-0.85,89.40%+/-0.37,81.05%+/-0.88,77.51%+/-0.45,85.09%+/-0.93,84.72%+/-0.34,55.12%+/-0.16,78.53125
h-index-mid50%,75.79%+/-2.89,79.51%+/-0.55,89.93%+/-0.68,80.73%+/-0.74,77.95%+/-0.63,84.84%+/-0.61,84.48%+/-0.28,55.04%+/-0.43,78.53375
h-index-top25%,76.23%+/-1.93,79.52%+/-0.60,89.51%+/-0.32,80.38%+/-1.13,78.13%+/-0.53,84.63%+/-0.80,84.74%+/-0.19,54.40%+/-1.41,78.4425
random-25%,75.79%+/-1.85,79.17%+/-0.42,90.03%+/-0.42,81.09%+/-1.09,78.02%+/-0.53,84.89%+/-0.69,84.41%+/-0.33,55.20%+/-0.00,78.575
sjr-mid25%,75.10%+/-2.26,78.74%+/-0.62,89.72%+/-0.40,81.00%+/-1.13,78.13%+/-0.62,84.97%+/-0.51,84.54%+/-0.57,55.20%+/-0.00,78.425
sjr-top25%,75.93%+/-1.96,79.01%+/-1.35,89.61%+/-0.74,80.76%+/-1.34,78.28%+/-0.75,84.30%+/-1.10,85.07%+/-0.51,55.24%+/-0.08,78.525


In [None]:
"""    "pico":[
        "ebm_pico",
    ],

    "relation_extraction" : [
        "chemprot",
        "ddi_corpus",
        "gad",
    ],

    "sentence_similarity" : [
        "biosses",
    ],"""

In [51]:
tasks_datasets = {
    "ner" : [
        "blurb-bc5chem",
        "blurb-bc5disease",
        "blurb-bc2gm",
        "blurb-jnlpba",
        "blurb-ncbi-disease",
    ],
    "document_classification" : [
        "hoc",
    ],
    "qa" : [
        "bioasq-task-b",
        "pubmed-qa",
    ]
}

def task_avg(row):
    tasks_means = []
    for task,dataset_list in tasks_datasets.items():
        task_mean = np.mean([float(row[d].split('%')[0]) for d in dataset_list])
        tasks_means.append(task_mean)
    return np.mean(tasks_means)
pivoted_df["task_avg"] = pivoted_df.apply(task_avg, axis=1) # blurb score
pivoted_df

Unnamed: 0_level_0,bioasq-task-b,blurb-bc2gm,blurb-bc5chem,blurb-bc5disease,blurb-jnlpba,blurb-ncbi-disease,hoc,pubmed-qa,dataset_avg,task_avg
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
bert-base-uncased,71.68%+/-8.04,75.32%+/-0.79,87.31%+/-0.57,77.09%+/-1.01,76.77%+/-0.86,81.59%+/-1.03,79.22%+/-1.14,55.40%+/-0.25,75.5475,74.125333
h-index-mid25%,75.85%+/-0.83,79.51%+/-0.85,89.40%+/-0.37,81.05%+/-0.88,77.51%+/-0.45,85.09%+/-0.93,84.72%+/-0.34,55.12%+/-0.16,78.53125,77.572333
h-index-mid50%,75.79%+/-2.89,79.51%+/-0.55,89.93%+/-0.68,80.73%+/-0.74,77.95%+/-0.63,84.84%+/-0.61,84.48%+/-0.28,55.04%+/-0.43,78.53375,77.495667
h-index-top25%,76.23%+/-1.93,79.52%+/-0.60,89.51%+/-0.32,80.38%+/-1.13,78.13%+/-0.53,84.63%+/-0.80,84.74%+/-0.19,54.40%+/-1.41,78.4425,77.496333
random-25%,75.79%+/-1.85,79.17%+/-0.42,90.03%+/-0.42,81.09%+/-1.09,78.02%+/-0.53,84.89%+/-0.69,84.41%+/-0.33,55.20%+/-0.00,78.575,77.515
sjr-mid25%,75.10%+/-2.26,78.74%+/-0.62,89.72%+/-0.40,81.00%+/-1.13,78.13%+/-0.62,84.97%+/-0.51,84.54%+/-0.57,55.20%+/-0.00,78.425,77.400667
sjr-top25%,75.93%+/-1.96,79.01%+/-1.35,89.61%+/-0.74,80.76%+/-1.34,78.28%+/-0.75,84.30%+/-1.10,85.07%+/-0.51,55.24%+/-0.08,78.525,77.682333
