In [1]:
import pickle
import pandas as pd
import math
from utils import *

config = load_config()

In [2]:
# PROJECT_PATH = config.project_path
PROJECT_PATH = config.file_path
DATA_PATH = PROJECT_PATH.joinpath("data/processed")

def load_data( name ) :
    with open(DATA_PATH.joinpath(name), 'rb') as f :
        data = pickle.load(f)
    return data


In [3]:
def print_results(df, baseline=True) :
    if baseline :
        df = df.drop(columns = ['cv', 'topN']).copy()
        df = df.groupby(['model']).agg({

            'f1': lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
            'precision':lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
            'recall': lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
            'mrr': lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
        })
        return df.sort_values(by=['model'], ascending=False)
        
    else :
        df = df.drop(columns = ['cv', 'topN']).copy()
        df = df.groupby(['model', 'shots']).agg({
                                            'f1': lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
                                            'precision':lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
                                            'recall': lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
                                            'mrr': lambda x : "{mean:.3f} ({hi:.3f}/{lo:.3f})".format(mean = x.mean(), hi=x.mean() + 1.96*x.std()/math.sqrt(5),lo=x.mean() - 1.96*x.std()/math.sqrt(5)),
                                            })
        return df.sort_values(by=['model','shots'], ascending=False)

In [4]:
def show_results(topn, model_names : list, baseline=True) :
    if baseline :
        eval_results = []
        for model_name in model_names :
            eval_results.append(load_data(f"{topn}_{model_name}_evaluation.pkl"))
            eval_results.append(load_data(f"{topn}_{model_name}_evaluation.pkl"))
        results = []
        for evaluation in eval_results :
            for record in evaluation :
                results.append(record)
        myResults = pd.DataFrame(results)

        df = print_results(myResults)
        return df
    else :
        eval_results = []
        for model_name in model_names :
            eval_results.append(load_data(f"{topn}_{model_name}_zeroshot_evaluation.pkl"))
            eval_results.append(load_data(f"{topn}_{model_name}_fewshot_evaluation.pkl"))
        results = []
        for evaluation in eval_results :
            for record in evaluation :
                results.append(record)
        myResults = pd.DataFrame(results)

        df = print_results(myResults,baseline=False)
        return df

In [5]:
# baseline
show_results("top10", ['gpt', 'biogpt'])

Unnamed: 0_level_0,f1,precision,recall,mrr
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gpt_finetuned,0.099 (0.116/0.081),0.095 (0.113/0.076),0.105 (0.124/0.085),0.170 (0.197/0.142)
biogpt_finetuned,0.174 (0.202/0.146),0.414 (0.464/0.363),0.110 (0.130/0.090),0.166 (0.219/0.112)


In [6]:
r = show_results("top10", ["biomistral7b_avigon_modified_finetune"], baseline=False)
r[['f1','mrr','precision', 'recall']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,mrr,precision,recall
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
biomistral7b_avigon_finetuned,fewshot,0.311 (0.322/0.300),0.468 (0.488/0.447),0.317 (0.337/0.298),0.306 (0.316/0.295)
biomistral7b_avigon_finetuned,zeroshot,0.165 (0.205/0.126),0.231 (0.283/0.179),0.167 (0.209/0.125),0.164 (0.202/0.126)


In [35]:
r = show_results("top10", ["biomistral7b_avigon_modified_mimic1000", "biomistral7b_avigon_mimic1000"], baseline=False)
r[['precision','recall']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1
biomistral7b_avigon_mimic1000,fewshot,0.314 (0.323/0.304),0.257 (0.260/0.253)
biomistral7b_avigon_mimic1000,zeroshot,0.404 (0.415/0.394),0.306 (0.314/0.298)
biomistral7b_avigon_modified_mimic1000,fewshot,0.351 (0.364/0.339),0.344 (0.354/0.333)
biomistral7b_avigon_modified_mimic1000,zeroshot,0.367 (0.377/0.358),0.343 (0.355/0.332)


In [17]:
r = show_results("top10", ["biomistral7b_avigon_mimic10", "biomistral7b_avigon_mimic100"], baseline=False)
r[['precision','recall']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1
biomistral7b_avigon_mimic10,fewshot,0.296 (0.308/0.284),0.204 (0.210/0.198)
biomistral7b_avigon_mimic10,zeroshot,0.298 (0.302/0.294),0.304 (0.311/0.297)
biomistral7b_avigon_mimic100,fewshot,0.349 (0.355/0.343),0.323 (0.328/0.317)
biomistral7b_avigon_mimic100,zeroshot,0.403 (0.411/0.394),0.326 (0.334/0.317)


In [7]:
r = show_results("top3", ["biomistral7b_avigon_modified_mimic1000"], baseline=False)
r[['precision','recall']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1
biomistral7b_avigon_modified_mimic1000,fewshot,0.310 (0.318/0.303),0.296 (0.314/0.278)
biomistral7b_avigon_modified_mimic1000,zeroshot,0.419 (0.433/0.404),0.215 (0.226/0.205)


In [14]:
r = show_results("top5", ["biomistral7b_avigon_modified_mimic10", "biomistral7b_avigon_modified_mimic100"], baseline=False)
r[['f1','mrr']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,mrr
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1
biomistral7b_avigon_modified_mimic10,fewshot,0.257 (0.264/0.250),0.488 (0.512/0.464)
biomistral7b_avigon_modified_mimic10,zeroshot,0.295 (0.302/0.288),0.453 (0.479/0.426)
biomistral7b_avigon_modified_mimic100,fewshot,0.305 (0.313/0.297),0.506 (0.529/0.484)
biomistral7b_avigon_modified_mimic100,zeroshot,0.299 (0.307/0.291),0.522 (0.554/0.491)


In [16]:
r = show_results("top10", ["biomistral7b_avigon_modified_mimic10", "biomistral7b_avigon_modified_mimic100"], baseline=False)
r[['f1','mrr']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,mrr
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1
biomistral7b_avigon_modified_mimic10,fewshot,0.276 (0.283/0.269),0.464 (0.484/0.443)
biomistral7b_avigon_modified_mimic10,zeroshot,0.321 (0.324/0.319),0.464 (0.494/0.434)
biomistral7b_avigon_modified_mimic100,fewshot,0.342 (0.349/0.336),0.495 (0.521/0.468)
biomistral7b_avigon_modified_mimic100,zeroshot,0.358 (0.367/0.348),0.525 (0.556/0.494)


In [13]:
r = show_results("top5", ["biomistral7b_avigon_mimic10", "biomistral7b_avigon_mimic100", "biomistral7b_avigon_mimic1000"], baseline=False)
r[['f1','mrr']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,mrr
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1
biomistral7b_avigon_mimic10,fewshot,0.237 (0.248/0.226),0.445 (0.468/0.421)
biomistral7b_avigon_mimic10,zeroshot,0.302 (0.311/0.294),0.535 (0.571/0.499)
biomistral7b_avigon_mimic100,fewshot,0.303 (0.310/0.296),0.501 (0.519/0.482)
biomistral7b_avigon_mimic100,zeroshot,0.339 (0.341/0.337),0.535 (0.569/0.502)
biomistral7b_avigon_mimic1000,fewshot,0.254 (0.263/0.245),0.471 (0.498/0.444)
biomistral7b_avigon_mimic1000,zeroshot,0.295 (0.308/0.282),0.516 (0.544/0.487)


In [16]:
r = show_results("top10", ["biomistral7b_avigon_mimic10", "biomistral7b_avigon_mimic100", "biomistral7b_avigon_mimic1000"], baseline=False)
r[['f1','mrr']].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,mrr
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1
biomistral7b_avigon_mimic10,fewshot,0.241 (0.249/0.233),0.427 (0.452/0.403)
biomistral7b_avigon_mimic10,zeroshot,0.301 (0.304/0.298),0.517 (0.554/0.480)
biomistral7b_avigon_mimic100,fewshot,0.335 (0.338/0.332),0.531 (0.552/0.510)
biomistral7b_avigon_mimic100,zeroshot,0.360 (0.369/0.352),0.558 (0.600/0.516)
biomistral7b_avigon_mimic1000,fewshot,0.282 (0.288/0.276),0.496 (0.522/0.471)
biomistral7b_avigon_mimic1000,zeroshot,0.348 (0.357/0.339),0.528 (0.563/0.494)


In [15]:
# top 5
show_results("top10", ["mistral7b", "biomistral7b_avigon"])

FileNotFoundError: [Errno 2] No such file or directory: '/data/home_beta/wjang/medicalnote_annotation/data/processed/top10_mistral7b_evaluation.pkl'

In [10]:
# top 10
show_results("top10", ["mistral7b", "biomistral7b_avigon_finetuned"])

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,precision,recall,mrr
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mistral7b,zeroshot,0.276 (0.284/0.268),0.197 (0.204/0.190),0.460 (0.468/0.452),0.539 (0.561/0.517)
mistral7b,fewshot,0.402 (0.408/0.395),0.301 (0.306/0.296),0.604 (0.612/0.597),0.694 (0.713/0.675)
biomistral7b_avigon_mimic_finetuned,zeroshot,0.339 (0.351/0.327),0.391 (0.401/0.381),0.300 (0.313/0.287),0.523 (0.560/0.486)
biomistral7b_avigon_mimic_finetuned,fewshot,0.335 (0.344/0.327),0.343 (0.352/0.334),0.328 (0.338/0.319),0.534 (0.548/0.520)
biomistral7b_avigon_finetuned,zeroshot,0.362 (0.381/0.344),0.337 (0.357/0.318),0.395 (0.431/0.358),0.601 (0.644/0.558)
biomistral7b_avigon_finetuned,fewshot,0.296 (0.327/0.265),0.287 (0.323/0.252),0.309 (0.348/0.269),0.506 (0.562/0.450)


In [8]:
# top10

mistral7b_finetuned_zeroshot = load_data("top10_mistral7b_finetuned_zeroshot_evaluation.pkl")
mistral7b_finetuned_fewshot = load_data("top10_mistral7b_finetuned_fewshot_evaluation.pkl")
# biomistral7b_finetuned_zeroshot = load_data("top10_biomistral7b_avigon_zeroshot_evaluation.pkl")
# biomistral7b_finetuned_fewshot = load_data("top10_biomistral7b_avigon_fewshot_evaluation.pkl")
biomistral7b_avigon_finetuned_zeroshot = load_data("top10_biomistral7b_finetuned_zeroshot_evaluation.pkl")
biomistral7b_avigon_finetuned_fewshot = load_data("top10_biomistral7b_finetuned_fewshot_evaluation.pkl")

evaluation_results = [ mistral7b_finetuned_zeroshot, mistral7b_finetuned_fewshot,
                      # biomistral7b_finetuned_zeroshot, biomistral7b_finetuned_fewshot]
                      biomistral7b_avigon_finetuned_zeroshot, biomistral7b_avigon_finetuned_fewshot]

# mistral7b_finetuned_zeroshot = load_data("top5_mistral7b_modified_zeroshot_evaluation.pkl")
# mistral7b_finetuned_fewshot = load_data("top5_mistral7b_modified_fewshot_evaluation.pkl")
# biomistral7b_finetuned_zeroshot = load_data("top5_biomistral7b_avigon_modified_zeroshot_evaluation.pkl")
# biomistral7b_finetuned_fewshot = load_data("top5_biomistral7b_avigon_modified_fewshot_evaluation.pkl")
# biomistral7b_avigon_finetuned_zeroshot = load_data("top5_biomistral7b_finetuned_zeroshot_evaluation.pkl")
# biomistral7b_avigon_finetuned_fewshot = load_data("top5_biomistral7b_finetuned_fewshot_evaluation.pkl")

# evaluation_results = [mistral7b_finetuned_zeroshot, mistral7b_finetuned_fewshot,
#                       biomistral7b_finetuned_zeroshot, biomistral7b_finetuned_fewshot,]
                      # biomistral7b_avigon_finetuned_zeroshot, biomistral7b_avigon_finetuned_fewshot]
results = []
for evaluation in evaluation_results :
    for record in evaluation :
        results.append(record)

myResults = pd.DataFrame(results)

In [9]:
t = print_results(myResults)
t

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,precision,recall,mrr
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mistral7b_finetuned,zeroshot,0.368 (0.392/0.343),0.342 (0.378/0.306),0.403 (0.436/0.370),0.576 (0.591/0.561)
mistral7b_finetuned,fewshot,0.386 (0.399/0.374),0.335 (0.343/0.327),0.458 (0.492/0.424),0.680 (0.701/0.658)
biomistral7b_finetuned,zeroshot,0.333 (0.354/0.311),0.320 (0.333/0.306),0.349 (0.385/0.312),0.589 (0.620/0.557)
biomistral7b_finetuned,fewshot,0.280 (0.318/0.242),0.313 (0.358/0.268),0.257 (0.301/0.213),0.435 (0.499/0.372)


These are just left for insurance. I may need them someday

In [13]:
import math
stats = myResults.groupby(['model','shots'])[['precision', 'recall', 'f1', 'mrr']].agg(['mean', 'count', 'std'])

for score in ['precision','recall', 'f1', 'mrr'] : 
    ci95_hi = []
    ci95_lo = []
    for i in stats.index:
        m, c, s = stats.loc[i][score]
        ci95_hi.append(m + 1.96*s/math.sqrt(c))
        ci95_lo.append(m - 1.96*s/math.sqrt(c))

    # stats[score]['ci95_hi'] = ci95_hi
    # stats[score]['ci95_lo'] = ci95_lo
    stats.loc[:,(score,'ci95_hi')] = ci95_hi
    stats.loc[:,(score,'ci95_lo')] = ci95_lo

In [10]:
# precision
stats['precision']

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count,std,ci95_hi,ci95_lo
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mistral7b_mimic_finetuned,fewshot,0.1376,5,0.002074,0.139418,0.135782
mistral7b_mimic_finetuned,zeroshot,0.144,5,0.005568,0.14888,0.13912


In [11]:
# recall
stats['recall']

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count,std,ci95_hi,ci95_lo
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mistral7b_mimic_finetuned,fewshot,0.1426,5,0.006693,0.148467,0.136733
mistral7b_mimic_finetuned,zeroshot,0.1424,5,0.004775,0.146585,0.138215


In [12]:
# f1
stats['f1']

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count,std,ci95_hi,ci95_lo
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mistral7b_mimic_finetuned,fewshot,0.1398,5,0.004087,0.143382,0.136218
mistral7b_mimic_finetuned,zeroshot,0.1432,5,0.004712,0.14733,0.13907


In [13]:
# mrr
stats['mrr']

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count,std,ci95_hi,ci95_lo
model,shots,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mistral7b_mimic_finetuned,fewshot,0.1822,5,0.017754,0.197762,0.166638
mistral7b_mimic_finetuned,zeroshot,0.0918,5,0.011692,0.102048,0.081552
