In [10]:
import os
import pickle
import pandas as pd
from collections import defaultdict
from bert_score import score as bert_score_fn

import re
import string
import evaluate

In [11]:
from sklearn.metrics import f1_score, accuracy_score
def calculate_f1(pred,truth,point=False,average='macro'):
    pred_clean={}
    truth_clean={}
    for key,real in truth.items():
        if key in pred:
            
            predicted=pred[key] 
            if point==True:
                predicted=predicted.split(".")[0]+"."
            elif point=="mm":
                predicted=predicted.split("mm")[0]+"mm."
                
            pred_clean[key]=predicted.lower()
            truth_clean[key]=str(real).lower()
    f1=f1_score(list(truth_clean.values()),list(pred_clean.values()),average=average)
    return f1
def calculate_acc(pred,truth,point=False):
    acc=0
    total=0
    for key,real in truth.items():
        if key in pred:
            total+=1
            predicted=pred[key]    
            if point==True:

                predicted=predicted.split(".")[0]+"."

            elif point=="mm":
                predicted=predicted.split("mm")[0]+"mm."
            if predicted.lower()==str(real).lower():
                acc+=1
            else:
                print(key,real,predicted)
        # else:
        #     print(key)
    print(acc)
    print(total)
    return acc/total


In [12]:


rouge = evaluate.load("rouge")
exact_match = evaluate.load("exact_match")

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return text.translate(str.maketrans('', '', string.punctuation))
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = set(prediction_tokens) & set(ground_truth_tokens)
    num_same = len(common)
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)
    if num_same == 0:
        return 0
    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    return 2 * precision * recall / (precision + recall)

def calculate_metrics(pred, truth, tipo):
    preds, refs = [], []
    for key, real in truth.items():
        if key not in pred:
            continue
        predicted = pred[key]    
            
        preds.append(predicted+".")
        refs.append(real+".")

    # Compute metrics
    f1s, ems = [], []
    for p, r in zip(preds, refs):
        f1s.append(f1_score(p, r))
        ems.append(int(normalize_answer(p) == normalize_answer(r)))

    rouge_scores = rouge.compute(predictions=preds, references=refs)
    rouge_l = rouge_scores["rougeL"]

    # BERTScore

    P, R, F1 = bert_score_fn(preds, refs, lang="en", verbose=False, rescale_with_baseline=True)
    bert_f1 = F1.mean().item()
    bert_R = R.mean().item()

    print(f"Examples: {len(preds)}")
    print(f"Token F1: {sum(f1s)/len(f1s):.4f}")
    print(f"Exact Match: {sum(ems)/len(ems):.4f}")
    print(f"ROUGE-L: {rouge_l:.4f}")
    print(f"BERTScore F1: {bert_f1:.4f}")
    print(f"BERTScore R: {bert_R:.4f}")

    return {
        "token_f1": sum(f1s)/len(f1s),
        "exact_match": sum(ems)/len(ems),
        "rouge_l": rouge_l,
        "bert_score_f1": bert_f1,
        "bert_score_R": bert_R,
        "list_bert": F1,
    }
    

In [16]:
accuracies=defaultdict(list)
f1s=defaultdict(list)
questions=["age","history","parenchymal_distortion","nodules_echo_size"]
average="macro"

models=["biomedbert","biobert","bluebert"]

for model_name in models:
    f1_total=[]
    print(model_name)
    model_name_str=model_name
    accuracies["model"].append(model_name_str)
    f1s["model"].append(model_name_str)
    for tipo in questions:
        print(tipo)
        with open(f"test_results/results_dic_{tipo}/{model_name_str}truth.pkl", "rb") as file:
                truth=pickle.load(file)
        print(truth)
        truth["547-205-006-20220425-132446_parenchymal_distortion"]="post-surgical changes in right breast."
        with open(f"test_results/results_dic_{tipo}/{model_name_str}2.pkl", "rb") as file:
                output=pickle.load(file)
        acc=calculate_acc(output,truth)
        # f1=calculate_f1(output,truth,average=average)
        result=calculate_metrics(output, truth, tipo)
        f1_bio=result["list_bert"]
        if tipo!="age":
            f1_total+=f1_bio
        
        accuracies[tipo].append(acc)
    print([round(v.item(),4) for v in f1_total])
        # f1s[tipo].append(f1)

biomedbert
age
{'212-130-033-20230313-153938_age': 'no response', '221-558-569-20220530-123819_age': 'no response', '221-558-569-20220530-130006_age': 'no response', '238-837-440-20220530-122842_age': 'no response', '255-300-954-20220913-121148_age': 'no response', '256-707-902-20230327-115117.781000_age': 'no response', '259-466-829-20230717-131725.078000_age': 'no response', '267-457-903-20231116-182149_age': 'no response', '275-897-737-20240125-115942_age': 'no response', '296-672-601-20221219-121616_age': 'no response', '300-309-885-20230313-133504.250000_age': 'no response', '326-065-308-20220620-124632_age': 'no response', '344-335-189-20240611-161331_age': 'no response', '352-797-481-20230327-131733_age': 'no response', '357-136-099-20220425-133933_age': 'no response', '368-796-678-20231106-173406_age': '50', '397-214-822-20230327-130347_age': 'no response', '398-860-253-20231102-182443_age': 'no response', '424-677-160-20220620-121757_age': 'no response', '436-206-523-20211213-

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 70
Token F1: 1.0000
Exact Match: 1.0000
ROUGE-L: 1.0000
BERTScore F1: 1.0000
BERTScore R: 1.0000
history
{'212-130-033-20230313-153938_history': 'no response', '221-558-569-20220530-123819_history': 'no response', '221-558-569-20220530-130006_history': 'no response', '238-837-440-20220530-122842_history': 'no response', '255-300-954-20220913-121148_history': 'no response', '256-707-902-20230327-115117.781000_history': 'no response', '259-466-829-20230717-131725.078000_history': 'no response', '267-457-903-20231116-182149_history': 'no response', '275-897-737-20240125-115942_history': 'no response', '296-672-601-20221219-121616_history': 'no response', '300-309-885-20230313-133504.250000_history': 'no response', '326-065-308-20220620-124632_history': 'no response', '344-335-189-20240611-161331_history': 'no response', '352-797-481-20230327-131733_history': 'no response', '357-136-099-20220425-133933_history': 'no response', '368-796-678-20231106-173406_history': 'no response',

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 64
Token F1: 0.9417
Exact Match: 0.9375
ROUGE-L: 0.9531
BERTScore F1: 0.9520
BERTScore R: 0.9395
parenchymal_distortion
{'212-130-033-20230313-153938_parenchymal_distortion': 'no response', '221-558-569-20220530-123819_parenchymal_distortion': 'no response', '221-558-569-20220530-130006_parenchymal_distortion': 'no response', '238-837-440-20220530-122842_parenchymal_distortion': 'no response', '255-300-954-20220913-121148_parenchymal_distortion': 'no response', '256-707-902-20230327-115117.781000_parenchymal_distortion': 'no response', '259-466-829-20230717-131725.078000_parenchymal_distortion': 'no response', '267-457-903-20231116-182149_parenchymal_distortion': 'no response', '275-897-737-20240125-115942_parenchymal_distortion': 'no response', '296-672-601-20221219-121616_parenchymal_distortion': 'no response', '300-309-885-20230313-133504.250000_parenchymal_distortion': 'no response', '326-065-308-20220620-124632_parenchymal_distortion': 'asymmetry of the density in the lo

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 64
Token F1: 0.9486
Exact Match: 0.9219
ROUGE-L: 0.9586
BERTScore F1: 0.9565
BERTScore R: 0.9466
nodules_echo_size
{'267-457-903-20231116-182149_nodules_echo_size_1': '10 x 5 mm', '275-897-737-20240125-115942_nodules_echo_size_1': 'no response', '300-309-885-20230313-133504.250000_nodules_echo_size_1': '14 mm', '344-335-189-20240611-161331_nodules_echo_size_1': '19 x 6 mm', '352-797-481-20230327-131733_nodules_echo_size_1': '19 mm', '368-796-678-20231106-173406_nodules_echo_size_1': '24 x 8 mm', '398-860-253-20231102-182443_nodules_echo_size_1': '11mm', '424-677-160-20220620-121757_nodules_echo_size_1': '8 - 9. 2 mm', '451-370-668-20221107-121538_nodules_echo_size_1': '18. 49 - 20 mm', '466-867-194-20221107-131038_nodules_echo_size_1': '32 mm', '493-946-873-20240201-105209_nodules_echo_size_1': '25x14 mm', '580-225-690-20240605-154056.482_nodules_echo_size_1': '13 mm', '612-418-370-20231102-192916_nodules_echo_size_1': '1cm', '621-280-972-20240124-112541_nodules_echo_size_1':

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 25
Token F1: 0.8600
Exact Match: 0.8400
ROUGE-L: 0.8667
BERTScore F1: 0.9517
BERTScore R: 0.9351
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7013, 1.0, 1.0, 1.0, 0.1352, 0.1352, 1.0, -0.0418, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0964, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7602, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4887, 1.0, 0.1503, 1.0, 1.0, 1.0, 1.0, 0.7231, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8368, 1.0, 0.3967, 1.0, 1.0, 0.991, 1.0, 1.0, 0.5678, 1.0, 1.0, 1.0, 1.0]
biobert
age
{'212-130-033-20230313-153938_age': 'no response', '221-558-569-20220530-123

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 70
Token F1: 1.0000
Exact Match: 1.0000
ROUGE-L: 1.0000
BERTScore F1: 1.0000
BERTScore R: 1.0000
history
{'212-130-033-20230313-153938_history': 'no response', '221-558-569-20220530-123819_history': 'no response', '221-558-569-20220530-130006_history': 'no response', '238-837-440-20220530-122842_history': 'no response', '255-300-954-20220913-121148_history': 'no response', '256-707-902-20230327-115117.781000_history': 'no response', '259-466-829-20230717-131725.078000_history': 'no response', '267-457-903-20231116-182149_history': 'no response', '275-897-737-20240125-115942_history': 'no response', '296-672-601-20221219-121616_history': 'no response', '300-309-885-20230313-133504.250000_history': 'no response', '326-065-308-20220620-124632_history': 'no response', '344-335-189-20240611-161331_history': 'no response', '352-797-481-20230327-131733_history': 'no response', '357-136-099-20220425-133933_history': 'no response', '368-796-678-20231106-173406_history': 'no response',

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 64
Token F1: 0.9278
Exact Match: 0.9219
ROUGE-L: 0.9392
BERTScore F1: 0.9426
BERTScore R: 0.9294
parenchymal_distortion
{'212-130-033-20230313-153938_parenchymal_distortion': 'no response', '221-558-569-20220530-123819_parenchymal_distortion': 'no response', '221-558-569-20220530-130006_parenchymal_distortion': 'no response', '238-837-440-20220530-122842_parenchymal_distortion': 'no response', '255-300-954-20220913-121148_parenchymal_distortion': 'no response', '256-707-902-20230327-115117.781000_parenchymal_distortion': 'no response', '259-466-829-20230717-131725.078000_parenchymal_distortion': 'no response', '267-457-903-20231116-182149_parenchymal_distortion': 'no response', '275-897-737-20240125-115942_parenchymal_distortion': 'no response', '296-672-601-20221219-121616_parenchymal_distortion': 'no response', '300-309-885-20230313-133504.250000_parenchymal_distortion': 'no response', '326-065-308-20220620-124632_parenchymal_distortion': 'asymmetry of the density in the lo

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 64
Token F1: 0.9413
Exact Match: 0.9219
ROUGE-L: 0.9493
BERTScore F1: 0.9472
BERTScore R: 0.9412
nodules_echo_size
{'267-457-903-20231116-182149_nodules_echo_size_1': '10 x 5 mm', '275-897-737-20240125-115942_nodules_echo_size_1': 'no response', '300-309-885-20230313-133504.250000_nodules_echo_size_1': '14 mm', '344-335-189-20240611-161331_nodules_echo_size_1': '19 x 6 mm', '352-797-481-20230327-131733_nodules_echo_size_1': '19 mm', '368-796-678-20231106-173406_nodules_echo_size_1': '24 x 8 mm', '398-860-253-20231102-182443_nodules_echo_size_1': '11mm', '424-677-160-20220620-121757_nodules_echo_size_1': '8 - 9. 2 mm', '451-370-668-20221107-121538_nodules_echo_size_1': '18. 49 - 20 mm', '466-867-194-20221107-131038_nodules_echo_size_1': '32 mm', '493-946-873-20240201-105209_nodules_echo_size_1': '25x14 mm', '580-225-690-20240605-154056.482_nodules_echo_size_1': '13 mm', '612-418-370-20231102-192916_nodules_echo_size_1': '1cm', '621-280-972-20240124-112541_nodules_echo_size_1':

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 25
Token F1: 0.7711
Exact Match: 0.7200
ROUGE-L: 0.7828
BERTScore F1: 0.8808
BERTScore R: 0.8977
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.259, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7013, 1.0, 1.0, 1.0, 0.2646, 0.1352, 1.0, -0.0359, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0964, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7602, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4887, 1.0, 0.1503, 1.0, 1.0, 1.0, 1.0, 0.1237, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.156, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7812, 1.0, 0.8368, 1.0, 0.3967, 1.0, 1.0, 0.991, 1.0, 1.0, 0.5678, 1.0, 1.0, 0.2911, 1.0]
bluebert
age
{'212-130-033-20230313-153938_age': 'no response', '221-558-569-2

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 70
Token F1: 0.9714
Exact Match: 0.9714
ROUGE-L: 0.9714
BERTScore F1: 0.9853
BERTScore R: 0.9816
history
{'212-130-033-20230313-153938_history': 'no response', '221-558-569-20220530-123819_history': 'no response', '221-558-569-20220530-130006_history': 'no response', '238-837-440-20220530-122842_history': 'no response', '255-300-954-20220913-121148_history': 'no response', '256-707-902-20230327-115117.781000_history': 'no response', '259-466-829-20230717-131725.078000_history': 'no response', '267-457-903-20231116-182149_history': 'no response', '275-897-737-20240125-115942_history': 'no response', '296-672-601-20221219-121616_history': 'no response', '300-309-885-20230313-133504.250000_history': 'no response', '326-065-308-20220620-124632_history': 'no response', '344-335-189-20240611-161331_history': 'no response', '352-797-481-20230327-131733_history': 'no response', '357-136-099-20220425-133933_history': 'no response', '368-796-678-20231106-173406_history': 'no response',

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 64
Token F1: 0.9281
Exact Match: 0.8594
ROUGE-L: 0.9391
BERTScore F1: 0.9386
BERTScore R: 0.9158
parenchymal_distortion
{'212-130-033-20230313-153938_parenchymal_distortion': 'no response', '221-558-569-20220530-123819_parenchymal_distortion': 'no response', '221-558-569-20220530-130006_parenchymal_distortion': 'no response', '238-837-440-20220530-122842_parenchymal_distortion': 'no response', '255-300-954-20220913-121148_parenchymal_distortion': 'no response', '256-707-902-20230327-115117.781000_parenchymal_distortion': 'no response', '259-466-829-20230717-131725.078000_parenchymal_distortion': 'no response', '267-457-903-20231116-182149_parenchymal_distortion': 'no response', '275-897-737-20240125-115942_parenchymal_distortion': 'no response', '296-672-601-20221219-121616_parenchymal_distortion': 'no response', '300-309-885-20230313-133504.250000_parenchymal_distortion': 'no response', '326-065-308-20220620-124632_parenchymal_distortion': 'asymmetry of the density in the lo

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 64
Token F1: 0.8750
Exact Match: 0.8750
ROUGE-L: 0.8750
BERTScore F1: 0.8900
BERTScore R: 0.8737
nodules_echo_size
{'267-457-903-20231116-182149_nodules_echo_size_1': '10 x 5 mm', '275-897-737-20240125-115942_nodules_echo_size_1': 'no response', '300-309-885-20230313-133504.250000_nodules_echo_size_1': '14 mm', '344-335-189-20240611-161331_nodules_echo_size_1': '19 x 6 mm', '352-797-481-20230327-131733_nodules_echo_size_1': '19 mm', '368-796-678-20231106-173406_nodules_echo_size_1': '24 x 8 mm', '398-860-253-20231102-182443_nodules_echo_size_1': '11mm', '424-677-160-20220620-121757_nodules_echo_size_1': '8 - 9. 2 mm', '451-370-668-20221107-121538_nodules_echo_size_1': '18. 49 - 20 mm', '466-867-194-20221107-131038_nodules_echo_size_1': '32 mm', '493-946-873-20240201-105209_nodules_echo_size_1': '25x14 mm', '580-225-690-20240605-154056.482_nodules_echo_size_1': '13 mm', '612-418-370-20231102-192916_nodules_echo_size_1': '1cm', '621-280-972-20240124-112541_nodules_echo_size_1':

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 25
Token F1: 0.5252
Exact Match: 0.3200
ROUGE-L: 0.5280
BERTScore F1: 0.6373
BERTScore R: 0.6207
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6223, 1.0, 0.7745, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8959, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8577, 1.0, 1.0, 1.0, 1.0, 0.7013, 0.485, 1.0, 1.0, 0.6345, 0.1352, 1.0, -0.0359, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.059, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0964, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2094, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2266, 1.0, 1.0, 1.0, 1.0, -0.0196, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2438, 1.0, 0.1503, 1.0, 1.0, 1.0, 1.0, -0.0067, 1.0, 1.0, 0.4949, 1.0, 1.0, 0.1409, 0.9963, 0.5712, 1.0, 1.0, 0.7557, 0.4147, 1.0, 1.0, 0.8368, 0.3215, 1.0, 0.215, 1.0, 0.3622, -0.0121, 0.3703, 0.5678, 0.5509, 0.771, 0.1981, 0.3765]


In [6]:
data=pd.DataFrame(accuracies)
data=data.set_index("model")
data['average'] = data.mean(axis=1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data

Unnamed: 0_level_0,average
model,Unnamed: 1_level_1
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract,


In [5]:
dmis-lab/biobert-base-cased-v1.10.0001416batch

SyntaxError: invalid decimal literal (1324788414.py, line 1)

In [None]:
bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch	

In [None]:
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch	0.995283	0.951613	0.930108	0.926829	0.950958

In [6]:
data=pd.DataFrame(f1s)
data=data.set_index("model")
data['average'] = data.mean(axis=1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data

Unnamed: 0_level_0,age,history,parenchymal_distortion,nodules_echo_size,average
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001316batch,0.990385,0.433239,0.23131,0.858403,0.628334
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05316batch,0.990385,0.331735,0.072143,0.919633,0.578474
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract2e-05316batch,0.990385,0.042459,0.049641,0.805861,0.472086
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract1e-05316batch,0.024224,0.042459,0.049641,0.432706,0.137257
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-06316batch,0.024224,0.042459,0.049641,0.001176,0.029375
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001416batch,0.990385,0.559396,0.347054,0.855095,0.687982
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05416batch,0.990385,0.383694,0.26074,0.889645,0.631116
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract2e-05416batch,0.990385,0.164039,0.049641,0.903226,0.526823
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract1e-05416batch,0.024224,0.042459,0.049641,0.78557,0.225473
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-06416batch,0.024224,0.042459,0.049641,0.001176,0.029375


In [7]:
accuracies=defaultdict(list)
f1s=defaultdict(list)
questions=["age","history","parenchymal_distortion","nodules_echo_size"]
average="macro"
epochs=[3,4,5,6,7,8]
lrs=[1e-4,5e-5,2e-5,1e-5,5e-6]
models=["microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract","dmis-lab/biobert-base-cased-v1.1","bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"]

BioMedBERT="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch"
BlueBERT="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch"
BioBERT="dmis-lab/biobert-base-cased-v1.10.0001416batch"
models=[BioMedBERT,BlueBERT, BioBERT]
for model_name in models:
            model_name_str=model_name
            accuracies["model"].append(model_name_str)
            f1s["model"].append(model_name_str)
            for tipo in questions:
                print(tipo)
                with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}truth.pkl", "rb") as file:
                        truth=pickle.load(file)
                with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}.pkl", "rb") as file:
                        output=pickle.load(file)
                acc=calculate_acc(output,truth)
                f1=calculate_f1(output,truth,average=average)
                accuracies[tipo].append(acc)
                f1s[tipo].append(f1)

age
181-536-521-20221217-101331_age 46 no response
211
212
history
144-007-361-20230619-165847_history reduction mammoplasty currently in treatment of a glioblastoma
154-571-251-20230417-111004_history history of bilateral ca, right breast with la in 2001 and left breast in 2012 no response
154-571-251-20230928-113422_history bilateral breast cancer, right breast with la in 2001 and left breast in 2012 no response
227-053-436-20230714-113137_history operated twice in 2022 for left intraductal papillomas no response
396-539-313-20230614-100316_history no response 41
400-124-635-20221219-134655.203_history history of follicular lymphoma in oct. 2021 no response
591-887-183-20230612-122010.531_history history of biopsy of microcalcifications in the left breast on two occasions history of biopsy of microcalcifications in the left breast
646-504-461-20240506-082527_history bilateral mastectomy and reconstruction with prosthesis due to bilateral breast cancer no response
678-939-808-20220404