In [15]:
import os
import pickle
import pandas as pd
from collections import defaultdict
from bert_score import score as bert_score_fn

import re
import string
import evaluate

In [16]:
from sklearn.metrics import f1_score, accuracy_score
def calculate_f1(pred,truth,point=False,average='macro'):
    pred_clean={}
    truth_clean={}
    for key,real in truth.items():
        if key in pred:
            
            predicted=pred[key] 
            if point==True:
                predicted=predicted.split(".")[0]+"."
            elif point=="mm":
                predicted=predicted.split("mm")[0]+"mm."
                
            pred_clean[key]=predicted.lower()
            truth_clean[key]=str(real).lower()
    f1=f1_score(list(truth_clean.values()),list(pred_clean.values()),average=average)
    return f1
def calculate_acc(pred,truth,point=False):
    acc=0
    total=0
    for key,real in truth.items():
        if key in pred:
            total+=1
            predicted=pred[key]    
            if point==True:

                predicted=predicted.split(".")[0]+"."

            elif point=="mm":
                predicted=predicted.split("mm")[0]+"mm."
            if predicted.lower()==str(real).lower():
                acc+=1
            else:
                print(key,"Real: ",real,"Predicted: ",predicted)
        # else:
        #     print(key)
    print(acc)
    print(total)
    return acc/total


In [None]:
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05616batch	
dmis-lab/biobert-base-cased-v1.15e-05616batch
bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch

In [17]:
rouge = evaluate.load("rouge")
exact_match = evaluate.load("exact_match")

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return text.translate(str.maketrans('', '', string.punctuation))
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = set(prediction_tokens) & set(ground_truth_tokens)
    num_same = len(common)
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)
    if num_same == 0:
        return 0
    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    return 2 * precision * recall / (precision + recall)

def calculate_metrics(pred, truth, tipo):
    preds, refs = [], []
    for key, real in truth.items():
        if key not in pred:
            continue
        predicted = pred[key]    
            
        preds.append(predicted+".")
        refs.append(real+".")

    # Compute metrics
    f1s, ems = [], []
    for p, r in zip(preds, refs):
        f1s.append(f1_score(p, r))
        ems.append(int(normalize_answer(p) == normalize_answer(r)))

    rouge_scores = rouge.compute(predictions=preds, references=refs)
    rouge_l = rouge_scores["rougeL"]

    # BERTScore

    P, R, F1 = bert_score_fn(preds, refs, lang="en", verbose=False, rescale_with_baseline=True)
    bert_f1 = F1.mean().item()
    bert_R = R.mean().item()

    print(f"Examples: {len(preds)}")
    print(f"Token F1: {sum(f1s)/len(f1s):.4f}")
    print(f"Exact Match: {sum(ems)/len(ems):.4f}")
    print(f"ROUGE-L: {rouge_l:.4f}")
    print(f"BERTScore F1: {bert_f1:.4f}")
    print(f"BERTScore R: {bert_R:.4f}")

    return {
        "token_f1": sum(f1s)/len(f1s),
        "exact_match": sum(ems)/len(ems),
        "rouge_l": rouge_l,
        "bert_score_f1": bert_f1,
        "bert_score_R": bert_R,
        "list_bert": F1,
    }
    

In [5]:
accuracies=defaultdict(list)
f1s=defaultdict(list)
questions=["age","history","parenchymal_distortion","nodules_echo_size"]
average="macro"
epochs=[3,4,5,6,7,8]
lrs=[1e-4,5e-5,2e-5,1e-5,5e-6]
models=["microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract","dmis-lab/biobert-base-cased-v1.1","bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"]
        
for model_name in models:
    for epoch in epochs:
        for lr in lrs:
            
            model_name_str=model_name+str(lr)+str(epoch)+"16batch"
            accuracies["model"].append(model_name_str)
            f1s["model"].append(model_name_str)
            for tipo in questions:
                print(tipo)
                with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}truth.pkl", "rb") as file:
                        truth=pickle.load(file)
                with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}.pkl", "rb") as file:
                        output=pickle.load(file)
                acc=calculate_acc(output,truth)
                f1=calculate_f1(output,truth,average=average)
                accuracies[tipo].append(acc)
                f1s[tipo].append(f1)
            

age
211
212
history
173
186
parenchymal_distortion
170
186
nodules_echo_size
73
82
age
211
212
history
171
186
parenchymal_distortion
166
186
nodules_echo_size
75
82
age
211
212
history
163
186
parenchymal_distortion
166
186
nodules_echo_size
69
82
age
156
212
history
163
186
parenchymal_distortion
166
186
nodules_echo_size
50
82
age
156
212
history
163
186
parenchymal_distortion
166
186
nodules_echo_size
3
82
age
211
212
history
178
186
parenchymal_distortion
175
186
nodules_echo_size
72
82
age
211
212
history
174
186
parenchymal_distortion
172
186
nodules_echo_size
75
82
age
211
212
history
166
186
parenchymal_distortion
166
186
nodules_echo_size
75
82
age
156
212
history
163
186
parenchymal_distortion
166
186
nodules_echo_size
69
82
age
156
212
history
163
186
parenchymal_distortion
166
186
nodules_echo_size
3
82
age
210
212
history
173
186
parenchymal_distortion
174
186
nodules_echo_size
74
82
age
210
212
history
175
186
parenchymal_distortion
173
186
nodules_echo_size
76
82
age
21

In [8]:
data=pd.DataFrame(accuracies)
data=data.set_index("model")
data['average'] = data.mean(axis=1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data

Unnamed: 0_level_0,age,history,parenchymal_distortion,nodules_echo_size,average
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001316batch,0.995283,0.930108,0.913978,0.890244,0.932403
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05316batch,0.995283,0.919355,0.892473,0.914634,0.930436
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract2e-05316batch,0.995283,0.876344,0.892473,0.841463,0.901391
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract1e-05316batch,0.735849,0.876344,0.892473,0.609756,0.778606
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-06316batch,0.735849,0.876344,0.892473,0.036585,0.635313
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001416batch,0.995283,0.956989,0.94086,0.878049,0.942795
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05416batch,0.995283,0.935484,0.924731,0.914634,0.942533
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract2e-05416batch,0.995283,0.892473,0.892473,0.914634,0.923716
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract1e-05416batch,0.735849,0.876344,0.892473,0.841463,0.836532
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-06416batch,0.735849,0.876344,0.892473,0.036585,0.635313


In [None]:
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05616batch	
dmis-lab/biobert-base-cased-v1.15e-05616batch
bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch

In [5]:
dmis-lab/biobert-base-cased-v1.10.0001416batch

SyntaxError: invalid decimal literal (1324788414.py, line 1)

In [None]:
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch	0.995283	0.951613	0.930108	0.926829	0.950958

In [9]:
data=pd.DataFrame(f1s)
data=data.set_index("model")
data['average'] = data.mean(axis=1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data

Unnamed: 0_level_0,age,history,parenchymal_distortion,nodules_echo_size,average
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001316batch,0.990385,0.433239,0.23131,0.858403,0.628334
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05316batch,0.990385,0.331735,0.072143,0.919633,0.578474
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract2e-05316batch,0.990385,0.042459,0.049641,0.805861,0.472086
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract1e-05316batch,0.024224,0.042459,0.049641,0.432706,0.137257
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-06316batch,0.024224,0.042459,0.049641,0.001176,0.029375
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001416batch,0.990385,0.559396,0.347054,0.855095,0.687982
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05416batch,0.990385,0.383694,0.26074,0.889645,0.631116
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract2e-05416batch,0.990385,0.164039,0.049641,0.903226,0.526823
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract1e-05416batch,0.024224,0.042459,0.049641,0.78557,0.225473
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-06416batch,0.024224,0.042459,0.049641,0.001176,0.029375


In [18]:
rouge = evaluate.load("rouge")
exact_match = evaluate.load("exact_match")

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return text.translate(str.maketrans('', '', string.punctuation))
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = set(prediction_tokens) & set(ground_truth_tokens)
    num_same = len(common)
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)
    if num_same == 0:
        return 0
    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    return 2 * precision * recall / (precision + recall)

def calculate_metrics(pred, truth, tipo):
    preds, refs = [], []
    for key, real in truth.items():
        if key not in pred:
            continue
        predicted = pred[key]    
            
        preds.append(predicted+".")
        refs.append(real+".")

    # Compute metrics
    f1s, ems = [], []
    for p, r in zip(preds, refs):
        f1s.append(f1_score(p, r))
        ems.append(int(normalize_answer(p) == normalize_answer(r)))

    rouge_scores = rouge.compute(predictions=preds, references=refs)
    rouge_l = rouge_scores["rougeL"]

    # BERTScore

    P, R, F1 = bert_score_fn(preds, refs, lang="en", verbose=False, rescale_with_baseline=True)
    bert_f1 = F1.mean().item()
    bert_R = R.mean().item()

    print(f"Examples: {len(preds)}")
    print(f"Token F1: {sum(f1s)/len(f1s):.4f}")
    print(f"Exact Match: {sum(ems)/len(ems):.4f}")
    print(f"ROUGE-L: {rouge_l:.4f}")
    print(f"BERTScore F1: {bert_f1:.4f}")
    print(f"BERTScore R: {bert_R:.4f}")

    return {
        "token_f1": sum(f1s)/len(f1s),
        "exact_match": sum(ems)/len(ems),
        "rouge_l": rouge_l,
        "bert_score_f1": bert_f1,
        "bert_score_R": bert_R,
        "list_bert": F1,
    }
    

In [20]:
accuracies=defaultdict(list)
f1s=defaultdict(list)
questions=["age","history","parenchymal_distortion","nodules_echo_size"]
average="macro"
epochs=[3,4,5,6,7,8]
lrs=[1e-4,5e-5,2e-5,1e-5,5e-6]
models=["microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract","dmis-lab/biobert-base-cased-v1.1","bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"]

BioMedBERT="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch"
BlueBERT="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch"
BioBERT="dmis-lab/biobert-base-cased-v1.10.0001416batch"

BioMedBERT="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05616batch"
BioBERT="dmis-lab/biobert-base-cased-v1.15e-05616batch"
BlueBERT="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch"
models=[BioMedBERT, BioBERT,BlueBERT]
for model_name in models:
    print(model_name)
    f1_total=[]
    model_name_str=model_name
    accuracies["model"].append(model_name_str)
    f1s["model"].append(model_name_str)
    for tipo in questions:
        print(tipo)
        with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}truth.pkl", "rb") as file:
            truth=pickle.load(file)
        with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}.pkl", "rb") as file:
            output=pickle.load(file)
        acc=calculate_acc(output,truth)
        # f1=calculate_f1(output,truth,average=average)
        result=calculate_metrics(output, truth, tipo)
        f1_bio=result["list_bert"]
        
        f1_total+=f1_bio
        
        accuracies[tipo].append(acc)
    print([round(v.item(),4) for v in f1_total])
        # f1s[tipo].append(f1)
# for model_name in models:
#     print(model_name)
#     model_name_str=model_name
#     accuracies["model"].append(model_name_str)
#     f1s["model"].append(model_name_str)
#     for tipo in questions:
#         print(tipo)
#         with open(f"Generativos/results_dic_{tipo}/{"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch".split("/")[-1]}truth.pkl", "rb") as file:
#                 truth=pickle.load(file)
#         with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}.pkl", "rb") as file:
#                 output=pickle.load(file)
#         acc=calculate_acc(output,truth)
#         f1=calculate_f1(output,truth,average=average)
#         accuracies[tipo].append(acc)
#         f1s[tipo].append(f1)

microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05616batch
age
181-536-521-20221217-101331_age Real:  46 Predicted:  no response
211
212


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 212
Token F1: 0.9953
Exact Match: 0.9953
ROUGE-L: 0.9953
BERTScore F1: 0.9976
BERTScore R: 0.9982
history
144-007-361-20230619-165847_history Real:  reduction mammoplasty Predicted:  no response
154-571-251-20230417-111004_history Real:  history of bilateral ca, right breast with la in 2001 and left breast in 2012 Predicted:  no response
154-571-251-20230928-113422_history Real:  bilateral breast cancer, right breast with la in 2001 and left breast in 2012 Predicted:  no response
227-053-436-20230714-113137_history Real:  operated twice in 2022 for left intraductal papillomas Predicted:  no response
342-835-546-20230522-124057.921_history Real:  history of previous image - guided biopsy in right breast performed in another center with a benign pathological anatomical result Predicted:  no response
400-124-635-20221219-134655.203_history Real:  history of follicular lymphoma in oct. 2021 Predicted:  no response
646-504-461-20240506-082527_history Real:  bilateral mastectomy an

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 186
Token F1: 0.9508
Exact Match: 0.9462
ROUGE-L: 0.9558
BERTScore F1: 0.9628
BERTScore R: 0.9558
parenchymal_distortion
043-372-637-20220620-135102_parenchymal_distortion Real:  focal asymmetry in the upper outer quadrant of the left breast, which corresponds to an accumulation of fibroglandular tissue that remains stable with respect to previous controls since 2020 Predicted:  focal asymmetry in the upper outer quadrant of the left breast
058-197-862-20230511-152451_parenchymal_distortion Real:  focal asymmetry persists in the upper quadrant - upper outer quadrant unit of the left breast, currently there is a slight associated parenchymal distortion Predicted:  focal asymmetry persists in the upper quadrant - upper outer quadrant unit of the left breast
095-672-078-20231010-170614_parenchymal_distortion Real:  parenchymal distortion in upper interquadrant - upper inner quadrant left breast, with associated pleomorphic microcalcifications Predicted:  parenchymal distortion i

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 186
Token F1: 0.9533
Exact Match: 0.9355
ROUGE-L: 0.9588
BERTScore F1: 0.9623
BERTScore R: 0.9521
nodules_echo_size
008-536-472-20211122-103547_nodules_echo_size_1 Real:  23 mm Predicted:  no response
145-861-121-20220621-123317.718_nodules_echo_size_1 Real:  no response Predicted:  6 - 7 mm
236-259-310-20230522-091930_nodules_echo_size_1 Real:  subcentimeter Predicted:  no response
446-217-126-20230612-114316.187_nodules_echo_size_1 Real:  7. 7 - 9. 7 mm Predicted:  no response
534-723-539-20231013-165228.835_nodules_echo_size_1 Real:  8 mm Predicted:  13 mm
646-504-461-20240506-082527_nodules_echo_size_1 Real:  no response Predicted:  7, 6 and 5 mm
766-258-279-20230831-132728.863_nodules_echo_size_1 Real:  21x8mm Predicted:  26 mm
75
82


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 82
Token F1: 0.9187
Exact Match: 0.9146
ROUGE-L: 0.9207
BERTScore F1: 0.9477
BERTScore R: 0.9466
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4909, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 212
Token F1: 0.9953
Exact Match: 0.9953
ROUGE-L: 0.9953
BERTScore F1: 0.9976
BERTScore R: 0.9982
history
008-536-472-20211122-103547_history Real:  history of left breast fibroadenoma excision that had grown in 2017 Predicted:  history
144-007-361-20230619-165847_history Real:  reduction mammoplasty Predicted:  currently in treatment of a glioblastoma
154-571-251-20230417-111004_history Real:  history of bilateral ca, right breast with la in 2001 and left breast in 2012 Predicted:  no response
154-571-251-20230928-113422_history Real:  bilateral breast cancer, right breast with la in 2001 and left breast in 2012 Predicted:  no response
183-814-707-20220425-122932_history Real:  history of bilateral breast reduction surgery Predicted:  no response
227-053-436-20230714-113137_history Real:  operated twice in 2022 for left intraductal papillomas Predicted:  no response
400-124-635-20221219-134655.203_history Real:  history of follicular lymphoma in oct. 2021 Predicted:  no resp

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 186
Token F1: 0.9455
Exact Match: 0.9355
ROUGE-L: 0.9502
BERTScore F1: 0.9576
BERTScore R: 0.9515
parenchymal_distortion
043-372-637-20220620-135102_parenchymal_distortion Real:  focal asymmetry in the upper outer quadrant of the left breast, which corresponds to an accumulation of fibroglandular tissue that remains stable with respect to previous controls since 2020 Predicted:  focal asymmetry in the upper outer quadrant of the left breast
058-197-862-20230511-152451_parenchymal_distortion Real:  focal asymmetry persists in the upper quadrant - upper outer quadrant unit of the left breast, currently there is a slight associated parenchymal distortion Predicted:  focal asymmetry persists in the upper quadrant - upper outer quadrant unit of the left breast
095-672-078-20231010-170614_parenchymal_distortion Real:  parenchymal distortion in upper interquadrant - upper inner quadrant left breast, with associated pleomorphic microcalcifications Predicted:  no response
153-234-011-

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 186
Token F1: 0.9547
Exact Match: 0.9355
ROUGE-L: 0.9584
BERTScore F1: 0.9598
BERTScore R: 0.9500
nodules_echo_size
001-689-096-20230529-131355_nodules_echo_size_1 Real:  8 - 9 mm Predicted:  no response
145-861-121-20220621-123317.718_nodules_echo_size_1 Real:  no response Predicted:  6 - 7 mm
236-259-310-20230522-091930_nodules_echo_size_1 Real:  subcentimeter Predicted:  no response
261-477-582-20230602-103817_nodules_echo_size_1 Real:  7 mm Predicted:  6x3 mm
290-754-149-20230823-105957_nodules_echo_size_1 Real:  9x5mm Predicted:  no response
446-217-126-20230612-114316.187_nodules_echo_size_1 Real:  7. 7 - 9. 7 mm Predicted:  no response
534-723-539-20231013-165228.835_nodules_echo_size_1 Real:  8 mm Predicted:  no response
646-504-461-20240506-082527_nodules_echo_size_1 Real:  no response Predicted:  7, 6 and 5 mm
678-939-808-20220404-130020.625_nodules_echo_size_1 Real:  6 mm Predicted:  no response
766-258-279-20230831-132728.863_nodules_echo_size_1 Real:  21x8mm Pred

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 82
Token F1: 0.8821
Exact Match: 0.8780
ROUGE-L: 0.8841
BERTScore F1: 0.9105
BERTScore R: 0.9084
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4909, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 212
Token F1: 0.9953
Exact Match: 0.9953
ROUGE-L: 0.9953
BERTScore F1: 0.9976
BERTScore R: 0.9982
history
008-536-472-20211122-103547_history Real:  history of left breast fibroadenoma excision that had grown in 2017 Predicted:  history of left breast fibroadenoma excision that had grown in 2017. a comparative study is performed with respect to previous scans. comments : mammography : craniocaudal and oblique projection of both breasts with tomosynthesis and synthesized image ( c - view ). high density and heterogeneous breasts ( acr c ) which significantly reduces the sensitivity for the detection of underlying lesions. however, no obvious parenchymal distortions or suspicious calcifications are observed. benign calcifications in the right breast. circumscribed, oval and hypodense bilateral nodules, stable with respect to previous studies
046-630-291-20230619-112356_history Real:  no response Predicted:  reduction mammoplasty
144-007-361-20230619-165847_history Real:  reduct

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 186
Token F1: 0.9435
Exact Match: 0.9247
ROUGE-L: 0.9480
BERTScore F1: 0.9564
BERTScore R: 0.9525
parenchymal_distortion
043-372-637-20220620-135102_parenchymal_distortion Real:  focal asymmetry in the upper outer quadrant of the left breast, which corresponds to an accumulation of fibroglandular tissue that remains stable with respect to previous controls since 2020 Predicted:  focal asymmetry in the upper outer quadrant of the left breast
058-197-862-20230511-152451_parenchymal_distortion Real:  focal asymmetry persists in the upper quadrant - upper outer quadrant unit of the left breast, currently there is a slight associated parenchymal distortion Predicted:  focal asymmetry persists in the upper quadrant - upper outer quadrant unit of the left breast
095-672-078-20231010-170614_parenchymal_distortion Real:  parenchymal distortion in upper interquadrant - upper inner quadrant left breast, with associated pleomorphic microcalcifications Predicted:  parenchymal distortion i

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 186
Token F1: 0.9518
Exact Match: 0.9409
ROUGE-L: 0.9559
BERTScore F1: 0.9637
BERTScore R: 0.9547
nodules_echo_size
095-672-078-20231010-170614_nodules_echo_size_1 Real:  7x10mm Predicted:  no response
145-861-121-20220621-123317.718_nodules_echo_size_1 Real:  no response Predicted:  6 - 7 mm
236-259-310-20230522-091930_nodules_echo_size_1 Real:  subcentimeter Predicted:  no response
261-477-582-20230602-103817_nodules_echo_size_1 Real:  7 mm Predicted:  6x3 mm
300-089-605-20230622-174812_nodules_echo_size_1 Real:  33 x. 18 mm Predicted:  33 x
520-049-440-20230417-114625_nodules_echo_size_1 Real:  9. 7 mm Predicted:  no response
534-723-539-20231013-165228.835_nodules_echo_size_1 Real:  8 mm Predicted:  13 mm
646-504-461-20240506-082527_nodules_echo_size_1 Real:  no response Predicted:  7, 6 and 5 mm
678-939-808-20220404-130020.625_nodules_echo_size_1 Real:  6 mm Predicted:  no response
747-999-147-20230404-185900_nodules_echo_size_1 Real:  36mm Predicted:  no response
949-17

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Examples: 82
Token F1: 0.8854
Exact Match: 0.8659
ROUGE-L: 0.8923
BERTScore F1: 0.9232
BERTScore R: 0.9256
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4909, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,