In [None]:
import os
import pickle
import pandas as pd
from collections import defaultdict
from bert_score import score as bert_score_fn

import re
import string
import evaluate

In [None]:
from sklearn.metrics import f1_score, accuracy_score
def calculate_f1(pred,truth,point=False,average='macro'):
    pred_clean={}
    truth_clean={}
    for key,real in truth.items():
        if key in pred:
            
            predicted=pred[key] 
            if point==True:
                predicted=predicted.split(".")[0]+"."
            elif point=="mm":
                predicted=predicted.split("mm")[0]+"mm."
                
            pred_clean[key]=predicted.lower()
            truth_clean[key]=str(real).lower()
    f1=f1_score(list(truth_clean.values()),list(pred_clean.values()),average=average)
    return f1
def calculate_acc(pred,truth,point=False):
    acc=0
    total=0
    for key,real in truth.items():
        if key in pred:
            total+=1
            predicted=pred[key]    
            if point==True:

                predicted=predicted.split(".")[0]+"."

            elif point=="mm":
                predicted=predicted.split("mm")[0]+"mm."
            if predicted.lower()==str(real).lower():
                acc+=1
            else:
                print(key,"Real: ",real,"Predicted: ",predicted)
        # else:
        #     print(key)
    print(acc)
    print(total)
    return acc/total


In [None]:
rouge = evaluate.load("rouge")
exact_match = evaluate.load("exact_match")

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return text.translate(str.maketrans('', '', string.punctuation))
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = set(prediction_tokens) & set(ground_truth_tokens)
    num_same = len(common)
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)
    if num_same == 0:
        return 0
    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    return 2 * precision * recall / (precision + recall)

def calculate_metrics(pred, truth, tipo):
    preds, refs = [], []
    for key, real in truth.items():
        if key not in pred:
            continue
        predicted = pred[key]    
            
        preds.append(predicted+".")
        refs.append(real+".")

    # Compute metrics
    f1s, ems = [], []
    for p, r in zip(preds, refs):
        f1s.append(f1_score(p, r))
        ems.append(int(normalize_answer(p) == normalize_answer(r)))

    rouge_scores = rouge.compute(predictions=preds, references=refs)
    rouge_l = rouge_scores["rougeL"]

    # BERTScore

    P, R, F1 = bert_score_fn(preds, refs, lang="en", verbose=False, rescale_with_baseline=True)
    bert_f1 = F1.mean().item()
    bert_R = R.mean().item()

    print(f"Examples: {len(preds)}")
    print(f"Token F1: {sum(f1s)/len(f1s):.4f}")
    print(f"Exact Match: {sum(ems)/len(ems):.4f}")
    print(f"ROUGE-L: {rouge_l:.4f}")
    print(f"BERTScore F1: {bert_f1:.4f}")
    print(f"BERTScore R: {bert_R:.4f}")

    return {
        "token_f1": sum(f1s)/len(f1s),
        "exact_match": sum(ems)/len(ems),
        "rouge_l": rouge_l,
        "bert_score_f1": bert_f1,
        "bert_score_R": bert_R,
        "list_bert": F1,
    }
    

In [None]:
accuracies=defaultdict(list)
f1s=defaultdict(list)
questions=["age","history","parenchymal_distortion","nodules_echo_size"]
average="macro"
epochs=[3,4,5,6,7,8]
lrs=[1e-4,5e-5,2e-5,1e-5,5e-6]
models=["microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract","dmis-lab/biobert-base-cased-v1.1","bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"]
        
for model_name in models:
    for epoch in epochs:
        for lr in lrs:
            
            model_name_str=model_name+str(lr)+str(epoch)+"16batch"
            accuracies["model"].append(model_name_str)
            f1s["model"].append(model_name_str)
            for tipo in questions:
                print(tipo)
                with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}truth.pkl", "rb") as file:
                        truth=pickle.load(file)
                with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}.pkl", "rb") as file:
                        output=pickle.load(file)
                acc=calculate_acc(output,truth)
                f1=calculate_f1(output,truth,average=average)
                accuracies[tipo].append(acc)
                f1s[tipo].append(f1)
            

In [None]:
data=pd.DataFrame(accuracies)
data=data.set_index("model")
data['average'] = data.mean(axis=1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data

In [None]:
data=pd.DataFrame(f1s)
data=data.set_index("model")
data['average'] = data.mean(axis=1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data

In [None]:
rouge = evaluate.load("rouge")
exact_match = evaluate.load("exact_match")

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return text.translate(str.maketrans('', '', string.punctuation))
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = set(prediction_tokens) & set(ground_truth_tokens)
    num_same = len(common)
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)
    if num_same == 0:
        return 0
    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    return 2 * precision * recall / (precision + recall)

def calculate_metrics(pred, truth, tipo):
    preds, refs = [], []
    for key, real in truth.items():
        if key not in pred:
            continue
        predicted = pred[key]    
            
        preds.append(predicted+".")
        refs.append(real+".")

    # Compute metrics
    f1s, ems = [], []
    for p, r in zip(preds, refs):
        f1s.append(f1_score(p, r))
        ems.append(int(normalize_answer(p) == normalize_answer(r)))

    rouge_scores = rouge.compute(predictions=preds, references=refs)
    rouge_l = rouge_scores["rougeL"]

    # BERTScore

    P, R, F1 = bert_score_fn(preds, refs, lang="en", verbose=False, rescale_with_baseline=True)
    bert_f1 = F1.mean().item()
    bert_R = R.mean().item()

    print(f"Examples: {len(preds)}")
    print(f"Token F1: {sum(f1s)/len(f1s):.4f}")
    print(f"Exact Match: {sum(ems)/len(ems):.4f}")
    print(f"ROUGE-L: {rouge_l:.4f}")
    print(f"BERTScore F1: {bert_f1:.4f}")
    print(f"BERTScore R: {bert_R:.4f}")

    return {
        "token_f1": sum(f1s)/len(f1s),
        "exact_match": sum(ems)/len(ems),
        "rouge_l": rouge_l,
        "bert_score_f1": bert_f1,
        "bert_score_R": bert_R,
        "list_bert": F1,
    }
    

In [None]:
accuracies=defaultdict(list)
f1s=defaultdict(list)
questions=["age","history","parenchymal_distortion","nodules_echo_size"]
average="macro"
epochs=[3,4,5,6,7,8]
lrs=[1e-4,5e-5,2e-5,1e-5,5e-6]
models=["microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract","dmis-lab/biobert-base-cased-v1.1","bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"]

BioMedBERT="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch"
BlueBERT="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch"
BioBERT="dmis-lab/biobert-base-cased-v1.10.0001416batch"

BioMedBERT="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05616batch"
BioBERT="dmis-lab/biobert-base-cased-v1.15e-05616batch"
BlueBERT="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch"
models=[BioMedBERT, BioBERT,BlueBERT]
for model_name in models:
    print(model_name)
    f1_total=[]
    model_name_str=model_name
    accuracies["model"].append(model_name_str)
    f1s["model"].append(model_name_str)
    for tipo in questions:
        print(tipo)
        with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}truth.pkl", "rb") as file:
            truth=pickle.load(file)
        with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}.pkl", "rb") as file:
            output=pickle.load(file)
        acc=calculate_acc(output,truth)
        # f1=calculate_f1(output,truth,average=average)
        result=calculate_metrics(output, truth, tipo)
        f1_bio=result["list_bert"]
        
        f1_total+=f1_bio
        
        accuracies[tipo].append(acc)
    print([round(v.item(),4) for v in f1_total])
        # f1s[tipo].append(f1)
# for model_name in models:
#     print(model_name)
#     model_name_str=model_name
#     accuracies["model"].append(model_name_str)
#     f1s["model"].append(model_name_str)
#     for tipo in questions:
#         print(tipo)
#         with open(f"Generativos/results_dic_{tipo}/{"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch".split("/")[-1]}truth.pkl", "rb") as file:
#                 truth=pickle.load(file)
#         with open(f"Generativos/results_dic_{tipo}/{model_name_str.split("/")[-1]}.pkl", "rb") as file:
#                 output=pickle.load(file)
#         acc=calculate_acc(output,truth)
#         f1=calculate_f1(output,truth,average=average)
#         accuracies[tipo].append(acc)
#         f1s[tipo].append(f1)