In [None]:
import os
gpu_device = 0    # número identificador del device puede ser: 0, 1, 2, o 3
os.environ["CUDA_VISIBLE_DEVICES"]=str(gpu_device)
from numba import cuda
cuda.select_device(0)

In [None]:
import pandas as pd
import re
import random
import numpy as np
import math
import pickle
from sklearn.metrics import classification_report, accuracy_score
from datasets import Dataset,concatenate_datasets,load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

In [None]:
import pandas as pd
import pickle

ground_truth=pd.read_excel("data/data_corrected_english.xlsx",index_col="Report")
ground_truth=ground_truth[ground_truth["Eliminar"]!="Yes"]

with open("data/report_data_q_a_ingles_v2.pkl", 'rb') as file:  # 'rb' mode is for reading binary files
    report_data = pickle.load(file)

def fix_brackets_spaces(texto):
    ''' 
        Introduce espacios por delante y por detrás de los paréntesis.
        Esta medida mejora el tokenizado de Spacy
    '''
    
    texto = re.sub(r'([(\[¿!])', r' \1', texto)
    texto = re.sub(r'([)\]?¡])', r'\1 ', texto)
                
    return texto

def preprocess_text(text):
    text= fix_brackets_spaces(text)
    return text

def visualize_errors(valid_dataset,valid_targets,validation_predictions,keys):
    # Crear un DataFrame con los textos originales, las etiquetas reales y las predicciones
    results_df = pd.DataFrame({
        'key':list(keys),
        'Text': list(valid_dataset),  # Usamos los textos originales
        'True Label': list(valid_targets),
        'Predicted Label': list(validation_predictions)
    })
    
    # Filtrar los ejemplos en los que el modelo falló
    errors_df = results_df[results_df['True Label'] != results_df['Predicted Label']]
    
    for ind,row in errors_df.iterrows():
        print(row["key"])
        print("EJEMPLO")
        print(row["Text"])
        print("PREDICTED")
        print(row["Predicted Label"])
        print("TRUE")
        print(row["True Label"])


def train_clean(X,Y):
    random.seed(1)
    # Agrupar ejemplos originales y sus copias
    train = X
    # Barajar los datos de entrenamiento para evitar sesgos
    train = train.sample(frac=1, random_state=1)
    train_y = Y.loc[train.index]
    train["label"]=train_y["label"]
    return train

In [None]:
questions=["age","tipo","tecnica","family","history","symtomatic",
           "prosthesis","birads","density_mammo","calcifications_benign","density_echo","ganglio_mamo","lymph_benign","lymph_suspicious","parenchymal_distortion","simple_cyst","ductal_ectasia",
          "nodules_echo_num","nodules_echo_description","nodules_echo_shape","nodules_echo_margin","nodules_echo_echogenicity","nodules_echo_location","nodules_echo_size","nodules_echo_known","nodules_echo_stable"]



TIPO=["biopsy report", "nodal staging ultrasound report", "normal control or revision report"]
TECNICA=["only ultrasound study", "only mammography study", "mammography and ultrasound"]
FAMILY=["no family history", "first degree", "second degree"]
PROSTHESIS=["no prosthesis","yes prosthesis"]
BIRADS=["BI-RADS 0","BI-RADS 1","BI-RADS 2","BI-RADS 3","BI-RADS 4A","BI-RADS 4B","BI-RADS 4C","BI-RADS 5","BI-RADS 6"]
DENSITY_MAMMO=["ACR A","ACR B","ACR C","ACR D","unknown density mammo"]
CALCIFICATIONS_BENIGN=["no calcifications","yes calcifications"]
GANGLIO_MAMO=["no ganglio","yes ganglio"]
DENSITY_ECHO=["homogeneous fibroglandular","heterogeneous fibroglandular","fibroglandular and fat","homogeneous fatty","unknown density echo"]
LYMPH_BENIGN=["no lymph benign","yes lymph benign"]
LYMPH_SUSPICIOUS=["no lymph suspicious","yes lymph suspicious"]
SIMPLE_CYST=["no cyst","yes cyst"]
DUCTAL_ECTASIA=["no ectasia","yes ectasia"]
NODULES_ECHO=["no nodules", "yes nodules"]
NODULES_SHAPE=["oval","round","lobulated","irregular","unknown shape"]
NODULES_MARGIN=["circumscribed","spiculated","indistinct","not circumscribed","unknown margin"]
NODULES_ECHOGENICITY=["hypoechoic", "isoechoic", "heterogeneous","complex cystic and solid","unknown echogenicity"]
NODULES_KNOWN=["no known", "yes known"]
NODULES_STABLE=["grown stable","shrunk stable", "yes stable"]


import gc
from torch.nn.utils.rnn import pad_sequence
def flatten_and_filter_dataset(ground_truth,reports):
    """
    Esta función toma un conjunto de datos en el formato original (con estructura jerárquica)
    y devuelve un conjunto de datos plano, donde cada entrada tiene un solo `context`, `question` y `answer`.
    
    Argumentos:
        dataset: Un conjunto de datos en formato original (puede ser train, validation, test).
    
    Retorno:
        Un conjunto de datos de Hugging Face en formato plano, con solo ejemplos completos.
    """
    # Lista para almacenar ejemplos en formato plano
    flattened_examples = {}
    examples_raw={}
    targets={}
    val_data={}
    question_tipo={}
    previous_message_answer_tipo={}
    options_tipo={}
    answers_tipo={}
    j=0
    
    question_tipo["age"]= "does the patient's age appear in the following breast medical report?"
    previous_message_answer_tipo["age"]="search for numbers, but do not mistake it with the age of a familiar. If a number appears without any context between two dots it is surely the age."
    options_tipo["age"]="answer only the age of the patient."

    question_tipo["tipo"]= 'is the following breast medical report a biopsy report or a nodal staging ultrasound report?'
    previous_message_answer_tipo["tipo"]="biopsy reports are normally Image-Guided Biopsy and is normally said that they are referred to the hospital for biopsy. Nodal staging ultrasound reports can also be written as 'axilla ultrasound'. If it is any of these it will be written in the beginning of the report, normally in the used technique. These kind of reports are only ultrasound. In these reports no final BI-RADS is given."
    options_tipo["tipo"]="answer with one of the following options: 'biopsy report', 'nodal staging ultrasound report' or 'normal control or revision report'."

    question_tipo["tecnica"]= 'what diagnostic technique was used in the following breast medical report?'
    previous_message_answer_tipo["tecnica"]="biopsy reports, simple cysts and analysis of lymph or axillary nodes are only seen on ultrasound. On the other hand, if the ACR density is given or parenchymal distortions are analysed, the technique will be a mammogram. Tomosyntesis is a mammography type. The report may include an ultrasound examination, a mammography examination or both."
    options_tipo["tecnica"]="answer with one of the following options: 'only ultrasound study', 'only mammography study' or 'mammography and ultrasound'."

    question_tipo["family"]= "does the patient have any family history in the following breast medical report?"
    previous_message_answer_tipo["family"]="family history of breast cancer is categorized based on the degree of relatives affected: First-degree relatives: Parents, siblings, or children. Second-degree relatives: Grandparents, aunts, uncles, nieces, nephews, or half-siblings. Third-degree relatives: Great-grandparents, great-aunts/uncles, or first cousins."
    options_tipo["family"]="answer with one of the following options: 'first degree', 'second degree', 'third degree' or 'no family history'."
    
    question_tipo["history"]= "does the patient have any non-familiar history in the following breast medical report?"
    previous_message_answer_tipo["history"]="check for the history at the beginning of the report. Normally it is a previous biopsy result, mastectomy or cancer. It normally starts with 'history of ...'."
    options_tipo["history"]="answer retrieving the information directly from the report or with 'no history was found'."
    
    question_tipo["symtomatic"]= "is the reason for the consultation that the patient is symptomatic in the following breast medical report?"
    previous_message_answer_tipo["symtomatic"]="the answer is at the beginning of the report, in the reason for consultation. It is normally a palpable lump, lumpectomy or nodule, sometimes painful."
    options_tipo["symtomatic"]="answer retrieving the information directly from the report or with 'non-symptomatic consultation'."
    
    question_tipo["prosthesis"]= "does the patient have a prosthesis in the following breast medical report?"
    previous_message_answer_tipo["prosthesis"]="it is normally clearly indicated at the beginning of the report. Sometimes it is written as implants instead of prosthesis."
    options_tipo["prosthesis"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["birads"]= "what is the final BI-RADS classification given to the patient in the following breast medical report?"
    previous_message_answer_tipo["birads"]="the final BI-RADS of the patient is given in the conclusions of the report, normally at the end."
    options_tipo["birads"]="answer with one of the following options: 'BI-RADS 0', 'BI-RADS 1', 'BI-RADS 2', 'BI-RADS 3', 'BI-RADS 4A', 'BI-RADS 4B', 'BI-RADS 4C', 'BI-RADS 5' or 'unknown'."

    question_tipo["density_mammo"]= "what is the breast density found in the mammography study of the following breast medical report?"
    previous_message_answer_tipo["density_mammo"]="breast density in mammography is classified into four categories: ACR A (= Almost entirely fatty), ACR B (= Scattered areas of fibroglandular density), ACR C (= Heterogeneously dense), ACR D (= Extremely or very dense breasts). Sometimes it is written as 'density type x'. It can also be written with their real meaning (very dense breasts = C) and not with the A, B, C, D classification. Focus only on density."
    options_tipo["density_mammo"]="answer with one of the following options: 'ACR A', 'ACR B', 'ACR C', ACR D' or 'unknown'."
   
    question_tipo["density_echo"]= "what is the breast density found in the ultrasound study of the following breast medical report?"
    previous_message_answer_tipo["density_echo"]="breast composition in ultrasound is classified into three categories: fibroglandular and fat (mixed distribution of fibroglandular and adipose tissue), heterogeneous fibroglandular (predominantly fibroglandular tissue with varying echogenicity and scattered fat areas), homogeneous fatty (uniform fatty tissue with consistent echogenicity and minimal fibroglandular content), and homogeneous fibroglandular (uniform fibroglandular tissue with consistent echogenicity and minimal fat content)."
    options_tipo["density_echo"]="answer with one of the following options: 'fibroglandular and fat', 'heterogeneous fibroglandular', 'homogeneous fibroglandular', 'homogeneous fatty' or 'unknown'."

    question_tipo["calcifications_benign"]= "does the following breast medical report mention the appearence of benign calcifications in the mammography exam?"
    previous_message_answer_tipo["calcifications_benign"]="Consider only benign calcifications in the mammography."
    options_tipo["calcifications_benign"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["ganglio_mamo"]= "does the following breast medical report mention any lymph nodes in the mammography exam?"
    previous_message_answer_tipo["ganglio_mamo"]="Consider only lymph nodes that appear in the mammography."
    options_tipo["ganglio_mamo"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["parenchymal_distortion"]= "does the following breast medical report mention any parenchymal distortion or asymmetry in the mammography exam?"
    previous_message_answer_tipo["parenchymal_distortion"]="If it has any it will appear in the results of the mammography exam using the words distortion, asymmetry or sometimes it can also be surgical changes."
    options_tipo["parenchymal_distortion"]="answer retrieving the information directly from the report or with 'no'"
    
    question_tipo["lymph_suspicious"]= "does the following breast medical report mention any suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_suspicious"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered suspicious when it has eccentric cortical thickening ≥ 3 mm, Short axis >10 mm, round shape, loss of fatty hilum, abnormal vascularity, or irregular margins—especially when associated with known malignancy or progressive enlargement. They can also be classified as UN3, UN4 or UN5. An exam may have both suspicious and benign lymph nodes, answer 'yes' in this case."
    options_tipo["lymph_suspicious"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["lymph_benign"]= "does the following breast medical report mention any benign or not suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_benign"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered benign when it has uniform cortex < 3 mm, preserved fatty hilum, oval shape, no abnormal vascularity, no irregular margins and homogeneous internal echo pattern. Benign axillary nodes can be classified as UN1 or UN2. A reactive axillary node is not suspicious. An exam may have both suspicious and benign lymph nodes, answer 'yes' in this case."
    options_tipo["lymph_benign"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["simple_cyst"]= "does the following breast medical report mention any simple cysts or microcysts in the ultrasound exam?"
    previous_message_answer_tipo["simple_cyst"]="The words symple cysts or microcysts will appear only in the ultrasound exam. Sometimes they can say that some of the cysts have echogenic content, but we still will consider them simple cysts and not nodules."
    options_tipo["simple_cyst"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["ductal_ectasia"]= "does the following breast medical report mention any ductal ectasia in the ultrasound exam?"
    previous_message_answer_tipo["ductal_ectasia"]="The word ductal ectasia will appear only in the ultrasound exam."
    options_tipo["ductal_ectasia"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["nodules_echo_num"]="is there any nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_echo_num"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. The localization, echogenicity and size of the nodules are normally said."
    options_tipo["nodules_echo_num"]="answer with one of the following options: 'yes' or 'no'."

    
    dic_order = {
        1: "first",
        2: "second",
        3: "third",
        4: "fourth",
        5: "fifth",
        6: "sixth",
        7: "seventh",
        8: "eighth",
        9: "ninth",
        10: "tenth",
        11: "eleventh",
        12: "twelfth",
        13: "thirteenth",
        14: "fourteenth",
        15: "fifteenth"
    }
    for i in range(1,2):
        question_tipo[f"nodules_echo_description_{i}"]= f"which is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_description_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_description_{i}"]="answer retrieving the information directly from the report."
        
        question_tipo[f"nodules_echo_shape_{i}"]= f"what is the shape of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_shape_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Shapes can be 'oval', 'round', 'lobulated' and 'irregular'. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_shape_{i}"]="answer with one of the following options: 'oval', 'round', 'lobulated', 'irregular' or 'unknown'."
        
        question_tipo[f"nodules_echo_margin_{i}"]= f"what is the margin of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_margin_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Margin can be 'circumscribed' and 'not circumscribed'. Inside the not circumscribed we have 'spiculated', 'angulated', 'microlobulated' or 'indistinc' ('not defined') margins. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders', in this case classify it as 'not circumscribed'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_margin_{i}"]="answer with one of the following options: 'circumscribed', 'not circumscribed', 'indefined', 'spiculated', 'angulated', 'microlobulated' or 'unknown'."
        
        question_tipo[f"nodules_echo_echogenicity_{i}"]= f"what is the echogenicity of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_echogenicity_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Echogenicity can be 'anechoic', 'hypoechoic', 'heterogeneous' and 'complex cystic and solid'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour." 
        options_tipo[f"nodules_echo_echogenicity_{i}"]="answer with one of the following options: 'hypoechoic', 'heterogeneous', 'anechoic', 'hyperecoic', 'isoechoic', 'complex cystic and solid' or 'unknown'."
        
        question_tipo[f"nodules_echo_location_{i}"]= f"In which location is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_location_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. If the nodule is mentioned previously in the mammography, the location can be found also there. sometimes the breast location of the tumour may be written in a different part than the quadrant. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_location_{i}"]="answer retrieving the information directly from the report or with 'unknown'."
        
        question_tipo[f"nodules_echo_size_{i}"]= f"what is the size of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_size_{i}"]="do not consider if a nodule is described in the mammography exam. The localization, echogenicity and size of the nodules are normally said. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_size_{i}"]="answer retrieving the information directly from the report (stop after 'mm') or with 'unknown'."
        
        question_tipo[f"nodules_echo_known_{i}"]= f"is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report previously known?"
        previous_message_answer_tipo[f"nodules_echo_known_{i}"]="do not consider if a nodule is described in the mammography exam. If the nodule is known from before the report, it will say if it it is stable or if it has grown or shrink. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_known_{i}"]="answer with one of the following options: 'yes' or 'no'."
        
        question_tipo[f"nodules_echo_stable_{i}"]= f"is the {dic_order[i]} known nodule described in the ultrasound exam stable in the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_stable_{i}"]="do not consider if a nodule is described in the mammography exam. If the nodule is known from before the examination, it will be analysed to see if it is stable or if it got bigger or smaller. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_stable_{i}"]="answer with one of the following options: 'yes', 'grown' or 'shrunk."
    for i, report in enumerate(reports["informes_ingles"]):
        informe=preprocess_text(report)
        key=reports["keys"][i]
        
        if key not in ground_truth.index:
            continue
        
        row=ground_truth.loc[key]

        #AGE
        age=str(row["Age"])
        answer_tipo={}
        # Verificar si el ejemplo tiene preguntas
        if age.isdigit():
            answer_tipo["age"]=age+"."
            
        else:
            answer_tipo["age"]="no"+"."
        
        #TIPO
        if row["Biopsy_report"]=="Yes":
            answer_tipo["tipo"]="biopsy report"+"."
            
        elif row["Ganglio_report"]=="Yes":
            answer_tipo["tipo"]="nodal staging ultrasound report"+"."
        else:
            answer_tipo["tipo"]="normal control or revision report"+"."
        
        #TECHNIQUE
        tecnica=row["Technique"]
        # Verificar si el ejemplo tiene preguntas
        if tecnica=="ultrasound":
            answer_tipo["tecnica"]="only ultrasound study"+"."            
        elif tecnica=="mammography":
            answer_tipo["tecnica"]="only mammography study"+"."
        elif not pd.isna(tecnica):
            answer_tipo["tecnica"]=tecnica+"."
        else:
            print(key,report)
        
        # 
        # HISTORY
        #No consideramos las biopsias o las ecografías de estadificación ganglionar.
        if answer_tipo["tipo"]=="normal control or revision report"+".":
            history=row["Other_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(history,str) or history=="No":
                answer_tipo["history"]="no history was found"+"."            
            else:
                answer_tipo["history"]=history+"." 
    
            # FAMILY
            family=row["Family_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(family,str) or family=="No":
                answer_tipo["family"]="no family history"+"."            
            else:
                answer_tipo["family"]=family+"." 
    
            # SYMTOMATIC
            symtomatic=row["Syntomatic"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(symtomatic,str) or symtomatic=="No" or symtomatic=="No estoy seguro":
                answer_tipo["symtomatic"]="Non-symptomatic consultation"+"."            
            else:
                answer_tipo["symtomatic"]=symtomatic+"." 
    
            # PROSTHESIS
            prosthesis=row["Prosthesis"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(prosthesis,str) or prosthesis=="No":
                answer_tipo["prosthesis"]="no"+"."            
            else:
                answer_tipo["prosthesis"]="yes"+"."
    
            #BIRADS
            birads=row["BI-RADS"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(birads,str):
                answer_tipo["birads"]="unknown"+"."            
            else:
                answer_tipo["birads"]=birads+"."
    
            #Density mammo
            density_mammo=row["Density_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_mammo,str):
                answer_tipo["density_mammo"]="unknown"+"."            
            else:
                answer_tipo["density_mammo"]=density_mammo+"."

            #Lymp nodes mammo
            ganglio_mamo=row["Ganglio_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ganglio_mamo,str):
                answer_tipo["ganglio_mamo"]="no"+"."            
            else:
                answer_tipo["ganglio_mamo"]=ganglio_mamo.lower()+"."

            #Calcifications benign
            calcifications_benign=row["Calcifications_benign_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(calcifications_benign,str):
                answer_tipo["calcifications_benign"]="no"+"."            
            else:
                answer_tipo["calcifications_benign"]=calcifications_benign.lower()+"."

            #Parenchymal distortion
            parenchymal_distortion=row["parenchymal_distortions_asymmetry"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(parenchymal_distortion,str):
                answer_tipo["parenchymal_distortion"]="no"+"."            
            else:
                answer_tipo["parenchymal_distortion"]=parenchymal_distortion.lower()+"."
    
            #Density echo
            density_echo=row["Density_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_echo,str):
                answer_tipo["density_echo"]="unknown"+"."            
            else:
                if density_echo in DENSITY_ECHO:
                    answer_tipo["density_echo"]=density_echo+"."

            #Benign lymph nodes
            simple_cyst=row["simple_cyst_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(simple_cyst,str):
                answer_tipo["simple_cyst"]="no"+"."            
            else:
                answer_tipo["simple_cyst"]=simple_cyst.lower()+"."
            #Suspicious lymph nodes
            lymph_suspicious=row["Ganglio_suspicious_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(lymph_suspicious,str):
                answer_tipo["lymph_suspicious"]="no"+"."            
            else:
                answer_tipo["lymph_suspicious"]=lymph_suspicious.lower()+"."

            #Benign lymph nodes
            lymph_benign=row["Ganglio_benign_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(lymph_benign,str):
                answer_tipo["lymph_benign"]="no"+"."            
            else:
                answer_tipo["lymph_benign"]=lymph_benign.lower()+"."

            #Ductal ectasia
            ductal_ectasia=row["Ductal_ectasia_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ductal_ectasia,str):
                answer_tipo["ductal_ectasia"]="no"+"."            
            else:
                answer_tipo["ductal_ectasia"]=ductal_ectasia.lower()+"."
    
            #Nodules echo
            nodules_echo_num=row["Nodules_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(nodules_echo_num,str) and not isinstance(nodules_echo_num,int):
                answer_tipo["nodules_echo_num"]="no"+"."    
            elif isinstance(nodules_echo_num,str) and nodules_echo_num=="No":
                answer_tipo["nodules_echo_num"]="no"+"."  
            else:
                answer_tipo["nodules_echo_num"]="yes"+"."
    
            #Si existen nódulos se hace las preguntas correspondientes
            if answer_tipo["nodules_echo_num"]!="no.":
                nodules_echo_description=row["Description_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                
                answer_tipo["nodules_echo_description_1"]=nodules_echo_description+"."
                    
                nodules_echo_shape=row[f"Shape_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_shape,str):
                    answer_tipo["nodules_echo_shape_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_shape_1"]=nodules_echo_shape.lower()+"."

                nodules_echo_margin=row["Margin_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_margin,str):
                    answer_tipo["nodules_echo_margin_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_margin_1"]=nodules_echo_margin.lower()+"."

                nodules_echo_echogenicity=row["Echogenicity_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_echogenicity,str):
                    answer_tipo["nodules_echo_echogenicity_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_echogenicity_1"]=nodules_echo_echogenicity.lower()+"."

                nodules_echo_location=row[f"Location_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_location,str):
                    answer_tipo["nodules_echo_location_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_location_1"]=nodules_echo_location.lower()+"."

                
                #Nodules echo size
                nodules_echo_size=row["size_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_size,str):
                    answer_tipo["nodules_echo_size_1"]="unknown"+"."            
                else:
                    answer_tipo["nodules_echo_size_1"]=nodules_echo_size+"."
        
                #Nodules echo known
                nodules_echo_known=row["new_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_known,str):
                    answer_tipo["nodules_echo_known_1"]="unknown"+"."
                elif nodules_echo_known=="No":
                    answer_tipo["nodules_echo_known_1"]="yes"+"."            
                else:
                    answer_tipo["nodules_echo_known_1"]="no"+"."
    
                if answer_tipo["nodules_echo_known_1"]=="yes.":
                    #Nodules echo stable
                    nodules_echo_stable=row["Stable_eco_1"]
                    # Verificar si el ejemplo tiene preguntas
                    if not isinstance(nodules_echo_stable,str):
                        answer_tipo["nodules_echo_stable_1"]="unknown"+"."
                    else:
                        answer_tipo["nodules_echo_stable_1"]=nodules_echo_stable.lower()+"."
            
        for tipo in answer_tipo:
            #Si está el tipo en las respuestas que hemos recogido lo metemos a la base de datos.
            key_tipo=key+"_"+tipo
            if key_tipo in flattened_examples:
                continue
                key_tipo=key_tipo+"_copy"
            examples_raw[key_tipo]=report
            answer=str(answer_tipo[tipo])
            
    
            inputs_tipo = "Question: " + question_tipo[tipo]+ " Context: " + informe +" Answer: "+ str(answer_tipo[tipo])
            flattened_examples[key_tipo]=inputs_tipo
            targets[key_tipo]=answer
            
            val_data[key_tipo]="Question: " + question_tipo[tipo]+  " Context: " + informe+  " Answer: "
    return flattened_examples,targets,val_data,examples_raw

def flatten_and_filter_dataset(ground_truth,reports):
    """
    Esta función toma un conjunto de datos en el formato original (con estructura jerárquica)
    y devuelve un conjunto de datos plano, donde cada entrada tiene un solo `context`, `question` y `answer`.
    
    Argumentos:
        dataset: Un conjunto de datos en formato original (puede ser train, validation, test).
    
    Retorno:
        Un conjunto de datos de Hugging Face en formato plano, con solo ejemplos completos.
    """
    # Lista para almacenar ejemplos en formato plano
    flattened_examples = {}
    examples_raw={}
    targets={}
    val_data={}
    question_tipo={}
    previous_message_answer_tipo={}
    options_tipo={}
    answers_tipo={}
    j=0
    
    question_tipo["tecnica"]= 'what diagnostic technique was used in the following breast medical report?'
    previous_message_answer_tipo["tecnica"]="biopsy reports, simple cysts and analysis of lymph or axillary nodes are only seen on ultrasound. On the other hand, if the ACR density is given or parenchymal distortions are analysed, the technique will be a mammogram. Tomosyntesis is a mammography type. The report may include an ultrasound examination, a mammography examination or both."
    options_tipo["tecnica"]="answer with one of the following options: 'only ultrasound study', 'only mammography study' or 'mammography and ultrasound'."

    for i, report in enumerate(reports["informes_ingles"]):
        informe=preprocess_text(report)
        key=reports["keys"][i]
        
        if key not in ground_truth.index:
            continue
        
        row=ground_truth.loc[key]
        answer_tipo={}

        #TECHNIQUE
        tecnica=row["Technique"]
        # Verificar si el ejemplo tiene preguntas
        if tecnica=="ultrasound":
            answer_tipo["tecnica"]="only ultrasound study"+"."            
        elif tecnica=="mammography":
            answer_tipo["tecnica"]="only mammography study"+"."
        elif not pd.isna(tecnica):
            answer_tipo["tecnica"]=tecnica+"."
        else:
            print(key,report)
        for tipo in answer_tipo:
            #Si está el tipo en las respuestas que hemos recogido lo metemos a la base de datos.
            key_tipo=key+"_"+tipo
            if key_tipo in flattened_examples:
                continue
                key_tipo=key_tipo+"_copy"
            examples_raw[key_tipo]=report
            answer=str(answer_tipo[tipo])
            
    
            # inputs_tipo = "Question: " + question_tipo[tipo]+ " Context: " + informe +" Answer: "+ str(answer_tipo[tipo])
            inputs_tipo = "Question: " + question_tipo[tipo] +" Context: " + informe + " Answer: "+ str(answer_tipo[tipo])

            flattened_examples[key_tipo]=inputs_tipo
            targets[key_tipo]=answer
            
            val_data[key_tipo]= "Question: " + question_tipo[tipo] + " Context: " + informe +" Answer: "
    return flattened_examples,targets,val_data,examples_raw

    
def tokenize_function(inputs):
    # Tokenizar el batch completo
    model_inputs = tokenizer(
        inputs["text"], 
        max_length=1024, 
        truncation=True, 
        padding="max_length", 
        return_tensors="pt"
    )

    # Crear las labels como una copia de los input_ids
    labels = model_inputs["input_ids"].clone()

    # Obtener la secuencia de tokens para "answer:" (sin el token `<s>`)
    answer_colon_tokens = tokenizer("Answer:").input_ids[1:]  # Ahora solo [9412, 20]

    # Iterar sobre cada entrada en el batch
    for idx in range(labels.shape[0]):  
        input_ids = model_inputs["input_ids"][idx].tolist()  # Convertir a lista para iterar
        answer_start_idx = -1

        # Buscar la secuencia exacta "answer:" en input_ids
        for i in range(len(input_ids) - len(answer_colon_tokens) + 1):
            if input_ids[i : i + len(answer_colon_tokens)] == answer_colon_tokens:
                answer_start_idx = i + len(answer_colon_tokens)  # Inicio de la respuesta
                break

        if answer_start_idx != -1:
            labels[idx, :answer_start_idx] = -100  # Enmascarar todo antes de la respuesta
        else:
            labels[idx, :] = -100  # Si no se encuentra, enmascarar todo

    # Enmascarar también los tokens de padding
    labels[labels == tokenizer.pad_token_id] = -100

    return {"input_ids": model_inputs["input_ids"], "labels": labels}

def train_save(X,Y,training=True,testing=False,low_beams=False):
    random.seed(1)
    np.random.seed(1)
    predicted=[]
    tested=[]
    acc_cv=[]
    kappa_cv=[]
    ind_cv={tipo:[] for tipo in questions}
    preds_category_cv={tipo:[] for tipo in questions}
    labels_category_cv={tipo:[] for tipo in questions}
    accuracies_cv={tipo:[] for tipo in questions}

    train = train_clean(X,Y) 
    del train["val_data"]
    print(len(train))
    model=AutoModelForCausalLM.from_pretrained(model_name)
    train_data = Dataset.from_pandas(train)
    train_data = train_data.map(tokenize_function, batched=True)
    train_data = train_data.remove_columns(["text","label"])
    train_data.set_format("torch")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data)
    if training:
        trainer.train()
    trainer.save_model(f"results/{model_name}_second_stage_model_final_tecnica_no_info")
    

In [None]:
model_name = "microsoft/biogpt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

inputs,targets,val_data,examples_raw = flatten_and_filter_dataset(ground_truth,report_data)  

training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    learning_rate=1e-5,  # Lower for fine-tuning without losing generalization
    per_device_train_batch_size=4,
    fp16=True,
    gradient_accumulation_steps=4,
    
    num_train_epochs=7,  # Shorter fine-tuning stage
    weight_decay=0.01,  # Lower weight decay to preserve learned features
    
)

In [None]:
dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
targets=pd.DataFrame.from_dict(targets,orient='index')
val_data=pd.DataFrame.from_dict(val_data,orient='index')
dataset_final.columns=["text"]
targets.columns=["label"]
val_data.columns=["val_data"]
dataset_final=pd.concat([dataset_final, val_data],axis=1)

In [None]:
train_save(dataset_final,targets,training=True,testing=False,low_beams=False)

In [None]:
questions=["age","tipo","tecnica","family","history","symtomatic",
           "prosthesis","birads","density_mammo","calcifications_benign","density_echo","ganglio_mamo","lymph_benign","lymph_suspicious","parenchymal_distortion","simple_cyst","ductal_ectasia",
          "nodules_echo_num","nodules_echo_description","nodules_echo_shape","nodules_echo_margin","nodules_echo_echogenicity","nodules_echo_location","nodules_echo_size","nodules_echo_known","nodules_echo_stable"]



TIPO=["biopsy report", "nodal staging ultrasound report", "normal control or revision report"]
TECNICA=["only ultrasound study", "only mammography study", "mammography and ultrasound"]
FAMILY=["no family history", "first degree", "second degree"]
PROSTHESIS=["no prosthesis","yes prosthesis"]
BIRADS=["BI-RADS 0","BI-RADS 1","BI-RADS 2","BI-RADS 3","BI-RADS 4A","BI-RADS 4B","BI-RADS 4C","BI-RADS 5","BI-RADS 6"]
DENSITY_MAMMO=["ACR A","ACR B","ACR C","ACR D","unknown density mammo"]
CALCIFICATIONS_BENIGN=["no calcifications","yes calcifications"]
GANGLIO_MAMO=["no ganglio","yes ganglio"]
DENSITY_ECHO=["homogeneous fibroglandular","heterogeneous fibroglandular","fibroglandular and fat","homogeneous fatty","unknown density echo"]
LYMPH_BENIGN=["no lymph benign","yes lymph benign"]
LYMPH_SUSPICIOUS=["no lymph suspicious","yes lymph suspicious"]
SIMPLE_CYST=["no cyst","yes cyst"]
DUCTAL_ECTASIA=["no ectasia","yes ectasia"]
NODULES_ECHO=["no nodules", "yes nodules"]
NODULES_SHAPE=["oval","round","lobulated","irregular","unknown shape"]
NODULES_MARGIN=["circumscribed","spiculated","indistinct","not circumscribed","unknown margin"]
NODULES_ECHOGENICITY=["hypoechoic", "isoechoic", "heterogeneous","complex cystic and solid","unknown echogenicity"]
NODULES_KNOWN=["no known", "yes known"]
NODULES_STABLE=["grown stable","shrunk stable", "yes stable"]


import gc
from torch.nn.utils.rnn import pad_sequence
def flatten_and_filter_dataset(ground_truth,reports):
    """
    Esta función toma un conjunto de datos en el formato original (con estructura jerárquica)
    y devuelve un conjunto de datos plano, donde cada entrada tiene un solo `context`, `question` y `answer`.
    
    Argumentos:
        dataset: Un conjunto de datos en formato original (puede ser train, validation, test).
    
    Retorno:
        Un conjunto de datos de Hugging Face en formato plano, con solo ejemplos completos.
    """
    # Lista para almacenar ejemplos en formato plano
    flattened_examples = {}
    examples_raw={}
    targets={}
    val_data={}
    question_tipo={}
    previous_message_answer_tipo={}
    options_tipo={}
    answers_tipo={}
    j=0
    
    question_tipo["age"]= "does the patient's age appear in the following breast medical report?"
    previous_message_answer_tipo["age"]="search for numbers, but do not mistake it with the age of a familiar. If a number appears without any context between two dots it is surely the age."
    options_tipo["age"]="answer only the age of the patient."

    question_tipo["tipo"]= 'is the following breast medical report a biopsy report or a nodal staging ultrasound report?'
    previous_message_answer_tipo["tipo"]="biopsy reports are normally Image-Guided Biopsy and is normally said that they are referred to the hospital for biopsy. Nodal staging ultrasound reports can also be written as 'axilla ultrasound'. If it is any of these it will be written in the beginning of the report, normally in the used technique. These kind of reports are only ultrasound. In these reports no final BI-RADS is given."
    options_tipo["tipo"]="answer with one of the following options: 'biopsy report', 'nodal staging ultrasound report' or 'normal control or revision report'."

    question_tipo["tecnica"]= 'what diagnostic technique was used in the following breast medical report?'
    previous_message_answer_tipo["tecnica"]="biopsy reports, simple cysts and analysis of lymph or axillary nodes are only seen on ultrasound. On the other hand, if the ACR density is given or parenchymal distortions are analysed, the technique will be a mammogram. Tomosyntesis is a mammography type. The report may include an ultrasound examination, a mammography examination or both."
    options_tipo["tecnica"]="answer with one of the following options: 'only ultrasound study', 'only mammography study' or 'mammography and ultrasound'."

    question_tipo["family"]= "does the patient have any family history in the following breast medical report?"
    previous_message_answer_tipo["family"]="family history of breast cancer is categorized based on the degree of relatives affected: First-degree relatives: Parents, siblings, or children. Second-degree relatives: Grandparents, aunts, uncles, nieces, nephews, or half-siblings. Third-degree relatives: Great-grandparents, great-aunts/uncles, or first cousins."
    options_tipo["family"]="answer with one of the following options: 'first degree', 'second degree', 'third degree' or 'no family history'."
    
    question_tipo["history"]= "does the patient have any non-familiar history in the following breast medical report?"
    previous_message_answer_tipo["history"]="check for the history at the beginning of the report. Normally it is a previous biopsy result, mastectomy or cancer. It normally starts with 'history of ...'."
    options_tipo["history"]="answer retrieving the information directly from the report or with 'no history was found'."
    
    question_tipo["symtomatic"]= "is the reason for the consultation that the patient is symptomatic in the following breast medical report?"
    previous_message_answer_tipo["symtomatic"]="the answer is at the beginning of the report, in the reason for consultation. It is normally a palpable lump, lumpectomy or nodule, sometimes painful."
    options_tipo["symtomatic"]="answer retrieving the information directly from the report or with 'non-symptomatic consultation'."
    
    question_tipo["prosthesis"]= "does the patient have a prosthesis in the following breast medical report?"
    previous_message_answer_tipo["prosthesis"]="it is normally clearly indicated at the beginning of the report. Sometimes it is written as implants instead of prosthesis."
    options_tipo["prosthesis"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["birads"]= "what is the final BI-RADS classification given to the patient in the following breast medical report?"
    previous_message_answer_tipo["birads"]="the final BI-RADS of the patient is given in the conclusions of the report, normally at the end."
    options_tipo["birads"]="answer with one of the following options: 'BI-RADS 0', 'BI-RADS 1', 'BI-RADS 2', 'BI-RADS 3', 'BI-RADS 4A', 'BI-RADS 4B', 'BI-RADS 4C', 'BI-RADS 5' or 'unknown'."

    question_tipo["density_mammo"]= "what is the breast density found in the mammography study of the following breast medical report?"
    previous_message_answer_tipo["density_mammo"]="breast density in mammography is classified into four categories: ACR A (= Almost entirely fatty), ACR B (= Scattered areas of fibroglandular density), ACR C (= Heterogeneously dense), ACR D (= Extremely or very dense breasts). Sometimes it is written as 'density type x'. It can also be written with their real meaning (very dense breasts = C) and not with the A, B, C, D classification. Focus only on density."
    options_tipo["density_mammo"]="answer with one of the following options: 'ACR A', 'ACR B', 'ACR C', ACR D' or 'unknown'."
   
    question_tipo["density_echo"]= "what is the breast density found in the ultrasound study of the following breast medical report?"
    previous_message_answer_tipo["density_echo"]="breast composition in ultrasound is classified into three categories: fibroglandular and fat (mixed distribution of fibroglandular and adipose tissue), heterogeneous fibroglandular (predominantly fibroglandular tissue with varying echogenicity and scattered fat areas), homogeneous fatty (uniform fatty tissue with consistent echogenicity and minimal fibroglandular content), and homogeneous fibroglandular (uniform fibroglandular tissue with consistent echogenicity and minimal fat content)."
    options_tipo["density_echo"]="answer with one of the following options: 'fibroglandular and fat', 'heterogeneous fibroglandular', 'homogeneous fibroglandular', 'homogeneous fatty' or 'unknown'."

    question_tipo["calcifications_benign"]= "does the following breast medical report mention the appearence of benign calcifications in the mammography exam?"
    previous_message_answer_tipo["calcifications_benign"]="Consider only benign calcifications in the mammography."
    options_tipo["calcifications_benign"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["ganglio_mamo"]= "does the following breast medical report mention any lymph nodes in the mammography exam?"
    previous_message_answer_tipo["ganglio_mamo"]="Consider only lymph nodes that appear in the mammography."
    options_tipo["ganglio_mamo"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["parenchymal_distortion"]= "does the following breast medical report mention any parenchymal distortion or asymmetry in the mammography exam?"
    previous_message_answer_tipo["parenchymal_distortion"]="If it has any it will appear in the results of the mammography exam using the words distortion, asymmetry or sometimes it can also be surgical changes."
    options_tipo["parenchymal_distortion"]="answer retrieving the information directly from the report or with 'no'"
    
    question_tipo["lymph_suspicious"]= "does the following breast medical report mention any suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_suspicious"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered suspicious when it has eccentric cortical thickening ≥ 3 mm, Short axis >10 mm, round shape, loss of fatty hilum, abnormal vascularity, or irregular margins—especially when associated with known malignancy or progressive enlargement. They can also be classified as UN3, UN4 or UN5. An exam may have both suspicious and benign lymph nodes, answer 'yes' in this case."
    options_tipo["lymph_suspicious"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["lymph_benign"]= "does the following breast medical report mention any benign or not suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_benign"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered benign when it has uniform cortex < 3 mm, preserved fatty hilum, oval shape, no abnormal vascularity, no irregular margins and homogeneous internal echo pattern. Benign axillary nodes can be classified as UN1 or UN2. A reactive axillary node is not suspicious. An exam may have both suspicious and benign lymph nodes, answer 'yes' in this case."
    options_tipo["lymph_benign"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["simple_cyst"]= "does the following breast medical report mention any simple cysts or microcysts in the ultrasound exam?"
    previous_message_answer_tipo["simple_cyst"]="The words symple cysts or microcysts will appear only in the ultrasound exam. Sometimes they can say that some of the cysts have echogenic content, but we still will consider them simple cysts and not nodules."
    options_tipo["simple_cyst"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["ductal_ectasia"]= "does the following breast medical report mention any ductal ectasia in the ultrasound exam?"
    previous_message_answer_tipo["ductal_ectasia"]="The word ductal ectasia will appear only in the ultrasound exam."
    options_tipo["ductal_ectasia"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["nodules_echo_num"]="is there any nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_echo_num"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. The localization, echogenicity and size of the nodules are normally said."
    options_tipo["nodules_echo_num"]="answer with one of the following options: 'yes' or 'no'."

    
    dic_order = {
        1: "first",
        2: "second",
        3: "third",
        4: "fourth",
        5: "fifth",
        6: "sixth",
        7: "seventh",
        8: "eighth",
        9: "ninth",
        10: "tenth",
        11: "eleventh",
        12: "twelfth",
        13: "thirteenth",
        14: "fourteenth",
        15: "fifteenth"
    }
    for i in range(1,2):
        question_tipo[f"nodules_echo_description_{i}"]= f"which is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_description_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_description_{i}"]="answer retrieving the information directly from the report."
        
        question_tipo[f"nodules_echo_shape_{i}"]= f"what is the shape of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_shape_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Shapes can be 'oval', 'round', 'lobulated' and 'irregular'. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_shape_{i}"]="answer with one of the following options: 'oval', 'round', 'lobulated', 'irregular' or 'unknown'."
        
        question_tipo[f"nodules_echo_margin_{i}"]= f"what is the margin of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_margin_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Margin can be 'circumscribed' and 'not circumscribed'. Inside the not circumscribed we have 'spiculated', 'angulated', 'microlobulated' or 'indistinc' ('not defined') margins. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders', in this case classify it as 'not circumscribed'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_margin_{i}"]="answer with one of the following options: 'circumscribed', 'not circumscribed', 'indefined', 'spiculated', 'angulated', 'microlobulated' or 'unknown'."
        
        question_tipo[f"nodules_echo_echogenicity_{i}"]= f"what is the echogenicity of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_echogenicity_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Echogenicity can be 'anechoic', 'hypoechoic', 'heterogeneous' and 'complex cystic and solid'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour." 
        options_tipo[f"nodules_echo_echogenicity_{i}"]="answer with one of the following options: 'hypoechoic', 'heterogeneous', 'anechoic', 'hyperecoic', 'isoechoic', 'complex cystic and solid' or 'unknown'."
        
        question_tipo[f"nodules_echo_location_{i}"]= f"In which location is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_location_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. If the nodule is mentioned previously in the mammography, the location can be found also there. sometimes the breast location of the tumour may be written in a different part than the quadrant. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_location_{i}"]="answer retrieving the information directly from the report or with 'unknown'."
        
        question_tipo[f"nodules_echo_size_{i}"]= f"what is the size of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_size_{i}"]="do not consider if a nodule is described in the mammography exam. The localization, echogenicity and size of the nodules are normally said. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_size_{i}"]="answer retrieving the information directly from the report (stop after 'mm') or with 'unknown'."
        
        question_tipo[f"nodules_echo_known_{i}"]= f"is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report previously known?"
        previous_message_answer_tipo[f"nodules_echo_known_{i}"]="do not consider if a nodule is described in the mammography exam. If the nodule is known from before the report, it will say if it it is stable or if it has grown or shrink. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_known_{i}"]="answer with one of the following options: 'yes' or 'no'."
        
        question_tipo[f"nodules_echo_stable_{i}"]= f"is the {dic_order[i]} known nodule described in the ultrasound exam stable in the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_stable_{i}"]="do not consider if a nodule is described in the mammography exam. If the nodule is known from before the examination, it will be analysed to see if it is stable or if it got bigger or smaller. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_stable_{i}"]="answer with one of the following options: 'yes', 'grown' or 'shrunk."
    for i, report in enumerate(reports["informes_ingles"]):
        informe=preprocess_text(report)
        key=reports["keys"][i]
        
        if key not in ground_truth.index:
            continue
        
        row=ground_truth.loc[key]

        #AGE
        age=str(row["Age"])
        answer_tipo={}
        # Verificar si el ejemplo tiene preguntas
        if age.isdigit():
            answer_tipo["age"]=age+"."
            
        else:
            answer_tipo["age"]="no"+"."
        
        #TIPO
        if row["Biopsy_report"]=="Yes":
            answer_tipo["tipo"]="biopsy report"+"."
            
        elif row["Ganglio_report"]=="Yes":
            answer_tipo["tipo"]="nodal staging ultrasound report"+"."
        else:
            answer_tipo["tipo"]="normal control or revision report"+"."
        
        #TECHNIQUE
        tecnica=row["Technique"]
        # Verificar si el ejemplo tiene preguntas
        if tecnica=="ultrasound":
            answer_tipo["tecnica"]="only ultrasound study"+"."            
        elif tecnica=="mammography":
            answer_tipo["tecnica"]="only mammography study"+"."
        elif not pd.isna(tecnica):
            answer_tipo["tecnica"]=tecnica+"."
        else:
            print(key,report)
        
        # 
        # HISTORY
        #No consideramos las biopsias o las ecografías de estadificación ganglionar.
        if answer_tipo["tipo"]=="normal control or revision report"+".":
            history=row["Other_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(history,str) or history=="No":
                answer_tipo["history"]="no history was found"+"."            
            else:
                answer_tipo["history"]=history+"." 
    
            # FAMILY
            family=row["Family_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(family,str) or family=="No":
                answer_tipo["family"]="no family history"+"."            
            else:
                answer_tipo["family"]=family+"." 
    
            # SYMTOMATIC
            symtomatic=row["Syntomatic"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(symtomatic,str) or symtomatic=="No" or symtomatic=="No estoy seguro":
                answer_tipo["symtomatic"]="Non-symptomatic consultation"+"."            
            else:
                answer_tipo["symtomatic"]=symtomatic+"." 
    
            # PROSTHESIS
            prosthesis=row["Prosthesis"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(prosthesis,str) or prosthesis=="No":
                answer_tipo["prosthesis"]="no"+"."            
            else:
                answer_tipo["prosthesis"]="yes"+"."
    
            #BIRADS
            birads=row["BI-RADS"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(birads,str):
                answer_tipo["birads"]="unknown"+"."            
            else:
                answer_tipo["birads"]=birads+"."
    
            #Density mammo
            density_mammo=row["Density_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_mammo,str):
                answer_tipo["density_mammo"]="unknown"+"."            
            else:
                answer_tipo["density_mammo"]=density_mammo+"."

            #Lymp nodes mammo
            ganglio_mamo=row["Ganglio_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ganglio_mamo,str):
                answer_tipo["ganglio_mamo"]="no"+"."            
            else:
                answer_tipo["ganglio_mamo"]=ganglio_mamo.lower()+"."

            #Calcifications benign
            calcifications_benign=row["Calcifications_benign_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(calcifications_benign,str):
                answer_tipo["calcifications_benign"]="no"+"."            
            else:
                answer_tipo["calcifications_benign"]=calcifications_benign.lower()+"."

            #Parenchymal distortion
            parenchymal_distortion=row["parenchymal_distortions_asymmetry"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(parenchymal_distortion,str):
                answer_tipo["parenchymal_distortion"]="no"+"."            
            else:
                answer_tipo["parenchymal_distortion"]=parenchymal_distortion.lower()+"."
    
            #Density echo
            density_echo=row["Density_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_echo,str):
                answer_tipo["density_echo"]="unknown"+"."            
            else:
                if density_echo in DENSITY_ECHO:
                    answer_tipo["density_echo"]=density_echo+"."

            #Benign lymph nodes
            simple_cyst=row["simple_cyst_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(simple_cyst,str):
                answer_tipo["simple_cyst"]="no"+"."            
            else:
                answer_tipo["simple_cyst"]=simple_cyst.lower()+"."
            #Suspicious lymph nodes
            lymph_suspicious=row["Ganglio_suspicious_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(lymph_suspicious,str):
                answer_tipo["lymph_suspicious"]="no"+"."            
            else:
                answer_tipo["lymph_suspicious"]=lymph_suspicious.lower()+"."

            #Benign lymph nodes
            lymph_benign=row["Ganglio_benign_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(lymph_benign,str):
                answer_tipo["lymph_benign"]="no"+"."            
            else:
                answer_tipo["lymph_benign"]=lymph_benign.lower()+"."

            #Ductal ectasia
            ductal_ectasia=row["Ductal_ectasia_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ductal_ectasia,str):
                answer_tipo["ductal_ectasia"]="no"+"."            
            else:
                answer_tipo["ductal_ectasia"]=ductal_ectasia.lower()+"."
    
            #Nodules echo
            nodules_echo_num=row["Nodules_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(nodules_echo_num,str) and not isinstance(nodules_echo_num,int):
                answer_tipo["nodules_echo_num"]="no"+"."    
            elif isinstance(nodules_echo_num,str) and nodules_echo_num=="No":
                answer_tipo["nodules_echo_num"]="no"+"."  
            else:
                answer_tipo["nodules_echo_num"]="yes"+"."
    
            #Si existen nódulos se hace las preguntas correspondientes
            if answer_tipo["nodules_echo_num"]!="no.":
                nodules_echo_description=row["Description_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                
                answer_tipo["nodules_echo_description_1"]=nodules_echo_description+"."
                    
                nodules_echo_shape=row[f"Shape_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_shape,str):
                    answer_tipo["nodules_echo_shape_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_shape_1"]=nodules_echo_shape.lower()+"."

                nodules_echo_margin=row["Margin_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_margin,str):
                    answer_tipo["nodules_echo_margin_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_margin_1"]=nodules_echo_margin.lower()+"."

                nodules_echo_echogenicity=row["Echogenicity_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_echogenicity,str):
                    answer_tipo["nodules_echo_echogenicity_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_echogenicity_1"]=nodules_echo_echogenicity.lower()+"."

                nodules_echo_location=row[f"Location_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_location,str):
                    answer_tipo["nodules_echo_location_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_location_1"]=nodules_echo_location.lower()+"."

                
                #Nodules echo size
                nodules_echo_size=row["size_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_size,str):
                    answer_tipo["nodules_echo_size_1"]="unknown"+"."            
                else:
                    answer_tipo["nodules_echo_size_1"]=nodules_echo_size+"."
        
                #Nodules echo known
                nodules_echo_known=row["new_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_known,str):
                    answer_tipo["nodules_echo_known_1"]="unknown"+"."
                elif nodules_echo_known=="No":
                    answer_tipo["nodules_echo_known_1"]="yes"+"."            
                else:
                    answer_tipo["nodules_echo_known_1"]="no"+"."
    
                if answer_tipo["nodules_echo_known_1"]=="yes.":
                    #Nodules echo stable
                    nodules_echo_stable=row["Stable_eco_1"]
                    # Verificar si el ejemplo tiene preguntas
                    if not isinstance(nodules_echo_stable,str):
                        answer_tipo["nodules_echo_stable_1"]="unknown"+"."
                    else:
                        answer_tipo["nodules_echo_stable_1"]=nodules_echo_stable.lower()+"."
            
        for tipo in answer_tipo:
            #Si está el tipo en las respuestas que hemos recogido lo metemos a la base de datos.
            key_tipo=key+"_"+tipo
            if key_tipo in flattened_examples:
                continue
                key_tipo=key_tipo+"_copy"
            examples_raw[key_tipo]=report
            answer=str(answer_tipo[tipo])
            
    
            inputs_tipo = "Question: " + question_tipo[tipo] + " Context: " + informe + " Answer: "+ str(answer_tipo[tipo])
            flattened_examples[key_tipo]=inputs_tipo
            targets[key_tipo]=answer
            
            val_data[key_tipo]="Question: " + question_tipo[tipo] + " Context: " + informe+ " Answer: "
    return flattened_examples,targets,val_data,examples_raw

def tokenize_function(inputs):
    # Tokenizar el batch completo
    model_inputs = tokenizer(
        inputs["text"], 
        max_length=512, 
        truncation=True, 
        padding="max_length", 
        return_tensors="pt"
    )

    # Crear las labels como una copia de los input_ids
    labels = model_inputs["input_ids"].clone()

    # Obtener la secuencia de tokens para "answer:" (sin el token `<s>`)
    answer_colon_tokens = tokenizer("Answer:").input_ids[1:]  # Ahora solo [9412, 20]

    # Iterar sobre cada entrada en el batch
    for idx in range(labels.shape[0]):  
        input_ids = model_inputs["input_ids"][idx].tolist()  # Convertir a lista para iterar
        answer_start_idx = -1

        # Buscar la secuencia exacta "answer:" en input_ids
        for i in range(len(input_ids) - len(answer_colon_tokens) + 1):
            if input_ids[i : i + len(answer_colon_tokens)] == answer_colon_tokens:
                answer_start_idx = i + len(answer_colon_tokens)  # Inicio de la respuesta
                break

        if answer_start_idx != -1:
            labels[idx, :answer_start_idx] = -100  # Enmascarar todo antes de la respuesta
        else:
            labels[idx, :] = -100  # Si no se encuentra, enmascarar todo

    # Enmascarar también los tokens de padding
    labels[labels == tokenizer.pad_token_id] = -100

    return {"input_ids": model_inputs["input_ids"], "labels": labels}

def train_save(X,Y,training=True,testing=False,low_beams=False):
    random.seed(1)
    np.random.seed(1)
    predicted=[]
    tested=[]
    acc_cv=[]
    kappa_cv=[]
    ind_cv={tipo:[] for tipo in questions}
    preds_category_cv={tipo:[] for tipo in questions}
    labels_category_cv={tipo:[] for tipo in questions}
    accuracies_cv={tipo:[] for tipo in questions}

    train = train_clean(X,Y) 
    del train["val_data"]
    print(len(train))
    model=AutoModelForCausalLM.from_pretrained(model_name)
    train_data = Dataset.from_pandas(train)
    train_data = train_data.map(tokenize_function, batched=True)
    train_data = train_data.remove_columns(["text","label"])
    train_data.set_format("torch")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data)
    if training:
        trainer.train()
    trainer.save_model(f"results/{model_name}_second_stage_model_final_no_info")
    

In [None]:
model_name = "microsoft/biogpt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

inputs,targets,val_data,examples_raw = flatten_and_filter_dataset(ground_truth,report_data)  
dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
targets=pd.DataFrame.from_dict(targets,orient='index')
val_data=pd.DataFrame.from_dict(val_data,orient='index')
dataset_final.columns=["text"]
targets.columns=["label"]
val_data.columns=["val_data"]
dataset_final=pd.concat([dataset_final, val_data],axis=1)

training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    learning_rate=1e-5,  # Lower for fine-tuning without losing generalization
    per_device_train_batch_size=8,
    fp16=True,
    gradient_accumulation_steps=2,
    
    num_train_epochs=7,  # Shorter fine-tuning stage
    weight_decay=0.01,  # Lower weight decay to preserve learned features
    
)
train_save(dataset_final,targets,training=True,testing=False,low_beams=False)

##CLINICALT5

In [None]:
import pandas as pd
import re
import random
import numpy as np
import math
import pickle
from sklearn.metrics import classification_report, accuracy_score
from datasets import Dataset,concatenate_datasets,load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

questions=["age","tipo","tecnica","family","history","symtomatic",
           "prosthesis","birads","density_mammo","calcifications_benign","density_echo","ganglio_mamo","lymph_benign","lymph_suspicious","parenchymal_distortion","simple_cyst","ductal_ectasia",
          "nodules_echo_num","nodules_echo_description","nodules_echo_shape","nodules_echo_margin","nodules_echo_echogenicity","nodules_echo_location","nodules_echo_size","nodules_echo_known","nodules_echo_stable"]



TIPO=["biopsy report", "nodal staging ultrasound report", "normal control or revision report"]
TECNICA=["only ultrasound study", "only mammography study", "mammography and ultrasound"]
FAMILY=["no family history", "first degree", "second degree"]
PROSTHESIS=["no prosthesis","yes prosthesis"]
BIRADS=["BI-RADS 0","BI-RADS 1","BI-RADS 2","BI-RADS 3","BI-RADS 4A","BI-RADS 4B","BI-RADS 4C","BI-RADS 5","BI-RADS 6"]
DENSITY_MAMMO=["ACR A","ACR B","ACR C","ACR D","unknown density mammo"]
CALCIFICATIONS_BENIGN=["no calcifications","yes calcifications"]
GANGLIO_MAMO=["no ganglio","yes ganglio"]
DENSITY_ECHO=["homogeneous fibroglandular","heterogeneous fibroglandular","fibroglandular and fat","homogeneous fatty","unknown density echo"]
LYMPH_BENIGN=["no lymph benign","yes lymph benign"]
LYMPH_SUSPICIOUS=["no lymph suspicious","yes lymph suspicious"]
SIMPLE_CYST=["no cyst","yes cyst"]
DUCTAL_ECTASIA=["no ectasia","yes ectasia"]
NODULES_ECHO=["no nodules", "yes nodules"]
NODULES_SHAPE=["oval","round","lobulated","irregular","unknown shape"]
NODULES_MARGIN=["circumscribed","spiculated","indistinct","not circumscribed","unknown margin"]
NODULES_ECHOGENICITY=["hypoechoic", "isoechoic", "heterogeneous","complex cystic and solid","unknown echogenicity"]
NODULES_KNOWN=["no known", "yes known"]
NODULES_STABLE=["grown stable","shrunk stable", "yes stable"]


import gc
from torch.nn.utils.rnn import pad_sequence

def train_clean(X,Y):
    random.seed(1)
    # Agrupar ejemplos originales y sus copias
    train = X
    # Barajar los datos de entrenamiento para evitar sesgos
    train = train.sample(frac=1, random_state=1)
    train_y = Y.loc[train.index]
    train["label"]=train_y["label"]
    return train
    
def flatten_and_filter_dataset(ground_truth,reports):
    """
    Esta función toma un conjunto de datos en el formato original (con estructura jerárquica)
    y devuelve un conjunto de datos plano, donde cada entrada tiene un solo `context`, `question` y `answer`.
    
    Argumentos:
        dataset: Un conjunto de datos en formato original (puede ser train, validation, test).
    
    Retorno:
        Un conjunto de datos de Hugging Face en formato plano, con solo ejemplos completos.
    """
    # Lista para almacenar ejemplos en formato plano
    flattened_examples = {}
    examples_raw={}
    targets={}
    val_data={}
    question_tipo={}
    previous_message_answer_tipo={}
    options_tipo={}
    answers_tipo={}
    j=0
    
    question_tipo["age"]= "does the patient's age appear in the following breast medical report?"
    previous_message_answer_tipo["age"]="search for numbers, but do not mistake it with the age of a familiar. If a number appears without any context between two dots it is surely the age."
    options_tipo["age"]="answer only the age of the patient."

    question_tipo["tipo"]= 'is the following breast medical report a biopsy report or a nodal staging ultrasound report?'
    previous_message_answer_tipo["tipo"]="biopsy reports are normally Image-Guided Biopsy and is normally said that they are referred to the hospital for biopsy. Nodal staging ultrasound reports can also be written as 'axilla ultrasound'. If it is any of these it will be written in the beginning of the report, normally in the used technique. These kind of reports are only ultrasound. In these reports no final BI-RADS is given."
    options_tipo["tipo"]="answer with one of the following options: 'biopsy report', 'nodal staging ultrasound report' or 'normal control or revision report'."

    question_tipo["tecnica"]= 'what diagnostic technique was used in the following breast medical report?'
    previous_message_answer_tipo["tecnica"]="biopsy reports, simple cysts and analysis of lymph or axillary nodes are only seen on ultrasound. On the other hand, if the ACR density is given or parenchymal distortions are analysed, the technique will be a mammogram. Tomosyntesis is a mammography type. The report may include an ultrasound examination, a mammography examination or both."
    options_tipo["tecnica"]="answer with one of the following options: 'only ultrasound study', 'only mammography study' or 'mammography and ultrasound'."

    question_tipo["family"]= "does the patient have any family history in the following breast medical report?"
    previous_message_answer_tipo["family"]="family history of breast cancer is categorized based on the degree of relatives affected: First-degree relatives: Parents, siblings, or children. Second-degree relatives: Grandparents, aunts, uncles, nieces, nephews, or half-siblings. Third-degree relatives: Great-grandparents, great-aunts/uncles, or first cousins."
    options_tipo["family"]="answer with one of the following options: 'first degree', 'second degree', 'third degree' or 'no family history'."
    
    question_tipo["history"]= "does the patient have any non-familiar history in the following breast medical report?"
    previous_message_answer_tipo["history"]="check for the history at the beginning of the report. Normally it is a previous biopsy result, mastectomy or cancer. It normally starts with 'history of ...'."
    options_tipo["history"]="answer retrieving the information directly from the report or with 'no history was found'."
    
    question_tipo["symtomatic"]= "is the reason for the consultation that the patient is symptomatic in the following breast medical report?"
    previous_message_answer_tipo["symtomatic"]="the answer is at the beginning of the report, in the reason for consultation. It is normally a palpable lump, lumpectomy or nodule, sometimes painful."
    options_tipo["symtomatic"]="answer retrieving the information directly from the report or with 'non-symptomatic consultation'."
    
    question_tipo["prosthesis"]= "does the patient have a prosthesis in the following breast medical report?"
    previous_message_answer_tipo["prosthesis"]="it is normally clearly indicated at the beginning of the report. Sometimes it is written as implants instead of prosthesis."
    options_tipo["prosthesis"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["birads"]= "what is the final BI-RADS classification given to the patient in the following breast medical report?"
    previous_message_answer_tipo["birads"]="the final BI-RADS of the patient is given in the conclusions of the report, normally at the end."
    options_tipo["birads"]="answer with one of the following options: 'BI-RADS 0', 'BI-RADS 1', 'BI-RADS 2', 'BI-RADS 3', 'BI-RADS 4A', 'BI-RADS 4B', 'BI-RADS 4C', 'BI-RADS 5' or 'unknown'."

    question_tipo["density_mammo"]= "what is the breast density found in the mammography study of the following breast medical report?"
    previous_message_answer_tipo["density_mammo"]="breast density in mammography is classified into four categories: ACR A (= Almost entirely fatty), ACR B (= Scattered areas of fibroglandular density), ACR C (= Heterogeneously dense), ACR D (= Extremely or very dense breasts). Sometimes it is written as 'density type x'. It can also be written with their real meaning (very dense breasts = C) and not with the A, B, C, D classification. Focus only on density."
    options_tipo["density_mammo"]="answer with one of the following options: 'ACR A', 'ACR B', 'ACR C', ACR D' or 'unknown'."
   
    question_tipo["density_echo"]= "what is the breast density found in the ultrasound study of the following breast medical report?"
    previous_message_answer_tipo["density_echo"]="breast composition in ultrasound is classified into three categories: fibroglandular and fat (mixed distribution of fibroglandular and adipose tissue), heterogeneous fibroglandular (predominantly fibroglandular tissue with varying echogenicity and scattered fat areas), homogeneous fatty (uniform fatty tissue with consistent echogenicity and minimal fibroglandular content), and homogeneous fibroglandular (uniform fibroglandular tissue with consistent echogenicity and minimal fat content)."
    options_tipo["density_echo"]="answer with one of the following options: 'fibroglandular and fat', 'heterogeneous fibroglandular', 'homogeneous fibroglandular', 'homogeneous fatty' or 'unknown'."

    question_tipo["calcifications_benign"]= "does the following breast medical report mention the appearence of benign calcifications in the mammography exam?"
    previous_message_answer_tipo["calcifications_benign"]="Consider only benign calcifications in the mammography."
    options_tipo["calcifications_benign"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["ganglio_mamo"]= "does the following breast medical report mention any lymph nodes in the mammography exam?"
    previous_message_answer_tipo["ganglio_mamo"]="Consider only lymph nodes that appear in the mammography."
    options_tipo["ganglio_mamo"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["parenchymal_distortion"]= "does the following breast medical report mention any parenchymal distortion or asymmetry in the mammography exam?"
    previous_message_answer_tipo["parenchymal_distortion"]="If it has any it will appear in the results of the mammography exam using the words distortion, asymmetry or sometimes it can also be surgical changes."
    options_tipo["parenchymal_distortion"]="answer retrieving the information directly from the report or with 'no'"
    
    question_tipo["lymph_suspicious"]= "does the following breast medical report mention any suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_suspicious"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered suspicious when it has eccentric cortical thickening ≥ 3 mm, Short axis >10 mm, round shape, loss of fatty hilum, abnormal vascularity, or irregular margins—especially when associated with known malignancy or progressive enlargement. They can also be classified as UN3, UN4 or UN5. An exam may have both suspicious and benign lymph nodes, answer 'yes' in this case."
    options_tipo["lymph_suspicious"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["lymph_benign"]= "does the following breast medical report mention any benign or not suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_benign"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered benign when it has uniform cortex < 3 mm, preserved fatty hilum, oval shape, no abnormal vascularity, no irregular margins and homogeneous internal echo pattern. Benign axillary nodes can be classified as UN1 or UN2. A reactive axillary node is not suspicious. An exam may have both suspicious and benign lymph nodes, answer 'yes' in this case."
    options_tipo["lymph_benign"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["simple_cyst"]= "does the following breast medical report mention any simple cysts or microcysts in the ultrasound exam?"
    previous_message_answer_tipo["simple_cyst"]="The words symple cysts or microcysts will appear only in the ultrasound exam. Sometimes they can say that some of the cysts have echogenic content, but we still will consider them simple cysts and not nodules."
    options_tipo["simple_cyst"]="answer with one of the following options: 'yes' or 'no'."

    question_tipo["ductal_ectasia"]= "does the following breast medical report mention any ductal ectasia in the ultrasound exam?"
    previous_message_answer_tipo["ductal_ectasia"]="The word ductal ectasia will appear only in the ultrasound exam."
    options_tipo["ductal_ectasia"]="answer with one of the following options: 'yes' or 'no'."
    
    question_tipo["nodules_echo_num"]="is there any nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_echo_num"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. The localization, echogenicity and size of the nodules are normally said."
    options_tipo["nodules_echo_num"]="answer with one of the following options: 'yes' or 'no'."

    
    dic_order = {
        1: "first",
        2: "second",
        3: "third",
        4: "fourth",
        5: "fifth",
        6: "sixth",
        7: "seventh",
        8: "eighth",
        9: "ninth",
        10: "tenth",
        11: "eleventh",
        12: "twelfth",
        13: "thirteenth",
        14: "fourteenth",
        15: "fifteenth"
    }
    for i in range(1,2):
        question_tipo[f"nodules_echo_description_{i}"]= f"which is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_description_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_description_{i}"]="answer retrieving the information directly from the report."
        
        question_tipo[f"nodules_echo_shape_{i}"]= f"what is the shape of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_shape_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Shapes can be 'oval', 'round', 'lobulated' and 'irregular'. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_shape_{i}"]="answer with one of the following options: 'oval', 'round', 'lobulated', 'irregular' or 'unknown'."
        
        question_tipo[f"nodules_echo_margin_{i}"]= f"what is the margin of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_margin_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Margin can be 'circumscribed' and 'not circumscribed'. Inside the not circumscribed we have 'spiculated', 'angulated', 'microlobulated' or 'indistinc' ('not defined') margins. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders', in this case classify it as 'not circumscribed'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_margin_{i}"]="answer with one of the following options: 'circumscribed', 'not circumscribed', 'indefined', 'spiculated', 'angulated', 'microlobulated' or 'unknown'."
        
        question_tipo[f"nodules_echo_echogenicity_{i}"]= f"what is the echogenicity of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_echogenicity_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. Echogenicity can be 'anechoic', 'hypoechoic', 'heterogeneous' and 'complex cystic and solid'. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour." 
        options_tipo[f"nodules_echo_echogenicity_{i}"]="answer with one of the following options: 'hypoechoic', 'heterogeneous', 'anechoic', 'hyperecoic', 'isoechoic', 'complex cystic and solid' or 'unknown'."
        
        question_tipo[f"nodules_echo_location_{i}"]= f"In which location is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_location_{i}"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. If the nodule is mentioned previously in the mammography, the location can be found also there. sometimes the breast location of the tumour may be written in a different part than the quadrant. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_location_{i}"]="answer retrieving the information directly from the report or with 'unknown'."
        
        question_tipo[f"nodules_echo_size_{i}"]= f"what is the size of the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_size_{i}"]="do not consider if a nodule is described in the mammography exam. The localization, echogenicity and size of the nodules are normally said. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_size_{i}"]="answer retrieving the information directly from the report (stop after 'mm') or with 'unknown'."
        
        question_tipo[f"nodules_echo_known_{i}"]= f"is the {dic_order[i]} nodule described in the ultrasound exam of the following breast medical report previously known?"
        previous_message_answer_tipo[f"nodules_echo_known_{i}"]="do not consider if a nodule is described in the mammography exam. If the nodule is known from before the report, it will say if it it is stable or if it has grown or shrink. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_known_{i}"]="answer with one of the following options: 'yes' or 'no'."
        
        question_tipo[f"nodules_echo_stable_{i}"]= f"is the {dic_order[i]} known nodule described in the ultrasound exam stable in the following breast medical report?"
        previous_message_answer_tipo[f"nodules_echo_stable_{i}"]="do not consider if a nodule is described in the mammography exam. If the nodule is known from before the examination, it will be analysed to see if it is stable or if it got bigger or smaller. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
        options_tipo[f"nodules_echo_stable_{i}"]="answer with one of the following options: 'yes', 'grown' or 'shrunk."
    for i, report in enumerate(reports["informes_ingles"]):
        informe=preprocess_text(report)
        key=reports["keys"][i]
        
        if key not in ground_truth.index:
            continue
        
        row=ground_truth.loc[key]

        #AGE
        age=str(row["Age"])
        answer_tipo={}
        # Verificar si el ejemplo tiene preguntas
        if age.isdigit():
            answer_tipo["age"]=age+"."
            
        else:
            answer_tipo["age"]="no"+"."
        
        #TIPO
        if row["Biopsy_report"]=="Yes":
            answer_tipo["tipo"]="biopsy report"+"."
            
        elif row["Ganglio_report"]=="Yes":
            answer_tipo["tipo"]="nodal staging ultrasound report"+"."
        else:
            answer_tipo["tipo"]="normal control or revision report"+"."
        
        #TECHNIQUE
        tecnica=row["Technique"]
        # Verificar si el ejemplo tiene preguntas
        if tecnica=="ultrasound":
            answer_tipo["tecnica"]="only ultrasound study"+"."            
        elif tecnica=="mammography":
            answer_tipo["tecnica"]="only mammography study"+"."
        elif not pd.isna(tecnica):
            answer_tipo["tecnica"]=tecnica+"."
        else:
            print(key,report)
        
        # 
        # HISTORY
        #No consideramos las biopsias o las ecografías de estadificación ganglionar.
        if answer_tipo["tipo"]=="normal control or revision report"+".":
            history=row["Other_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(history,str) or history=="No":
                answer_tipo["history"]="no history was found"+"."            
            else:
                answer_tipo["history"]=history+"." 
    
            # FAMILY
            family=row["Family_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(family,str) or family=="No":
                answer_tipo["family"]="no family history"+"."            
            else:
                answer_tipo["family"]=family+"." 
    
            # SYMTOMATIC
            symtomatic=row["Syntomatic"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(symtomatic,str) or symtomatic=="No" or symtomatic=="No estoy seguro":
                answer_tipo["symtomatic"]="Non-symptomatic consultation"+"."            
            else:
                answer_tipo["symtomatic"]=symtomatic+"." 
    
            # PROSTHESIS
            prosthesis=row["Prosthesis"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(prosthesis,str) or prosthesis=="No":
                answer_tipo["prosthesis"]="no"+"."            
            else:
                answer_tipo["prosthesis"]="yes"+"."
    
            #BIRADS
            birads=row["BI-RADS"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(birads,str):
                answer_tipo["birads"]="unknown"+"."            
            else:
                answer_tipo["birads"]=birads+"."
    
            #Density mammo
            density_mammo=row["Density_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_mammo,str):
                answer_tipo["density_mammo"]="unknown"+"."            
            else:
                answer_tipo["density_mammo"]=density_mammo+"."

            #Lymp nodes mammo
            ganglio_mamo=row["Ganglio_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ganglio_mamo,str):
                answer_tipo["ganglio_mamo"]="no"+"."            
            else:
                answer_tipo["ganglio_mamo"]=ganglio_mamo.lower()+"."

            #Calcifications benign
            calcifications_benign=row["Calcifications_benign_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(calcifications_benign,str):
                answer_tipo["calcifications_benign"]="no"+"."            
            else:
                answer_tipo["calcifications_benign"]=calcifications_benign.lower()+"."

            #Parenchymal distortion
            parenchymal_distortion=row["parenchymal_distortions_asymmetry"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(parenchymal_distortion,str):
                answer_tipo["parenchymal_distortion"]="no"+"."            
            else:
                answer_tipo["parenchymal_distortion"]=parenchymal_distortion.lower()+"."
    
            #Density echo
            density_echo=row["Density_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_echo,str):
                answer_tipo["density_echo"]="unknown"+"."            
            else:
                if density_echo in DENSITY_ECHO:
                    answer_tipo["density_echo"]=density_echo+"."

            #Benign lymph nodes
            simple_cyst=row["simple_cyst_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(simple_cyst,str):
                answer_tipo["simple_cyst"]="no"+"."            
            else:
                answer_tipo["simple_cyst"]=simple_cyst.lower()+"."
            #Suspicious lymph nodes
            lymph_suspicious=row["Ganglio_suspicious_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(lymph_suspicious,str):
                answer_tipo["lymph_suspicious"]="no"+"."            
            else:
                answer_tipo["lymph_suspicious"]=lymph_suspicious.lower()+"."

            #Benign lymph nodes
            lymph_benign=row["Ganglio_benign_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(lymph_benign,str):
                answer_tipo["lymph_benign"]="no"+"."            
            else:
                answer_tipo["lymph_benign"]=lymph_benign.lower()+"."

            #Ductal ectasia
            ductal_ectasia=row["Ductal_ectasia_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ductal_ectasia,str):
                answer_tipo["ductal_ectasia"]="no"+"."            
            else:
                answer_tipo["ductal_ectasia"]=ductal_ectasia.lower()+"."
    
            #Nodules echo
            nodules_echo_num=row["Nodules_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(nodules_echo_num,str) and not isinstance(nodules_echo_num,int):
                answer_tipo["nodules_echo_num"]="no"+"."    
            elif isinstance(nodules_echo_num,str) and nodules_echo_num=="No":
                answer_tipo["nodules_echo_num"]="no"+"."  
            else:
                answer_tipo["nodules_echo_num"]="yes"+"."
    
            #Si existen nódulos se hace las preguntas correspondientes
            if answer_tipo["nodules_echo_num"]!="no.":
                nodules_echo_description=row["Description_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                
                answer_tipo["nodules_echo_description_1"]=nodules_echo_description+"."
                    
                nodules_echo_shape=row[f"Shape_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_shape,str):
                    answer_tipo["nodules_echo_shape_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_shape_1"]=nodules_echo_shape.lower()+"."

                nodules_echo_margin=row["Margin_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_margin,str):
                    answer_tipo["nodules_echo_margin_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_margin_1"]=nodules_echo_margin.lower()+"."

                nodules_echo_echogenicity=row["Echogenicity_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_echogenicity,str):
                    answer_tipo["nodules_echo_echogenicity_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_echogenicity_1"]=nodules_echo_echogenicity.lower()+"."

                nodules_echo_location=row[f"Location_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_location,str):
                    answer_tipo["nodules_echo_location_1"]="unknown"+"."
                else:
                    answer_tipo["nodules_echo_location_1"]=nodules_echo_location.lower()+"."

                
                #Nodules echo size
                nodules_echo_size=row["size_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_size,str):
                    answer_tipo["nodules_echo_size_1"]="unknown"+"."            
                else:
                    answer_tipo["nodules_echo_size_1"]=nodules_echo_size+"."
        
                #Nodules echo known
                nodules_echo_known=row["new_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_known,str):
                    answer_tipo["nodules_echo_known_1"]="unknown"+"."
                elif nodules_echo_known=="No":
                    answer_tipo["nodules_echo_known_1"]="yes"+"."            
                else:
                    answer_tipo["nodules_echo_known_1"]="no"+"."
    
                if answer_tipo["nodules_echo_known_1"]=="yes.":
                    #Nodules echo stable
                    nodules_echo_stable=row["Stable_eco_1"]
                    # Verificar si el ejemplo tiene preguntas
                    if not isinstance(nodules_echo_stable,str):
                        answer_tipo["nodules_echo_stable_1"]="unknown"+"."
                    else:
                        answer_tipo["nodules_echo_stable_1"]=nodules_echo_stable.lower()+"."
            
        for tipo in answer_tipo:
            
            #Si está el tipo en las respuestas que hemos recogido lo metemos a la base de datos.
            key_tipo=key+"_"+tipo
            if key_tipo in flattened_examples:
                continue
                key_tipo=key_tipo+"_copy"
            examples_raw[key_tipo]=report
            answer=str(answer_tipo[tipo])
            
    
            inputs_tipo = "Question: " + question_tipo[tipo] +" Additional information: "+ previous_message_answer_tipo[tipo]+ " Context: " + informe + "Options:"+ options_tipo[tipo]
            # inputs_tipo = "question: " + question_tipo[tipo] +" context: " + informe
            flattened_examples[key_tipo]=inputs_tipo
            targets[key_tipo]=answer
            
    return flattened_examples,targets


def tokenize_function(inputs):
    # Tokenize the input text
    model_inputs = tokenizer(
        inputs["text"], 
        max_length=1024, 
        truncation=True, 
        padding="max_length"
    )
    
    # Tokenize the target labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            inputs["label"], 
            max_length=45, 
            truncation=True, 
            padding="max_length"
        )
    
    # Replace padding token id's in labels by -100 to ignore in loss
    labels["input_ids"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label_seq]
        for label_seq in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["decoder_attention_mask"] = labels["attention_mask"]
    
    return model_inputs


def generate_output(model,test_data, test_attention, tipo, tokens, beams):
    outputs = []
    for i,data in enumerate(test_data[tipo]):
        output = model.generate(
            data.unsqueeze(0),  # Shape: (1, sequence_length)
            max_new_tokens=tokens,
            attention_mask=test_attention[tipo][i].unsqueeze(0),  # Important for reliable results

            num_beams=beams,
            num_return_sequences=1,
            early_stopping=True
        ).squeeze(0).cpu()  # Shape: (generated_sequence_length,)
        
        outputs.append(output)  # Append 1D tensors (no extra dimensions)

    # Pad sequences to the longest one in the batch
    outputs_padded = pad_sequence(outputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    return outputs_padded  # Shape: (batch_size, max_sequence_length)
    
def train_save(X,Y,training=True,low_beams=False,testing=False):
    random.seed(1)
    np.random.seed(1)
    predicted=[]
    tested=[]
    acc_cv=[]
    kappa_cv=[]
    ind_cv={tipo:[] for tipo in questions}
    preds_category_cv={tipo:[] for tipo in questions}
    labels_category_cv={tipo:[] for tipo in questions}
    accuracies_cv={tipo:[] for tipo in questions}
   
        
    train = train_clean(X,Y)

    
    
    # model = T5ForConditionalGeneration.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name, from_flax=True)
    train_data = Dataset.from_pandas(train)
    
    train_data = train_data.map(tokenize_function, batched=True)
    # train_data = train_data.rename_column("label", "labels")
    train_data = train_data.remove_columns(["text","label"])
    train_data.set_format("torch")
    
    trainer = Trainer(
    model=model,
        args=training_args,
        train_dataset=train_data,
    )
    
    
    trainer.train()
    trainer.save_model(f"results/{model_name}_second_stage_model_final")
        
       

In [None]:
# Cargar modelo y tokenizador
model_name = "luqh/ClinicalT5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

inputs,targets = flatten_and_filter_dataset(ground_truth,report_data)  


#ahora estaba a 1e-5 y 7 epoochs
training_args = TrainingArguments(
    
    # evaluation_strategy="steps",
    # eval_steps=10,
    
    learning_rate=7e-5,  # Lower for fine-tuning without losing generalization
    fp16=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    
    num_train_epochs=7,  # Shorter fine-tuning stage
    weight_decay=0.05,  # Lower weight decay to preserve learned features
    # save_total_limit=2,
)
dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
targets=pd.DataFrame.from_dict(targets,orient='index')
dataset_final.columns=["text"]
targets.columns=["label"]
train_save(dataset_final,targets)

##BERT RETRIEVAL

In [None]:
models_r=["microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract","dmis-lab/biobert-base-cased-v1.1","bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"]

BioMedBERT="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract0.0001816batch"
BlueBERT="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch"
BioBERT="dmis-lab/biobert-base-cased-v1.10.0001416batch"
models= [BioMedBERT, BlueBERT, BioBERT]
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import pandas as pd
import random
import math
from sklearn.metrics import classification_report, accuracy_score
from collections import defaultdict
import re
import numpy as np
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

In [None]:
questions=["age","history","parenchymal_distortion","nodules_echo_size"]


def fix_brackets_spaces(texto):
    ''' 
        Introduce espacios por delante y por detrás de los paréntesis.
        Esta medida mejora el tokenizado de Spacy
    '''
    
    texto = re.sub(r'([(\[¿!])', r' \1', texto)
    texto = re.sub(r'([)\]?¡])', r'\1 ', texto)
                
    return texto
    
def preprocess_text(text):
    text= fix_brackets_spaces(text)
    return text


def flatten_and_filter_dataset(ground_truth,reports):
    """
    Esta función toma un conjunto de datos en el formato original (con estructura jerárquica)
    y devuelve un conjunto de datos plano, donde cada entrada tiene un solo `context`, `question` y `answer`.
    
    Argumentos:
        dataset: Un conjunto de datos en formato original (puede ser train, validation, test).
    
    Retorno:
        Un conjunto de datos de Hugging Face en formato plano, con solo ejemplos completos.
    """
    # Lista para almacenar ejemplos en formato plano
    flattened_examples = {}
    examples_raw={}
    targets={}
    questions_examples={}
    val_data={}
    question_tipo={}
    previous_message_answer_tipo={}
    options_tipo={}
    answers_tipo={}
    j=0
    
    question_tipo["age"]= "does the patient's age appear in the following breast medical report?"
    previous_message_answer_tipo["age"]="search for numbers, but do not mistake it with the age of a familiar. If a number appears without any context between two dots it is surely the age."
    

    question_tipo["history"]= "does the patient have any non-familiar history in the following breast medical report?"
    previous_message_answer_tipo["history"]="check for the history at the beginning of the report. Normally it is a previous biopsy result, mastectomy or cancer. It normally starts with 'history of ...'."

    question_tipo["parenchymal_distortion"]= "does the following breast medical report mention any parenchymal distortion or asymmetry in the mammography exam?"
    previous_message_answer_tipo["parenchymal_distortion"]="If it has any it will appear in the results of the mammography exam using the words distortion, asymmetry or sometimes it can also be surgical changes."
    

    # question_tipo["nodules_echo_location_1"]= "In which location is the first nodule described in the ultrasound exam of the following breast medical report?"
    # previous_message_answer_tipo["nodules_echo_location_1"]="do not consider if a nodule is described in the mammography exam or if it is in the axilla. If the nodule is mentioned previously in the mammography, the location can be found also there. sometimes the breast location of the tumour may be written in a different part than the quadrant. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
    
    
    question_tipo["nodules_echo_size_1"]= "what is the size of the first nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_echo_size_1"]="do not consider if a nodule is described in the mammography exam. The localization, echogenicity and size of the nodules are normally said. More than one nodule can be described simultaneously ('several', 'two', 'three', etc.). If the number is unspecified, only the ones with size will be considered. Sometimes it can also say 'similar characteristics' or 'similar to the previous', consider in this case the answer to the previous tumour."
    
    # Iterar sobre cada ejemplo en el conjunto de datos original
    for i, report in enumerate(reports["informes_ingles"]):
        informe=preprocess_text(report)
        key=reports["keys"][i]
        
        if key not in ground_truth.index:
            continue
        answer_tipo={}
        row=ground_truth.loc[key]
        age=str(row["Age"])
        # Verificar si el ejemplo tiene preguntas
        if age.isdigit():
            answer_tipo["age"]=age
            
        else:
            answer_tipo["age"]="no response"

        if row["Biopsy_report"]!="Yes" and row["Ganglio_report"]!="Yes":
            history=row["Other_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(history,str) or history=="No":
                answer_tipo["history"]="no response"           
            else:
                answer_tipo["history"]=history


            #Parenchymal distortion
            parenchymal_distortion=row["parenchymal_distortions_asymmetry"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(parenchymal_distortion,str) or parenchymal_distortion.lower()=="no":
                answer_tipo["parenchymal_distortion"]="no response"        
            else:
                answer_tipo["parenchymal_distortion"]=parenchymal_distortion.lower()

            #Nodules echo
            nodules_echo_num=row["Nodules_eco"]
            
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(nodules_echo_num,str) and not isinstance(nodules_echo_num,int):
                nodules=False
            elif isinstance(nodules_echo_num,str) and nodules_echo_num=="No":
                nodules=False
            else:
                nodules=True
            if nodules:
                # nodules_echo_location=row["Location_eco_1"]
                # # Verificar si el ejemplo tiene preguntas
                # if not isinstance(nodules_echo_location,str):
                #     answer_tipo["nodules_echo_location_1"]="no response"
                # else:
                #     answer_tipo["nodules_echo_location_1"]=nodules_echo_location.lower()
                #Nodules echo size
                nodules_echo_size=row["size_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echo_size,str):
                    answer_tipo["nodules_echo_size_1"]="no response"         
                else:
                    answer_tipo["nodules_echo_size_1"]=nodules_echo_size
                

            
        for tipo in answer_tipo:
            #Si está el tipo en las respuestas que hemos recogido lo metemos a la base de datos.
            key_tipo=key+"_"+tipo
            if key_tipo in flattened_examples:
                continue
                key_tipo=key_tipo+"_copy"
            examples_raw[key_tipo]=report
            answer=str(answer_tipo[tipo])
            
            questions_examples[key_tipo]=question_tipo[tipo]
            inputs_tipo = " Additional information: "+ previous_message_answer_tipo[tipo]+ " Context: " + informe
            flattened_examples[key_tipo]=inputs_tipo
            targets[key_tipo]=answer.strip()
        
        
        
        # if answer==word_to_idx_out["other"]:
        #     for j in range(2):
        #         examples_raw[key+"_copy"+str(j)]=report
        #         flattened_examples[key+"_copy"+str(j)]=informe
        #         targets[key+"_copy"+str(j)]=answer
            
        
            
        
    # flattened_examples=pd.DataFrame.from_dict(flattened_examples,orient='index')
    # targets=pd.DataFrame.from_dict(targets,orient='index')
    return flattened_examples,questions_examples,targets,examples_raw






def visualize_errors(valid_dataset,valid_targets,validation_predictions,keys):
    # Crear un DataFrame con los textos originales, las etiquetas reales y las predicciones
    results_df = pd.DataFrame({
        'key':list(keys),
        'Text': list(valid_dataset),  # Usamos los textos originales
        'True Label': list(valid_targets),
        'Predicted Label': list(validation_predictions)
    })
    
    # Filtrar los ejemplos en los que el modelo falló
    errors_df = results_df[results_df['True Label'] != results_df['Predicted Label']]
    
    for ind,row in errors_df.iterrows():
        print(row["key"])
        print("EJEMPLO")
        print(row["Text"])
        print("PREDICTED")
        print(row["Predicted Label"])
        print("TRUE")
        print(row["True Label"])

def tokenize_function(examples):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_examples = tokenizer(
        examples["question"],
        examples["text"],
        truncation=True,
        max_length=512,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized_examples.pop("offset_mapping")
    start_positions = []
    end_positions = []

    context_texts = examples["text"]
    answer_texts = examples["label"]
    errors=[]

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        context = context_texts[i]
        answer = answer_texts[i]

        # Default to CLS for no response
        if answer == "no response" or answer.strip() == "":
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        # Lowercase match to avoid case mismatch
        start_char = context.lower().find(answer.lower())
        if start_char == -1:
            print(i)
            errors.append(i)
            print(f"[WARNING] Could not find answer: '{answer}' in context:\n{context}")
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        end_char = start_char + len(answer)

        # Now find token positions
        start_pos = None
        end_pos = None
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end and start_pos is None:
                start_pos = idx
            if start < end_char <= end:
                end_pos = idx
                break

        if start_pos is None or end_pos is None:
            # Fallback if something failed
            print(f"[WARNING] Failed to align answer '{answer}' in context")
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_positions.append(start_pos)
            end_positions.append(end_pos)
        
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples
    
def train_clean(X,Y):
    random.seed(1)
    
    train = X

    # Barajar los datos de entrenamiento para evitar sesgos
    train = train.sample(frac=1, random_state=1)

    train_y = Y.loc[train.index]

    train["label"]=train_y["label"]
    
    return train


def predict_indexes(pred):

    # Extract logits from predictions
    start_logits, end_logits = pred.predictions

    
    # Get the best start and end indices
    start_indexes = np.argmax(start_logits, axis=1)
    end_indexes = np.argmax(end_logits, axis=1)

    
    return start_indexes, end_indexes

def extract_answer_from_tokens(tokenized_inputs, start_index, end_index):
    """
    Extracts the predicted answer using tokenized input and index positions.
    
    Args:
        tokenized_inputs: The tokenized dataset
        start_index: Predicted start position
        end_index: Predicted end position
    
    Returns:
        Extracted text or "No response" if CLS token is chosen
    """
    # Convert token IDs back to words
    tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"])
    
    
    # If CLS token is chosen (indicating no answer)
    if start_index == 0 or end_index == 0 or start_index > end_index:
        return "no response"

    # Extract the predicted text
    answer_tokens = tokens[start_index:end_index+1]
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    

    
    return answer


def train_save(X, Y):
    import torch
    from transformers import AutoModelForQuestionAnswering, Trainer
    from datasets import Dataset
    from sklearn.metrics import accuracy_score, classification_report

    random.seed(1)
    np.random.seed(1)
    
    predicted = []
    tested = []
    acc_cv = []
    ind_cv={tipo:[] for tipo in questions}
    preds_category_cv={tipo:[] for tipo in questions}
    labels_category_cv={tipo:[] for tipo in questions}
    accuracies_cv={tipo:[] for tipo in questions}

    
    train = train_clean(X, Y)
    print(len(train))

    model = AutoModelForQuestionAnswering.from_pretrained(
        model_name
    )

    train_data = Dataset.from_pandas(train)
    

    # Tokenize datasets

    train_data= train_data.map(tokenize_function, batched=True)
    train_data = train_data.rename_column("label", "labels")
    train_data = train_data.remove_columns(["text"])
    train_data.set_format("torch")


        
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data
    )

    trainer.train()
    trainer.save_model(f"results/{model_name}_model_final2")
    

In [None]:


# BioMedBERT="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract5e-05616batch"
# BioBERT="dmis-lab/biobert-base-cased-v1.15e-05616batch"
# BlueBERT="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-120.0001716batch"
# models_r=["microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract","dmis-lab/biobert-base-cased-v1.1","bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"]
for i,model_name in enumerate(models_r):
    
    if model_name=="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract":
        epochs=6
        lr=5e-05
    if model_name=="dmis-lab/biobert-base-cased-v1.1":
        epochs=6
        lr=5e-05
    if model_name=="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
        epochs=7
        lr=0.0001

    inputs,questions_examples,targets,examples_raw = flatten_and_filter_dataset(ground_truth,report_data)  
    dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
    questions_examples=pd.DataFrame.from_dict(questions_examples,orient='index')
    
    print(len(dataset_final),len(questions_examples),len(targets))
    
    targets=pd.DataFrame.from_dict(targets,orient='index')
    dataset_final.columns=["text"]
    
    dataset_final["question"]=questions_examples
    
    targets.columns=["label"]
        
    training_args= TrainingArguments(
        output_dir='./results',          # Carpeta para guardar el modelo
        num_train_epochs=epochs,             # Número de épocas
        per_device_train_batch_size=8,  # Tamaño del batch
        gradient_accumulation_steps=2,
        learning_rate=lr, 
        weight_decay=0.05,              # Decaimiento del peso
        logging_dir='./logs',           # Carpeta para los logs
        logging_steps=10,
    )
    train_save(dataset_final,targets)
    torch.cuda.empty_cache()
    gc.collect()

##BERT CLASSIFICATION

In [None]:
questions=["tipo","tecnica","family",
           "prosthesis","birads","density_mammo","calcifications_benign","ganglio_mamo","density_echo","lymph_benign","lymph_suspicious","simple_cyst","ductal_ectasia"]

TIPO=["biopsy report", "nodal staging ultrasound report", "normal control or revision report"]
TECNICA=["only ultrasound study", "only mammography study", "mammography and ultrasound"]
FAMILY=["no family history", "first degree", "second degree"]
PROSTHESIS=["no prosthesis","yes prosthesis"]
BIRADS=["BI-RADS 0","BI-RADS 1","BI-RADS 2","BI-RADS 3","BI-RADS 4A","BI-RADS 4B","BI-RADS 4C","BI-RADS 5","BI-RADS 6"]
DENSITY_MAMMO=["ACR A","ACR B","ACR C","ACR D","unknown density mammo"]
CALCIFICATIONS_BENIGN=["no calcifications","yes calcifications"]
GANGLIO_MAMO=["no ganglio","yes ganglio"]
DENSITY_ECHO=["homogeneous fibroglandular","heterogeneous fibroglandular","fibroglandular and fat","homogeneous fatty","unknown density echo"]
LYMPH_BENIGN=["no lymph benign","yes lymph benign"]
LYMPH_SUSPICIOUS=["no lymph suspicious","yes lymph suspicious"]
SIMPLE_CYST=["no cyst","yes cyst"]
DUCTAL_ECTASIA=["no ectasia","yes ectasia"]
NODULES_ECHO=["no nodules", "yes nodules"]
NODULES_SHAPE=["oval","round","lobulated","irregular","unknown shape"]
NODULES_MARGIN=["circumscribed","spiculated","indistinct","not circumscribed","unknown margin"]
NODULES_ECHOGENICITY=["hypoechoic", "isoechoic", "heterogeneous","complex cystic and solid","unknown echogenicity"]
NODULES_KNOWN=["no known", "yes known"]
NODULES_STABLE=["grown stable","shrunk stable", "yes stable"]


TIPO.sort()
TECNICA.sort()
FAMILY.sort()
PROSTHESIS.sort()
BIRADS.sort()
DENSITY_MAMMO.sort()
CALCIFICATIONS_BENIGN.sort()
GANGLIO_MAMO.sort()
DENSITY_ECHO.sort()
LYMPH_BENIGN.sort()
SIMPLE_CYST.sort()
DUCTAL_ECTASIA.sort()
NODULES_ECHO.sort()
NODULES_SHAPE.sort()
NODULES_MARGIN.sort()
NODULES_ECHOGENICITY.sort()
NODULES_KNOWN.sort()
NODULES_STABLE.sort()


word_to_idx_tipo={word:idx for idx,word in enumerate(TIPO)}
idx_to_word_tipo={idx:word for idx,word in enumerate(TIPO)}

word_to_idx_tecnica={word:idx for idx,word in enumerate(TECNICA)}
idx_to_word_tecnica={idx:word for idx,word in enumerate(TECNICA)}

word_to_idx_family={word:idx for idx,word in enumerate(FAMILY)}
idx_to_word_family={idx:word for idx,word in enumerate(FAMILY)}

word_to_idx_prosthesis={word:idx for idx,word in enumerate(PROSTHESIS)}
idx_to_word_prosthesis={idx:word for idx,word in enumerate(PROSTHESIS)}

word_to_idx_birads={word:idx for idx,word in enumerate(BIRADS)}
idx_to_word_birads={idx:word for idx,word in enumerate(BIRADS)}

word_to_idx_density_mammo={word:idx for idx,word in enumerate(DENSITY_MAMMO)}
idx_to_word_density_mammo={idx:word for idx,word in enumerate(DENSITY_MAMMO)}

word_to_idx_calcifications_benign={word:idx for idx,word in enumerate(CALCIFICATIONS_BENIGN)}
idx_to_word_calcifications_benign={idx:word for idx,word in enumerate(CALCIFICATIONS_BENIGN)}

word_to_idx_ganglio_mamo={word:idx for idx,word in enumerate(GANGLIO_MAMO)}
idx_to_word_ganglio_mamo={idx:word for idx,word in enumerate(GANGLIO_MAMO)}

word_to_idx_density_echo={word:idx for idx,word in enumerate(DENSITY_ECHO)}
idx_to_word_density_echo={idx:word for idx,word in enumerate(DENSITY_ECHO)}

word_to_idx_lymph_benign={word:idx for idx,word in enumerate(LYMPH_BENIGN)}
idx_to_word_lymph_benign={idx:word for idx,word in enumerate(LYMPH_BENIGN)}

word_to_idx_lymph_suspicious={word:idx for idx,word in enumerate(LYMPH_SUSPICIOUS)}
idx_to_word_lymph_suspicious={idx:word for idx,word in enumerate(LYMPH_SUSPICIOUS)}

word_to_idx_simple_cyst={word:idx for idx,word in enumerate(SIMPLE_CYST)}
idx_to_word_simple_cyst={idx:word for idx,word in enumerate(SIMPLE_CYST)}

word_to_idx_ductal_ectasia={word:idx for idx,word in enumerate(DUCTAL_ECTASIA)}
idx_to_word_ductal_ectasia={idx:word for idx,word in enumerate(DUCTAL_ECTASIA)}
DICTIONARY={"tipo":TIPO,"tecnica":TECNICA,"family":FAMILY,"prosthesis":PROSTHESIS,"birads":BIRADS,"density_mammo":DENSITY_MAMMO,"calcifications_benign":CALCIFICATIONS_BENIGN,
            "ganglio_mamo":GANGLIO_MAMO,"density_echo":DENSITY_ECHO,"lymph_benign":LYMPH_BENIGN,"lymph_suspicious":LYMPH_SUSPICIOUS,"simple_cyst":SIMPLE_CYST,"ductal_ectasia":DUCTAL_ECTASIA,
           "nodules_echo": NODULES_ECHO,"nodules_shape":NODULES_SHAPE,"nodules_margin":NODULES_MARGIN, "nodules_echogenicity":NODULES_ECHOGENICITY, "nodules_known":NODULES_KNOWN, "nodules_stable":NODULES_STABLE}


outputs=[]
for tipo in DICTIONARY.values():
    outputs+=tipo
print(outputs)

word_to_idx_out={word:idx for idx,word in enumerate(outputs)}
idx_to_word_out={idx:word for idx,word in enumerate(outputs)}
import gc

import nltk
import re
import numpy as np



def fix_brackets_spaces(texto):
    ''' 
        Introduce espacios por delante y por detrás de los paréntesis.
        Esta medida mejora el tokenizado de Spacy
    '''
    
    texto = re.sub(r'([(\[¿!])', r' \1', texto)
    texto = re.sub(r'([)\]?¡])', r'\1 ', texto)
                
    return texto
    
def preprocess_text(text):
    text= fix_brackets_spaces(text)
    # print(text)
    
    # print(text)
    return text

    
def flatten_and_filter_dataset(ground_truth,reports):
    """
    Esta función toma un conjunto de datos en el formato original (con estructura jerárquica)
    y devuelve un conjunto de datos plano, donde cada entrada tiene un solo `context`, `question` y `answer`.
    
    Argumentos:
        dataset: Un conjunto de datos en formato original (puede ser train, validation, test).
    
    Retorno:
        Un conjunto de datos de Hugging Face en formato plano, con solo ejemplos completos.
    """
    # Lista para almacenar ejemplos en formato plano
    
    j=0
    flattened_examples = {}
    examples_raw={}
    targets={}
    val_data={}
    question_tipo={}
    previous_message_answer_tipo={}
    options_tipo={}
    answers_tipo={}
    j=0
    
    question_tipo["tipo"]= 'is the following breast medical report a biopsy report or a nodal staging ultrasound report?'
    previous_message_answer_tipo["tipo"]="biopsy reports are Image-Guided Biopsy or Fine needle aspiration and is normally said that they are referred to the hospital for biopsy. Nodal staging ultrasound reports can also be written as 'axilla ultrasound'. If it is any of these it will be written in the beginning of the report, normally in the used technique. These kind of reports are only ultrasound."
    

    question_tipo["tecnica"]= 'what diagnostic technique was used in the following breast medical report?'
    previous_message_answer_tipo["tecnica"]="biopsy reports, simple cysts and analysis of lymph or axillary nodes are only seen on ultrasound. On the other hand, if the ACR density is given or parenchymal distortions are analysed, the technique will be a mammogram. Tomosyntesis is a mammography type. The report may include an ultrasound examination, a mammography examination or both."
    

    question_tipo["family"]= "does the patient have any family history in the following breast medical report?"
    previous_message_answer_tipo["family"]="family history of breast cancer is categorized based on the degree of relatives affected: First-degree relatives: Parents, siblings, or children. Second-degree relatives: Grandparents, aunts, uncles, nieces, nephews, or half-siblings. Third-degree relatives: Great-grandparents, great-aunts/uncles, or first cousins."
    
    question_tipo["prosthesis"]= "does the patient have a prosthesis in the following breast medical report?"
    previous_message_answer_tipo["prosthesis"]="it is normally clearly indicated at the beginning of the report. Sometimes it is written as implants instead of prosthesis."
    
    question_tipo["birads"]= "what is the final BI-RADS classification given to the patient in the following breast medical report?"
    previous_message_answer_tipo["birads"]="the final BI-RADS of the patient is given in the conclusions of the report, normally at the end."

    question_tipo["density_mammo"]= "what is the breast density found in the mammography study of the following breast medical report?"
    previous_message_answer_tipo["density_mammo"]="breast density in mammography is classified into four categories: ACR A (= Almost entirely fatty), ACR B (= Scattered areas of fibroglandular density), ACR C (= Heterogeneously dense), ACR D (= Extremely or very dense breasts). Sometimes it is written as 'density type x' or with their real meaning (very dense breasts = C)."
       
    question_tipo["density_echo"]= "what is the breast density found in the ultrasound study of the following breast medical report?"
    previous_message_answer_tipo["density_echo"]="breast composition in ultrasound is classified into four categories: fibroglandular and fat (mixed distribution of fibroglandular and adipose tissue), heterogeneous fibroglandular (predominantly fibroglandular tissue with varying echogenicity and scattered fat areas), homogeneous fatty (uniform fatty tissue with consistent echogenicity and minimal fibroglandular content), and homogeneous fibroglandular (uniform fibroglandular tissue with consistent echogenicity and minimal fat content)."
    
    question_tipo["calcifications_benign"]= "does the following breast medical report mention the appearence of benign calcifications in the mammography exam?"
    previous_message_answer_tipo["calcifications_benign"]="Consider only benign calcifications in the mammography."
      
    question_tipo["ganglio_mamo"]= "does the following breast medical report mention any lymph nodes in the mammography exam?"
    previous_message_answer_tipo["ganglio_mamo"]="Consider only lymph nodes that appear in the mammography."
    
    question_tipo["lymph_suspicious"]= "does the following breast medical report mention any suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_suspicious"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered suspicious when it has eccentric cortical thickening ≥ 3 mm, Short axis >10 mm, round shape, loss of fatty hilum, abnormal vascularity, or irregular margins—especially when associated with known malignancy or progressive enlargement. They can also be classified as UN3, UN4 or UN5. An exam may have both suspicious and benign lymph nodes."

    question_tipo["lymph_benign"]= "does the following breast medical report mention any benign or not suspicious axillary lymph nodes in the ultrasound exam?"
    previous_message_answer_tipo["lymph_benign"]="if a lymph node is suspicious the report will recomend a biopsy or Fine Needle Aspiration. A lymph node is considered benign when it has uniform cortex < 3 mm, preserved fatty hilum, oval shape, no abnormal vascularity, no irregular margins and homogeneous internal echo pattern. Benign axillary nodes can be classified as UN1 or UN2. A reactive axillary node is not suspicious. An exam may have both suspicious and benign lymph nodes."
    
    question_tipo["simple_cyst"]= "does the following breast medical report mention any simple cysts or microcysts in the ultrasound exam?"
    previous_message_answer_tipo["simple_cyst"]="The words symple cysts or microcysts will appear only in the ultrasound exam. Sometimes they can say that some of the cysts have echogenic content, but we still will consider them simple cysts and not nodules."
    
    question_tipo["ductal_ectasia"]= "does the following breast medical report mention any ductal ectasia in the ultrasound exam?"
    previous_message_answer_tipo["ductal_ectasia"]="The word ductal ectasia will appear only in the ultrasound exam."

    question_tipo["nodules_echo"]= "is there any nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_echo"]="The localization, echogenicity and size of the nodules are normally said."
    
    question_tipo["nodules_shape"]= "what is the shape of the first nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_shape"]="Shapes can be 'oval', 'round', 'lobulated' and 'irregular'. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders'. "
    
    question_tipo["nodules_margin"]= "what is the margin of the first nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_margin"]="Margin can be 'circumscribed' and 'not circumscribed'. Inside the not circumscribed we have 'spiculated', 'angulated', 'microlobulated' or 'indistinc' ('not defined') margins. Sometimes irregular is also used for the margin, but in this case it is written as 'irregular margin' or 'irregular borders', in this case classify it as 'not circumscribed'."
    
    question_tipo["nodules_echogenicity"]= "what is the echogenicity of the first nodule described in the ultrasound exam of the following breast medical report?"
    previous_message_answer_tipo["nodules_echogenicity"]="Echogenicity can be 'anechoic', 'hypoechoic', 'heterogeneous' and 'complex cystic and solid'." 
    
    question_tipo["nodules_known"]= "is the first nodule described in the ultrasound exam of the following breast medical report previously known?"
    previous_message_answer_tipo["nodules_known"]="If the nodule is known from before the report, it will say if it it is stable or if it has grown or shrink."
    
    question_tipo["nodules_stable"]= "is the first known nodule described in the ultrasound exam stable in the following breast medical report?"
    previous_message_answer_tipo["nodules_stable"]="If the nodule is known from before the examination, it will be analysed to see if it is stable or if it got bigger or smaller. "
    
    for i, report in enumerate(reports["informes_ingles"]):
        informe=preprocess_text(report)
        key=reports["keys"][i]
        
        if key not in ground_truth.index:
            continue
        if key in flattened_examples:
            continue

        n_tipo=np.zeros(len(outputs))
        n_tecnica=np.zeros(len(outputs))
        n_family=np.zeros(len(outputs))
        n_prosthesis=np.zeros(len(outputs))
        n_birads=np.zeros(len(outputs))
        n_density_mammo=np.zeros(len(outputs))
        n_calcifications_benign=np.zeros(len(outputs))
        n_ganglio_mamo=np.zeros(len(outputs))
        n_density_echo=np.zeros(len(outputs))
        n_lymph_benign=np.zeros(len(outputs))
        n_lymph_suspicious=np.zeros(len(outputs))
        n_simple_cyst=np.zeros(len(outputs))
        n_ductal_ectasia=np.zeros(len(outputs))
        n_nodules_echo=np.zeros(len(outputs))
        n_nodules_shape=np.zeros(len(outputs))
        n_nodules_margin=np.zeros(len(outputs))
        n_nodules_echogenicity=np.zeros(len(outputs))
        n_nodules_known=np.zeros(len(outputs))
        n_nodules_stable=np.zeros(len(outputs))
        row=ground_truth.loc[key]
        answer_tipo={}
        #TIPO
        normal_control=False
        if row["Biopsy_report"]=="Yes":
            n_tipo[word_to_idx_out["biopsy report"]]=1
            
        elif row["Ganglio_report"]=="Yes":
            n_tipo[word_to_idx_out["nodal staging ultrasound report"]]=1
        else:
            normal_control=True
            n_tipo[word_to_idx_out["normal control or revision report"]]=1
        answer_tipo["tipo"]=n_tipo
        #TECHNIQUE
        tecnica=row["Technique"]
        # Verificar si el ejemplo tiene preguntas
        if tecnica=="ultrasound":
            n_tecnica[word_to_idx_out["only ultrasound study"]]=1          
        elif tecnica=="mammography":
            n_tecnica[word_to_idx_out["only mammography study"]]=1
        elif not pd.isna(tecnica):
            n_tecnica[word_to_idx_out[tecnica]]=1
        else:
            print(key,report)
        answer_tipo["tecnica"]=n_tecnica
        # 
        # HISTORY
        #No consideramos las biopsias o las ecografías de estadificación ganglionar.
        if normal_control:
            
            family=row["Family_history"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(family,str) or family=="No":
                n_family[word_to_idx_out["no family history"]]=1         
            else:
                n_family[word_to_idx_out[family]]=1
            answer_tipo["family"]=n_family    
            # PROSTHESIS
            prosthesis=row["Prosthesis"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(prosthesis,str) or prosthesis=="No":
                n_prosthesis[word_to_idx_out["no prosthesis"]]=1        
            else:
                n_prosthesis[word_to_idx_out["yes prosthesis"]]=1
            answer_tipo["prosthesis"]=n_prosthesis
            #BIRADS
            birads=row["BI-RADS"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(birads,str):
                n_birads[word_to_idx_out["unknown BI-RADS"]]=1           
            else:
                n_birads[word_to_idx_out[birads]]=1
            answer_tipo["birads"]=n_birads
            #Density mammo
            density_mammo=row["Density_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_mammo,str) or density_mammo not in DENSITY_MAMMO:
                n_density_mammo[word_to_idx_out["unknown density mammo"]]=1       
            else:
                n_density_mammo[word_to_idx_out[density_mammo]]=1
            answer_tipo["density_mammo"]=n_density_mammo
            #Lymp nodes mammo
            ganglio_mamo=row["Ganglio_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ganglio_mamo,str):
                n_ganglio_mamo[word_to_idx_out["no ganglio"]]=1            
            else:
                n_ganglio_mamo[word_to_idx_out[ganglio_mamo.lower()+" ganglio"]]=1
            answer_tipo["ganglio_mamo"]=n_ganglio_mamo
            #Calcifications benign
            calcifications_benign=row["Calcifications_benign_mamo"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(calcifications_benign,str):
                n_calcifications_benign[word_to_idx_out["no calcifications"]]=1       
            else:
                n_calcifications_benign[word_to_idx_out[calcifications_benign.lower()+ " calcifications"]]=1
            answer_tipo["calcifications_benign"]=n_calcifications_benign
        
    
            #Density echo
            density_echo=row["Density_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(density_echo,str)or density_echo not in DENSITY_ECHO:
                n_density_echo[word_to_idx_out["unknown density echo"]]=1         
            else:
                n_density_echo[word_to_idx_out[density_echo]]=1
            answer_tipo["density_echo"]=n_density_echo
            #Benign lymph nodes
            simple_cyst=row["simple_cyst_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(simple_cyst,str):
                n_simple_cyst[word_to_idx_out["no cyst"]]=1         
            else:
                n_simple_cyst[word_to_idx_out[simple_cyst.lower()+" cyst"]]=1

            answer_tipo["simple_cyst"]=n_simple_cyst
            #Suspicious lymph nodes
            lymph_suspicious=row["Ganglio_suspicious_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(lymph_suspicious,str):
                n_lymph_suspicious[word_to_idx_out["no lymph suspicious"]]=1         
            else:
                n_lymph_suspicious[word_to_idx_out[lymph_suspicious.lower()+ " lymph suspicious"]]=1
            answer_tipo["lymph_suspicious"]=n_lymph_suspicious
            #Benign lymph nodes
            lymph_benign=row["Ganglio_benign_eco"]
            # Verificar si el ejemplo tiene preguntas
            
            if not isinstance(lymph_benign,str):
                n_lymph_benign[word_to_idx_out["no lymph benign"]]=1           
            else:
                n_lymph_benign[word_to_idx_out[lymph_benign.lower()+ " lymph benign"]]=1
            answer_tipo["lymph_benign"]=n_lymph_benign
            #Ductal ectasia
            ductal_ectasia=row["Ductal_ectasia_eco"]
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(ductal_ectasia,str):
                n_ductal_ectasia[word_to_idx_out["no ectasia"]]=1    
            else:
                n_ductal_ectasia[word_to_idx_out[ductal_ectasia.lower()+" ectasia"]]=1
            answer_tipo["ductal_ectasia"]=n_ductal_ectasia

            nodules_echo=row["Nodules_eco"]
            nodules_bool=False
            # Verificar si el ejemplo tiene preguntas
            if not isinstance(nodules_echo,str) and not isinstance(nodules_echo,int):
                n_nodules_echo[word_to_idx_out["no nodules"]]=1
            elif isinstance(nodules_echo,str) and nodules_echo=="No":
                n_nodules_echo[word_to_idx_out["no nodules"]]=1
            else:
                nodules_bool=True
                n_nodules_echo[word_to_idx_out["yes nodules"]]=1
            answer_tipo["nodules_echo"]=n_nodules_echo
            if nodules_bool:
                #Density echo
                nodules_shape=row["Shape_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_shape,str)or nodules_shape not in NODULES_SHAPE:
                    n_nodules_shape[word_to_idx_out["unknown shape"]]=1         
                else:
                    n_nodules_shape[word_to_idx_out[nodules_shape]]=1
                answer_tipo["nodules_shape"]=n_nodules_shape

                nodules_margin=row["Margin_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_margin,str)or nodules_margin not in NODULES_MARGIN:
                    n_nodules_margin[word_to_idx_out["unknown margin"]]=1         
                else:
                    n_nodules_margin[word_to_idx_out[nodules_margin]]=1
                answer_tipo["nodules_margin"]=n_nodules_margin

                nodules_echogenicity=row["Echogenicity_eco_1"]
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_echogenicity,str)or nodules_echogenicity not in NODULES_ECHOGENICITY:
                    n_nodules_echogenicity[word_to_idx_out["unknown echogenicity"]]=1         
                else:
                    n_nodules_echogenicity[word_to_idx_out[nodules_echogenicity]]=1
                answer_tipo["nodules_echogenicity"]=n_nodules_echogenicity

                #Nodules echo known
                nodules_known=row["new_eco_1"]
                known_bool=False
                # Verificar si el ejemplo tiene preguntas
                if not isinstance(nodules_known,str):
                    n_nodules_known[word_to_idx_out["unknown known"]]=1
                elif nodules_known=="No":
                    known_bool=True
                    n_nodules_known[word_to_idx_out["yes known"]]=1    
                else:
                    n_nodules_known[word_to_idx_out["no known"]]=1
                answer_tipo["nodules_known"]=n_nodules_known
                if known_bool:
                    #Nodules echo stable
                    nodules_stable=row["Stable_eco_1"]
                    # Verificar si el ejemplo tiene preguntas
                    if not isinstance(nodules_stable,str):
                        n_nodules_stable[word_to_idx_out["unknown stable"]]=1
                    else:
                        n_nodules_stable[word_to_idx_out[nodules_stable.lower()+" stable"]]=1
                    answer_tipo["nodules_stable"]=n_nodules_stable

                
                

        for tipo in answer_tipo:
            #Si está el tipo en las respuestas que hemos recogido lo metemos a la base de datos.
            key_tipo=key+"_"+tipo
            if key_tipo in flattened_examples:
                continue
                key_tipo=key_tipo+"_copy"
            examples_raw[key_tipo]=report
            answer=answer_tipo[tipo]
            
    
            inputs_tipo = "Question: " + question_tipo[tipo] +" Extra information: "+ previous_message_answer_tipo[tipo]+ " Context: " + informe
            flattened_examples[key_tipo]=inputs_tipo
            targets[key_tipo]=int(np.argmax(answer))
    return flattened_examples,targets




def visualize_errors(valid_dataset,valid_targets,validation_predictions,keys):
    # Crear un DataFrame con los textos originales, las etiquetas reales y las predicciones
    results_df = pd.DataFrame({
        'key':list(keys),
        'Text': list(valid_dataset),  # Usamos los textos originales
        'True Label': list(valid_targets),
        'Predicted Label': list(validation_predictions)
    })
    
    # Filtrar los ejemplos en los que el modelo falló
    errors_df = results_df[results_df['True Label'] != results_df['Predicted Label']]
    
    for ind,row in errors_df.iterrows():
        print(row["key"])
        print("EJEMPLO")
        print(row["Text"])
        print("PREDICTED")
        print(idx_to_word_out[row["Predicted Label"]])
        print("TRUE")
        print(idx_to_word_out[row["True Label"]])

def tokenize_function(examples):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    texts = examples["text"]
    
    outputs = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512
    )

    # Verificar truncación
    for i, text in enumerate(texts):
        untruncated = tokenizer(
            text,
            truncation=False,
            add_special_tokens=True
        )
        if len(untruncated["input_ids"]) > 512:
            print("⚠️ Truncation occurred!")
            print(f"Original length: {len(untruncated['input_ids'])}, Truncated to: 512")
            print("Sample text:", text[:200], "...\n")

    return outputs
def train_clean(X, Y):
    random.seed(1)
    # Agrupar ejemplos originales y sus copias

    
    train = X

    # Barajar los datos de entrenamiento para evitar sesgos
    train = train.sample(frac=1, random_state=1)
    print(train)
    train_y = Y.loc[train.index]
    print(train_y)
    train["label"]=train_y["label"]
    
    # Devolver los conjuntos
    train = train.reset_index(drop=True)
    
    return train

def evaluate_per_question(predicted, tested, DICTIONARY):
    """
    Evaluate model predictions per question type.
    
    Parameters:
    - predicted: array of predicted label indices (flattened from all folds)
    - tested: array of true label indices (same shape as predicted)
    - DICTIONARY: dict mapping each question to its list of class names
    """
    
    # Step 1: Build global index → (question, class_name) mapping
    idx_to_question_value = {}
    offset = 0
    question_offsets = {}
    for question, class_list in DICTIONARY.items():
        question_offsets[question] = offset
        for i, label in enumerate(class_list):
            idx_to_question_value[offset + i] = (question, label)
        offset += len(class_list)

    # Step 2: Group predictions by question
    per_question_true = defaultdict(list)
    per_question_pred = defaultdict(list)

    for true_idx, pred_idx in zip(tested, predicted):
        q_true, _ = idx_to_question_value[true_idx]
        # You can check if q_true == q_pred here for safety if needed
        per_question_true[q_true].append(true_idx)
        per_question_pred[q_true].append(pred_idx)

    # Step 3: Classification reports
    print("\n🔍 Per-question classification reports:\n")
    for question, true_labels in per_question_true.items():
        pred_labels = per_question_pred[question]
        label_names = DICTIONARY[question]
        start = question_offsets[question]
        end = start + len(label_names)
        question_label_ids = list(range(start, end))

        print(f"\n📘 Question: {question}")
        try:
            print(classification_report(true_labels, pred_labels, labels=question_label_ids, target_names=label_names))
        except Exception as e:
            print(f"⚠️ Could not generate report for '{question}': {e}")

    print("\n📊 Accuracy per class and per question:\n")
    for question in DICTIONARY:
        y_true = np.array(per_question_true[question])
        y_pred = np.array(per_question_pred[question])
        class_list = DICTIONARY[question]
        start = question_offsets[question]
    
        if len(y_true) == 0:
            print(f"\n❌ {question}: [No data]")
            continue
    
        print(f"\n✅ Accuracy for: {question}")
        # Per-class accuracy
        for i, class_name in enumerate(class_list):
            global_idx = start + i
            mask = y_true == global_idx
            if mask.sum() == 0:
                print(f"  {class_name}: [No samples]")
                continue
            acc = accuracy_score(y_true[mask], y_pred[mask])
            print(f"  {class_name}: {acc:.4f}")
        
        # Overall accuracy for the question
        overall_acc = accuracy_score(y_true, y_pred)
        print(f"🎯 Overall accuracy: {overall_acc:.4f}")



def train_save(X,Y):
    
    random.seed(1)
    np.random.seed(1)
    predicted=[]
    tested=[]
    acc_cv=[]
    kappa_cv=[]
    ind_cv=[]
    
        
    train =train_clean(X,Y)
    # Diferenciamos el fit cuando el resultado es categorical o no.
    print(len(train))
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=64 # Cambia según tus clases
    )
    for param in model.base_model.parameters():
        param.requires_grad = False
    train_data = Dataset.from_pandas(train)
    
    train_data = train_data.map(tokenize_function, batched=True)
    train_data = train_data.rename_column("label", "labels")
    train_data = train_data.remove_columns(["text"])
    train_data.set_format("torch")
    
    print(train_data)
    
    if 'token_type_ids' in train_data:
        train_data.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
        
   
    trainer = Trainer(
    model=model,
        args=training_args,
        train_dataset=train_data,
    )
    
    trainer.train()
    for param in model.base_model.parameters():
        param.requires_grad = True
    trainer = Trainer(
    model=model,
        args=training_args_all,
        train_dataset=train_data,
    )
    trainer.train()
    trainer.save_model(f"results/{model_name}_model_final_classification")
        

In [None]:
inputs,targets = flatten_and_filter_dataset(ground_truth,report_data)   

dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
targets=pd.DataFrame.from_dict(targets,orient='index')
dataset_final.columns=["text"]
targets.columns=["label"]

In [None]:
inputs,targets = flatten_and_filter_dataset(ground_truth,report_data)   
model_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
targets=pd.DataFrame.from_dict(targets,orient='index')
dataset_final.columns=["text"]
targets.columns=["label"]
training_args = TrainingArguments(
    
    num_train_epochs=5,             # Número de épocas
    per_device_train_batch_size=16,  # Tamaño del batch
    
    learning_rate=0.01, 
    weight_decay=0.05,              # Decaimiento del peso
    
)

training_args_all= TrainingArguments(
    output_dir='./results',          # Carpeta para guardar el modelo
    num_train_epochs=8,             # Número de épocas
    per_device_train_batch_size=16,  # Tamaño del batch
    
    learning_rate=0.00005, 
    weight_decay=0.05,              # Decaimiento del peso
)

In [None]:
train_save(dataset_final,targets)

In [None]:
inputs,targets = flatten_and_filter_dataset(ground_truth,report_data)   
model_name="dmis-lab/biobert-base-cased-v1.1"
dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
targets=pd.DataFrame.from_dict(targets,orient='index')
dataset_final.columns=["text"]
targets.columns=["label"]
training_args = TrainingArguments(
    
    num_train_epochs=5,             # Número de épocas
    per_device_train_batch_size=16,  # Tamaño del batch
    
    learning_rate=0.01, 
    weight_decay=0.05,              # Decaimiento del peso
    logging_dir='./logs',           # Carpeta para los logs
    logging_steps=10,
    
)

training_args_all= TrainingArguments(
   
    num_train_epochs=5,             # Número de épocas
    per_device_train_batch_size=16,  # Tamaño del batch
   
    learning_rate=0.00005, 
    weight_decay=0.05,              # Decaimiento del peso
    
)
train_save(dataset_final,targets)

In [None]:
inputs,targets = flatten_and_filter_dataset(ground_truth,report_data)   
model_name="bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"
dataset_final=pd.DataFrame.from_dict(inputs,orient='index')
targets=pd.DataFrame.from_dict(targets,orient='index')
dataset_final.columns=["text"]
targets.columns=["label"]
training_args = TrainingArguments(
    
    num_train_epochs=5,             # Número de épocas
    per_device_train_batch_size=16,  # Tamaño del batch
    
    learning_rate=0.01, 
    weight_decay=0.05,              # Decaimiento del peso
    
)

training_args_all= TrainingArguments(
    
    num_train_epochs=8,             # Número de épocas
    per_device_train_batch_size=16,  # Tamaño del batch
    
    learning_rate=0.00005, 
    weight_decay=0.05,              # Decaimiento del peso
    
)
train_save(dataset_final,targets)