# Evaluation dataset

In [46]:
import pandas as pd
import os
from dotenv import load_dotenv
import deepl
import json

In [113]:
load_dotenv(override=True)
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
translator = deepl.Translator(DEEPL_API_KEY)

GCS Translation

In [32]:
annotations_folder = "../../datasets/GSC+/Annotations"

In [33]:
evaluation_hpo_codes = set()
for file in os.listdir(annotations_folder)[1:]:
    df = pd.read_csv(os.path.join(annotations_folder, file), sep=r"[\t|]", header=None, names=["positions", "hpo_code", "hpo_name"], engine='python')
    evaluation_hpo_codes.update(df.hpo_code.unique())

In [92]:
text_folder = "../../datasets/GSC+/Text"
text_content = {}
total_chars = 0
for file_name in os.listdir(text_folder):
     with open(os.path.join(text_folder, file_name), "r", encoding="utf-8", errors="ignore") as file:
            content = file.read()
            total_chars += len(content)
            text_content[file_name] = content

In [106]:
# Dividir el número de ids en chunks de 50
def chunk_dict_keys(d, chunk_size=50):
    values = list(d.keys())  
    return [values[i:i + chunk_size] for i in range(0, len(values), chunk_size)]

chunks = chunk_dict_keys(text_content, chunk_size=20)

In [107]:
def save_chunks_to_json(chunks, filename="chunks.json"):
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(chunks, file, indent=4)

def load_chunks_from_json(filename="chunks.json"):
    with open(filename, "r", encoding="utf-8") as file:
        chunks = json.load(file)  # Load JSON into a Python list
    return chunks

save_chunks_to_json(chunks, "../../resources/evalGCS+_chunks.json")
chunks = load_chunks_from_json("../../resources/evalGCS+_chunks.json")

In [53]:
if not os.path.exists("../../datasets/GCS+_ESP/Text"):
    os.makedirs("../../datasets/GCS+_ESP/Text")

In [36]:
print(f"There eval dataset has {total_chars} characters in total")

There eval dataset has 226191 characters in total


In [62]:
def save_to_txt(ids, traducciones, dir):
    for id, traduccion in zip(ids, traducciones):
        with open(os.path.join(dir, id), "w") as fp:
            fp.write(traduccion)

In [None]:
from tqdm import tqdm
for _ in tqdm(range(1), desc="Traduciendo"):
    processed_codes = os.listdir("../../datasets/GCS+_ESP/Text")
    for i in range(len(chunks)):
        if all([s in processed_codes for s in chunks[i]]):
            continue
        break
    if i == len(chunks):
        break
    texto_original = [text_content[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, "../../datasets/GCS+_ESP/Text")  # Guarda en "output.txt"

In [105]:
#Compruebo que se han traducido todos los archivos
og_files = os.listdir("../../datasets/GSC+/Text")
esp_files = os.listdir("../../datasets/GCS+_ESP/Text")
set(og_files) - set(esp_files)

set()

RAG-HPO Dataset

In [110]:
df = pd.read_excel("../../datasets/Test_Cases.xlsx")
df.columns  = ["id", "eng", "esp"]
df.head(3)

Unnamed: 0,id,eng,esp
0,1,A 44-year- old super-morbidly- obese man body ...,Un hombre de 44 años con obesidad mórbida e ín...
1,2,A 32-year-old man presented to a regional gene...,Un hombre de 32 años acudió a una unidad regio...
2,3,"In December 1990, a 24-year-old female was ref...","En diciembre de 1990, una mujer de 24 años fue..."


In [111]:
print(f"Documentos no traducidos: {df[df.esp.isna()].shape[0]}") 
print(f"Caracteres a traducir: {sum(df[df.esp.isna()].eng.apply(lambda x: len(x)))}")

Documentos no traducidos: 82
Caracteres a traducir: 155294


In [132]:
while df[df.esp.isna()].shape[0] > 0:
    texto_original = df[df.esp.isna()].iloc[0:10].eng.to_list()
    idxs = df[df.esp.isna()].iloc[0:10].index
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)
    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]
    df.loc[idxs, "esp"] = texto_traduccion

In [None]:
df.id = df.id.apply(lambda x: int(x[:-1]))

In [None]:
df.to_csv("../../datasets/RAG-HPO/Test_Cases.csv")

In [186]:
eval = pd.read_csv("../../datasets/RAG-HPO/Test_Cases.csv")
eval.shape

(116, 4)

Lectura y asociación de los textos con sus anotaciones

In [180]:
df = pd.read_excel("../../datasets/RAG-HPO/RAG-HPO_Tests_and_Data_Analysis.xlsx", header=None)
df = df[[0,1,2]]

In [171]:
import numpy as np
pd.isna(df.loc[3099][0])

True

In [217]:
annotations = {}
id = None
read = False
for i, row in df.iterrows():
    if pd.isna(row[2]):
        read=False
    if row[1] == "Manually Assigned HPO Terms":
        if id is not None:
            annotations[id] = annot_list
        id = None
        annot_list = []
        read=True
        if not pd.isna(row[0]) and row[0]!="Case":
            id = row[0] + 96

    if read and row[1] == "Phenotype name" and id is None:
        if row[0] == "Case":
            id = df.loc[i+1, 0]
        else:
            id = row[0]

    if read and isinstance(row[2], str) and row[2] not in ['none', 'HPO ID']:
        annot_list.append(row[2])
annotations[id] = annot_list

In [196]:
len(annotations)

116

In [218]:
annotations[104]

['HP:0000832',
 'HP:0000988',
 'HP:0001000',
 'HP:0001880',
 'HP:0001974',
 'HP:0002728',
 'HP:0002925',
 'HP:0004322',
 'HP:0005407',
 'HP:0010280',
 'HP:0010783',
 'HP:0025092',
 'HP:0031392',
 'HP:0031446',
 'HP:0031507',
 'HP:0032210',
 'HP:0033078',
 'HP:0033425',
 'HP:0040189',
 'HP:0045080',
 'HP:0100643',
 'HP:0100827',
 'HP:0200041',
 'HP:0025697']

In [199]:
with open(os.path.join("../../resources", "hpo_es.json"), "r") as fp:
    hpo = json.load(fp)

In [201]:
#Read the desired fields of the ontology
fields = ["esp_name", "esp_def", "is_a"]
hpo_dict = {}

for element in hpo:
    hpo_dict[element["id"]] = {field:element[field] for field in fields if field in element}

In [219]:
hpo_dict['HP:0000832']

{'esp_name': 'Hipotiroidismo primario',
 'esp_def': 'Tipo de hipotiroidismo que resulta de un defecto en la glándula tiroides.',
 'is_a': 'HP:0000821 ! Hypothyroidism'}

In [225]:
test = pd.read_csv("../../datasets/RAG-HPO/Test_Cases.csv")
print(test.loc[103])

Unnamed: 0                                                  103
id                                                          104
eng            A young girl in her early childhood was refer...
esp            Una niña de corta edad fue remitida a nuestra...
Name: 103, dtype: object


In [None]:
test.drop(columns = "Unnamed: 0", inplace=True)
test["annotations"] = test.id.apply(lambda x: annotations[x])
test.to_csv("../../datasets/RAG-HPO/Test_Cases.csv", index=False)

Lectura y asociación de términos del GSC

In [267]:
texts = []
annotations = []
for file in os.listdir("../../datasets/GCS+_ESP/Text"):
    with open(os.path.join("../../datasets/GCS+_ESP/Text", file), "r") as fp:
        texts.append(fp.read())
    annots = pd.read_csv(os.path.join("../../datasets/GSC+/Annotations", file), header=None, sep="\t")
    annots[1] = annots[1].apply(lambda x: x.split("|")[0].strip())
    annotations.append(annots[1].to_list())

Concatenación de ambos

In [288]:
test.shape, test2.shape

((116, 4), (228, 2))

In [276]:
final_test = test[["esp", "annotations"]]
final_test= final_test.rename(columns={"esp":"texts"})

In [290]:
final_test = pd.concat([final_test, test2])

In [291]:
final_test.to_csv("../../datasets/TFM_test.csv", index=False)