# Traducir y crear los datasets de evaluación

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
import deepl
import json
import re

In [113]:
load_dotenv(override=True)
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
translator = deepl.Translator(DEEPL_API_KEY)

## Traducción del dataset GSC

In [32]:
annotations_folder = "../../datasets/GSC+/Annotations"

In [33]:
evaluation_hpo_codes = set()
for file in os.listdir(annotations_folder)[1:]:
    df = pd.read_csv(os.path.join(annotations_folder, file), sep=r"[\t|]", header=None, names=["positions", "hpo_code", "hpo_name"], engine='python')
    evaluation_hpo_codes.update(df.hpo_code.unique())

In [92]:
text_folder = "../../datasets/GSC+/Text"
text_content = {}
total_chars = 0
for file_name in os.listdir(text_folder):
     with open(os.path.join(text_folder, file_name), "r", encoding="utf-8", errors="ignore") as file:
            content = file.read()
            total_chars += len(content)
            text_content[file_name] = content

In [106]:
# Dividir el número de ids en chunks de 50
def chunk_dict_keys(d, chunk_size=50):
    values = list(d.keys())  
    return [values[i:i + chunk_size] for i in range(0, len(values), chunk_size)]

chunks = chunk_dict_keys(text_content, chunk_size=20)

In [107]:
def save_chunks_to_json(chunks, filename="chunks.json"):
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(chunks, file, indent=4)

def load_chunks_from_json(filename="chunks.json"):
    with open(filename, "r", encoding="utf-8") as file:
        chunks = json.load(file)  # Load JSON into a Python list
    return chunks

save_chunks_to_json(chunks, "../../resources/evalGCS+_chunks.json")
chunks = load_chunks_from_json("../../resources/evalGCS+_chunks.json")

In [53]:
if not os.path.exists("../../datasets/GCS+_ESP/Text"):
    os.makedirs("../../datasets/GCS+_ESP/Text")

In [36]:
print(f"There eval dataset has {total_chars} characters in total")

There eval dataset has 226191 characters in total


In [62]:
def save_to_txt(ids, traducciones, dir):
    for id, traduccion in zip(ids, traducciones):
        with open(os.path.join(dir, id), "w") as fp:
            fp.write(traduccion)

In [None]:
from tqdm import tqdm
for _ in tqdm(range(1), desc="Traduciendo"):
    processed_codes = os.listdir("../../datasets/GCS+_ESP/Text")
    for i in range(len(chunks)):
        if all([s in processed_codes for s in chunks[i]]):
            continue
        break
    if i == len(chunks):
        break
    texto_original = [text_content[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, "../../datasets/GCS+_ESP/Text")  # Guarda en "output.txt"

In [None]:
#Compruebo que se han traducido todos los archivos
og_files = os.listdir("../../datasets/GSC+/Text")
esp_files = os.listdir("../../datasets/GCS+_ESP/Text")
set(og_files) - set(esp_files)

## Traducción del dataset RAG-HPO 

In [None]:
df = pd.read_excel("../../datasets/Test_Cases.xlsx")
df.columns  = ["id", "eng", "esp"]
df.head(3)

In [None]:
print(f"Documentos no traducidos: {df[df.esp.isna()].shape[0]}") 
print(f"Caracteres a traducir: {sum(df[df.esp.isna()].eng.apply(lambda x: len(x)))}")

In [132]:
while df[df.esp.isna()].shape[0] > 0:
    texto_original = df[df.esp.isna()].iloc[0:10].eng.to_list()
    idxs = df[df.esp.isna()].iloc[0:10].index
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)
    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]
    df.loc[idxs, "esp"] = texto_traduccion

In [None]:
df.id = df.id.apply(lambda x: int(x[:-1]))

In [None]:
df.to_csv("../../datasets/RAG-HPO/Test_Cases.csv")

In [186]:
eval = pd.read_csv("../../datasets/RAG-HPO/Test_Cases.csv")
eval.shape

(116, 4)

Lectura y asociación de los textos con sus anotaciones

In [180]:
df = pd.read_excel("../../datasets/RAG-HPO/RAG-HPO_Tests_and_Data_Analysis.xlsx", header=None)
df = df[[0,1,2]]

In [217]:
annotations = {}
id = None
read = False
for i, row in df.iterrows():
    if pd.isna(row[2]):
        read=False
    if row[1] == "Manually Assigned HPO Terms":
        if id is not None:
            annotations[id] = annot_list
        id = None
        annot_list = []
        read=True
        if not pd.isna(row[0]) and row[0]!="Case":
            id = row[0] + 96

    if read and row[1] == "Phenotype name" and id is None:
        if row[0] == "Case":
            id = df.loc[i+1, 0]
        else:
            id = row[0]

    if read and isinstance(row[2], str) and row[2] not in ['none', 'HPO ID']:
        annot_list.append(row[2])
annotations[id] = annot_list

In [None]:
test = pd.read_csv("../datasets/RAG-HPO/Test_Cases.csv")
test.drop(columns = "Unnamed: 0", inplace=True)
test["annotations"] = test.id.apply(lambda x: annotations[x])
test.to_csv("../../datasets/RAG-HPO/Test_Cases.csv", index=False)
test.annotations = test.annotations.apply(eval)
test.annotations = test.annotations.apply(lambda x: [j.strip() for j in x])
test.annotations = test.annotations.apply(lambda x: [j for j in x if re.compile(r"^HP:\d{7}$").match(j)])
test.to_csv("../../datasets/RAG-HPO/Test_Cases.csv", index=False)

Lectura y asociación de términos del GSC

In [267]:
texts = []
annotations = []
for file in os.listdir("../../datasets/GCS+_ESP/Text"):
    with open(os.path.join("../../datasets/GCS+_ESP/Text", file), "r") as fp:
        texts.append(fp.read())
    annots = pd.read_csv(os.path.join("../../datasets/GSC+/Annotations", file), header=None, sep="\t")
    annots[1] = annots[1].apply(lambda x: x.split("|")[0].strip())
    annotations.append(annots[1].to_list())