# HPO ontology translation

In [73]:
import requests
import os
from dotenv import load_dotenv
import deepl
import re
import pandas as pd
from tqdm import tqdm

In [2]:
load_dotenv(override=True)
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
translator = deepl.Translator(DEEPL_API_KEY)

## Retrieving ontology by API

In [74]:
def download_request(url, file_name):
    response = requests.get(url)

    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
        print(f"Archivo '{file_name}' descargado con éxito.")
    else:
        print(f"Error: {response.status_code} - {response.json().get('message', 'Error desconocido')}")

In [4]:
def github_download_request(url, file_name):
    # Hacer la solicitud GET
    response = requests.get(url)

    if response.status_code == 200:
        file_data = response.json()
        download_url = file_data["download_url"]  # URL de descarga del archivo
        # Descargar el archivo
        file_response = requests.get(download_url)

        if file_response.status_code == 200:
            with open(file_name, "wb") as file:
                file.write(file_response.content)
            print(f"Archivo '{file_name}' descargado con éxito.")
        else:
            print("Error al descargar el archivo.")
    else:
        print(f"Error: {response.status_code} - {response.json().get('message', 'Error desconocido')}")

Archivo de traducción oficial

In [5]:
# Repository configuration
esp_file_path = "src/translations/hp-es-preprocessed.babelon.tsv"  
RESOURCE_FOLDER = "../../resources"

url = f"https://api.github.com/repos/obophenotype/human-phenotype-ontology/contents/{esp_file_path}"
esp_file_name = os.path.join(RESOURCE_FOLDER, esp_file_path.split("/")[-1])

# github_download_request(url, esp_file_name)

Archivo en inglés con descripciones

In [6]:
RESOURCE_FOLDER = "../../resources"
obo_file_path = "hp.obo"

url = f"http://purl.obolibrary.org/obo/{obo_file_path}"
eng_file_name = os.path.join(RESOURCE_FOLDER, obo_file_path)

# download_request(url, eng_file_name)

Convertir archivo .obo a diccionario

In [75]:
def str_to_dict(line):
    new_dict = {}
    for element in line:
        if ": " in element:
            k,v = element.split(": ", 1)
            if k in new_dict:
                if isinstance(new_dict[k], str):
                    new_dict[k] = [new_dict[k]]
                new_dict[k].append(v)
            else:
                new_dict[k] = v
    return new_dict

def find_translation(id, translation_df, col="translation_value"):
    if id in translation_df.index:
        return {"esp_name": translation_df.loc[id, col]}
    return []

Cargar el archivo de traducción

In [8]:
esp_translation = pd.read_csv(esp_file_name, sep="\t")
esp_translation.set_index("subject_id", inplace=True)
esp_translation.translation_status.value_counts()

translation_status
OFFICIAL     17533
CANDIDATE     1428
Name: count, dtype: int64

Cargar el archivo de la ontología

In [76]:
with open(eng_file_name, "r") as fp:
    lines = fp.read()
lines = re.split(r"(\[Term\]|\[Typedef\])", lines)
lines = [lines[i+1] for i in range(len(lines)) if lines[i] == '[Term]']
lines = [line.split("\n") for line in lines] #separar cada propiedad
lines = [str_to_dict(line) for line in lines] #convertir cada elemento en un diccionario
print(f"Total HPO terms: {len(lines)}")

Total HPO terms: 19533


Añadir la traducción

In [10]:
_ = [line.update(find_translation(line["id"], esp_translation)) for line in lines]
lines[0]

{'id': 'HP:0000001',
 'name': 'All',
 'comment': 'Root of all terms in the Human Phenotype Ontology.',
 'xref': 'UMLS:C0444868',
 'esp_name': 'Todos'}

In [11]:
not_mapped_names = [s for s in lines if "esp_name" not in s]
sum([len(name) for name in not_mapped_names])

3189

In [12]:
def clean_synonym(s):
        match = re.search(r'"(.*)"(.*)', s)
        if match:
            s= match.group(1)
        return s.strip(' ')

In [13]:
def process_synonyms(synonyms):
    if isinstance(synonyms, list):
        for i, s in enumerate(synonyms):
            synonyms[i] = clean_synonym(s)
    else:
        synonyms = clean_synonym(synonyms)
    return {"synonym": synonyms}

In [14]:
_ = [line.update(process_synonyms(line["synonym"])) for line in lines if "synonym" in line]
lines[4]

{'id': 'HP:0000006',
 'name': 'Autosomal dominant inheritance',
 'alt_id': ['HP:0001415',
  'HP:0001447',
  'HP:0001448',
  'HP:0001451',
  'HP:0001452',
  'HP:0001455',
  'HP:0001456',
  'HP:0001463'],
 'def': '"A mode of inheritance that is observed for traits related to a gene encoded on one of the autosomes (i.e., the human chromosomes 1-22) in which a trait manifests in heterozygotes. In the context of medical genetics, an autosomal dominant disorder is caused when a single copy of the mutant allele is present. Males and females are affected equally, and can both transmit the disorder with a risk of 50% for each child of inheriting the mutant allele." [https://orcid.org/0000-0002-0736-9199]',
 'synonym': ['Autosomal dominant',
  'Autosomal dominant form',
  'Autosomal dominant type',
  'monoallelic_autosomal'],
 'xref': ['SNOMEDCT_US:263681008', 'UMLS:C0443147'],
 'is_a': 'HP:0034345 ! Mendelian inheritance',
 'esp_name': 'Herencia autosómica dominante'}

In [15]:
def process_synonyms(synonyms):
    if isinstance(synonyms, list):
        total_length = 0
        for i, s in enumerate(synonyms):
            total_length += len(s)
    else:
        total_length = len(synonyms)
    return total_length

In [16]:
sum([process_synonyms(line["synonym"]) for line in lines if "synonym" in line])

670583

In [17]:
def process_definitions(defi):
    defi = re.sub(r'\[http.*?\]'  , '', defi)
    defi = re.search(r'"(.*)"', defi).group(1)
    defi = defi.strip(' ')
    return defi

In [18]:
definitions = {line["id"]:process_definitions(line["def"]) for line in lines if "def" in line}

In [19]:
len(definitions.keys())

16509

In [20]:
# Dividir el número de ids en chunks de 50
def chunk_dict_keys(d, chunk_size=50):
    values = list(d.keys())  
    return [values[i:i + chunk_size] for i in range(0, len(values), chunk_size)]

chunks = chunk_dict_keys(definitions)

In [21]:
#guardar el resultado en un archivo para asegurar que el split siempre sea igual
import json

def save_chunks_to_json(chunks, filename="chunks.json"):
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(chunks, file, indent=4)

def load_chunks_from_json(filename="chunks.json"):
    with open(filename, "r", encoding="utf-8") as file:
        chunks = json.load(file)  # Load JSON into a Python list
    return chunks

# save_chunks_to_json(chunks, "../../resources/def_chunks.json")
chunks = load_chunks_from_json("../../resources/def_chunks.json")

In [22]:
sum([len(s) for s in chunks])

16509

Traducción con DeepL

In [23]:
def save_to_txt(lista1, lista2, filename="output.txt"):
    with open(filename, "a", encoding="utf-8") as file:  # "a" = append mode
        for val1, val2 in zip(lista1, lista2):
            file.write(f"{val1}\t{val2}\n")  # Tab-separated

In [None]:
for _ in tqdm(range(50), desc="Traduciendo"):
    df = pd.read_csv("../../resources/traduccion_definiciones.txt", delimiter="\t", header=None)  
    processed_codes = df.iloc[:, 0].tolist()
    for i in range(len(chunks)):
        if all([s in processed_codes for s in chunks[i]]):
            continue
        break
    if i == len(chunks):
        break
    texto_original = [definitions[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, "../../resources/traduccion_definiciones.txt")  # Guarda en "output.txt"

Asociar las definiciones en español

In [71]:
esp_definitions = pd.read_csv("../../resources/traduccion_definiciones.txt", delimiter="\t", header=None)  
esp_definitions.columns=["code", "definition"]
esp_definitions.set_index("code", inplace=True)

_ = [hpo.update({"esp_def":esp_definitions.loc[hpo["id"], "definition"]}) for hpo in lines if hpo["id"] in esp_definitions.index]

Traducción de nombres

In [47]:
names = {i["id"]:i["name"] for i in lines if "esp_name" not in i}
sum([len(s) for s in names.values()])

19065

In [48]:
# chunks = chunk_dict_keys(names)
# save_chunks_to_json(chunks, "../../resources/name_chunks.json")
chunks = load_chunks_from_json("../../resources/name_chunks.json")

In [49]:
ARCHIVO_TRADUCCION_NOMBRES = "../../resources/traduccion_nombres.txt"

In [50]:
for _ in tqdm(range(1), desc="Traduciendo"):
    if os.path.exists(ARCHIVO_TRADUCCION_NOMBRES):
        df = pd.read_csv(ARCHIVO_TRADUCCION_NOMBRES, delimiter="\t", header=None)
        processed_codes = df.iloc[:, 0].tolist()
        for i in range(len(chunks)):
            if all([s in processed_codes for s in chunks[i]]):
                continue
            break
    else:
        i=0

    if i == len(chunks):
        break
    texto_original = [names[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, ARCHIVO_TRADUCCION_NOMBRES)  # Guarda en "output.txt"

Traduciendo: 100%|██████████| 1/1 [00:00<00:00, 243.53it/s]

heyy





Asociar con los nombres

In [51]:
esp_names = pd.read_csv(ARCHIVO_TRADUCCION_NOMBRES, delimiter="\t", header=None)  
esp_names.columns=["code", "name"]
esp_names.set_index("code", inplace=True)

_ = [hpo.update({"esp_name":esp_names.loc[hpo["id"], "name"]}) for hpo in lines if hpo["id"] in esp_names.index]

Guardar el diccionario creado

In [72]:
with open("../../resources/hpo_es.json", "w", encoding="utf-8") as fp:
    json.dump(lines, fp, ensure_ascii=False)