# HPO ontology translation

In [90]:
import requests
import os
from dotenv import load_dotenv
import deepl
import re
import pandas as pd
from tqdm import tqdm

In [91]:
load_dotenv(override=True)
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
translator = deepl.Translator(DEEPL_API_KEY)

## Retrieving ontology by API

In [92]:
def download_request(url, file_name):
    response = requests.get(url)

    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
        print(f"Archivo '{file_name}' descargado con éxito.")
    else:
        print(f"Error: {response.status_code} - {response.json().get('message', 'Error desconocido')}")

In [93]:
def github_download_request(url, file_name):
    # Hacer la solicitud GET
    response = requests.get(url)

    if response.status_code == 200:
        file_data = response.json()
        download_url = file_data["download_url"]  # URL de descarga del archivo
        # Descargar el archivo
        file_response = requests.get(download_url)

        if file_response.status_code == 200:
            with open(file_name, "wb") as file:
                file.write(file_response.content)
            print(f"Archivo '{file_name}' descargado con éxito.")
        else:
            print("Error al descargar el archivo.")
    else:
        print(f"Error: {response.status_code} - {response.json().get('message', 'Error desconocido')}")

Archivo de traducción oficial

In [94]:
# Repository configuration
esp_file_path = "src/translations/hp-es-preprocessed.babelon.tsv"  
RESOURCE_FOLDER = "../../resources"

url = f"https://api.github.com/repos/obophenotype/human-phenotype-ontology/contents/{esp_file_path}"
esp_file_name = os.path.join(RESOURCE_FOLDER, esp_file_path.split("/")[-1])

# github_download_request(url, esp_file_name)

Archivo en inglés con descripciones

In [95]:
RESOURCE_FOLDER = "../../resources"

oboa_file_path = "hp/phenotype.hpoa"
url = f"http://purl.obolibrary.org/obo/{oboa_file_path}"
eng_file_name = os.path.join(RESOURCE_FOLDER, "phenotype.hpoa")

# download_request(url, eng_file_name)

obo_file_path = "hp.obo"

url = f"http://purl.obolibrary.org/obo/{obo_file_path}"
eng_file_name = os.path.join(RESOURCE_FOLDER, "hp.obo")

Convertir archivo .obo a diccionario

In [96]:
def str_to_dict(line):
    new_dict = {}
    for element in line:
        if ": " in element:
            k,v = element.split(": ", 1)
            if k in new_dict:
                if isinstance(new_dict[k], str):
                    new_dict[k] = [new_dict[k]]
                new_dict[k].append(v)
            else:
                new_dict[k] = v
    return new_dict

def find_translation(id, translation_df, col="translation_value"):
    if id in translation_df.index:
        return {"esp_name": translation_df.loc[id, col]}
    return []

Cargar el archivo de traducción

In [97]:
esp_translation = pd.read_csv(esp_file_name, sep="\t")
esp_translation.set_index("subject_id", inplace=True)
esp_translation.translation_status.value_counts()

OFFICIAL     17533
CANDIDATE     1428
Name: translation_status, dtype: int64

Cargar el archivo de la ontología

In [98]:
with open(eng_file_name, "r") as fp:
    lines = fp.read()
lines = re.split(r"(\[Term\]|\[Typedef\])", lines)
lines = [lines[i+1] for i in range(len(lines)) if lines[i] == '[Term]']
lines = [line.split("\n") for line in lines] #separar cada propiedad
lines = [str_to_dict(line) for line in lines] #convertir cada elemento en un diccionario
print(f"Total HPO terms: {len(lines)}")

Total HPO terms: 19533


In [99]:
lines = [line for line in lines if not "is_obsolete" in line]
print(f"Total non-obsolete HPO terms: {len(lines)}")

Total non-obsolete HPO terms: 19077


Añadir la traducción

In [143]:
_ = [line.update(find_translation(line["id"], esp_translation)) for line in lines]
lines[0]

{'id': 'HP:0000001',
 'name': 'All',
 'comment': 'Root of all terms in the Human Phenotype Ontology.',
 'xref': 'UMLS:C0444868',
 'esp_name': 'Todos'}

In [101]:
print(f"Total HPO terms with spanish translation: {len([line for line in lines if 'esp_name' in line])}")

Total HPO terms with spanish translation: 18506


In [102]:
not_mapped_names = [s for s in lines if "esp_name" not in s]
sum([len(name) for name in not_mapped_names])

3184

In [130]:
def clean_synonym(s):
        match = re.search(r'"(.*)"(.*)', s)
        if match:
            s= match.group(1)
        return s.strip(' ')

In [131]:
def process_synonyms(synonyms):
    if isinstance(synonyms, list):
        for i, s in enumerate(synonyms):
            synonyms[i] = clean_synonym(s)
    else:
        synonyms = clean_synonym(synonyms)
    return {"synonym": synonyms}

In [132]:
_ = [line.update(process_synonyms(line["synonym"])) for line in lines if "synonym" in line]
lines[4]

{'id': 'HP:0000006',
 'name': 'Autosomal dominant inheritance',
 'alt_id': ['HP:0001415',
  'HP:0001447',
  'HP:0001448',
  'HP:0001451',
  'HP:0001452',
  'HP:0001455',
  'HP:0001456',
  'HP:0001463'],
 'def': '"A mode of inheritance that is observed for traits related to a gene encoded on one of the autosomes (i.e., the human chromosomes 1-22) in which a trait manifests in heterozygotes. In the context of medical genetics, an autosomal dominant disorder is caused when a single copy of the mutant allele is present. Males and females are affected equally, and can both transmit the disorder with a risk of 50% for each child of inheriting the mutant allele." [https://orcid.org/0000-0002-0736-9199]',
 'synonym': ['Autosomal dominant',
  'Autosomal dominant form',
  'Autosomal dominant type',
  'monoallelic_autosomal'],
 'xref': ['SNOMEDCT_US:263681008', 'UMLS:C0443147'],
 'is_a': 'HP:0034345 ! Mendelian inheritance',
 'esp_name': 'Herencia autosómica dominante'}

In [106]:
def process_synonyms(synonyms):
    if isinstance(synonyms, list):
        total_length = 0
        for i, s in enumerate(synonyms):
            total_length += len(s)
    else:
        total_length = len(synonyms)
    return total_length

In [107]:
sum([process_synonyms(line["synonym"]) for line in lines if "synonym" in line])

670352

In [108]:
def process_definitions(defi):
    defi = re.sub(r'\[http.*?\]'  , '', defi)
    defi = re.search(r'"(.*)"', defi).group(1)
    defi = defi.strip(' ')
    return defi

In [109]:
definitions = {line["id"]:process_definitions(line["def"]) for line in lines if "def" in line}

In [110]:
len(definitions.keys())

16504

In [111]:
# Dividir el número de ids en chunks de 50
def chunk_dict_keys(d, chunk_size=50):
    values = list(d.keys())  
    return [values[i:i + chunk_size] for i in range(0, len(values), chunk_size)]

# chunks = chunk_dict_keys(definitions)

In [112]:
#guardar el resultado en un archivo para asegurar que el split siempre sea igual
import json

def save_chunks_to_json(chunks, filename="chunks.json"):
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(chunks, file, indent=4)

def load_chunks_from_json(filename="chunks.json"):
    with open(filename, "r", encoding="utf-8") as file:
        chunks = json.load(file)  # Load JSON into a Python list
    return chunks

# save_chunks_to_json(chunks, "../../resources/def_chunks.json")
# chunks = load_chunks_from_json("../../resources/def_chunks.json")

In [113]:
sum([len(s) for s in chunks])

3079

Traducción con DeepL

In [114]:
def save_to_txt(lista1, lista2, filename="output.txt"):
    with open(filename, "a", encoding="utf-8") as file:  # "a" = append mode
        for val1, val2 in zip(lista1, lista2):
            file.write(f"{val1}\t{val2}\n")  # Tab-separated

In [115]:
for _ in tqdm(range(50), desc="Traduciendo"):
    df = pd.read_csv("../../resources/traduccion_definiciones.txt", delimiter="\t", header=None)  
    processed_codes = df.iloc[:, 0].tolist()
    for i in range(len(chunks)):
        if all([s in processed_codes for s in chunks[i]]):
            continue
        break
    if i == len(chunks):
        break
    texto_original = [definitions[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, "../../resources/traduccion_definiciones.txt")  # Guarda en "output.txt"

Traduciendo:   0%|          | 0/50 [00:00<?, ?it/s]


KeyError: 0

Asociar las definiciones en español

In [133]:
esp_definitions = pd.read_csv("../../resources/traduccion_definiciones.txt", delimiter="\t", header=None)  
esp_definitions.columns=["code", "definition"]
esp_definitions.set_index("code", inplace=True)

_ = [hpo.update({"esp_def":esp_definitions.loc[hpo["id"], "definition"]}) for hpo in lines if hpo["id"] in esp_definitions.index]

Traducción de nombres

In [134]:
names = {i["id"]:i["name"] for i in lines if "esp_name" not in i}
sum([len(s) for s in names.values()])

19026

In [None]:
# chunks = chunk_dict_keys(names)
# save_chunks_to_json(chunks, "../../resources/name_chunks.json")
chunks = load_chunks_from_json("../../resources/name_chunks.json")

In [136]:
ARCHIVO_TRADUCCION_NOMBRES = "../../resources/traduccion_nombres.txt"

In [None]:
for _ in tqdm(range(1), desc="Traduciendo"):
    if os.path.exists(ARCHIVO_TRADUCCION_NOMBRES):
        df = pd.read_csv(ARCHIVO_TRADUCCION_NOMBRES, delimiter="\t", header=None)
        processed_codes = df.iloc[:, 0].tolist()
        for i in range(len(chunks)):
            if all([s in processed_codes for s in chunks[i]]):
                continue
            break
    else:
        i=0

    if i == len(chunks):
        break
    texto_original = [names[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, ARCHIVO_TRADUCCION_NOMBRES)  # Guarda en "output.txt"

Traduciendo: 100%|██████████| 1/1 [00:00<00:00, 243.53it/s]

heyy





Asociar con los nombres

In [137]:
esp_names = pd.read_csv(ARCHIVO_TRADUCCION_NOMBRES, delimiter="\t", header=None)  
esp_names.columns=["code", "name"]
esp_names.set_index("code", inplace=True)

_ = [hpo.update({"esp_name":esp_names.loc[hpo["id"], "name"]}) for hpo in lines if hpo["id"] in esp_names.index]

Traducción de sinónimos

Términos con un solo sinónimo

In [None]:
single_synonym = {line['id']: line["synonym"] for line in lines if "synonym" in line if isinstance(line["synonym"], str)}
chunks = chunk_dict_keys(single_synonym, chunk_size=100)
# save_chunks_to_json(chunks, "../../resources/synonym_chunks.json")
chunks = load_chunks_from_json("../../resources/synonym_chunks.json")

In [138]:
ARCHIVO_TRADUCCION_SINONIMOS = "../../resources/traduccion_sinonimos.txt"

In [None]:
for _ in tqdm(range(15), desc="Traduciendo"):
    if os.path.exists(ARCHIVO_TRADUCCION_SINONIMOS):
        df = pd.read_csv(ARCHIVO_TRADUCCION_SINONIMOS, delimiter="\t", header=None)
        processed_codes = df.iloc[:, 0].tolist()
        for i in range(len(chunks)):
            if all([s in processed_codes for s in chunks[i]]):
                continue
            break
    else:
        i=0

    if i == len(chunks):
        break
    texto_original = [single_synonym[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, ARCHIVO_TRADUCCION_SINONIMOS)  # Guarda en "output.txt"

Traduciendo: 100%|██████████| 15/15 [00:06<00:00,  2.29it/s]


In [None]:
len([line for line in lines if "synonym" in line])

10859

Términos con más de un sinónimo

In [None]:
multiple_synonym = {line['id']: line["synonym"] for line in lines if "synonym" in line if isinstance(line["synonym"], list)}
len(multiple_synonym)

5272

In [None]:
chunks = chunk_dict_keys(multiple_synonym, chunk_size=100)
# save_chunks_to_json(chunks, "../../resources/mult_synonym_chunks.json")
chunks = load_chunks_from_json("../../resources/mult_synonym_chunks.json")

In [None]:
def save_to_txt2(lista1, lista2, filename="output.txt"):
    with open(filename, "a", encoding="utf-8") as file:  # "a" = append mode
        for hpo_code, positions in lista1.items():
            file.write(f"{hpo_code}\t{lista2[positions[0]: positions[1]]}\n")  # Tab-separated

In [None]:
for _ in tqdm(range(1), desc="Traduciendo"):
    if os.path.exists(ARCHIVO_TRADUCCION_SINONIMOS):
        df = pd.read_csv(ARCHIVO_TRADUCCION_SINONIMOS, delimiter="\t", header=None)
        processed_codes = df.iloc[:, 0].tolist()
        for i in range(len(chunks)):
            if all([s in processed_codes for s in chunks[i]]):
                continue
            break
    else:
        i=0

    if i == len(chunks):
        break
    list_pos = 0 
    texto_original = []
    posiciones_originales = {}
    for j in chunks[i]:
        for synonym in multiple_synonym[j]:
            texto_original.append(synonym)
        posiciones_originales[j] = [list_pos, list_pos + len(multiple_synonym[j])]
        list_pos += len(multiple_synonym[j])
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]
    
    save_to_txt2(posiciones_originales, texto_traduccion, ARCHIVO_TRADUCCION_SINONIMOS)  # Guarda en "output.txt"

Traduciendo: 100%|██████████| 1/1 [00:00<00:00,  1.54it/s]


Asociar con sinónimos

In [139]:
df = pd.read_csv(ARCHIVO_TRADUCCION_SINONIMOS, delimiter="\t", header=None)  
df.drop_duplicates(0).shape

(10859, 2)

In [140]:
def convert_to_list(synonyms):
    try: 
        return eval(synonyms)
    except:
        return synonyms

In [141]:
esp_synonyms = pd.read_csv(ARCHIVO_TRADUCCION_SINONIMOS, delimiter="\t", header=None)  
esp_synonyms.columns=["code", "synonyms"]
esp_synonyms.set_index("code", inplace=True)

_ = [hpo.update({"esp_synonyms":convert_to_list(esp_synonyms.loc[hpo["id"], "synonyms"])})
                 for hpo in lines if hpo["id"] in esp_synonyms.index]

In [142]:
len(lines)

19077

Addons

In [None]:
hpo_addons = pd.read_csv(os.path.join("../../resources/", "HPO_addons.csv"))
hpo_addons = hpo_addons['info'].to_dict()

In [None]:
chunks = chunk_dict_keys(hpo_addons, chunk_size=400)
save_chunks_to_json(chunks, "../../resources/addons_chunks.json")
chunks = load_chunks_from_json("../../resources/addons_chunks.json")

In [None]:
ARCHIVO_TRADUCCION_ADDONS = "../../resources/traduccion_addons.txt"

In [None]:
for _ in tqdm(range(2), desc="Traduciendo"):
    if os.path.exists(ARCHIVO_TRADUCCION_ADDONS):
        df = pd.read_csv(ARCHIVO_TRADUCCION_ADDONS, delimiter="\t", header=None)
        processed_codes = df.iloc[:, 0].tolist()
        for i in range(len(chunks)):
            if all([s in processed_codes for s in chunks[i]]):
                continue
            break
    else:
        i=0

    if i == len(chunks):
        break
    texto_original = [hpo_addons[j] for j in chunks[i]]
    idioma_destino = "ES"  # Código de idioma (ES = español, EN = inglés, etc.)

    traduccion = translator.translate_text(texto_original, target_lang=idioma_destino)
    texto_traduccion = [t.text for t in traduccion]

    save_to_txt(chunks[i], texto_traduccion, ARCHIVO_TRADUCCION_ADDONS)  # Guarda en "output.txt"

Traduciendo: 100%|██████████| 2/2 [00:03<00:00,  1.60s/it]


In [121]:
addons = pd.read_csv(ARCHIVO_TRADUCCION_ADDONS, delimiter="\t", header=None)  
addons.columns = ["HPO_ID", "description"]
# df[["HPO_ID", 1]].to_csv("../../resources/traduccion_addons.txt", sep="\t", header=False, index=False)

In [122]:
addons = addons.groupby("HPO_ID").agg({'description':list})

In [128]:
_ = [hpo.update({"esp_addons":addons.loc[hpo["id"], "description"]})
                 for hpo in lines if hpo["id"] in addons.index]

Guardar el diccionario creado

In [144]:
with open("../../resources/hpo_es.json", "w", encoding="utf-8") as fp:
    json.dump(lines, fp, ensure_ascii=False)