In [1]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.18.0.tar.gz (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.0/198.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25ldone
[?25h  Created wheel for neo4j: filename=neo4j-5.18.0-py3-none-any.whl size=273862 sha256=6155d453b77625e75c03b87225260d3388a1f8bbf2fcbff49f826c5bd81e85c4
  Stored in directory: /Users/matentzn/Library/Caches/pip/wheels/54/5c/8d/646324601710739f27a4b46c615dc12a6149f193d895997857
Successfully built neo4j
Installing collected packages: neo4j
Successfully installed neo4j-5.18.0


In [7]:
from neo4j import GraphDatabase

# Connect to the Neo4j database
driver = GraphDatabase.driver("bolt://neo4j-bolt.monarchinitiative.org:7687")

# Define the Cypher query
query = """
MATCH
(upheno:`biolink:PhenotypicFeature` WHERE upheno.id STARTS WITH "UPHENO:")<-[:`biolink:subclass_of`]-(phenotype:`biolink:PhenotypicFeature`)<-[gena:`biolink:has_phenotype`]-(gene:`biolink:Gene`)-[:`biolink:orthologous_to`]-(human_gene:`biolink:Gene` WHERE "NCBITaxon:9606" IN human_gene.in_taxon)
RETURN 
    upheno.id, 
    phenotype.id, 
    gene.id, 
    gena.negated,
    CASE WHEN gene.in_taxon IS NOT NULL AND size(gene.in_taxon) > 0 
         THEN REDUCE(s = "", x IN gene.in_taxon | s + x + CASE WHEN x <> gene.in_taxon[size(gene.in_taxon)-1] THEN "|" ELSE "" END) 
         ELSE "" END AS gene_in_taxon, 
    human_gene.id,
    gena.primary_knowledge_source,
    gena.publications
"""

# Run the query and print the results
data = []
with driver.session() as session:
    results = session.run(query)
    for record in results:
        data.append(record)

import pandas as pd
df = pd.DataFrame(data, columns=["upheno_grouping", "phenotype", "gene", "negated", "taxon", "human_orthologue", "source", "publications"])
df

Unnamed: 0,upheno_grouping,phenotype,gene,negated,taxon,human_orthologue,source,publications
0,UPHENO:0000508,ZP:0000606,ZFIN:ZDB-GENE-040426-1675,,NCBITaxon:7955,HGNC:9721,infores:zfin,[ZFIN:ZDB-PUB-170311-8]
1,UPHENO:0000508,ZP:0000606,ZFIN:ZDB-GENE-040426-1675,,NCBITaxon:7955,HGNC:30262,infores:zfin,[ZFIN:ZDB-PUB-170311-8]
2,UPHENO:0000508,WBPhenotype:0000848,WB:WBGene00044068,,NCBITaxon:6239,HGNC:12927,infores:wormbase,[PMID:16803962]
3,UPHENO:0000508,WBPhenotype:0000848,WB:WBGene00009178,,NCBITaxon:6239,HGNC:15664,infores:wormbase,[PMID:22073243]
4,UPHENO:0000508,WBPhenotype:0000848,WB:WBGene00009178,,NCBITaxon:6239,HGNC:15663,infores:wormbase,[PMID:22073243]
...,...,...,...,...,...,...,...,...
195803,UPHENO:0088438,MP:0003849,MGI:94865,,NCBITaxon:10090,HGNC:2690,infores:mgi,[PMID:24316079]
195804,UPHENO:0088439,MP:0001197,MGI:2141879,,NCBITaxon:10090,HGNC:18324,infores:mgi,[PMID:17601774]
195805,UPHENO:0088439,MP:0001197,MGI:106091,,NCBITaxon:10090,HGNC:2651,infores:mgi,[PMID:8663429]
195806,UPHENO:0088440,MP:0003902,MGI:1915751,,NCBITaxon:10090,HGNC:26513,infores:mgi,[PMID:26443207]


In [8]:
df.to_csv("upheno_gene_human_orthologues.tsv", sep="\t", index=False)

## Concentration phenotypes

In [13]:
# URL of the TSV file
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT597OxlO_uml2xJY6ztzBEOCf1CR6sdZSn9tmyulfHMLHIh7j8HHmfQ0f4aZnoY5bKtMUX3E5JeKOO/pub?gid=0&single=true&output=tsv'

# Read the TSV file from the URL into a pandas DataFrame
df_concentration = pd.read_csv(url, sep='\t')
df_concentration

Unnamed: 0,hpo_id,hpo_label,definition,parents
0,HP:0040147,L-2-hydroxyglutaric acidemia,,Dicarboxylic acidemia
1,HP:0410179,Decreased glucose-6-phosphate dehydrogenase le...,A decrease in the level of glucose-6-phosphate...,Abnormal glucose-6-phosphate dehydrogenase lev...
2,HP:6000292,Elevated tissue sphingomyelin concentration,Concentration of sphoingomyelin in tissues abo...,Abnormal muscle tissue metabolite concentration
3,HP:0410247,Increased anti-animal dander IgE antibody level,Increased level of IgE antibody against animal...,Increased anti-animal protein IgE antibody level
4,HP:0033502,Abnormal esterified to free carnitine ratio,Any deviation from the normal ratio of acylcar...,Abnormal circulating acylcarnitine concentration
...,...,...,...,...
1744,HP:0500239,Increased CSF albumin concentration@en,,Abnormal CSF albumin concentration
1745,HP:0008288,Nonketotic hyperglycinemia,,Hyperglycinemia
1746,HP:0410318,Decreased urinary 3-methylhistidine,Decreased concentration of 3-methylhistidine i...,Abnormal urinary 3-methylhistidine level
1747,HP:0032473,Decreased urine urobilinogen,An abnormally reduced concentration of urobili...,Abnormal urine urobilinogen level


In [33]:
import pandas as pd
import os

def merge_tsv_files(directory, df_concentration):
    # Check if 'hpo_id' exists in df_concentration and create 'defined_class'
    if 'hpo_id' in df_concentration.columns:
        df_concentration['defined_class'] = df_concentration['hpo_id']
    else:
        raise KeyError("'hpo_id' column not found in df_concentration")

    # Iterate through each file in the specified directory
    for filename in os.listdir(directory):
        # Check if the file is a TSV file
        if filename.endswith('.tsv'):
            # Construct the full file path
            file_path = os.path.join(directory, filename)
            # Read the TSV file into a DataFrame
            df_tsv = pd.read_csv(file_path, sep='\t')
            df_tsv['pattern']=filename

            # Replace "http://purl.obolibrary.org/obo/HP_" with "HP:" in 'defined_class'
            df_tsv['defined_class'] = df_tsv['defined_class'].str.replace("http://purl.obolibrary.org/obo/HP_", "HP:")

            # Check if 'defined_class' column is in df_tsv
            if 'defined_class' in df_tsv.columns:
                # Get overlapping columns except 'defined_class'
                merged_df = pd.merge(df_concentration, df_tsv, on='defined_class', how='left', suffixes=('', '_tsv'))

                # For each column that exists in both DataFrames (excluding 'defined_class')
                common_columns = set(df_concentration.columns).intersection(df_tsv.columns) - {'defined_class'}
                for col in common_columns:
                # Update df_concentration with df_tsv values where they exist
                    merged_df[col] = merged_df[col + '_tsv'].combine_first(merged_df[col])

                # Drop the temporary _tsv columns
                merged_df.drop(columns=[col + '_tsv' for col in common_columns], inplace=True)
                df_concentration = merged_df

    return df_concentration

# Usage example:
# Assuming df_concentration is your initial DataFrame and 'path/to/directory' is the directory with your TSV files
df_concentration_with_patterns = merge_tsv_files('../patterns/data/matches/', df_concentration)
df_concentration_with_patterns = df_concentration_with_patterns.dropna(axis=1, how='all')
df_concentration_with_patterns

Unnamed: 0,hpo_id,hpo_label,definition,parents,defined_class,defined_class_label,anatomical_entity,anatomical_entity_label,pattern,location,location_label,biological_process,biological_process_label,chemical_entity,chemical_entity_label,role,role_label
0,HP:0040147,L-2-hydroxyglutaric acidemia,,Dicarboxylic acidemia,HP:0040147,,,,,,,,,,,,
1,HP:0410179,Decreased glucose-6-phosphate dehydrogenase le...,A decrease in the level of glucose-6-phosphate...,Abnormal glucose-6-phosphate dehydrogenase lev...,HP:0410179,Decreased glucose-6-phosphate dehydrogenase le...,,,abnormallyDecreasedLevelOfChemicalEntityInLoca...,http://purl.obolibrary.org/obo/UBERON_0000178,blood,,,http://purl.obolibrary.org/obo/PR_000007749,glucose-6-phosphate 1-dehydrogenase,,
2,HP:6000292,Elevated tissue sphingomyelin concentration,Concentration of sphoingomyelin in tissues abo...,Abnormal muscle tissue metabolite concentration,HP:6000292,,,,,,,,,,,,
3,HP:0410247,Increased anti-animal dander IgE antibody level,Increased level of IgE antibody against animal...,Increased anti-animal protein IgE antibody level,HP:0410247,,,,,,,,,,,,
4,HP:0033502,Abnormal esterified to free carnitine ratio,Any deviation from the normal ratio of acylcar...,Abnormal circulating acylcarnitine concentration,HP:0033502,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1744,HP:0500239,Increased CSF albumin concentration@en,,Abnormal CSF albumin concentration,HP:0500239,Increased CSF albumin concentration,,,abnormallyIncreasedLevelOfChemicalEntityInLoca...,http://purl.obolibrary.org/obo/UBERON_0001359,cerebrospinal fluid,,,http://purl.obolibrary.org/obo/PR_000003918,albumin,,
1745,HP:0008288,Nonketotic hyperglycinemia,,Hyperglycinemia,HP:0008288,,,,,,,,,,,,
1746,HP:0410318,Decreased urinary 3-methylhistidine,Decreased concentration of 3-methylhistidine i...,Abnormal urinary 3-methylhistidine level,HP:0410318,Decreased urinary 3-methylhistidine,,,abnormallyDecreasedLevelOfChemicalEntityInLoca...,http://purl.obolibrary.org/obo/UBERON_0001088,urine,,,http://purl.obolibrary.org/obo/CHEBI_70959,3-methylhistidine,,
1747,HP:0032473,Decreased urine urobilinogen,An abnormally reduced concentration of urobili...,Abnormal urine urobilinogen level,HP:0032473,,,,,,,,,,,,


In [34]:
df_concentration_with_patterns.to_csv("concentration_with_patterns.tsv", sep="\t", index=False)

# Translate layperson synonyms

In [7]:
import pandas as pd
# URL of the layperson file
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSjsojX2-iZ25YPyXoI5_QycBilfkg5A-BBD30gQwmJAcoK_uWOGHROa-KjQAd_6LKoUhZsPnh0Kiy4/pub?gid=0&single=true&output=tsv'

# Read the TSV file from the URL into a pandas DataFrame
df_layperson = pd.read_csv(url, sep='\t')

In [10]:
import deepl

api_key = "8551965a-5d87-4632-8d47-69012697f4b8"
translator = deepl.Translator(api_key)

def translate_text(text_to_translate, language_code):
    """
    Translate text using DeepL API.
    """
    result = translator.translate_text(
        text_to_translate, target_lang=language_code.upper()
    )
    if result:
        return result.text
    return ""

# Translate layperson synonyms
df_layperson['layperson_synonym_de'] = df_layperson['layperson'].apply(lambda x: translate_text(f"The patient reports she is suffering from {x}", 'de'))
df_layperson

Unnamed: 0,term,term_label,layperson,layperson_synonym_de
0,http://purl.obolibrary.org/obo/HP_0004396,Poor appetite,Poor appetite,"Die Patientin berichtet, dass sie unter Appeti..."
1,http://purl.obolibrary.org/obo/HP_0008402,Ridged fingernail,Ridged fingernail,"Die Patientin berichtet, dass sie unter einem ..."
2,http://purl.obolibrary.org/obo/HP_0000464,Abnormality of the neck,Abnormality of the neck,"Die Patientin berichtet, sie leide an einer Ab..."
3,http://purl.obolibrary.org/obo/HP_0012088,Abnormal urinary odor,Abnormal urine smell,"Die Patientin berichtet, sie leide unter abnor..."
4,http://purl.obolibrary.org/obo/HP_0000419,Abnormal nasal septum morphology,Abnormality of the nasal septum,"Die Patientin berichtet, sie leide an einer An..."
...,...,...,...,...
7179,http://purl.obolibrary.org/obo/HP_0002757,Recurrent fractures,Increased fracture rate,"Die Patientin berichtet, sie leide unter einer..."
7180,http://purl.obolibrary.org/obo/HP_0005104,Hypoplastic nasal septum,Decreased size of septum of nose,"Die Patientin berichtet, sie leide unter einer..."
7181,http://purl.obolibrary.org/obo/HP_0100755,Abnormality of salivation,Abnormality of salivation,"Die Patientin berichtet, sie leide unter abnor..."
7182,http://purl.obolibrary.org/obo/HP_0012726,Episodic hypokalemia,Recurrent low potassium,"Die Patientin berichtet, sie leide unter rezid..."


In [11]:
df_layperson.to_csv("layperson_translated.tsv", sep="\t", index=False)

In [4]:
!pip install deepl

Collecting deepl
  Downloading deepl-1.22.0-py3-none-any.whl.metadata (35 kB)
Downloading deepl-1.22.0-py3-none-any.whl (43 kB)
Installing collected packages: deepl
Successfully installed deepl-1.22.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
