In [1]:
import pandas as pd
from rdflib import Graph, RDF, Namespace

In [2]:
defects_map = pd.read_csv('defects_hpo_mappings.tsv', sep='\t', index_col=0)

In [7]:
g = Graph()

disease = Namespace("https://www.orpha.net/ORDO/") # Orphanet Rare Disease Ontology namespace
phenotype = Namespace("https://hpo.jax.org/app/browse/term/") # Human Phenotype Ontology namespace
relationship = Namespace("https://semanticscience.org/resource/") # Semanticscience Integrated Ontology namespace
gene_symbol = Namespace("https://identifiers.org/hgnc.symbol/") # HGNC Gene Symbol namespace

In [8]:
g.bind('gene_symbol', gene_symbol)
g.bind('disease', disease)
g.bind('relationship', relationship)
g.bind('phenotype', phenotype)

In [9]:
with open('CDC-birth-defect-genes-Geneshot.gmt', 'r') as f: 
    for l in f.readlines(): 
        p = l.split('\t')[0].strip()
        hp_id = defects_map.loc[p, 'hp']
        if hp_id.startswith('ORPHA'): # ORPHA diseases technically belong to a separate namespace than HPO phenotypes
            hp_id = hp_id.replace(':', '_').replace('ORPHA', 'Orphanet')
            g.add((disease[hp_id], RDF.type, relationship['SIO_010299'])) # disease type
            for gene in l.split('\t')[2:]:
                g.add((disease[hp_id], relationship['SIO_000983'], gene_symbol[gene.strip()])) # gene-disease association
        else:
            g.add((phenotype[hp_id], RDF.type, relationship['SIO_010056'])) # phenotype type
            for gene in l.split('\t')[2:]:
                g.add((phenotype[hp_id], relationship['SIO_000983'], gene_symbol[gene.strip()])) # gene-disease association

In [11]:
g.serialize(format="turtle", destination="Geneshot_BirthDefects_Gene_Associations.ttl")

<Graph identifier=Ne12a419597f044bf8ff0120972d36d73 (<class 'rdflib.graph.Graph'>)>