In [1]:
import rdflib

# Load the OWL file
file_path = "./ORDO_en_4.5.owl"
g = rdflib.Graph()
g.parse(file_path, format="xml")

# Define the namespaces (prefixes) used in the OWL file
namespace = {
    "ORDO": "http://www.orpha.net/ORDO/",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "owl": "http://www.w3.org/2002/07/owl#",
    "efo": "http://www.ebi.ac.uk/efo/",
    "oboInOwl": "http://www.geneontology.org/formats/oboInOwl#",
}

# Query to get all classes with their labels
label_query = """
    SELECT ?class ?label WHERE {
        ?class rdf:type owl:Class .
        ?class rdfs:label ?label .
    }
"""

# Execute the query to get top-level categories
label_results = g.query(label_query, initNs=namespace)
base_uris = {str(label): class_uri for class_uri, label in label_results if class_uri.startswith("http://www.orpha.net/ORDO/Orphanet_C")}

In [2]:
# Function to get efo:definition for a given entity
def get_definition(g, entity_uri):
    query = f"""
        SELECT ?definition WHERE {{
            <{entity_uri}> efo:definition ?definition .
        }}
    """
    results = g.query(query, initNs=namespace)
    for row in results:
        return str(row.definition)
    return None


base_uris_map = {
    str(class_uri): {"label": label, "definition": get_definition(g, class_uri)}
    for label, class_uri in base_uris.items()
}

In [3]:
base_uris_map

{'http://www.orpha.net/ORDO/Orphanet_C001': {'label': 'clinical entity',
  'definition': 'A generic term used to describe the clinical items included in the Orphanet nomenclature of rare diseases.'},
 'http://www.orpha.net/ORDO/Orphanet_C010': {'label': 'genetic material',
  'definition': 'DNA or RNA sequence (gene with protein product, non-coding RNA and disorder-associated locus).'},
 'http://www.orpha.net/ORDO/Orphanet_C041': {'label': 'inactive clinical entity',
  'definition': 'A clinical entity that has been excluded from the Orphanet nomenclature. This includes obsolete entities, deprecated entities, and entities that have been inactivated because they are not rare in Europe.'},
 'http://www.orpha.net/ORDO/Orphanet_C042': {'label': 'deprecated clinical entity',
  'definition': 'A clinical entity that was initially considered as an independent diagnosis, but is now considered as part of another diagnosis as a result of the evolution of knowledge, and is therefore removed from the

In [4]:
# SPARQL query to get all entities with their subclasses
query = """
    SELECT ?entity ?subclass WHERE {
        ?entity rdf:type owl:Class .
        OPTIONAL { ?subclass rdfs:subClassOf ?entity . }
    }
"""

# Execute the query
results = g.query(query, initNs=namespace)

# Dictionaries to hold entities and their subclasses
entities_with_subclasses = set()
entities = set()

for row in results:
    entity = row.entity
    subclass = row.subclass
    entities.add(entity)
    if subclass:
        entities_with_subclasses.add(subclass)

# Bottom entities are those that are not in entities_with_subclasses
bottom_entities = [
    entity
    for entity in entities - entities_with_subclasses
    if entity.startswith("http://www.orpha.net/ORDO/Orphanet_C")
]


# Function to recursively build the graph structure
def build_graph_structure(entity, graph_structure):
    # SPARQL query to get all entities with their subclasses
    query = """
        SELECT ?entity ?parent WHERE {
            ?entity rdf:type owl:Class .
            ?entity rdfs:subClassOf ?parent .
        }
    """

    # Execute the query
    results = g.query(query, initNs=namespace)
    for row in results:
        entity = str(row.entity)
        parent = str(row.parent)
        if entity not in graph_structure:
            graph_structure[entity] = []
        if parent.startswith("http://www.orpha.net/ORDO/Orphanet_"):
            graph_structure[entity].append(parent)
            graph_structure[entity] = list(set(graph_structure[entity]))


# Dictionary to store the hierarchical structure
graph_structure = {}

# Build the graph structure starting from each bottom entity
for entity in bottom_entities:
    build_graph_structure(entity, graph_structure)


# Function to find all ancestors of a given node
def find_all_ancestors(node, graph_structure):
    ancestors = list()
    if node in graph_structure:
        parents = graph_structure[node]
        for parent in parents:
            ancestors.append(parent)
            ancestors.extend(find_all_ancestors(parent, graph_structure))
    return ancestors


# Dictionary to store each node and its ancestors
node_ancestors = {
    node: list(find_all_ancestors(node, graph_structure)) for node in graph_structure
}

In [5]:
node_ancestors

{'http://www.orpha.net/ORDO/Orphanet_10': ['http://www.orpha.net/ORDO/Orphanet_377789',
  'http://www.orpha.net/ORDO/Orphanet_557493',
  'http://www.orpha.net/ORDO/Orphanet_C001',
  'http://www.orpha.net/ORDO/Orphanet_557493',
  'http://www.orpha.net/ORDO/Orphanet_C001'],
 'http://www.orpha.net/ORDO/Orphanet_100': ['http://www.orpha.net/ORDO/Orphanet_557493',
  'http://www.orpha.net/ORDO/Orphanet_C001',
  'http://www.orpha.net/ORDO/Orphanet_377788',
  'http://www.orpha.net/ORDO/Orphanet_557493',
  'http://www.orpha.net/ORDO/Orphanet_C001'],
 'http://www.orpha.net/ORDO/Orphanet_1000': ['http://www.orpha.net/ORDO/Orphanet_557493',
  'http://www.orpha.net/ORDO/Orphanet_C001',
  'http://www.orpha.net/ORDO/Orphanet_377788',
  'http://www.orpha.net/ORDO/Orphanet_557493',
  'http://www.orpha.net/ORDO/Orphanet_C001'],
 'http://www.orpha.net/ORDO/Orphanet_100000': ['http://www.orpha.net/ORDO/Orphanet_557494',
  'http://www.orpha.net/ORDO/Orphanet_C001'],
 'http://www.orpha.net/ORDO/Orphanet_100

In [6]:
# Function to get efo:definition for a given entity
def get_name(g, entity_uri):
    query = f"""
        SELECT ?label WHERE {{
            <{entity_uri}> rdfs:label ?label .
        }}
    """
    results = g.query(query, initNs=namespace)
    for row in results:
        return str(row.label)
    return None

def get_metadata(g, entity_uri, id_suffix):
    query = f"""
        SELECT ?Orphanet_{id_suffix} WHERE {{
            <{entity_uri}> ORDO:Orphanet_{id_suffix} ?Orphanet_{id_suffix} .
        }}
    """
    results = g.query(query, initNs=namespace)
    m = [str(row[f"Orphanet_{id_suffix}"]) for row in results]
    return m

def get_db_xrefs(g, entity_uri):
    mapping = {
        "MeSH": "MESH",
        "MedDRA": "MEDDRA",
        "ICD-10": "ICD-10",
        "ICD-11": "ICD-11",
        "UMLS": "UMLS",
    }
    query = f"""
        SELECT ?db_xrefs WHERE {{
            <{entity_uri}> oboInOwl:hasDbXref ?db_xrefs .
        }}
    """
    results = g.query(query, initNs=namespace)
    db_xrefs = [str(row.db_xrefs) for row in results]
    xrefs = [f"{mapping.get(x.split(':')[0], x.split(':')[0])}:{x.split(':')[1]}" for x in db_xrefs]
    return xrefs

nodes = []
for node, ancestors in node_ancestors.items():
    node_data = {}
    print(f"Node: {node}")
    print(f"Ancestors: {ancestors}")

    node_data["id"] = f"Orphanet:{node.split('_')[-1]}"
    node_data["raw_id"] = node
    parent = ancestors[-1] if ancestors else None
    if parent and parent.endswith("C001"):
        node_data["label"] = "Disease"
    else:
        # We don't care about the ancestors that are not diseases
        continue
    node_data["resource"] = "Orphanet"
    node_data["name"] = get_name(g, node)
    # http://www.orpha.net/ORDO/Orphanet_C016
    # <AnnotationProperty rdf:about="http://www.orpha.net/ORDO/Orphanet_C016">
    #     <efo:definition xml:lang="en">Relationship between a clinical entity and modes of inheritance.</efo:definition>
    #     <rdfs:label>has_inheritance</rdfs:label>
    # </AnnotationProperty>
    node_data["has_inheritance"] = get_metadata(g, node, "C016")
    # http://www.orpha.net/ORDO/Orphanet_C017
    # <AnnotationProperty rdf:about="http://www.orpha.net/ORDO/Orphanet_C017">
    #     <efo:definition xml:lang="en">Relationship between clinical entity and age of onset.</efo:definition>
    #     <rdfs:label>has_age_of_onset</rdfs:label>
    # </AnnotationProperty>
    node_data["has_age_of_onset"] = get_metadata(g, node, "C017")
    # http://www.orpha.net/ORDO/Orphanet_C022
    # <AnnotationProperty rdf:about="http://www.orpha.net/ORDO/Orphanet_C022">
    #     <efo:definition xml:lang="en">Relationship between a clinical entity and the geographical area for which epidemiological data (Epidemiology) is available.</efo:definition>
    #     <rdfs:label>present_in</rdfs:label>
    # </AnnotationProperty>
    node_data["present_in"] = get_metadata(g, node, "C022")
    node_data["description"] = get_definition(g, node)
    node_data["xrefs"] = get_db_xrefs(g, node)
    node_data["pmids"] = None
    node_data["synonyms"] = None

    nodes.append(node_data)

nodes

Node: http://www.orpha.net/ORDO/Orphanet_10
Ancestors: ['http://www.orpha.net/ORDO/Orphanet_377789', 'http://www.orpha.net/ORDO/Orphanet_557493', 'http://www.orpha.net/ORDO/Orphanet_C001', 'http://www.orpha.net/ORDO/Orphanet_557493', 'http://www.orpha.net/ORDO/Orphanet_C001']
Node: http://www.orpha.net/ORDO/Orphanet_100
Ancestors: ['http://www.orpha.net/ORDO/Orphanet_557493', 'http://www.orpha.net/ORDO/Orphanet_C001', 'http://www.orpha.net/ORDO/Orphanet_377788', 'http://www.orpha.net/ORDO/Orphanet_557493', 'http://www.orpha.net/ORDO/Orphanet_C001']
Node: http://www.orpha.net/ORDO/Orphanet_1000
Ancestors: ['http://www.orpha.net/ORDO/Orphanet_557493', 'http://www.orpha.net/ORDO/Orphanet_C001', 'http://www.orpha.net/ORDO/Orphanet_377788', 'http://www.orpha.net/ORDO/Orphanet_557493', 'http://www.orpha.net/ORDO/Orphanet_C001']
Node: http://www.orpha.net/ORDO/Orphanet_100000
Ancestors: ['http://www.orpha.net/ORDO/Orphanet_557494', 'http://www.orpha.net/ORDO/Orphanet_C001']
Node: http://www.o

[{'id': 'Orphanet:10',
  'raw_id': 'http://www.orpha.net/ORDO/Orphanet_10',
  'label': 'Disease',
  'resource': 'Orphanet',
  'name': '48,XXYY syndrome',
  'has_inheritance': ['Not applicable', 'Unknown'],
  'has_age_of_onset': ['Adolescent', 'Childhood', 'Infancy', 'Neonatal'],
  'present_in': ['Europe AND has_birth_prevalence_average_value : 1.9 AND has_birth_prevalence_range : 1-9 / 100 000',
   'Worldwide AND has_point_prevalence_range : Unknown'],
  'description': 'A rare sex chromosome number anomaly disorder characterized, genetically, by the presence of an extra X and Y chromosome in males and, clinically, by tall stature, dysfunctional testes associated with infertility and insufficient testosterone production, cognitive, affective and social functioning impairments, global developmental delay, and an increased risk of congenital malformations.',
  'xrefs': ['ICD-10:Q98.8',
   'ICD-11:LD50.3Y',
   'MESH:D007713',
   'MedDRA:10048230',
   'UMLS:C2936741'],
  'pmids': None,
  's

In [None]:
import pandas as pd

df = pd.DataFrame(nodes)
df.to_csv("orphanet.csv", index=False)