In [1]:
from neo4j import GraphDatabase
import networkx as nx
import pandas as pd
from sklearn import metrics
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
uri = "bolt://disease.ncats.io:80"
driver = GraphDatabase.driver(uri, auth=("neo4j", ""))


# Free exploration

In [3]:
query = """MATCH p=(n:S_ORDO_ORPHANET {N_Name:'RETINITIS PIGMENTOSA'})-[:I_CODE]-(m) RETURN p LIMIT 100"""

results = driver.session().run(query)

In [4]:
node=list(results.graph()._nodes.values())[0]

In [5]:
results.graph()._nodes[node.id].labels

frozenset({'Class', 'ENTITY', 'S_ORDO_ORPHANET', 'S_PHENOTYPE'})

In [6]:
node.id

1317850

In [7]:
node.keys()

dict_keys(['parent', 'R_closeMatch', 'N_Name', 'kind', 'created', 'R_subClassOf', 'source', 'R_hasPhenotype', 'I_CODE', 'lastUpdated', '_N_Name', 'R_equivalentClass', 'R_exactMatch', 'rank', 'R_rel', '_I_CODE'])

In [29]:
query_data = []
for node_id in results.graph()._nodes:
    node = results.graph()._nodes[node_id]
    #node_type = node.labels
    query_data.append([node.labels,node['N_Name'],node.keys()])

In [30]:
[i for i in query_data if 'S_OMIM' in i[0]] 

[[frozenset({'Class', 'ENTITY', 'S_OMIM', 'S_PHENOTYPE', 'T047'}),
  ['IDDRP', 'INTELLECTUAL DEVELOPMENTAL DISORDER AND RETINITIS PIGMENTOSA'],
  dict_keys(['parent', 'R_closeMatch', 'N_Name', 'kind', 'created', 'I_GENE', 'source', 'R_hasPhenotype', 'I_CODE', 'lastUpdated', '_N_Name', '_I_GENE', 'rank', 'R_rel', '_I_CODE'])],
 [frozenset({'Class', 'ENTITY', 'S_OMIM', 'S_PHENOTYPE', 'T047'}),
  ['RP83', 'RETINITIS PIGMENTOSA 83'],
  dict_keys(['parent', 'R_closeMatch', 'N_Name', 'kind', 'created', 'I_GENE', 'source', 'R_hasPhenotype', 'I_CODE', 'lastUpdated', '_N_Name', '_I_GENE', 'rank', 'R_rel', '_I_CODE'])],
 [frozenset({'Class', 'ENTITY', 'S_OMIM', 'S_PHENOTYPE', 'T047'}),
  ['RP80', 'RETINITIS PIGMENTOSA 80'],
  dict_keys(['parent', 'R_closeMatch', 'N_Name', 'kind', 'I_GENE', 'created', 'source', 'R_hasPhenotype', 'I_CODE', 'lastUpdated', '_N_Name', 'R_equivalentClass', '_I_GENE', 'rank', 'R_rel', '_I_CODE'])],
 [frozenset({'Class', 'ENTITY', 'S_OMIM', 'S_PHENOTYPE', 'T047'}),
  ['

In [26]:
omim_set = [i for i in query_data if 'S_OMIM' in i[0] and not 'S_PHENOTYPE' in i[0]] 

In [27]:
omim_set

[[frozenset({'Class', 'ENTITY', 'S_OMIM', 'T047'}),
  'TAPETORETINAL DEGENERATION',
  dict_keys(['lastUpdated', 'parent', '_N_Name', 'N_Name', 'kind', 'created', 'rank', 'R_subClassOf', 'source', 'R_rel', '_I_CODE', 'I_CODE'])],
 [frozenset({'Class', 'ENTITY', 'S_OMIM', 'T047'}),
  'RETINITIS PIGMENTOSA',
  dict_keys(['lastUpdated', 'parent', '_N_Name', 'N_Name', 'kind', 'created', 'rank', 'R_subClassOf', 'source', 'R_rel', '_I_CODE', 'I_CODE'])]]

# DOID

In [15]:
query = """
MATCH p=(n:S_MONDO)-[*2]-(g:S_GARD) RETURN p LIMIT 25
"""

results = driver.session().run(query)

In [16]:
nodes = results.graph()._nodes

In [26]:
payload_query="MATCH (s)<-[:PAYLOAD]-(d:DATA) WHERE ID(s) IN {} RETURN s,d".format(list(nodes.keys()))
results = driver.session().run(payload_query)

In [27]:
nodes_payload = results.graph()._nodes

In [37]:
results.graph()._nodes

{5883165: <Node id=5883165 labels=frozenset({'ENTITY', 'TRANSIENT', 'S_MONDO', 'Class'}) properties={'lastUpdated': 1607785652125, 'parent': 5756, 'kind': 'ncats.stitcher.Entity', 'created': 1607785652125, 'R_equivalentClass': ['http://purl.obolibrary.org/obo/MONDO_0001083'], 'rank': 1, 'source': '73d8af1b3', '_I_CODE': 'MESH:D005198', 'I_CODE': 'MESH:D005198'}>,
 5883166: <Node id=5883166 labels=frozenset({'DATA'}) properties={'id': 'http://purl.obolibrary.org/obo/MESH_D005198', 'type': ['Class'], 'uri': 'http://purl.obolibrary.org/obo/MESH_D005198', 'created': 1607785652128, 'notation': 'MESH:D005198'}>,
 127722: <Node id=127722 labels=frozenset({'S_PHENOTYPE', 'Class', 'human_phenotype', 'ENTITY', 'S_HP'}) properties={'parent': 5756, 'N_Name': ["'DE TONI-FANCONI-DEBRE' SYNDROME", 'RENAL TUBULAR FANCONI SYNDROME', 'RENAL FANCONI SYNDROME'], 'kind': 'ncats.stitcher.Entity', 'created': 1607577110442, 'R_subClassOf': ['http://purl.obolibrary.org/obo/HP_0011038', 'UMLS:C0341703'], 'sourc

In [38]:
results.graph()._nodes[1]

<Node id=1 labels=frozenset({'S_PHENOTYPE', 'S_GARD', 'Congenital and Genetic Diseases', 'Digestive Diseases', 'Metabolic disorders', 'ENTITY'}) properties={'parent': 5756, 'N_Name': ['GRACILE SYNDROME', 'FELLMAN DISEASE', 'FELLMAN SYNDROME', 'FINNISH LACTIC ACIDOSIS WITH HEPATIC HEMOSIDEROSIS', 'FINNISH LETHAL NEONATAL METABOLIC SYNDROME', 'FLNMS', 'GROWTH DELAY-AMINOACIDURIA-CHOLESTASIS-IRON OVERLOAD-LACTIC ACIDOSIS-EARLY DEATH SYNDROME', 'GROWTH RESTRICTION-AMINOACIDURIA-CHOLESTASIS-IRON OVERLOAD-LACTIC ACIDOSIS-EARLY DEATH SYNDROME', 'GROWTH RETARDATION, AMINOACIDURIA, CHOLESTASIS, IRON OVERLOAD, LACTIC ACIDOSIS AND EARLY DEATH'], 'kind': 'ncats.stitcher.Entity', 'created': 1604466914527, 'source': 'ae83c1975', 'R_hasPhenotype': ['HP:0003355', 'HP:0001511', 'HP:0004925', 'HP:0001396', 'HP:0003281', 'HP:0003452', 'HP:0003542', 'HP:0001319', 'HP:0000365', 'HP:0001394', 'HP:0001397', 'HP:0001994', 'HP:0003128', 'HP:0012464', 'HP:0012465', 'HP:0100613'], 'I_CODE': ['GARD:0000001', 'OMI

# Disease mapping matrix

In [39]:
data_sources = {
    'S_GARD': {
        'cons': '{X}.is_rare=true'
    },
    'S_DOID': {},
    'S_ORDO_ORPHANET': {
        'cons': 'not exists({X}.symbol) and not exists({X}.reason_for_obsolescence)'
    },
    'S_GHR': {},
    'S_HP': {},
    'S_ICD10CM': {},
    'S_MEDGEN': {
        'labels': ['T047']
    },
    'S_MEDLINEPLUS': {
        'labels': ['T047']
    },
    'S_MESH': {
        'labels': ['T047']
    },
    'S_MONDO': {
        'cons': 'exists({X}.label)'
    },
    'S_NORD': {},
    'S_OMIM': {
        'labels': ['T047']
    },
    'S_THESAURUS': {
        'labels': ['Disease or Syndrome']
    }
}

def disease_matrix (session):
    ds = list(data_sources.keys())
    for i in range (0, len(ds)):
        s1 = ds[i]
        query = 'match (a:DATA)-->(n:`%s`' % s1
        ds1 = data_sources[s1]
        if 'labels' in ds1:
            for l in ds1['labels']:
                query += ':`%s`' % l
        query += ')-[:N_Name|:I_CODE*1]-(m:`'
        for j in range (i+1, len(ds)):
            s2 = ds[j]
            q = query+s2+'`'
            ds2 = data_sources[s2]
            if 'labels' in ds2:
                for l in ds2['labels']:
                    q += ':`%s`' % l
            q += ')<--(b:DATA)'
            if 'cons' in ds1 or 'cons' in ds2:
                q += ' where'
            joint = ''
            if 'cons' in ds1:
                q += ' '+ds1['cons'].format(X='a')
                joint = ' and'
            if 'cons' in ds2:
                q += joint +' '+ds2['cons'].format(X='b')
            q += (' return count(distinct n) as `%s`, count(distinct m) as `%s`'
                  % (s1, s2))
            print('executing ==> %s' % q)
            
disease_matrix(0)

executing ==> match (a:DATA)-->(n:`S_GARD`)-[:N_Name|:I_CODE*1]-(m:`S_DOID`)<--(b:DATA) where a.is_rare=true return count(distinct n) as `S_GARD`, count(distinct m) as `S_DOID`
executing ==> match (a:DATA)-->(n:`S_GARD`)-[:N_Name|:I_CODE*1]-(m:`S_ORDO_ORPHANET`)<--(b:DATA) where a.is_rare=true and not exists(b.symbol) and not exists(b.reason_for_obsolescence) return count(distinct n) as `S_GARD`, count(distinct m) as `S_ORDO_ORPHANET`
executing ==> match (a:DATA)-->(n:`S_GARD`)-[:N_Name|:I_CODE*1]-(m:`S_GHR`)<--(b:DATA) where a.is_rare=true return count(distinct n) as `S_GARD`, count(distinct m) as `S_GHR`
executing ==> match (a:DATA)-->(n:`S_GARD`)-[:N_Name|:I_CODE*1]-(m:`S_HP`)<--(b:DATA) where a.is_rare=true return count(distinct n) as `S_GARD`, count(distinct m) as `S_HP`
executing ==> match (a:DATA)-->(n:`S_GARD`)-[:N_Name|:I_CODE*1]-(m:`S_ICD10CM`)<--(b:DATA) where a.is_rare=true return count(distinct n) as `S_GARD`, count(distinct m) as `S_ICD10CM`
executing ==> match (a:DATA)--

# Disease Similarity

In [58]:
query = """
match P = (n:S_GARD) where any (x in n.I_CODE where x=~ "ORPHA.*") return distinct n.I_CODE
"""

results = driver.session().run(query)

gard2other = []
for mapping in results.data():
    map_list = mapping['n.I_CODE']
    for i in map_list[1:]:
        if 'ORPHA:' in i:
            gard2other.append((map_list[0],i))

In [59]:
query = """
match P = (n:S_GARD) where any (x in n.I_CODE where x=~ "OMIM.*") return distinct n.I_CODE
"""

results = driver.session().run(query)

for mapping in results.data():
    map_list = mapping['n.I_CODE']
    for i in map_list[1:]:
        if 'OMIM:' in i:
            gard2other.append((map_list[0],i))

In [60]:
query = """
match P = (n:S_GARD) where any (x in n.I_CODE where x=~ "UMLS.*") return distinct n.I_CODE
"""

results = driver.session().run(query)

for mapping in results.data():
    map_list = mapping['n.I_CODE']
    for i in map_list[1:]:
        if 'UMLS:' in i:
            gard2other.append((map_list[0],i))

In [62]:
gard_map_df = pd.DataFrame(gard2other,columns=['GARD','MAPPING'])

In [113]:
gard_map_onehot = pd.get_dummies(gard_map_df,columns=['MAPPING'],prefix='',prefix_sep='').groupby('GARD').sum()
gard_map_onehot.head()

Unnamed: 0_level_0,OMIM:100070,OMIM:100100,OMIM:100300,OMIM:100600,OMIM:100678,OMIM:100700,OMIM:100800,OMIM:100820,OMIM:101000,OMIM:101200,...,UMLS:C2936859,UMLS:C2936860,UMLS:C2936861,UMLS:C2936862,UMLS:C2936863,UMLS:C2936864,UMLS:C2960310,UMLS:C3203653,UMLS:C3896969,UMLS:TEST
GARD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GARD:0000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GARD:0000003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GARD:0000005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GARD:0000007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GARD:0000011,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
gard_map_onehot.sum(1).value_counts()

2    1936
1    1860
3    1369
4      22
5       7
6       2
7       2
dtype: int64

In [119]:
gard_mapping_cosim = pd.DataFrame(metrics.pairwise.cosine_similarity(gard_map_onehot))

In [136]:
np.triu(gard_mapping_cosim,1)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [139]:
df = gard_mapping_cosim.where(np.triu(np.ones(gard_mapping_cosim.shape),1).astype(bool)).stack().reset_index()

In [141]:
df.columns = ['GARD_1','GARD_2','cosim']

In [150]:
df[df.cosim>.5]

Unnamed: 0,GARD_1,GARD_2,cosim
1369437,270,2833,0.516398
1684342,334,4490,0.57735
2219632,446,1452,0.666667
2526082,511,1232,1.0
3787064,788,2695,1.0
4074011,854,859,0.707107
4590049,974,2997,0.57735
4980822,1068,1273,0.816497
5236386,1130,2792,0.57735
5428174,1177,4559,0.707107


In [151]:
gard_map_onehot.index[[4759,4760]]

Index(['GARD:0012354', 'GARD:0012355'], dtype='object', name='GARD')

# Phenotype and Genotype ontology reconstructions

In [5]:
query = """
MATCH p=(n:S_HP)--(m:S_HP) RETURN p LIMIT 25
"""
results = driver.session().run(query).data()


In [6]:
results

[{'p': [{'parent': 5756,
    'lastUpdated': 1607576877325,
    '_N_Name': 'ABDOMINAL INGUINAL RING|ANNULUS INGUINALIS PROFUNDUS|INTERNAL ABDOMINAL RING|INTERNAL INGUINAL RING|INTERNAL RING|DEEP INGUINAL RING',
    'N_Name': ['ABDOMINAL INGUINAL RING',
     'ANNULUS INGUINALIS PROFUNDUS',
     'INTERNAL ABDOMINAL RING',
     'INTERNAL INGUINAL RING',
     'INTERNAL RING',
     'DEEP INGUINAL RING'],
    'kind': 'ncats.stitcher.Entity',
    'created': 1607576877322,
    'R_equivalentClass': ['649d2103-b84b-4a7c-bf07-c3ad275d8234',
     'b7ec7a3b-abda-4326-bd2e-150614eeb5dc',
     'd7e84242-d005-4872-b51a-51d0747895cc'],
    'rank': 4,
    'R_subClassOf': ['http://purl.obolibrary.org/obo/UBERON_0006204',
     'http://purl.obolibrary.org/obo/UBERON_0006674',
     'http://purl.obolibrary.org/obo/UBERON_0013721'],
    'source': 'd7a8dc9fd',
    '_I_CODE': 'UBERON:0013721',
    'I_CODE': 'UBERON:0013721'},
   'R_subClassOf',
   {'parent': 5756,
    'lastUpdated': 1607577221101,
    '_N_Name':

In [8]:
query = """
MATCH (d:DATA)-[:PAYLOAD]->(n:S_GARD)-[:R_hasPhenotype]->(p:S_HP)-[:R_rel|:R_subClassOf*1..3]-(p2:S_HP)
WHERE d.is_rare=true
RETURN n, p, p2 LIMIT 100
"""

In [9]:
results = driver.session().run(query).data()


No data
Failed to read from defunct connection IPv4Address(('disease.ncats.io', 80)) (IPv4Address(('54.205.210.57', 80)))


ServiceUnavailable: Failed to read from defunct connection IPv4Address(('disease.ncats.io', 80)) (IPv4Address(('54.205.210.57', 80)))