# Find what compounds and diseases are reachable by metapath of length 4

In [1]:
import pandas as pd
import json
import py2neo
from tqdm import tqdm

In [2]:
with open('../neo/servers.json') as read_file:
    instances = json.load(read_file)

name_to_neo = dict()
for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    bolt_port = 7690 + (instance['port'] % 10)
    name_to_neo[instance['name']] = py2neo.database.Graph(uri, bolt = True, bolt_port = bolt_port)

In [3]:
query = """
MATCH (n0:Compound)-[e1]-(n1:Disease)
RETURN
n0.identifier AS compound_id,
n1.identifier AS disease_id
"""

In [4]:
neo = name_to_neo['wikidata-v1.0']
neo

<Graph uri='http://localhost:7500/db/data/'>

In [5]:
direct = pd.DataFrame(neo.data(query))
print(len(direct))
direct.head(3)

2651


Unnamed: 0,compound_id,disease_id
0,Q900898,Q18558279
1,Q900898,Q133087
2,Q900898,Q101896


In [6]:
direct['disease_id'].nunique()

572

In [7]:
query2 = """
MATCH (n0:Compound)-[e1]-(n1)-[e2]-(n2:Disease)
WHERE
n0 <> n1 AND n1 <> n2
RETURN DISTINCT
n0.identifier AS compound_id,
n2.identifier AS disease_id
"""

In [8]:
len2 = pd.DataFrame(neo.data(query2))
print(len(len2))
len2.head(3)

15069


Unnamed: 0,compound_id,disease_id
0,Q10354103,Q168403
1,Q4746466,Q12174
2,Q408557,Q12174


In [9]:
pd.concat([direct, len2])['disease_id'].nunique()

2081

In [10]:
query3 = """
MATCH (n0:Compound)-[e1]-(n1)-[e2]-(n2)-[e3]-(n3:Disease)
WHERE
n0 <> n1 AND n0 <> n2 AND
n1 <> n2 AND n1 <> n3 AND 
n2 <> n3
RETURN DISTINCT
n0.identifier AS compound_id,
n3.identifier AS disease_id
"""

In [11]:
len3 = pd.DataFrame(neo.data(query3))
print(len(len3))
len3.head(3)

126331


Unnamed: 0,compound_id,disease_id
0,Q187695,Q12192
1,Q187695,Q47912
2,Q187695,Q180664


In [12]:
pd.concat([direct, len2, len3])['disease_id'].nunique()

3754

In [13]:
query4 = """
MATCH (n0:Compound)-[e1]-(n1)-[e2]-(n2)-[e3]-(n3)-[e4]-(n4:Disease)
WHERE
n0 <> n1 AND n0 <> n2 AND n0 <> n3 AND
n1 <> n2 AND n1 <> n3 AND n1 <> n4 AND
n2 <> n3 AND n2 <> n4 AND
n3 <> n4
RETURN DISTINCT
n0.identifier AS compound_id,
n4.identifier AS disease_id
"""

In [14]:
len4 = pd.DataFrame(neo.data(query4))
print(len(len4))
len4.head(3)

783893


Unnamed: 0,compound_id,disease_id
0,Q894611,Q168403
1,Q2823280,Q168403
2,Q6523413,Q168403


In [15]:
pd.concat([direct, len2, len3, len4])['disease_id'].nunique()

5072

In [16]:
reach_df = pd.concat([direct, len2, len3, len4]).drop_duplicates()

In [17]:
print(len(reach_df))
reach_df.to_csv('data/reach.csv', index=False)

838714


In [18]:
reach_df['compound_id'].nunique()

2491

In [27]:
m_path = pd.concat([len2, len3, len4])

In [28]:
m_path_set = set(m_path['disease_id'])

In [29]:
set(direct['disease_id']) - m_path_set

{'Q1149042',
 'Q1648484',
 'Q1755568',
 'Q18557017',
 'Q201180',
 'Q3547899',
 'Q6822340'}

In [19]:
df_list = []

for instance in tqdm(instances[1:]):
    neo = name_to_neo[instance['name']]
    
    permuted_direct = pd.DataFrame(neo.data(query))
    permuted_len2 = pd.DataFrame(neo.data(query2))
    permuted_len3 = pd.DataFrame(neo.data(query3))
    permuted_len4 = pd.DataFrame(neo.data(query4))
    
    df_list.append(pd.concat([permuted_direct, permuted_len2, permuted_len3, permuted_len4]).drop_duplicates())
    

100%|██████████| 5/5 [14:03<00:00, 169.81s/it]


In [20]:
new_dfs = [reach_df] + df_list

In [21]:
len(new_dfs)

6

In [22]:
dis_sets = []
for df in new_dfs:
    dis_sets.append(set(df['disease_id']))

In [23]:
for ds in dis_sets:
    print(len(ds))

5072
4731
4710
4752
4754
4745


In [24]:
for ds in dis_sets:
    print(len(dis_sets[0] - ds))

0
341
362
320
318
327


In [25]:
for ds in dis_sets:
    print(len(ds - dis_sets[0]))

0
0
0
0
0
0


In [30]:
for df, instance in zip(new_dfs, instances):
    df['network'] = instance['name']

In [31]:
reach = pd.concat(new_dfs)

In [32]:
reach.to_csv('data/reach_all_perms.csv', index=False)