# Assign positives and negatives 

In [1]:
import json
import threading
import concurrent.futures
import bz2
import csv
import time

import pandas
import py2neo
from tqdm import tqdm

## Connect to neo4j server

In [2]:
# Override the default py2neo timeout
py2neo.packages.httpstream.http.socket_timeout = 1e8

In [3]:
with open('servers.json') as read_file:
    instances = json.load(read_file)

name_to_neo = dict()
for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    bolt_port = 7690 + (instance['port'] % 10)
    name_to_neo[instance['name']] = py2neo.database.Graph(uri, bolt = True, bolt_port = bolt_port)

## Read metapaths

In [4]:
with open('data/metapaths.json') as read_file:
    metapaths = json.load(read_file)

metapaths.sort(key=lambda x: x['join_complexities'][0])
len(metapaths)

295

## Read hetnet-compound-disease pairs

In [5]:
part_df = pandas.read_table('data/partitions.tsv')
part_df['neo'] = part_df.hetnet.map(name_to_neo)
parts = list(part_df.itertuples())
print(len(part_df))
part_df.head(2)

161327


Unnamed: 0,hetnet,compound_id,disease_id,status,primary,neo
0,wikidata-v0.1,Q10354103,Q1004647,0,1,<Graph uri='http://localhost:7500/db/data/'>
1,wikidata-v0.1_perm-1,Q10354103,Q1004647,0,0,<Graph uri='http://localhost:7501/db/data/'>


In [6]:
part_df['disease_id'].nunique()

626

In [7]:
part_df['compound_id'].nunique()

1019

## Set up queries

In [8]:
# Total number of queries
total_queries = len(metapaths) * len(part_df)
'{:,}'.format(total_queries)

'47,591,465'

In [9]:
parts = [row for row in part_df.itertuples()]

def generate_parameters(max_elems=None):
    """Generate compound, disease, metapath combinations"""
    n = 0
    for metapath_dict in metapaths:
        metapath = metapath_dict['abbreviation']
        query = metapath_dict['dwpc_query']
        for part_info in parts:
            if max_elems is not None and n == max_elems:
                break
            yield {
                'neo': part_info.neo,
                'hetnet': part_info.hetnet,
                'compound_id': part_info.compound_id,
                'disease_id': part_info.disease_id,
                'metapath': metapath,
                'query': query,
                'w': 0.4,
            }
            n += 1

In [10]:
def compute_dwpc(neo, hetnet, query, metapath, compound_id, disease_id, w):
    """Execute the neo4j query and write results to file"""
    start = time.time()
    results = neo.data(query, source=compound_id, target=disease_id, w=w)
    record = results[0]
    seconds = '{0:.4g}'.format(time.time() - start)
    row = hetnet, compound_id, disease_id, metapath, record['PC'], w, '{0:.6g}'.format(record['DWPC']), seconds
    with writer_lock:
        writer.writerow(row)

## Execute queries

In [11]:
def test(neo, hetnet, query, metapath, compound_id, disease_id, w):
    print(neo, hetnet, query, metapath, compound_id, disease_id, w)

In [41]:
compute_dwpc(**params)

In [29]:
query = 'MATCH (a:Compound)-[:`drug-used-for-treatment_DduftC`]-(other) RETURN a.name, count(other) LIMIT 4'

In [30]:
g.data(query)

[{'a.name': 'metirosine', 'count(other)': 1},
 {'a.name': 'Urofollitropin', 'count(other)': 2},
 {'a.name': 'molindone', 'count(other)': 2},
 {'a.name': 'cabazitaxel', 'count(other)': 1}]

In [15]:
test(**params)

<Graph uri='http://localhost:7504/db/data/'> wikidata-v0.1_perm-4 MATCH path = (n0:Compound)<-[:`instance-of_CioC`]-(n1)<-[:`instance-of_CioC`]-(n2)<-[:`instance-of_CioC`]-(n3)-[:`drug-used-for-treatment_DduftC`]-(n4:Disease)
USING JOIN ON n2
WHERE n0.identifier = { source }
AND n4.identifier = { target }
AND n0 <> n1 AND n0 <> n2 AND n0 <> n3 AND n1 <> n2 AND n1 <> n3 AND n2 <> n3
WITH
[
size((n0)<-[:`instance-of_CioC`]-()),
size(()<-[:`instance-of_CioC`]-(n1)),
size((n1)<-[:`instance-of_CioC`]-()),
size(()<-[:`instance-of_CioC`]-(n2)),
size((n2)<-[:`instance-of_CioC`]-()),
size(()<-[:`instance-of_CioC`]-(n3)),
size((n3)-[:`drug-used-for-treatment_DduftC`]-()),
size(()-[:`drug-used-for-treatment_DduftC`]-(n4))
] AS degrees, path
RETURN
count(path) AS PC,
sum(reduce(pdp = 1.0, d in degrees| pdp * d ^ -{ w })) AS DWPC C<ioC<ioC<ioCduftD Q370244 Q18557998 0.4


In [44]:
metapath_dict['abbreviation']

'CsdiCsdiCduftD<soD'

In [50]:
for metapath_dict in metapaths:
    metapath = metapath_dict['abbreviation']
    query = metapath_dict['dwpc_query']
    part_info = parts[1854]
    params = {
                    'neo': part_info.neo,
                    'hetnet': part_info.hetnet,
                    'compound_id': part_info.compound_id,
                    'disease_id': part_info.disease_id,
                    'metapath': metapath,
                    'query': query,
                    'w': 0.4,
                }



    g = params['neo']
    query = params['query']
    source = params['compound_id']
    target = params['disease_id']
    w = params['w']

    res = g.data(query, source=source, target=target, w=w)
    if res[0]['DWPC'] != 0:
        print(res)


[{'DWPC': 0.0018540439162014872, 'PC': 8}]
[{'DWPC': 0.03477346681446545, 'PC': 6}]
[{'DWPC': 0.020715739227722747, 'PC': 4}]
[{'DWPC': 0.0015621473793640199, 'PC': 1}]
[{'DWPC': 0.0037495345405597004, 'PC': 1}]
[{'DWPC': 0.0016320797031912603, 'PC': 1}]
[{'DWPC': 0.002937409003502495, 'PC': 1}]
[{'DWPC': 0.010060728514355851, 'PC': 2}]


In [None]:
%%time

# Parameters
workers = 4
max_elems = None

# Prepare writer
print('preparing writer...')
path = 'data/dwpc.tsv.bz2'
write_file = bz2.open(path, 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(['hetnet', 'compound_id', 'disease_id', 'metapath', 'PC', 'w', 'DWPC', 'seconds'])

# Create ThreadPoolExecutor
print('creating executer...')
executor = concurrent.futures.ThreadPoolExecutor(max_workers=workers)
writer_lock = threading.Lock()

# Submit jobs
print('submitting first job...')
n_queries = 0
for params in tqdm(generate_parameters(max_elems)):
    executor.submit(compute_dwpc, **params)
    n_queries += 1

# Shutdown and close
executor.shutdown()
write_file.close()

10146it [00:00, 51275.88it/s]

preparing writer...
creating executer...
submitting first job...


875805it [00:25, 48284.00it/s]

In [16]:
n_queries

1000000