# Assign positives and negatives 

In [1]:
import itertools
import random
import sys
import json

import py2neo
import pandas

## Startup neo4j and connections

In [2]:
with open('servers.json') as read_file:
    instances = json.load(read_file)

for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])

    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    bolt_port = 7690 + (instance['port'] % 10)
    instance['py2neo'] = py2neo.database.Graph(uri, bolt = True, bolt_port = bolt_port)
    
    if instance['name'] == 'wikidata-v0.1':
        neo_unperm = instance['py2neo']

hetnets = [x['name'] for x in instances]
hetnets

['wikidata-v0.1',
 'wikidata-v0.1_perm-1',
 'wikidata-v0.1_perm-2',
 'wikidata-v0.1_perm-3',
 'wikidata-v0.1_perm-4',
 'wikidata-v0.1_perm-5']

## Create partitions

In [3]:
def to_df(record_list):
    """Convert a py2neo RecordList to a dataframe"""
    return pandas.DataFrame(record_list)

In [4]:
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')

In [5]:
'{} compounds × {} diseases = {} pairs'.format(
    len(disease_df), len(compound_df), len(disease_df) * len(compound_df))

'8264 compounds × 4374 diseases = 36146736 pairs'

In [6]:
nonzero_prior_pairs = set(itertools.product(
    compound_df.query("treats > 0").compound_id,
    disease_df.query("treats > 0").disease_id)
)

In [7]:
indication_query = '''
MATCH (compound:Compound)<-[treatment:`drug-used-for-treatment_DduftC`]-(disease:Disease)
RETURN
  compound.identifier AS compound_id,
  disease.identifier AS disease_id
ORDER BY
  compound_id, disease_id
'''

def partition(neo):
    """
    Extract negative and positive compound-disease pairs from a py2neo.Graph.
    """
    # Use TREATS_CtD as positives
    indication_df = to_df(neo.data(indication_query))
    positives = set(zip(indication_df.compound_id, indication_df.disease_id))
    # Use nonzero-prior pairs excluding non-negatives as negatives
    negatives = nonzero_prior_pairs - positives
    negatives = random.sample(negatives, k=(len(positives) * 4))
    rows = list()
    for status, pairs in (0, negatives), (1, positives):
        for drug, disease in pairs:
            rows.append((drug, disease, status))
    df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id', 'status'])
    df = df.sort_values(['disease_id', 'status', 'compound_id'])
    return df

In [8]:
compound_df.query('treats > 0')

Unnamed: 0,compound_id,compound_name,treats,palliates,total_edges
5,Q413147,(+)-phenylpropanolamine,3,0,3
8,Q3322838,(+/-)-benzphetamine,1,0,6
14,Q407418,(-)-menthol,1,0,3
23,Q474880,(21R)-argatroban anhydrous,1,0,1
37,Q3231623,(E)-cefprozil anhydrous,5,0,5
38,Q2439845,(E)-crotamiton,1,0,1
45,Q417227,(R)-etiracetam,1,0,1
65,Q422416,17β-estradiol,6,0,11
93,Q1117888,4-[1-hydroxy-2-(1-phenoxypropan-2-ylamino)prop...,3,0,3
122,Q2697833,Abatacept,1,0,1


In [9]:
part_dfs = list()
for seed, instance in enumerate(instances):
    random.seed(seed, version=2)
    part_df = partition(instance['py2neo'])
    part_df.insert(0, 'hetnet', instance['name'])
    part_dfs.append(part_df)
part_df = pandas.concat(part_dfs)

In [10]:
part_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
9850,wikidata-v0.1,Q10354103,Q1004647,0
6932,wikidata-v0.1,Q19856779,Q1004647,0


In [11]:
## Create an even-matrix for unpermuted observations
template_df = pandas.DataFrame(list(itertools.product(hetnets, compound_df.compound_id)), columns=['hetnet', 'compound_id'])
unperm_pair_df = part_df.query("hetnet == 'wikidata-v0.1'")[['compound_id', 'disease_id']]
unperm_pair_df = template_df.merge(unperm_pair_df)
unperm_pair_df = unperm_pair_df.merge(part_df, how='left')
unperm_pair_df = unperm_pair_df[unperm_pair_df.status.isnull()]
unperm_pair_df.status = unperm_pair_df.status.fillna(0).astype(int)
unperm_pair_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
18,wikidata-v0.1_perm-1,Q413147,Q114085,0
19,wikidata-v0.1_perm-1,Q413147,Q117121,0


In [12]:
part_df['primary'] = 1
unperm_pair_df['primary'] = 0
full_part_df = pandas.concat([part_df, unperm_pair_df])
full_part_df = full_part_df.sort_values(['compound_id', 'disease_id', 'hetnet'])
full_part_df.head(5)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary
9850,wikidata-v0.1,Q10354103,Q1004647,0,1
72540,wikidata-v0.1_perm-1,Q10354103,Q1004647,0,0
72552,wikidata-v0.1_perm-2,Q10354103,Q1004647,0,0
72564,wikidata-v0.1_perm-3,Q10354103,Q1004647,0,0
72576,wikidata-v0.1_perm-4,Q10354103,Q1004647,0,0


In [13]:
full_part_df.to_csv('data/partitions.tsv', sep='\t', index=False)

In [14]:
# Number of hetnet-compound-disease pairs
len(full_part_df)

161327

In [15]:
# Number of positives and negatives per purpose
pandas.crosstab(full_part_df.primary, full_part_df.status)

status,0,1
primary,Unnamed: 1_level_1,Unnamed: 2_level_1
0,72257,0
1,71256,17814


In [16]:
# Number of positives and negatives per hetnet
pandas.crosstab(full_part_df.hetnet, full_part_df.status)

status,0,1
hetnet,Unnamed: 1_level_1,Unnamed: 2_level_1
wikidata-v0.1,11876,2969
wikidata-v0.1_perm-1,26326,2969
wikidata-v0.1_perm-2,26321,2969
wikidata-v0.1_perm-3,26301,2969
wikidata-v0.1_perm-4,26359,2969
wikidata-v0.1_perm-5,26330,2969
