# Prepare the selected metapaths for feature computation for all compound-disease pairs

In [1]:
import json
import itertools
import bz2
import configparser

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

## Read node info

In [2]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')
len(compound_df), len(disease_df)

(4374, 8264)

In [3]:
indication_df = pandas.read_table('../summary/indications.tsv')
indication_df['category'] = 'DM'
indication_df = indication_df[['compound_id', 'disease_id', 'category']]
indication_df.head(2)

Unnamed: 0,compound_id,disease_id,category
0,Q413147,Q12125,DM
1,Q413147,Q12174,DM


In [7]:
reach_df = pandas.read_csv('../all-features/data/reach.csv', index_col=0).reset_index(drop=True)
reach_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id
0,wikidata-v0.1,Q1002165,Q1004647
1,wikidata-v0.1,Q10354103,Q1004647


In [4]:
rows = list()
for c, d in itertools.product(compound_df.itertuples(), disease_df.itertuples()):
    row = c.compound_id, c.compound_name, d.disease_id, d.disease_name
    rows.append(row)
pair_df = pandas.DataFrame(rows, columns=['compound_id', 'compound_name', 'disease_id', 'disease_name'])
pair_df = pair_df.merge(indication_df, how='left')
pair_df['status'] = (pair_df.category == 'DM').astype(int)
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status
0,Q161459,"(+)-1,2-diaminocyclohexane",Q4596888,2-hydroxyglutaric aciduria,,0
1,Q161459,"(+)-1,2-diaminocyclohexane",Q3335660,3-M syndrome,,0


In [5]:
pair_df['status'].value_counts()

0    36143767
1        2969
Name: status, dtype: int64

In [6]:
len(pair_df)

36146736

In [7]:
with bz2.open('features/compound-disease-pairs.tsv.bz2', 'wt') as write_file:
    pair_df.to_csv(write_file, sep='\t', index=False)

## Select metapaths

In [8]:
auroc_df = pandas.read_table('../all-features/data/feature-performance/auroc.tsv')
reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(auroc_df.pval_delta_auroc, method='fdr_bh')
auroc_df['fdr_delta_auroc'] = pvals_corrected
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc
0,C<hpCduftD,0.50034,0.49996,0.50038,0.000135,0.5,0.000341,9.2617e-08,3.601772e-07
1,C<hpCduftDduftCduftD,0.50029,0.5,0.50029,0.000202,0.50002,0.000273,0.00021131,0.0003287044


In [23]:
# standard is 0.55
whitelist_df = auroc_df.query(
    "rdwpc_auroc > 0.54"
    " and delta_auroc > 0"
    " and fdr_delta_auroc < 0.05"
    " and pdwpc_primary_auroc > 0.5"
).copy()
whitelist_df['feature'] = 'dwpc_' + whitelist_df['metapath']
whitelist_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc,feature
15,CduftDduftCduftD,0.82804,0.76269,0.79249,0.18814,0.5907,0.23734,8.42e-09,3.929333e-08,dwpc_CduftDduftCduftD
24,CduftDso>D<soD,0.61272,0.5182,0.60223,0.058942,0.50151,0.11121,6.3751e-09,3.432746e-08,dwpc_CduftDso>D<soD


In [24]:
metapaths = set(whitelist_df.metapath)
len(metapaths)

7

In [25]:
metapaths

{'CduftDduftCduftD',
 'CduftDso>D<soD',
 'CduftDso>D<soD<soD',
 'CduftDso>DduftCduftD',
 'CduftDso>Dso>D<soD',
 'CpiwPpiwCduftD',
 'CsdiCsdiCduftD'}

## Metaedges in chosen metapaths

In [26]:
m2m_df = pandas.read_table('../all-features/data/metaedge-in-metapath.tsv')
m2m_df = m2m_df.query("metapath in @metapaths")

In [27]:
# Counting multiple occurrences in the same metapath
m2m_df.metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Disease - drug-used-for-treatment - Compound,11
1,Disease > subclass-of > Disease,9
2,Protein - physically-interacts-with - Compound,2
3,Compound - significant-drug-interaction - Comp...,2


In [28]:
# Counting only one metaedge occurrence per metapath
m2m_df.drop_duplicates().metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Disease - drug-used-for-treatment - Compound,11
1,Disease > subclass-of > Disease,9
2,Protein - physically-interacts-with - Compound,2
3,Compound - significant-drug-interaction - Comp...,2


In [29]:
# Number of included metaedges
m2m_df.metaedge.nunique()

4

In [30]:
with open('../all-features/data/metapaths.json') as read_file:
    metapath_obj = json.load(read_file)
metapath_obj = [x for x in metapath_obj if x['abbreviation'] in metapaths]
with open('features/metapaths.json', 'wt') as write_file:
    metapath_obj = json.dump(metapath_obj, write_file, indent=2, sort_keys=True)

In [31]:
total_queries = len(metapaths) * len(pair_df)
print('{:,} total queries'.format(total_queries))

253,027,152 total queries
