# Extracting Walks from Unperm and Permuted Networks

In [1]:
import sys
import pandas as pd
sys.path.append('../src')
from extractor import MatrixFormattedGraph

In [2]:
unperm_graph = MatrixFormattedGraph('nodes.csv', 'edges.csv')

Reading file information...
Processing node and edge data...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 24/24 [00:59<00:00,  4.69s/it]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 24/24 [00:30<00:00,  1.26s/it]


In [3]:
unperm_walks = unperm_graph.extract_dwwc(start_nodes='Compound', end_nodes='Disease', n_jobs=16)

Preparing function arguments...
Calculating DWWCs...


100%|██████████| 1206/1206 [00:39<00:00, 30.88it/s]



Formatting results to series...


100%|██████████| 1206/1206 [00:04<00:00, 283.71it/s]



Concatenating series to DataFrame...


In [4]:
# Add hetnet name to the result
unperm_walks['hetnet'] = 'rephetio-v2.0'

In [5]:
unperm_walks.head()

Unnamed: 0,compound_id,disease_id,CbG<rG<rGaD,CbG<rG<rGdD,CbG<rG<rGuD,CbG<rGaD,CbG<rGaDrD,CbG<rGbCpD,CbG<rGbCtD,CbG<rGcGaD,...,CuGuDpCtD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD,hetnet
0,DB00014,DOID:0050156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002885,0.0,0.000216,0.000662,0.001791,rephetio-v2.0
1,DB00014,DOID:0050425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rephetio-v2.0
2,DB00014,DOID:0050741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000413,0.0,0.0,0.0,0.0,0.0,0.000211,0.00116,rephetio-v2.0
3,DB00014,DOID:0050742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00805,0.0,0.0,0.000816,0.001002,rephetio-v2.0
4,DB00014,DOID:0060073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rephetio-v2.0


## Handling the permuted networks

In [6]:
%%time
permuted_walks = []

for i in range(1, 6):
    edge_file = 'hetnet_perm-{}.csv'.format(i)
    
    permuted_graph = MatrixFormattedGraph('nodes.csv', edge_file)
    
    res = permuted_graph.extract_dwwc(start_nodes='Compound', end_nodes='Disease', n_jobs=4)
    res['hetnet'] = 'rephetio-v2.0_perm-{}'.format(i)
    
    permuted_walks.append(res)

Reading file information...
Processing node and edge data...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 24/24 [01:00<00:00,  5.12it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 24/24 [00:32<00:00,  1.34s/it]


Preparing function arguments...
Calculating DWWCs...


100%|██████████| 1206/1206 [00:40<00:00, 29.84it/s]



Formatting results to series...


100%|██████████| 1206/1206 [00:04<00:00, 279.34it/s]



Concatenating series to DataFrame...
Reading file information...
Processing node and edge data...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 24/24 [01:00<00:00,  4.58it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 24/24 [00:32<00:00,  1.34s/it]


Preparing function arguments...
Calculating DWWCs...


100%|██████████| 1206/1206 [00:41<00:00, 29.04it/s]



Formatting results to series...


100%|██████████| 1206/1206 [00:04<00:00, 286.70it/s]



Concatenating series to DataFrame...
Reading file information...
Processing node and edge data...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 24/24 [01:00<00:00,  4.44it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 24/24 [00:32<00:00,  1.36s/it]


Preparing function arguments...
Calculating DWWCs...


100%|██████████| 1206/1206 [00:40<00:00, 29.50it/s]



Formatting results to series...


100%|██████████| 1206/1206 [00:04<00:00, 284.29it/s]



Concatenating series to DataFrame...
Reading file information...
Processing node and edge data...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 24/24 [00:59<00:00,  4.98it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 24/24 [00:32<00:00,  1.36s/it]


Preparing function arguments...
Calculating DWWCs...


100%|██████████| 1206/1206 [00:40<00:00, 29.96it/s]



Formatting results to series...


100%|██████████| 1206/1206 [00:04<00:00, 286.54it/s]



Concatenating series to DataFrame...
Reading file information...
Processing node and edge data...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 24/24 [01:00<00:00,  3.01it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 24/24 [00:32<00:00,  1.36s/it]


Preparing function arguments...
Calculating DWWCs...


100%|██████████| 1206/1206 [00:40<00:00, 29.53it/s]



Formatting results to series...


100%|██████████| 1206/1206 [00:04<00:00, 270.22it/s]



Concatenating series to DataFrame...
CPU times: user 8min 47s, sys: 13.6 s, total: 9min
Wall time: 12min 22s


In [7]:
result = pd.concat([unperm_walks]+permuted_walks)

In [8]:
import bz2

In [9]:
with bz2.open('full-dwwc-features.tsv.bz2', 'wt') as write_file:
    result.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

## Merge with partitions.tsv

This way only a small subset of Compound-Disease pairs is examined. Requires much less memory this way

In [13]:
full_part_df = pd.read_table('../../../store/projects/learn-1/all-features/data/partitions.tsv')

In [14]:
full_part_df.head()

Unnamed: 0,hetnet,compound_id,disease_id,status,primary
0,rephetio-v2.0_perm-2,DB00014,DOID:0050742,0,1
1,rephetio-v2.0_perm-5,DB00014,DOID:0050742,0,1
2,rephetio-v2.0_perm-5,DB00014,DOID:0060073,0,1
3,rephetio-v2.0,DB00014,DOID:10283,1,1
4,rephetio-v2.0_perm-1,DB00014,DOID:10283,0,0


In [15]:
dwwc_parts = pd.merge(full_part_df, result, on=['hetnet', 'compound_id', 'disease_id'], how='left')

In [16]:
dwwc_parts.head()

Unnamed: 0,hetnet,compound_id,disease_id,status,primary,CbG<rG<rGaD,CbG<rG<rGdD,CbG<rG<rGuD,CbG<rGaD,CbG<rGaDrD,...,CuGuDpCpD,CuGuDpCtD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,rephetio-v2.0_perm-2,DB00014,DOID:0050742,0,1,0.0,0.00014,9.4e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,rephetio-v2.0_perm-5,DB00014,DOID:0050742,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003477,0.002161,0.001438
2,rephetio-v2.0_perm-5,DB00014,DOID:0060073,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,rephetio-v2.0,DB00014,DOID:10283,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000736,0.0,0.0,0.002448,0.000273,0.001891,0.0,0.000196
4,rephetio-v2.0_perm-1,DB00014,DOID:10283,0,0,0.0,0.0,0.0,0.0,0.0,...,0.00609,0.0,0.001531,0.0,0.0,0.0,0.0,0.001825,0.001091,0.00118


In [17]:
dwwc_parts.shape

(30137, 1211)

In [18]:
with bz2.open('partitioned-dwwc-features.tsv.bz2', 'wt') as write_file:
    dwwc_parts.to_csv(write_file, sep='\t', index=False, float_format='%.4g')