# Edge Extraction

objectives:

- Extract basenodes and edges between them for import to Neo4j

In [7]:
import sys
sys.path.append('../py')
import processing as pt

import pandas as pd
from tqdm import tqdm

In [8]:
comp = pd.read_hdf('data/compound_edges.h5')
gene = pd.read_hdf('data/h_genes_edges.h5')
prot = pd.read_hdf('data/h_protein_edges.h5')
dis = pd.read_hdf('data/disease_edges.h5')

In [9]:
comp['type'] = 'Compound'
gene['type'] = 'Gene'
prot['type'] = 'Protein'
dis['type'] = 'Disease'

In [10]:
base = pd.concat([comp, gene, prot, dis])

## Strategy

We will classify eveything as either compound, gene, disease or protein, and then extract relationships between only those 4 node types.  

# Basenodes into CSV Format

Converting the basenodes into csv format for Neo4j impoort. Just use the base classes: Compound, Gene, Disease, and Protein.  We will also need the edges, but only where an object is also a basenode.

In [11]:
node_types = pt.get_node_type_dict(base, 'type')

In [12]:
base_filt = pt.filter_untyped_nodes(base)

In [13]:
base_filt['e_type'] = pt.get_edge_types(base_filt, node_types)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
base_filt1 = pt.remove_low_count_edges(base_filt)

In [15]:
pair_list = pt.get_pair_lists(base_filt1)

In [16]:
reciprocal_types = pt.find_reciprocal_relations(pair_list)

In [17]:
base_filt2 = pt.remove_reciprocals(base_filt1, reciprocal_types)

In [18]:
len(base), len(base_filt), len(base_filt1), len(base_filt2)

(1007063, 86490, 86024, 49825)

In [19]:
base_filt1['e_type'].value_counts()

encodes_GeP                           27164
encoded-by_PebG                       27004
subclass-of_DsoD                      10218
physically-interacts-with_PpiwC        3657
physically-interacts-with_CpiwP        3596
drug-used-for-treatment_DduftC         2912
medical-condition-treated_CmctD        2822
has-part_ChpC                          2378
genetic-association_GgaD               2201
genetic-association_DgaG               2181
significant-drug-interaction_CsdiC     1798
instance-of_CioC                         93
Name: e_type, dtype: int64

In [20]:
base_filt2['e_type'].value_counts()

encodes_GeP                           27178
subclass-of_DsoD                      10218
physically-interacts-with_PpiwC        3670
drug-used-for-treatment_DduftC         2969
has-part_ChpC                          2378
genetic-association_GgaD               2223
significant-drug-interaction_CsdiC     1096
instance-of_CioC                         93
Name: e_type, dtype: int64

In [21]:
reciprocal_types

[['drug-used-for-treatment_DduftC', 'medical-condition-treated_CmctD'],
 ['encodes_GeP', 'encoded-by_PebG'],
 ['physically-interacts-with_PpiwC', 'physically-interacts-with_CpiwP'],
 ['genetic-association_GgaD', 'genetic-association_DgaG'],
 ['significant-drug-interaction_CsdiC', 'significant-drug-interaction_CsdiC']]

In [22]:
filtered_edges = pt.prep_for_export(base)

In [23]:
node_neo = pt.format_nodes_neo(filtered_edges, node_types)

In [24]:
edge_neo = pt.format_edges_neo(filtered_edges)

In [25]:
len(node_neo), len(edge_neo)

(61011, 49825)

## Files needed for 'learn' pipline

Files include:

 - metagraph.json <- from metagraph
     - Made from:
         - metaedge Tuples
         - abbrevation dict
 - degrees.xlsx  <- from graph
 - metaedge_style.tsv <- from metagraph
 - 

In [30]:
metaedge_tuples = pt.get_metaedge_tuples(filtered_edges, node_types, reciprocal_types)
metaedge_tuples

[('Disease', 'Compound', 'drug-used-for-treatment', 'both'),
 ('Protein', 'Compound', 'physically-interacts-with', 'both'),
 ('Compound', 'Compound', 'has-part', 'forward'),
 ('Compound', 'Compound', 'instance-of', 'forward'),
 ('Gene', 'Protein', 'encodes', 'both'),
 ('Gene', 'Disease', 'genetic-association', 'both'),
 ('Disease', 'Disease', 'subclass-of', 'forward'),
 ('Compound', 'Compound', 'significant-drug-interaction', 'both')]

In [50]:
dir_dict = {x[2]: x[3] for x in metaedge_tuples}

In [33]:
abbrev_dict = pt.get_abbrev_dict(filtered_edges)
abbrev_dict

{'Compound': 'C',
 'Disease': 'D',
 'Gene': 'G',
 'Protein': 'P',
 'drug-used-for-treatment': 'duft',
 'encoded-by': 'eb',
 'encodes': 'e',
 'genetic-association': 'ga',
 'has-part': 'hp',
 'instance-of': 'io',
 'medical-condition-treated': 'mct',
 'physically-interacts-with': 'piw',
 'significant-drug-interaction': 'sdi',
 'subclass-of': 'so'}

In [40]:
from hetio.hetnet import MetaGraph
from hetio.readwrite import write_metagraph
metagraph = MetaGraph.from_edge_tuples(metaedge_tuples, abbrev_dict)
write_metagraph(metagraph, 'data/basenodes_metagraph.json')

In [43]:
from hetio.hetnet import Graph
graph = Graph(metagraph)

In [45]:
def add_node_from_row(row):
    graph.add_node(kind = row[':LABEL'], identifier=row[':ID'], name=row['name:String'])

In [70]:
import sparql_tools as qt

node_types1 = {qt.id_from_uri(k):v for k,v in node_types.items()}

In [71]:
def add_edge_from_row(row):
    start_id = (node_types1[row[':START_ID']],row[':START_ID'])
    end_id = (node_types1[row[':END_ID']],row[':END_ID'])
    kind = row[':TYPE'].split('_')[0]
    graph.add_edge(start_id, end_id, kind, dir_dict[kind])


In [52]:
node_neo.apply(add_node_from_row, axis=1)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
12       None
13       None
14       None
15       None
16       None
17       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
30       None
31       None
32       None
         ... 
79135    None
79164    None
79502    None
79766    None
80135    None
80625    None
80885    None
80980    None
81243    None
81301    None
81303    None
81306    None
81458    None
81490    None
81982    None
82214    None
82515    None
82758    None
82759    None
82765    None
82951    None
83274    None
83288    None
83960    None
84094    None
84226    None
84254    None
84263    None
84304    None
84373    None
dtype: object

In [72]:
edge_neo.apply(add_edge_from_row, axis = 1)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
85292    None
85293    None
85294    None
85295    None
85296    None
85297    None
85298    None
85299    None
85300    None
85301    None
85302    None
85303    None
85304    None
85305    None
85306    None
85307    None
85308    None
85309    None
85310    None
85311    None
85312    None
85313    None
85314    None
85315    None
85316    None
85317    None
85318    None
85319    None
85320    None
85321    None
dtype: object

In [61]:
len(list(graph.get_nodes()))

61011

In [73]:
len(list(graph.get_edges()))

49825

In [75]:
len(node_neo), len(edge_neo)

(61011, 49825)

In [80]:
from hetio.stats import degrees_to_excel
degrees_to_excel(graph, 'data/degrees.xlsx')

In [81]:
from hetio.stats import get_metaedge_style_df
metaedge_style_df = get_metaedge_style_df(metagraph)
metaedge_style_df.to_csv('data/metaedge-styles.tsv', sep='\t', index=False)

## Save Neo4j Import Files

In [34]:
node_neo.to_csv('data/hetnet_basenodes.csv', index=False)

In [35]:
node_neo[':LABEL'].value_counts()

Protein     26705
Gene        21668
Disease      8264
Compound     4374
Name: :LABEL, dtype: int64

In [36]:
edge_neo.to_csv('data/hetnet_baseedges.csv', index = False)

In [37]:
# How many edge types are there?
edge_neo[':TYPE'].nunique()

8

In [38]:
# How many of each type of edges are there?
edge_neo[':TYPE'].value_counts()

encodes_GeP                           27178
subclass-of_DsoD                      10218
physically-interacts-with_PpiwC        3670
drug-used-for-treatment_DduftC         2969
has-part_ChpC                          2378
genetic-association_GgaD               2223
significant-drug-interaction_CsdiC     1096
instance-of_CioC                         93
Name: :TYPE, dtype: int64