# Edge Extraction

objectives:

- Extract basenodes and edges between them for import to Neo4j

In [1]:
import sys
sys.path.append('../py')
import sparql_tools as qt

import pandas as pd
from tqdm import tqdm

In [2]:
comp = pd.read_hdf('data/compound_edges.h5')
gene = pd.read_hdf('data/h_genes_edges.h5')
prot = pd.read_hdf('data/h_protein_edges.h5')
dis = pd.read_hdf('data/disease_edges.h5')

In [3]:
comp['type'] = 'Compound'
gene['type'] = 'Gene'
prot['type'] = 'Protein'
dis['type'] = 'Disease'

In [4]:
base = pd.concat([comp, gene, prot, dis])

## Strategy

We will classify eveything as either compound, gene, disease or protein, and then extract relationships between only those 4 node types.  

# Basenodes into CSV Format

Converting the basenodes into csv format for Neo4j impoort. Just use the base classes: Compound, Gene, Disease, and Protein.  We will also need the edges, but only where an object is also a basenode.

In [5]:
type_dict = base.set_index('s')['type'].to_dict()

CPU times: user 380 ms, sys: 24 ms, total: 404 ms
Wall time: 400 ms


In [20]:
prot.query('s == "http://www.wikidata.org/entity/Q248215"')

Unnamed: 0,sLabel,pLabel,oLabel,s,p,o,type
122589,DYNC1I1,subclass of,gene,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P279,http://www.wikidata.org/entity/Q7187,Protein
122590,DYNC1I1,genetic association,Obesity,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P2293,http://www.wikidata.org/entity/Q12174,Protein
122591,DYNC1I1,genetic association,smallpox,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P2293,http://www.wikidata.org/entity/Q12214,Protein
122592,DYNC1I1,encoded by,DYNC1I1,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P702,http://www.wikidata.org/entity/Q248215,Protein
122593,DYNC1I1,chromosome,Homo sapiens chromosome 7,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P1057,http://www.wikidata.org/entity/Q657319,Protein
122594,DYNC1I1,found in taxon,Homo sapiens,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P703,http://www.wikidata.org/entity/Q15978631,Protein
122595,DYNC1I1,ortholog,Dync1i1,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P684,http://www.wikidata.org/entity/Q18250190,Protein
122596,DYNC1I1,subclass of,protein-coding gene,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P279,http://www.wikidata.org/entity/Q20747295,Protein
122597,DYNC1I1,encodes,Dynein cytoplasmic 1 intermediate chain 1,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P688,http://www.wikidata.org/entity/Q21110010,Protein
122598,DYNC1I1,strand orientation,Forward Strand,http://www.wikidata.org/entity/Q248215,http://www.wikidata.org/prop/direct/P2548,http://www.wikidata.org/entity/Q22809680,Protein


In [9]:
print(len(base))
base_uri = list(set(base['s']))
# Filter for edges point to basenodes
base_filt = base.query("o in {!r}".format(base_uri))
len(base_filt)

1007063
CPU times: user 14.7 s, sys: 80 ms, total: 14.8 s
Wall time: 14.8 s


86490

In [10]:
base_filt.groupby('s').first()['type'].value_counts()

CPU times: user 272 ms, sys: 4 ms, total: 276 ms
Wall time: 275 ms


Protein     26706
Gene        21707
Disease      8232
Compound     4297
Name: type, dtype: int64

In [11]:
base_nodes = base_filt.groupby('s').first()

In [12]:
# Need the basenodes that are object (taget) nodes, not just the subject (source) nodes
base_obj = base_filt.groupby('o').first()
base_obj = base_obj.query('o not in {!r}'.format(list(set(base_filt['s']))))

In [10]:
base_obj = base_obj.reset_index()
len(base_obj)

262

In [11]:
# Create a dataframe for csv export
base_out = pd.DataFrame()

base_out[':ID'] = base_filt['s'].apply(qt.id_from_uri)
base_out['identifier:String'] = base_filt['s'].apply(qt.id_from_uri)
base_out['name:String'] = base_filt['sLabel']
base_out[':LABEL'] = base_filt['type']

base_out = base_out.reset_index(drop=True)
base_obj['type'] = base_obj['o'].apply(lambda k: type_dict[k])

In [12]:
# do the same with the object-only nodes
base_out1 = pd.DataFrame()
base_out1[':ID'] = base_obj['o'].apply(qt.id_from_uri)
base_out1['identifier:String'] = base_obj['o'].apply(qt.id_from_uri)
base_out1['name:String'] = base_obj['oLabel']
base_out1[':LABEL'] = base_obj['type']

In [13]:
# Concatenate and export
base_out_f = pd.concat([base_out, base_out1]).drop_duplicates()
print(len(base_out_f), 'Nodes')

#base_out_f.to_csv('data/hetnet_basenodes.csv', index=False)

61205 Nodes


In [18]:
with open('prots.txt', 'w') as fout:
    for val in base_out_f[base_out_f[':LABEL'] == 'Protein'][':ID']:
        fout.write(val+'\n')

In [14]:
base_out_f[':LABEL'].value_counts()

Protein     26707
Gene        21710
Disease      8312
Compound     4476
Name: :LABEL, dtype: int64

# Basenode Edge export

In [14]:
def get_edge_type(row):
    start_type = type_dict[row['s']]
    end_type = type_dict[row['o']]
    
    return row['pLabel'].replace(' ', '-') + "_" + start_type[0] + row['pLabel'][0] + end_type[0]

In [15]:
# Create a datafrane for csv export of the edges
edge_out = pd.DataFrame()

edge_out[':START_ID'] = base_filt['s'].apply(qt.id_from_uri)
edge_out[':END_ID'] = base_filt['o'].apply(qt.id_from_uri)
edge_out[':TYPE'] = base_filt.apply(get_edge_type, axis = 1)

# Drop duplicates and Export
edge_out = edge_out.drop_duplicates()
edge_out.to_csv('data/hetnet_baseedges.csv', index = False)

In [16]:
# How many edge types are there?
edge_out[':TYPE'].nunique()

76

In [17]:
# How many of each type of edges are there?
edge_out[':TYPE'].value_counts()

encodes_GeP                                            27164
encoded-by_PeG                                         27004
subclass-of_DsD                                        10218
physically-interacts-with_PpC                           3657
physically-interacts-with_CpP                           3596
drug-used-for-treatment_DdC                             2912
medical-condition-treated_CmD                           2822
has-part_ChC                                            2378
genetic-association_GgD                                 2201
genetic-association_DgG                                 2181
significant-drug-interaction_CsC                        1798
instance-of_CiC                                           93
symptoms_DsD                                              81
has-cause_DhD                                             55
subclass-of_CsC                                           40
subclass-of_GsC                                           39
different-from_CdC      