# Making N1 connections for Neo4j Import

In [2]:
import sys
sys.path.append('../py')
import sparql_tools as qt
import processing as pt

import pandas as pd
from tqdm import tqdm

In [3]:
comp = pd.read_hdf('data/compound_edges.h5')
gene = pd.read_hdf('data/h_genes_edges.h5')
prot = pd.read_hdf('data/h_protein_edges.h5')
dis = pd.read_hdf('data/disease_edges.h5')

In [4]:
comp['type'] = 'Compound'
gene['type'] = 'Gene'
prot['type'] = 'Protein'
dis['type'] = 'Disease'

## Split Genes into multiple types

Rationale:

- Using a subclass of link for all these subtypes would add more nodes and edges to the graph
- Since learning performance is dependant of path lenght, having this information encoded in the node_type will
reduce the path length
- Almost all genes have 2 types (56,500 / 60,000) so it makes sense to separate these types out.

In [5]:
gene_types = pt.combine_multiclass(gene, edge_type='subclass of', sep='; ')

In [6]:
gene_types['label'].value_counts()

gene; protein-coding gene                    20700
gene; non-coding RNA                         18064
gene; pseudogene                             16861
gene                                          3447
gene; transfer RNA                             553
gene; small nucleolar RNA                      431
gene; small nuclear RNA                         63
gene; ribosomal RNA                             39
gene; non-coding RNA; protein-coding gene        2
gene; protein-coding gene; pseudogene            1
Name: label, dtype: int64

Lets use 4 main categories:

 - Gene
 - Protein-coding gene
 - Non-coding RNA
 - Psuedogene
 
tRNA and small nucleolar RNA may be large enough to try, too, but at first pass we're going to stick with these 4
 
In terms of hiearachry, protein-coding > Non-coding > psuedogene > Gene

In [7]:
def categorize_func(item):
    if 'protein-coding gene' in item:
        return 'Protein-coding Gene'
    elif 'non-coding RNA'in item:
        return 'Non-coding RNA'
    elif 'pseudogene' in item:
        return 'Pseudogene'
    else:
        return 'Gene'

In [8]:
gene_types['label'] = gene_types['label'].apply(categorize_func)

In [9]:
gene_types['label'].value_counts()

Protein-coding Gene    20703
Non-coding RNA         18064
Pseudogene             16861
Gene                    4533
Name: label, dtype: int64

In [10]:
gene_type_dict = pt.get_node_type_dict(gene_types)
gene['type'] = gene['s'].apply(lambda s: gene_type_dict[s])

base = pd.concat([comp, gene, prot, dis])
base['type'].value_counts()

Protein                495113
Compound               190386
Protein-coding Gene    162902
Non-coding RNA          65856
Pseudogene              52178
Disease                 29447
Gene                    11181
Name: type, dtype: int64

## Filter out Low Count Edges

## Look at N1 Nodes

In [11]:
n1 = pd.read_hdf('data/n1_edges.h5')

In [12]:
n1_filt = pt.filter_bad(n1)

In [13]:
de = ['biological process', 'cell component', 'molecular function', 'found in taxon', 'chromosome', 'strand orientation']

### Assign types with descriptive egdes

In [14]:
de_map = {
    'biological process': 'GO Term',
    'cell component': 'GO Term',
    'molecular function': 'GO Term',
    'found in taxon': 'Taxon',
    'chromosome': 'Chromosome',
    'strand orientation': 'Strand Orientation'
   }
mapping_dict = dict()
for edge, etype in de_map.items():
    found = base.query('pLabel == {!r}'.format(edge)).reset_index(drop=True)
    found['type'] = etype 
    mapping_dict.update(found.set_index('o')['type'].to_dict())

### Assign types to remining nodes

- Start with 'instance of'
- next do 'subclass of' and 'found in taxon'
    - Doing both will separte genes and proteins by taxon

In [15]:
n1_types = pt.combine_multiclass(n1_filt)
n1_dict = pt.get_node_type_dict(n1_types)

In [16]:
n1_types1 = pt.combine_multiclass(n1_filt, edge_type=['subclass of', 'found in taxon'])
n1_dict1 = pt.get_node_type_dict(n1_types1)

In [17]:
n1_dict1.update(n1_dict)

In [18]:
n1_dict1.update(mapping_dict)

In [19]:
n1_filt['type'] = n1_filt['s'].apply(lambda s: n1_dict1[s] if s in n1_dict1 else 'Other')

In [20]:
n1_filt['type'].value_counts()[:15]

mature microRNA                                              292164
Mus musculus; gene; protein-coding gene                      136320
brown rat; gene; protein-coding gene                         121899
GO Term                                                       44584
chemical hazard                                               15328
protein family                                                13970
protein domain                                                13351
Saccharomyces cerevisiae S288c; gene; protein-coding gene     11915
human                                                          3258
Supersecondary structure                                       2023
Other                                                          1408
taxon                                                           563
active site                                                     414
encyclopedia                                                    408
Structural motif                                

## Combine Base and N1 node edges

In [21]:
base_filt = pt.filter_bad(base)
full_list = pd.concat([base_filt, n1_filt]).reset_index(drop=True)

In [22]:
neo_nodes = pt.nodes_neo_export(full_list)

In [23]:
neo_edges = pt.edges_neo_export(full_list)

In [24]:
len(neo_nodes)

323086

In [25]:
len(neo_edges)

1578644

In [26]:
neo_nodes[':LABEL'].nunique()

1098

In [27]:
neo_nodes[':LABEL'].value_counts()[:30]

Compound                                                     156692
Protein                                                       26706
Protein-coding Gene                                           20703
Non-coding RNA                                                18064
GO Term                                                       17281
Mus musculus; gene; protein-coding gene                       17096
Pseudogene                                                    16861
brown rat; gene; protein-coding gene                          16564
Disease                                                        8785
protein family                                                 7187
Gene                                                           4533
protein domain                                                 4245
mature microRNA                                                2285
Saccharomyces cerevisiae S288c; gene; protein-coding gene      1332
Other                                           

In [28]:
neo_edges[':TYPE'].nunique()

4442

In [29]:
neo_edges[':TYPE'].value_counts()[:100]

regulates-(molecular-biology)_MMrPG    278713
instance-of_CiCS                       156788
biological-process_PbGT                153189
cell-component_PcGT                     96891
molecular-function_PmGT                 81543
has-part_PhPD                           49916
found-in-taxon_PfT                      43568
subclass-of_PsBN                        25042
encodes_PGeP                            24668
encoded-by_PePG                         24519
subclass-of_GTsGT                       20744
subclass-of_PGsG                        20702
found-in-taxon_PGfT                     20702
subclass-of_PGsNAS                      20702
chromosome_PGcC                         19293
strand-orientation_PGsSO                19256
subclass-of_NRsR                        18069
subclass-of_NRsNAS                      18064
found-in-taxon_NRfT                     18064
ortholog_PGoMMGPG                       17218
ortholog_MMGPGoPG                       17143
found-in-taxon_MMGPGfT            

In [30]:
edges_to_keep = []
for edge, count in neo_edges[':TYPE'].value_counts().to_dict().items():
    if count > .001*len(neo_edges):
        edges_to_keep.append(edge)

In [31]:
len(edges_to_keep)

72

In [34]:
neo_nodes[':LABEL'].value_counts().to_csv('data/node_types_counts.csv')

In [36]:
neo_edges[':TYPE'].value_counts().to_csv('data/edge_types_counts.csv')

In [26]:
neo_nodes.to_csv('data/neo_nodes_n1.csv', index=False)

In [27]:
neo_edges.to_csv('data/neo_edges_n1.csv', index=False)

## Get types for all S and O uris

In [41]:
# Filter out o's that aren't in the S set.

s_set = set(full_list['s'])
full_list1 = full_list.query('o in {}'.format(list(s_set)))

In [72]:
# Get the types already assinged to 's' values in a dict
test_dict = {row['s']: row['type'] for x, row in full_list1.iterrows()}
test_dict.update(n1_dict1)

#### Make a DataFrame of source node type, edge, and target node type

In [93]:
def edge_to_edge_type(row):
    from_type = test_dict.get(row['s'], 'Other')
    to_type = test_dict.get(row['o'], 'Other')
    
    e_type = row['pLabel']
    
    return [from_type, e_type, to_type]

In [142]:
type_df = pd.DataFrame()

In [95]:
%%time
type_series = full_list1.apply(edge_to_edge_type, axis = 1)

CPU times: user 1min 24s, sys: 52 ms, total: 1min 24s
Wall time: 1min 24s


In [97]:
type_df[['start', 'edge', 'end']] = pd.DataFrame([x for x in type_series])

In [144]:
len(type_df)

1578644

In [209]:
node_info = neo_nodes[':LABEL'].value_counts().to_frame()
node_info.columns = ['count']

In [210]:
node_info['edge_count'] = type_df['start'].value_counts()
node_info['edge_count'] += type_df['end'].value_counts()
node_info['edge_count'] = node_info['edge_count'].fillna(0).astype(int)
node_info['edge_per_node'] = node_info['edge_count'].astype(float) / node_info['count'].astype(float)
#node_info = node_info[:50]

In [184]:
gene_types_list = list(set(gene_types['label']))

In [177]:
def count_type(row, e_type):
    idx = row.name
    count = len(type_df.query('(start == "{0}" and end == "{1}") | (start == "{1}" and end == "{0}")'.format(idx, e_type)))
    return count

In [190]:
def count_gene_links(row):
    idx = row.name
    count = len(type_df.query('(start == "{0}" and end in {1}) | (start in {1} and end == "{0}")'.format(idx, gene_types_list)))
    return count

In [211]:
%%time
node_info['comp_links'] = node_info.apply(lambda row: count_type(row, 'Compound'), axis = 1)
node_info['gene_links'] = node_info.apply(count_gene_links, axis = 1)
node_info['prot_links'] = node_info.apply(lambda row: count_type(row, 'Protein'), axis = 1)
node_info['dis_links'] = node_info.apply(lambda row: count_type(row, 'Disease'), axis = 1)

CPU times: user 25min 2s, sys: 248 ms, total: 25min 2s
Wall time: 24min 46s


In [212]:
node_info.head()

Unnamed: 0,count,edge_count,edge_per_node,comp_links,gene_links,prot_links,dis_links
Compound,156692,203118,1.296288,4368,43,7258,5750
Protein,26706,528464,19.788212,7258,54172,16,17
Protein-coding Gene,20703,503434,24.316959,4,0,49191,4343
Non-coding RNA,18064,68349,3.783713,0,0,3733,41
GO Term,17281,399548,23.120653,56,0,331603,22


In [213]:
node_info['basenode_links'] = node_info['comp_links'] + node_info['gene_links'] + node_info['prot_links'] + node_info['dis_links']
node_info['basenode_fract'] = node_info['basenode_links'].astype(float) / node_info['edge_count'].astype(float)
node_info['basenode_prop'] = node_info['basenode_links'].astype(float) / node_info['count'].astype(float)

In [214]:
node_info.head(20)

Unnamed: 0,count,edge_count,edge_per_node,comp_links,gene_links,prot_links,dis_links,basenode_links,basenode_fract,basenode_prop
Compound,156692,203118,1.296288,4368,43,7258,5750,17419,0.085758,0.111167
Protein,26706,528464,19.788212,7258,54172,16,17,61463,0.116305,2.301468
Protein-coding Gene,20703,503434,24.316959,4,0,49191,4343,53538,0.106346,2.586002
Non-coding RNA,18064,68349,3.783713,0,0,3733,41,3774,0.055217,0.208924
GO Term,17281,399548,23.120653,56,0,331603,22,331681,0.830141,19.193392
Mus musculus; gene; protein-coding gene,17096,139489,8.15916,0,34372,46,0,34418,0.246743,2.013219
Pseudogene,16861,53542,3.175494,0,0,896,16,912,0.017033,0.054089
brown rat; gene; protein-coding gene,16564,134971,8.148454,0,33341,3,0,33344,0.247046,2.01304
Disease,8785,46817,5.329197,5750,4402,17,10440,20609,0.440203,2.345931
protein family,7187,32936,4.582719,1,0,14296,0,14297,0.434084,1.989286


In [215]:
node_info.to_csv('data/node_info_full.csv')

In [219]:
node_info.query('basenode_prop > 1 and basenode_fract > .8')

Unnamed: 0,count,edge_count,edge_per_node,comp_links,gene_links,prot_links,dis_links,basenode_links,basenode_fract,basenode_prop
GO Term,17281,399548,23.120653,56,0,331603,22,331681,0.830141,19.193392
protein domain,4245,61553,14.500118,5,0,49930,0,49935,0.811252,11.763251
mature microRNA,2285,292309,127.925164,0,280347,4934,0,285281,0.975957,124.849453
Other,981,19026,19.394495,708,16867,11,767,18353,0.964627,18.708461
taxon,129,340,2.635659,23,0,0,258,281,0.826471,2.178295
Structural motif,115,2857,24.843478,0,0,2466,0,2466,0.863143,21.443478
medical specialty,50,2397,47.940000,1,0,0,2031,2032,0.847726,40.640000
chemical substance,42,157089,3740.214286,156872,0,0,3,156875,0.998638,3735.119048
chemical element,37,7246,195.837838,6942,6,13,2,6963,0.960944,188.189189
Chromosome,25,29856,1194.240000,0,25671,1671,0,27342,0.915796,1093.680000
