# Format the resulting edges for import to neo4j and get files needed for learn pipeline

In [1]:
import pandas as pd
import sys
sys.path.append('../py')
import processing as pt

In [2]:
edges = pd.read_hdf('data/all_edges_unfiltered.h5', 'edges')

## Examine the effect of different cutoff values for removing low count edge types

In [3]:
# Node types should return the same every time, so won't keep separate copies
p0, nt, r0 = pt.prep_for_export(edges, 0.5, 0)

In [4]:
p01, nt, r01 = pt.prep_for_export(edges, 0.5, 0.01)

In [5]:
p05, nt, r05 = pt.prep_for_export(edges, 0.5, 0.05)

In [6]:
p1, nt, r1 = pt.prep_for_export(edges, 0.5, 0.1)

In [7]:
p15, nt, r15 = pt.prep_for_export(edges, 0.5, 0.15)

In [8]:
p2, nt, r2 = pt.prep_for_export(edges, 0.5, 0.2)

In [9]:
p25, nt, r25 =  pt.prep_for_export(edges, 0.5, 0.25)

In [10]:
p3, nt, r3 = pt.prep_for_export(edges, 0.5, 0.3)

In [11]:
p5, nt, r5 = pt.prep_for_export(edges, 0.5, 0.5)

In [12]:
e0 = set(p0['e_type'])
e01 = set(p01['e_type'])
e05 = set(p05['e_type'])
e1 = set(p1['e_type'])
e15 = set(p15['e_type'])
e2 = set(p2['e_type'])
e25 = set(p25['e_type'])
e3 = set(p3['e_type'])
e5 = set(p5['e_type'])


for i, j in zip([e0, e01, e05, e1, e15, e2, e25, e3, e5], [0, .01, .05, .1, .15, .2, .25, .3, .5]):
    print('Cutoff: {}\tEdge Tyeps: {}'.format(j, len(i)))

Cutoff: 0	Edge Tyeps: 183
Cutoff: 0.01	Edge Tyeps: 75
Cutoff: 0.05	Edge Tyeps: 57
Cutoff: 0.1	Edge Tyeps: 51
Cutoff: 0.15	Edge Tyeps: 50
Cutoff: 0.2	Edge Tyeps: 48
Cutoff: 0.25	Edge Tyeps: 45
Cutoff: 0.3	Edge Tyeps: 44
Cutoff: 0.5	Edge Tyeps: 39


In [13]:
print('--No Cutoff--')
print('Total Number of Edge Types: {}'.format(len(set(p0['e_type']))))
print('Edge Types with more than 1 instance: {}'.format(sum(p0['e_type'].value_counts() > 1)))
print('Edge Types with more than 10 instances: {}'.format(sum(p0['e_type'].value_counts() > 10)))

--No Cutoff--
Total Number of Edge Types: 183
Edge Types with more than 1 instance: 124
Edge Types with more than 10 instances: 76


Without filtering, 183 edge types is just too many. Many (over 50) edge types with just a single edge, and even more (about another 50) with fewer than 10

In [14]:
print('--Cutoff of 0.01--')
print('Total Number of Edge Types: {}'.format(len(set(p01['e_type']))))
print('Edge Types with more than 1 instance: {}'.format(sum(p01['e_type'].value_counts() > 1)))
print('Edge Types with more than 10 instances: {}'.format(sum(p01['e_type'].value_counts() > 10)))

--Cutoff of 0.01--
Total Number of Edge Types: 75
Edge Types with more than 1 instance: 67
Edge Types with more than 10 instances: 62


Filtered with a cutoff of 0.01, there still a lot of edge types at 75, and several with 1 or few instances

In [15]:
print('--Cutoff of 0.05--')
print('Total Number of Edge Types: {}'.format(len(set(p05['e_type']))))
print('Edge Types with more than 1 instance: {}'.format(sum(p05['e_type'].value_counts() > 1)))
print('Edge Types with more than 10 instances: {}'.format(sum(p05['e_type'].value_counts() > 10)))

--Cutoff of 0.05--
Total Number of Edge Types: 57
Edge Types with more than 1 instance: 56
Edge Types with more than 10 instances: 54


Still have 1 with only 1 instance, and a couple with fewer than 10.  We can compare to later ones and see what is lost

In [16]:
print('--Cutoff of 0.1--')
print('Total Number of Edge Types: {}'.format(len(set(p1['e_type']))))
print('Edge Types with more than 1 instance: {}'.format(sum(p1['e_type'].value_counts() > 1)))
print('Edge Types with more than 10 instances: {}'.format(sum(p1['e_type'].value_counts() > 10)))

--Cutoff of 0.1--
Total Number of Edge Types: 51
Edge Types with more than 1 instance: 51
Edge Types with more than 10 instances: 50


This looks ok, none have just 1 insance anymore, but one edge is still has fewer than 10

Lets compare this to the more forgiving and more strict cutoffs of 0.05 and 0.15

In [17]:
# See what is lost between cutoff of 0.05 and 0.1
e05 - e1

{'chromosome_GcGT',
 'chromosome_MMcGT',
 'chromosome_PGcGT',
 'drug-used-for-treatment_CHduftC',
 'has-part_PCGhpPTM',
 'subclass-of_DsoS'}

In [18]:
# Whats lost when the cutoff is incread from 0.1 to 0.15
e1 - e15

{'part-of_PDpoPD'}

In [19]:
# There should only be 1 fewer edge, see if something was changed inadvertantly
e15 - e1

set()

So lost ProteinDomain is part of a ProteinDomain

In [20]:
print('--Cutoff of 0.2--')
print('Total Number of Edge Types: {}'.format(len(set(p2['e_type']))))
print('Edge Types with more than 1 instance: {}'.format(sum(p2['e_type'].value_counts() > 1)))
print('Edge Types with more than 10 instances: {}'.format(sum(p2['e_type'].value_counts() > 10)))

--Cutoff of 0.2--
Total Number of Edge Types: 48
Edge Types with more than 1 instance: 48
Edge Types with more than 10 instances: 47


In [21]:
# See what is lost between 0.1 and 0.2
e1 - e2

{'part-of_PDpoPD', 'regulates-molecular-biology_GTrmbGT', 'subclass-of_GsoC'}

3 edges lost, all not too useful it seems. They either link 2 non-basenode concepts, or they just don't make sene (how can a gene be a subclass of a compound?)

In [22]:
print('--Cutoff of 0.25--')
print('Total Number of Edge Types: {}'.format(len(set(p25['e_type']))))
print('Edge Types with more than 1 instance: {}'.format(sum(p25['e_type'].value_counts() > 1)))
print('Edge Types with more than 10 instances: {}'.format(sum(p25['e_type'].value_counts() > 10)))

--Cutoff of 0.25--
Total Number of Edge Types: 45
Edge Types with more than 1 instance: 45
Edge Types with more than 10 instances: 45


In [23]:
# Edges lost wne increasing from 0.2 to 0.25
e2 - e25

{'cause-of_CHcoD', 'cause-of_DcoS', 'part-of_GTpoGT'}

Starting to lose more edges, Chemical Hazards-cause->disease, and Disease-cause->Symptoms

These are a problem becuase the connection to the basenode disease. Lets look at how many edges of each type there are

In [24]:
p25['e_type'].value_counts()[-10:]

encodes_GeP                           179
regulates-molecular-biology_MMrmbG    139
drug-used-for-treatment_SduftC        135
part-of_ASpoPD                        129
part-of_BSpoPD                         93
part-of_SMpoPD                         93
symptoms_DsS                           52
part-of_PTMpoPF                        35
cause-of_CHcoS                         31
part-of_PTMpoPD                        16
Name: e_type, dtype: int64

31 and 7, so not a lot, but could be problematic.

Lets look at the next cutoff level, betwen 0.25 and 0.3

In [25]:
# whats lost when increasing the cutoff from 0.25 to 0.3
e25 - e3

{'genetic-association_PCGgaD'}

In [26]:
p1[p1['pLabel'] == 'genetic association']['e_type'].value_counts()

genetic-association_PCGgaD    2093
Name: e_type, dtype: int64

Now this is a huge edge, and a nice bridge in the compound-protein-gene-disease link... This can't aford to be lost, I won't look in detail at any higher cutoffs

## Summary of different cutoff values for low count edges

- No Cutoff
    - 183 Edge Types
        - over 50 with only a count of 1
        - Another 50 with fewer than 10 counts


- 0.01
    - 75 edge types
    - As compared to 0.05, contains:
        - Chem Hazard afflicts Protein Fam
        - Chem Hazard cause of death Disease
        - Chem Hazard cause of Symptom
        - compound different from compound
        - nc-RNA genetic association with Disease
        - PG genetic association with Disease
    - Many edge types with count of 1


- 0.05
    - 57 edge types
    - As compared to 0.1, contains:
        - Chromosome edge to X chromosome GO Term (probably an inappropriate type)
            - 'chromosome_GcGT'
            - 'chromosome_MMcGT'
            - 'chromosome_PGcGT'
        - 'drug-used-for-treatment_CHduftC'
            - Drugs that treat chemical hazards
        - 'has-part_PCGhpPTM'
            - Protein coding genes have part post-trans mod <- only 1
        - 'subclass-of_DsoS'
            - Diseases are subclasses of Symptoms


- 0.1
    - 51 edge types
    - Decent coverage of edges
    - No edges with only 1 insance
    - 1 edge with fewer than 10 instances
    
    
- 0.15
    - 50 Edge types
    - As compared to 0.1, lost:
        - Protein Domain is part of Protein Domain


- 0.2
    - 48 Edge Types
    - As compared to 0.1, lost:
        - Protein Domain is part of Protein Domain
        - GO Term -regulates-molecular-biology-> GO Term
        - Gene -subclass-of-> Compound
            - Not sure this one makes any sense anyway...

- 0.25
    - 45 Edge Types
    - Lose 2 potential important edges:
        - Chemical Hazards -cause-> Disease
        - Disease -cause-> Symptoms


- 0.3
    - 44 Edge Types
    - Lost a critical 'Protein-Coding Gene -genetic-association-with-> Disease' edge
    
    


In [29]:
prepped, node_types, reciprocals = p2, nt, r2

### Format to neo4j and save the files

In [30]:
node_neo = pt.format_nodes_neo(prepped, node_types)
edge_neo = pt.format_edges_neo(prepped)

In [31]:
# How many of each node type is represented?
node_neo[':LABEL'].value_counts()

Protein                            26907
GO Term                            21120
Protein-coding Gene                19321
Protein Family                     12832
Disease                             7482
Protein Domain                      6177
Compound                            2614
Mature MicroRNA                     2583
Chemical Hazard                      675
Supersecondary Structure             643
Non-coding RNA                       524
Pseudo Gene                          440
Gene                                 151
Structural Motif                     144
Active Site                          121
Binding Site                          71
Symptom                               30
Post-translational Modification       16
Name: :LABEL, dtype: int64

In [32]:
# How many of each node type is represented?
node_neo[':LABEL'].value_counts()

Protein                            26907
GO Term                            21120
Protein-coding Gene                19321
Protein Family                     12832
Disease                             7482
Protein Domain                      6177
Compound                            2614
Mature MicroRNA                     2583
Chemical Hazard                      675
Supersecondary Structure             643
Non-coding RNA                       524
Pseudo Gene                          440
Gene                                 151
Structural Motif                     144
Active Site                          121
Binding Site                          71
Symptom                               30
Post-translational Modification       16
Name: :LABEL, dtype: int64

In [33]:
node_neo.to_csv('data/hetnet_nodes.csv', index=False)
edge_neo.to_csv('data/hetnet_edges.csv', index=False)

## Make the metagraph needed for the learn pipline

#### See how long of metapaths we should use

In [34]:
met = pt.get_metaedge_tuples(prepped, node_types, reciprocals)
abv = pt.get_abbrev_dict(prepped, node_types)

In [35]:
from hetio.hetnet import MetaGraph

mg = MetaGraph.from_edge_tuples(met, abv)

In [36]:
num4 = len(mg.extract_metapaths('Compound', 'Disease', 4))
num5 = len(mg.extract_metapaths('Compound', 'Disease', 5))
num6 = len(mg.extract_metapaths('Compound', 'Disease', 6))

for pl, paths in zip([4, 5, 6], [num4, num5, num6]):
    print('For a max path length of {}, there are {} metapaths'.format(pl, paths))

For a max path length of 4, there are 278 metapaths
For a max path length of 5, there are 1608 metapaths
For a max path length of 6, there are 9214 metapaths


## Metagraph Analysis

Metapaths of length 4:
- 278 pathtypes
- AS, BS, GT, NCR, PTM, PD, PF, PG, SM, SS - All not included
    - Active Site
    - Binding Site
    - GO Term
    - Non-Coding RNA
    - Post-translational Modification
    - Protein Domain
    - Protein Family
    - Pseudo Gene
    - Structural Motif
    - Supersecondary Structure
  
Metapaths of length 5:
- 1608 pathtypes
- SM - Still not represented
- Picked up: All remaining!
    - AS: 2
    - BS: 2
    - GT: 18 (though some might not be valid, probably only about 6-8 valid)
        - protein -mf-> GO TERM -bp-> protein (etc.)
    - MM: 10
    - NCR: 3
    - PD: 2
    - PF: 2
    - PG: 3
    - SS: 2
    - SM: 2
- When you think about it, this is like metapath length 4 in Rephetio
    - CGD gets you there 1 step faster than CPGD
    - Example: Compound **-binds-** Protein **-has-** Structural Motif **-has-** Protein **-encoded-** Gene **-associated-** Disease 


Metapaths of length 6:
- 9,196 pathtypes
- Lots more examples here, would be great if we had the computation

## Make Hetio information

In [37]:
pt.prep_hetio(prepped, node_types, reciprocals)

Added 101851 nodes to graph
Added 842952 edges to graph
