# Format the resulting edges for import to neo4j and get files needed for learn pipeline

In [1]:
import pandas as pd
import sys
sys.path.append('../py')
import processing as pt

In [2]:
edges = pd.read_hdf('data/all_edges_unfiltered.h5', 'edges')

## Examine the effect of different cutoff values for removing low count edge types

In [3]:
prepped, node_types, reciprocals = pt.prep_for_export(edges, 0.5, 0.1)

In [4]:
p1, nt, r = pt.prep_for_export(edges, 0.5, 0.05)

In [5]:
p2, nt, r = pt.prep_for_export(edges, 0.5, 0.01)

In [6]:
e0 = set(prepped['e_type'])
e1 = set(p1['e_type'])
e2 = set(p2['e_type'])

for i, j in zip([e0, e1, e2], [0.1, 0.05, 0.01]):
    print('Cutoff: {}\tEdge Tyeps: {}'.format(j, len(i)))

Cutoff: 0.1	Edge Tyeps: 51
Cutoff: 0.05	Edge Tyeps: 57
Cutoff: 0.01	Edge Tyeps: 75


In [7]:
e1-e0

{'chromosome_GcGT',
 'chromosome_MMcGT',
 'chromosome_PGcGT',
 'drug-used-for-treatment_CHduftC',
 'has-part_PCGhpPTM',
 'subclass-of_DsoS'}

In [8]:
e2-e1

{'afflicts_CHaPF',
 'cause-of-death_CHcodD',
 'cause-of_ScoC',
 'chromosome_PCGcGT',
 'different-from_CdfC',
 'genetic-association_DgaNCR',
 'genetic-association_PCGgaD',
 'genetic-association_PGgaD',
 'has-part_ChpC',
 'has-part_PCGhpAS',
 'has-part_PCGhpBS',
 'instance-of_CioC',
 'instance-of_SioGT',
 'part-of_BSpoSM',
 'part-of_BSpoSS',
 'part-of_SSpoSM',
 'subclass-of_SsoD',
 'subclass-of_SsoS',
 'symptoms_DsD'}

## Cutoff values for low count edges compared

- 0.1
    - 51 edge types
    
    
- 0.05
    - 57 edge types
    - Chromosome edge to X chromosome GO Term (probably an inappropriate type)
        - 'chromosome_GcGT'
        - 'chromosome_MMcGT'
        - 'chromosome_PGcGT'
    - 'drug-used-for-treatment_CHduftC'
        - Drugs that treat chemical hazards
    - 'has-part_PCGhpPTM'
        - Protein coding genes have part post-trans mod <- only 1
    - 'subclass-of_DsoS'
        - Diseases are subclasses of Symptoms
    
    
- 0.01
    - 75 edge types
        - Chem Hazard afflicts Protein Fam
        - Chem Hazard cause of death Disease
        - Chem Hazard cause of Symptom
        - compound different from compound
        - nc-RNA genetic association with Disease
        - PG genetic association with Disease
    - Many edge types with count of 1

### Format to neo4j and save the files

In [9]:
node_neo = pt.format_nodes_neo(prepped, node_types)
edge_neo = pt.format_edges_neo(prepped)

In [10]:
# How many of each node type is represented?
node_neo[':LABEL'].value_counts()

Protein                            26907
GO Term                            21148
Protein-coding Gene                19321
Protein Family                     12832
Disease                             7482
Protein Domain                      6321
Compound                            2615
Mature MicroRNA                     2583
Chemical Hazard                      675
Supersecondary Structure             643
Non-coding RNA                       524
Pseudo Gene                          440
Gene                                 189
Structural Motif                     144
Active Site                          121
Binding Site                          71
Symptom                               31
Post-translational Modification       16
Name: :LABEL, dtype: int64

In [11]:
node_neo.to_csv('data/hetnet_nodes.csv', index=False)
edge_neo.to_csv('data/hetnet_edges.csv', index=False)

## Make the metagraph needed for the learn pipline

#### See how long of metapaths we should use

In [12]:
met = pt.get_metaedge_tuples(prepped, node_types, reciprocals)
abv = pt.get_abbrev_dict(prepped, node_types)

In [13]:
from hetio.hetnet import MetaGraph

mg = MetaGraph.from_edge_tuples(met, abv)

In [14]:
num4 = len(mg.extract_metapaths('Compound', 'Disease', 4))
num5 = len(mg.extract_metapaths('Compound', 'Disease', 5))
num6 = len(mg.extract_metapaths('Compound', 'Disease', 6))

for pl, paths in zip([4, 5, 6], [num4, num5, num6]):
    print('For a max path length of {}, there are {} metapaths'.format(pl, paths))

For a max path length of 4, there are 291 metapaths
For a max path length of 5, there are 1699 metapaths
For a max path length of 6, there are 9914 metapaths


## Metagraph Analysis

Metapaths of length 4:
- 291 pathtypes
- AS, BS, GT, NCR, PTM, PD, PF, PG, SM, SS - All not included
    - Active Site
    - Binding Site
    - GO Term
    - Non-Coding RNA
    - Post-translational Modification
    - Protein Domain
    - Protein Family
    - Pseudo Gene
    - Structural Motif
    - Supersecondary Structure
  
Metapaths of length 5:
- 1699 pathtypes
- SM - Still not represented
- Picked up: All remaining!
    - AS: 2
    - BS: 2
    - GT: 18 (though some might not be valid, probably only about 6-8 valid)
        - protein -mf-> GO TERM -bp-> protein (etc.)
    - MM: 10
    - NCR: 3
    - PD: 2
    - PF: 2
    - PG: 3
    - SS: 2
    - SM: 2
- When you think about it, this is like metapath length 4 in Rephetio
    - CGD gets you there 1 step faster than CPGD
    - Example: Compound **-binds-** Protein **-has-** Structural Motif **-has-** Protein **-encoded-** Gene **-associated-** Disease 


Metapaths of length 6:
- 9,914 pathtypes
- Lots more examples here, would be great if we had the computation

## Make Hetio information

In [15]:
pt.prep_hetio(prepped, node_types, reciprocals)

Added 102063 nodes to graph
Added 848121 edges to graph
