# Network propagation development

- connecting results to graphs
- applying PPR

In [1]:
import os

import pandas as pd
import numpy as np

from napistu.ingestion import sbml
from napistu import sbml_dfs_core
from napistu import mechanism_matching

In [2]:
PATH_TO_TEST_DATA = os.path.expanduser("~/Desktop/GITHUB/napistu/lib/napistu-py/src/tests/test_data")
example_pathway = os.path.join(PATH_TO_TEST_DATA, "reactome_glucose_metabolism.sbml")
assert os.path.exists(example_pathway)

In [3]:
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml.SBML(example_pathway))

species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'").query("ontology != 'reactome'")

INFO:napistu.utils:creating an edgelist linking index levels s_id, entry and linking it to levels defined by ontology, identifier
DEBUG:napistu.utils:label is not defined in table_schema; adding a constant (1)


In [4]:
# create a table whose index is s_ids and columns are faux-measurements
example_data = species_identifiers.groupby("ontology").head(10)[["ontology", "identifier"]]


example_data["results_a"] = np.random.randn(len(example_data))
example_data["results_b"] = np.random.randn(len(example_data))
# add a feature_id column to the example_data which tracks the row of the original data
example_data["feature_id"] = range(0, len(example_data))
example_data

Unnamed: 0,ontology,identifier,results_a,results_b,feature_id
0,chebi,17925,-0.373569,-0.717671,0
5,uniprot,Q9NQR9,1.612987,0.053992,1
13,chebi,58225,0.538856,-0.70991,2
16,chebi,15377,0.256441,0.692296,3
19,chebi,18367,-0.59862,-1.664589,4
23,uniprot,O43826,1.407006,-0.693861,5
46,chebi,57540,0.870453,-0.687041,6
49,chebi,30797,0.303877,-0.321269,7
52,chebi,57945,-0.076696,-1.002339,8
55,chebi,30744,-0.560509,-0.422192,9


In [5]:
# pivot (identifier, ontology) to columns for each ontology
example_data_wide = example_data.pivot(columns = "ontology", values = "identifier", index = ["feature_id", "results_a", "results_b"]).reset_index().rename_axis(None, axis = 1)

example_data_wide

Unnamed: 0,feature_id,results_a,results_b,chebi,uniprot
0,0,-0.373569,-0.717671,17925.0,
1,1,1.612987,0.053992,,Q9NQR9
2,2,0.538856,-0.70991,58225.0,
3,3,0.256441,0.692296,15377.0,
4,4,-0.59862,-1.664589,18367.0,
5,5,1.407006,-0.693861,,O43826
6,6,0.870453,-0.687041,57540.0,
7,7,0.303877,-0.321269,30797.0,
8,8,-0.076696,-1.002339,57945.0,
9,9,-0.560509,-0.422192,30744.0,


In [6]:
results_tables = dict()
for ont in example_data["ontology"].unique():
    results_tables[ont] = example_data.query("ontology == @ont").drop(columns = "ontology")

results_tables

{'chebi':    identifier  results_a  results_b  feature_id
 0       17925  -0.373569  -0.717671           0
 13      58225   0.538856  -0.709910           2
 16      15377   0.256441   0.692296           3
 19      18367  -0.598620  -1.664589           4
 46      57540   0.870453  -0.687041           6
 49      30797   0.303877  -0.321269           7
 52      57945  -0.076696  -1.002339           8
 55      30744  -0.560509  -0.422192           9
 58      15378   0.304291   0.772504          10
 89      16810  -0.432954   0.234474          12,
 'uniprot':     identifier  results_a  results_b  feature_id
 5       Q9NQR9   1.612987   0.053992           1
 23      O43826   1.407006  -0.693861           5
 61      Q9UBX3  -1.467092   0.518391          11
 127     P53007   0.538888  -0.484291          13
 178     P35558   0.035667   0.657693          14
 241     Q16822   1.207939  -0.305081          15
 316     P35575   0.028248   0.589675          16
 399     Q9BUM1  -0.483691  -0.298671   

In [7]:
import utils

# options, for matching
# 1. match by identifier and a set of ontologies (provided by arg).
matched_s_ids = mechanism_matching.features_to_pathway_species(
    feature_identifiers = example_data.drop(columns = "ontology"), 
    species_identifiers = species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_identifiers_var = "identifier",
)


# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
    example_data_wide,
    species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_identifiers_var = "identifier",
)

# check for equivalence of the three strategies


# 4. format multiple tables by applying strategy #3 multiple times; storing results as separate `species_data` tables.
## this may depend more on the structure and whether measures are defined over all modalities or only a subset.

DEBUG:napistu.mechanism_matching:Validated ontology columns: {'chebi', 'uniprot'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_a', 'feature_id', 'results_b']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 5)
DEBUG:napistu.mechanism_matching:Matching 10 features to 38 species for ontology chebi
DEBUG:napistu.mechanism_matching:Matching 10 features to 10 species for ontology uniprot
INFO:napistu.mechanism_matching:Found 20 total matches across 2 ontologies


In [8]:
matched_s_ids_from_wide

Unnamed: 0,results_a,feature_id,results_b,identifier,s_id,entry,ontology,url,bqb,s_name,s_Source
0,-0.373569,0,-0.717671,17925,S00000000,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Glc,<napistu.source.Source object at 0x1455c9850>
1,0.538856,2,-0.70991,58225,S00000002,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,G6P,<napistu.source.Source object at 0x147881e10>
2,0.256441,3,0.692296,15377,S00000003,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,H2O,<napistu.source.Source object at 0x142557c90>
3,-0.59862,4,-1.664589,18367,S00000004,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Pi,<napistu.source.Source object at 0x110ba5b50>
4,0.870453,6,-0.687041,57540,S00000007,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NAD+,<napistu.source.Source object at 0x1439ee950>
5,0.303877,7,-0.321269,30797,S00000008,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,MAL,<napistu.source.Source object at 0x147855bd0>
6,-0.076696,8,-1.002339,57945,S00000009,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NADH,<napistu.source.Source object at 0x14366e3d0>
7,-0.560509,9,-0.422192,30744,S00000010,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,OAA,<napistu.source.Source object at 0x147857710>
8,0.304291,10,0.772504,15378,S00000011,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,H+,<napistu.source.Source object at 0x147856690>
9,-0.432954,12,0.234474,16810,S00000014,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,2OG,<napistu.source.Source object at 0x147850bd0>


In [9]:
import test_utils
test_utils.test_drop_extra_cols()
test_utils.test_resolve_matches_with_example_data()
test_utils.test_resolve_matches_first_method()
test_utils.test_resolve_matches_invalid_dtypes()
test_utils.test_resolve_matches_deduplicate_feature_id_within_sid()

In [10]:
from napistu import identifiers

from typing import Optional, Union, Set, Dict
import logging

logger = logging.getLogger(__name__)

# match a table containing identifiers from 1+ ontologies and a additional results
def bind_wide_results(
    sbml_dfs : sbml_dfs_core.SBML_dfs,
    results_df : pd.DataFrame,
    results_name : str,
    ontologies : Optional[Union[Set[str], Dict[str, str]]] = None,
    dogmatic : bool = False,
    species_identifiers : Optional[pd.DataFrame] = None,
    verbose : bool = False
) -> sbml_dfs_core.SBML_dfs:
    """
    Binds wide results to a sbml_dfs object.

    Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data

    Parameters
    ----------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object to bind the results to.
    results_df : pd.DataFrame
        The table containing the results to bind.
    results_name : str
        The name of the results to bind.
    ontologies : Optional[Union[Set[str], Dict[str, str]]]
        The ontologies to use for matching.
    dogmatic : bool
        Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
    species_identifiers : Optional[pd.DataFrame]
        Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
    verbose : bool
        Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
    
    Returns
    -------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object with the results bound.
    """

    species_identifiers = identifiers._prepare_species_identifiers(
        sbml_dfs,
        dogmatic = dogmatic,
        species_identifiers = species_identifiers
        )
    
    # match
    matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
        example_data_wide,
        species_identifiers,
        ontologies = ontologies,
        feature_identifiers_var = "identifier",
        verbose = verbose
    )

    disambiguated_matches = utils.resolve_matches(
        matched_data = matched_s_ids_from_wide,
        )

    clean_species_data = utils._drop_extra_cols(
        example_data_wide,
        disambiguated_matches
    )

    sbml_dfs.add_species_data(
        results_name,
        clean_species_data
        )

    return sbml_dfs

bind_wide_results(
    sbml_dfs,
    example_data_wide,
    "results",
    ontologies = {"uniprot", "chebi"},
    dogmatic = False,
    species_identifiers = None,
    verbose = True
)


INFO:napistu.sbml_dfs_utils:Running in non-dogmatic mode - genes, transcripts, and proteins will be merged if possible.
  promiscuous_component_identifiers = pd.Series(
DEBUG:napistu.mechanism_matching:Validated ontology columns: {'chebi', 'uniprot'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_a', 'feature_id', 'results_b']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 5)
DEBUG:napistu.mechanism_matching:Matching 10 features to 48 species for ontology chebi
DEBUG:napistu.mechanism_matching:Matching 10 features to 98 species for ontology uniprot
INFO:napistu.mechanism_matching:Found 25 total matches across 2 ontologies
INFO:napistu.mechanism_matching:100.0% of feature_ids are present one or more times in the output (20/20)
INFO:napistu.mechanism_matching:2 s_id(s) map to more than one feature_id.
INFO:napistu.mechanism_matching:Examples of s_id mapping to multiple feature_ids (showing up to 3):
s_id       s_name           
S00000056  GCK1:GKRP co

NameError: name 'sbml_dfs_w_data' is not defined

In [11]:
sbml_dfs.species_data["results"]

Unnamed: 0_level_0,feature_id,results_a,results_b
s_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S00000000,0,-0.373569,-0.717671
S00000001,1,1.612987,0.053992
S00000002,2,0.538856,-0.70991
S00000003,3,0.256441,0.692296
S00000004,4,-0.59862,-1.664589
S00000005,5,1.407006,-0.693861
S00000007,6,0.870453,-0.687041
S00000008,7,0.303877,-0.321269
S00000009,8,-0.076696,-1.002339
S00000010,9,-0.560509,-0.422192
