# Network propagation development

- connecting results to graphs
- applying PPR

In [1]:
import os

import pandas as pd
import numpy as np

from napistu.ingestion import sbml
from napistu import sbml_dfs_core
from napistu import mechanism_matching

In [2]:
PATH_TO_TEST_DATA = os.path.expanduser("~/Desktop/GITHUB/napistu/lib/napistu-py/src/tests/test_data")
example_pathway = os.path.join(PATH_TO_TEST_DATA, "reactome_glucose_metabolism.sbml")
assert os.path.exists(example_pathway)

In [3]:
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml.SBML(example_pathway))

species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'").query("ontology != 'reactome'")

INFO:napistu.utils:creating an edgelist linking index levels s_id, entry and linking it to levels defined by ontology, identifier
DEBUG:napistu.utils:label is not defined in table_schema; adding a constant (1)


In [4]:
# create a table whose index is s_ids and columns are faux-measurements
example_data = species_identifiers.groupby("ontology").head(10)[["ontology", "identifier"]]


example_data["results_a"] = np.random.randn(len(example_data))
example_data["results_b"] = np.random.randn(len(example_data))
# add a feature_id column to the example_data which tracks the row of the original data
example_data["feature_id"] = range(0, len(example_data))
example_data

Unnamed: 0,ontology,identifier,results_a,results_b,feature_id
0,chebi,17925,-2.740863,0.756076,0
5,uniprot,Q9NQR9,-0.841275,-0.133762,1
13,chebi,58225,-0.503649,-1.629732,2
16,chebi,15377,0.040976,-0.878528,3
19,chebi,18367,2.467045,-1.266263,4
23,uniprot,O43826,0.344171,-1.011381,5
46,chebi,57540,0.069725,-0.226334,6
49,chebi,30797,0.043073,0.669256,7
52,chebi,57945,0.230705,0.157361,8
55,chebi,30744,-0.417795,-1.658783,9


In [5]:
# pivot (identifier, ontology) to columns for each ontology
example_data_wide = example_data.pivot(columns = "ontology", values = "identifier", index = ["feature_id", "results_a", "results_b"]).reset_index().rename_axis(None, axis = 1)

example_data_wide

Unnamed: 0,feature_id,results_a,results_b,chebi,uniprot
0,0,-2.740863,0.756076,17925.0,
1,1,-0.841275,-0.133762,,Q9NQR9
2,2,-0.503649,-1.629732,58225.0,
3,3,0.040976,-0.878528,15377.0,
4,4,2.467045,-1.266263,18367.0,
5,5,0.344171,-1.011381,,O43826
6,6,0.069725,-0.226334,57540.0,
7,7,0.043073,0.669256,30797.0,
8,8,0.230705,0.157361,57945.0,
9,9,-0.417795,-1.658783,30744.0,


In [6]:
results_tables = dict()
for ont in example_data["ontology"].unique():
    results_tables[ont] = example_data.query("ontology == @ont").drop(columns = "ontology")

results_tables

{'chebi':    identifier  results_a  results_b  feature_id
 0       17925  -2.740863   0.756076           0
 13      58225  -0.503649  -1.629732           2
 16      15377   0.040976  -0.878528           3
 19      18367   2.467045  -1.266263           4
 46      57540   0.069725  -0.226334           6
 49      30797   0.043073   0.669256           7
 52      57945   0.230705   0.157361           8
 55      30744  -0.417795  -1.658783           9
 58      15378  -0.605422  -0.619332          10
 89      16810   0.594403   0.127173          12,
 'uniprot':     identifier  results_a  results_b  feature_id
 5       Q9NQR9  -0.841275  -0.133762           1
 23      O43826   0.344171  -1.011381           5
 61      Q9UBX3  -0.083885  -1.236516          11
 127     P53007  -1.190742   0.330880          13
 178     P35558   2.852637   1.383258          14
 241     Q16822  -1.514976   0.474668          15
 316     P35575   0.118134   0.350530          16
 399     Q9BUM1  -0.473053   0.690574   

In [7]:
import utils

# options, for matching
# 1. match by identifier and a set of ontologies (provided by arg).
matched_s_ids = mechanism_matching.features_to_pathway_species(
    feature_identifiers = example_data.drop(columns = "ontology"), 
    species_identifiers = species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_identifiers_var = "identifier",
)


# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
    example_data_wide,
    species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_identifiers_var = "identifier",
)

# check for equivalence of the three strategies


# 4. format multiple tables by applying strategy #3 multiple times; storing results as separate `species_data` tables.
## this may depend more on the structure and whether measures are defined over all modalities or only a subset.

DEBUG:napistu.mechanism_matching:Validated ontology columns: {'uniprot', 'chebi'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_b', 'feature_id', 'results_a']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 5)
DEBUG:napistu.mechanism_matching:Matching 10 features to 10 species for ontology uniprot
DEBUG:napistu.mechanism_matching:Matching 10 features to 38 species for ontology chebi
INFO:napistu.mechanism_matching:Found 20 total matches across 2 ontologies


In [8]:
matched_s_ids_from_wide

Unnamed: 0,results_b,feature_id,results_a,identifier,s_id,entry,ontology,url,bqb,s_name,s_Source
0,-0.133762,1,-0.841275,Q9NQR9,S00000001,0,uniprot,https://purl.uniprot.org/uniprot/Q9NQR9,BQB_IS,G6PC2,<napistu.source.Source object at 0x104664790>
1,-1.011381,5,0.344171,O43826,S00000005,0,uniprot,https://purl.uniprot.org/uniprot/O43826,BQB_IS,SLC37A4,<napistu.source.Source object at 0x13ef4b9d0>
2,-1.236516,11,-0.083885,Q9UBX3,S00000012,0,uniprot,https://purl.uniprot.org/uniprot/Q9UBX3,BQB_IS,SLC25A10,<napistu.source.Source object at 0x13ef24050>
3,0.33088,13,-1.190742,P53007,S00000019,0,uniprot,https://purl.uniprot.org/uniprot/P53007,BQB_IS,SLC25A1,<napistu.source.Source object at 0x13ef88150>
4,1.383258,14,2.852637,P35558,S00000028,0,uniprot,https://purl.uniprot.org/uniprot/P35558,BQB_IS,PCK1,<napistu.source.Source object at 0x13ef8a850>
5,0.474668,15,-1.514976,Q16822,S00000036,0,uniprot,https://purl.uniprot.org/uniprot/Q16822,BQB_IS,PCK2,<napistu.source.Source object at 0x13ef88a10>
6,0.35053,16,0.118134,P35575,S00000042,0,uniprot,https://purl.uniprot.org/uniprot/P35575,BQB_IS,G6PC,<napistu.source.Source object at 0x13ef88a90>
7,0.690574,17,-0.473053,Q9BUM1,S00000051,0,uniprot,https://purl.uniprot.org/uniprot/Q9BUM1,BQB_IS,G6PC3,<napistu.source.Source object at 0x13ef89250>
8,0.299916,18,0.146419,P35557,S00000057,0,uniprot,https://purl.uniprot.org/uniprot/P35557,BQB_IS,GCK,<napistu.source.Source object at 0x13ef89110>
9,0.264882,19,2.597283,Q14397,S00000058,0,uniprot,https://purl.uniprot.org/uniprot/Q14397,BQB_IS,GCKR,<napistu.source.Source object at 0x13ef897d0>


In [1]:
import test_utils
test_utils.test_drop_extra_cols()
test_utils.test_resolve_matches_with_example_data()
test_utils.test_resolve_matches_first_method()
test_utils.test_resolve_matches_invalid_dtypes()
test_utils.test_resolve_matches_deduplicate_feature_id_within_sid()

In [10]:
from napistu import identifiers

from typing import Optional, Union, Set, Dict
import logging

logger = logging.getLogger(__name__)

# match a table containing identifiers from 1+ ontologies and a additional results
def bind_wide_results(
    sbml_dfs : sbml_dfs_core.SBML_dfs,
    results_df : pd.DataFrame,
    results_name : str,
    ontologies : Optional[Union[Set[str], Dict[str, str]]] = None,
    dogmatic : bool = False,
    species_identifiers : Optional[pd.DataFrame] = None,
    verbose : bool = False
) -> sbml_dfs_core.SBML_dfs:
    """
    Binds wide results to a sbml_dfs object.

    Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data

    Parameters
    ----------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object to bind the results to.
    results_df : pd.DataFrame
        The table containing the results to bind.
    results_name : str
        The name of the results to bind.
    ontologies : Optional[Union[Set[str], Dict[str, str]]]
        The ontologies to use for matching.
    dogmatic : bool
        Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
    species_identifiers : Optional[pd.DataFrame]
        Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
    verbose : bool
        Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
    
    Returns
    -------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object with the results bound.
    """

    species_identifiers = identifiers._prepare_species_identifiers(
        sbml_dfs,
        dogmatic = dogmatic,
        species_identifiers = species_identifiers
        )
    
    # match
    matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
        example_data_wide,
        species_identifiers,
        ontologies = ontologies,
        feature_identifiers_var = "identifier",
        verbose = verbose
    )

    utils.resolve_matches(
        matched_data = matched_s_ids_from_wide
        )


    return matched_s_ids_from_wide

    #sbml_dfs.add_species_data(
    #    results_name,
    #    matched_s_ids_from_wide.set_index("s_id")
    #     )

    return sbml_dfs

x = bind_wide_results(
    sbml_dfs,
    example_data_wide,
    "results",
    ontologies = {"uniprot", "chebi"},
    dogmatic = False,
    species_identifiers = None,
    verbose = True
)

disambiguated_matches = utils.resolve_matches(
    matched_data = x
)

utils._drop_extra_cols(
    example_data_wide,
    disambiguated_matches
)


INFO:napistu.sbml_dfs_utils:Running in non-dogmatic mode - genes, transcripts, and proteins will be merged if possible.
  promiscuous_component_identifiers = pd.Series(
DEBUG:napistu.mechanism_matching:Validated ontology columns: {'uniprot', 'chebi'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_b', 'feature_id', 'results_a']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 5)
DEBUG:napistu.mechanism_matching:Matching 10 features to 98 species for ontology uniprot
DEBUG:napistu.mechanism_matching:Matching 10 features to 48 species for ontology chebi
INFO:napistu.mechanism_matching:Found 25 total matches across 2 ontologies
INFO:napistu.mechanism_matching:100.0% of feature_ids are present one or more times in the output (20/20)
INFO:napistu.mechanism_matching:2 s_id(s) map to more than one feature_id.
INFO:napistu.mechanism_matching:Examples of s_id mapping to multiple feature_ids (showing up to 3):
s_id       s_name           
S00000056  GCK1:GKRP co

Unnamed: 0_level_0,feature_id,results_a,results_b
s_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S00000000,0.0,-2.740863,0.756076
S00000001,1.0,-0.841275,-0.133762
S00000002,2.0,-0.503649,-1.629732
S00000003,3.0,0.040976,-0.878528
S00000004,4.0,2.467045,-1.266263
S00000005,5.0,0.344171,-1.011381
S00000007,6.0,0.069725,-0.226334
S00000008,7.0,0.043073,0.669256
S00000009,8.0,0.230705,0.157361
S00000010,9.0,-0.417795,-1.658783


In [None]:
import utils
utils.test_aggregate_numeric_basic()
utils.test_aggregate_numeric_weighted_mean()
utils.test_aggregate_numeric_edge_cases()
utils.test_resolve_matches_with_example_data()

next steps

- Specify a feature_id variable mapping back to the original entry
- Support for identifiers which are nested in a single entry with a delimiter (e.g, chebi_a, chebi_b). This should probably going into the backlog
- Add a verbose flag to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
- use the filtering function to trim down the output prior to binding species_data.

With this, we'll have a good process for tying numeric attributes to species data.

Next, we can specify how to translate the results to graph attributes.