# Network propagation development

- connecting results to graphs
- applying PPR

In [1]:
import os

import pandas as pd
import numpy as np

from napistu.ingestion import sbml
from napistu import sbml_dfs_core
from napistu import mechanism_matching

In [2]:
PATH_TO_TEST_DATA = os.path.expanduser("~/Desktop/GITHUB/napistu/lib/napistu-py/src/tests/test_data")
example_pathway = os.path.join(PATH_TO_TEST_DATA, "reactome_glucose_metabolism.sbml")
assert os.path.exists(example_pathway)

In [3]:
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml.SBML(example_pathway))

species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'").query("ontology != 'reactome'")

INFO:napistu.utils:creating an edgelist linking index levels s_id, entry and linking it to levels defined by ontology, identifier
DEBUG:napistu.utils:label is not defined in table_schema; adding a constant (1)


In [4]:
# create a table whose index is s_ids and columns are faux-measurements
example_data = species_identifiers.groupby("ontology").head(10)[["ontology", "identifier"]]

example_data["results_a"] = np.random.randn(len(example_data))
example_data["results_b"] = np.random.randn(len(example_data))

example_data

Unnamed: 0,ontology,identifier,results_a,results_b
0,chebi,17925,-0.95184,-0.374808
5,uniprot,Q9NQR9,1.497391,0.099568
13,chebi,58225,0.653753,1.643016
16,chebi,15377,1.58923,1.330121
19,chebi,18367,-0.173185,0.719055
23,uniprot,O43826,0.111398,1.125249
46,chebi,57540,1.068384,0.764542
49,chebi,30797,-0.01833,0.194962
52,chebi,57945,-0.981172,-1.490892
55,chebi,30744,2.028916,0.937222


In [5]:
# pivot (identifier, ontology) to columns for each ontology
example_data_wide = example_data.pivot(columns = "ontology", values = "identifier", index = ["results_a", "results_b"]).reset_index().rename_axis(None, axis = 1)

example_data_wide

Unnamed: 0,results_a,results_b,chebi,uniprot
0,-0.981172,-1.490892,57945.0,
1,-0.95184,-0.374808,17925.0,
2,-0.885649,0.660444,16810.0,
3,-0.542018,-0.192182,,Q9BUM1
4,-0.325965,-0.486341,,P35557
5,-0.173185,0.719055,18367.0,
6,-0.097432,-1.213259,,Q14397
7,-0.01833,0.194962,30797.0,
8,0.111398,1.125249,,O43826
9,0.270507,0.94812,,P35558


In [6]:
results_tables = dict()
for ont in example_data["ontology"].unique():
    results_tables[ont] = example_data.query("ontology == @ont").drop(columns = "ontology")

results_tables

{'chebi':    identifier  results_a  results_b
 0       17925  -0.951840  -0.374808
 13      58225   0.653753   1.643016
 16      15377   1.589230   1.330121
 19      18367  -0.173185   0.719055
 46      57540   1.068384   0.764542
 49      30797  -0.018330   0.194962
 52      57945  -0.981172  -1.490892
 55      30744   2.028916   0.937222
 58      15378   0.694369   1.973247
 89      16810  -0.885649   0.660444,
 'uniprot':     identifier  results_a  results_b
 5       Q9NQR9   1.497391   0.099568
 23      O43826   0.111398   1.125249
 61      Q9UBX3   0.764440  -0.499913
 127     P53007   1.919061   1.223260
 178     P35558   0.270507   0.948120
 241     Q16822   1.168988  -0.621227
 316     P35575   0.543918   0.105410
 399     Q9BUM1  -0.542018  -0.192182
 469     P35557  -0.325965  -0.486341
 522     Q14397  -0.097432  -1.213259}

In [7]:
import utils

# options, for matching
# 1. match by identifier and a set of ontologies (provided by arg).
matched_s_ids = mechanism_matching.features_to_pathway_species(
    feature_identifiers = example_data.drop(columns = "ontology"), 
    species_identifiers = species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)


# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
    example_data_wide,
    species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)

# check for equivalence of the three strategies


# 4. format multiple tables by applying strategy #3 multiple times; storing results as separate `species_data` tables.
## this may depend more on the structure and whether measures are defined over all modalities or only a subset.

DEBUG:napistu.mechanism_matching:Validated ontology columns: {'chebi', 'uniprot'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_b', 'results_a']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 4)
DEBUG:napistu.mechanism_matching:Matching 10 features to 38 species for ontology chebi
DEBUG:napistu.mechanism_matching:Matching 10 features to 10 species for ontology uniprot
INFO:napistu.mechanism_matching:Found 20 total matches across 2 ontologies


In [2]:
import test_utils
test_utils.test_drop_extra_cols()
test_utils.test_aggregate_numeric_basic()
test_utils.test_aggregate_numeric_weighted_mean()
test_utils.test_aggregate_numeric_edge_cases()
test_utils.test_resolve_matches_with_example_data()
test_utils.test_features_to_pathway_species_basic_and_expansion()

INFO:utils:Expanding identifiers: 2 delimiters found in 'my_id', will expand to more rows.
INFO:utils:Expanding identifiers: 2 delimiters found in 'my_id', will expand to more rows.


In [None]:
from napistu.mechanism_matching import _check_species_identifiers_table

from typing import Optional, Union, Set, Dict
import logging

logger = logging.getLogger(__name__)

# match a table containing identifiers from 1+ ontologies and a additional results
def bind_wide_results(
    sbml_dfs : sbml_dfs_core.SBML_dfs,
    results_df : pd.DataFrame,
    results_name : str,
    ontologies : Optional[Union[Set[str], Dict[str, str]]] = None,
    dogmatic : bool = False,
    species_identifiers : Optional[pd.DataFrame] = None
) -> sbml_dfs_core.SBML_dfs:
    """
    Binds wide results to a sbml_dfs object.

    Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data

    Parameters
    ----------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object to bind the results to.
    results_df : pd.DataFrame
        The table containing the results to bind.
    results_name : str
        The name of the results to bind.
    ontologies : Optional[Union[Set[str], Dict[str, str]]]
        The ontologies to use for matching.
    dogmatic : bool
        Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
    species_identifiers : Optional[pd.DataFrame]
        Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
    
    Returns
    -------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object with the results bound.
    """

    species_identifiers = _prepare_species_identifiers(
        sbml_dfs,
        dogmatic = dogmatic,
        species_identifiers = species_identifiers
        )
    
    # match
    matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
        example_data_wide,
        species_identifiers,
        ontologies = ontologies,
        feature_id_var = "identifier",
    )

    return matched_s_ids_from_wide

    #sbml_dfs.add_species_data(
    #    results_name,
    #    matched_s_ids_from_wide.set_index("s_id")
    #     )

    return sbml_dfs

x = bind_wide_results(
    sbml_dfs,
    example_data_wide,
    "results",
    ontologies = {"uniprot", "chebi"},
    dogmatic = False,
    species_identifiers = None
)

utils.resolve_matches(
    matched_data = x
)


INFO:napistu.sbml_dfs_utils:Running in non-dogmatic mode - genes, transcripts, and proteins will be merged if possible.
  promiscuous_component_identifiers = pd.Series(
DEBUG:napistu.mechanism_matching:Validated ontology columns: {'chebi', 'uniprot'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_a', 'results_b']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 4)
DEBUG:napistu.mechanism_matching:Matching 10 features to 48 species for ontology chebi
DEBUG:napistu.mechanism_matching:Matching 10 features to 98 species for ontology uniprot
INFO:napistu.mechanism_matching:Found 25 total matches across 2 ontologies


KeyError: "Column 'feature_id' not found in DataFrame. This column is required when using numeric_agg='weighted_mean'"

next steps

- Specify a feature_id variable mapping back to the original entry
- Support for identifiers which are nested in a single entry with a delimiter (e.g, chebi_a, chebi_b). This should probably going into the backlog
- Add a verbose flag to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
- use the filtering function to trim down the output prior to binding species_data.

With this, we'll have a good process for tying numeric attributes to species data.

Next, we can specify how to translate the results to graph attributes.