# Network propagation development

- connecting results to graphs
- applying PPR

In [1]:
import os

import pandas as pd
import numpy as np

from napistu.ingestion import sbml
from napistu import sbml_dfs_core
from napistu import mechanism_matching

In [2]:
PATH_TO_TEST_DATA = os.path.expanduser("~/Desktop/GITHUB/napistu/lib/napistu-py/src/tests/test_data")
example_pathway = os.path.join(PATH_TO_TEST_DATA, "reactome_glucose_metabolism.sbml")
assert os.path.exists(example_pathway)

In [3]:
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml.SBML(example_pathway))

species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'").query("ontology != 'reactome'")

INFO:napistu.utils:creating an edgelist linking index levels s_id, entry and linking it to levels defined by ontology, identifier
DEBUG:napistu.utils:label is not defined in table_schema; adding a constant (1)


In [4]:
# create a table whose index is s_ids and columns are faux-measurements
example_data = species_identifiers.groupby("ontology").head(10)[["ontology", "identifier"]]

example_data["results_a"] = np.random.randn(len(example_data))
example_data["results_b"] = np.random.randn(len(example_data))

example_data

Unnamed: 0,ontology,identifier,results_a,results_b
0,chebi,17925,1.262732,0.229445
5,uniprot,Q9NQR9,0.279089,1.453349
13,chebi,58225,0.001207,0.092009
16,chebi,15377,1.886222,0.591523
19,chebi,18367,-0.254374,0.047264
23,uniprot,O43826,-1.587021,-0.952682
46,chebi,57540,2.180325,-0.877731
49,chebi,30797,-1.32269,-0.031177
52,chebi,57945,1.185448,-2.568565
55,chebi,30744,0.799275,0.625308


In [5]:
# pivot (identifier, ontology) to columns for each ontology
example_data_wide = example_data.pivot(columns = "ontology", values = "identifier", index = ["results_a", "results_b"]).reset_index().rename_axis(None, axis = 1)

example_data_wide

Unnamed: 0,results_a,results_b,chebi,uniprot
0,-1.672518,-0.09877,,P35575
1,-1.587021,-0.952682,,O43826
2,-1.543537,0.120711,,Q16822
3,-1.32269,-0.031177,30797.0,
4,-1.006104,1.299365,15378.0,
5,-0.743016,-2.223224,,P35558
6,-0.673405,0.352925,,P35557
7,-0.254374,0.047264,18367.0,
8,0.001207,0.092009,58225.0,
9,0.091037,-1.394691,,P53007


In [6]:
results_tables = dict()
for ont in example_data["ontology"].unique():
    results_tables[ont] = example_data.query("ontology == @ont").drop(columns = "ontology")

results_tables

{'chebi':    identifier  results_a  results_b
 0       17925   1.262732   0.229445
 13      58225   0.001207   0.092009
 16      15377   1.886222   0.591523
 19      18367  -0.254374   0.047264
 46      57540   2.180325  -0.877731
 49      30797  -1.322690  -0.031177
 52      57945   1.185448  -2.568565
 55      30744   0.799275   0.625308
 58      15378  -1.006104   1.299365
 89      16810   0.237641   0.074177,
 'uniprot':     identifier  results_a  results_b
 5       Q9NQR9   0.279089   1.453349
 23      O43826  -1.587021  -0.952682
 61      Q9UBX3   1.851145   0.333437
 127     P53007   0.091037  -1.394691
 178     P35558  -0.743016  -2.223224
 241     Q16822  -1.543537   0.120711
 316     P35575  -1.672518  -0.098770
 399     Q9BUM1   1.287197  -0.294719
 469     P35557  -0.673405   0.352925
 522     Q14397   0.649638   1.087585}

In [7]:
import utils

# options, for matching
# 1. match by identifier and a set of ontologies (provided by arg).
matched_s_ids = mechanism_matching.features_to_pathway_species(
    feature_identifiers = example_data.drop(columns = "ontology"), 
    species_identifiers = species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)


# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
    example_data_wide,
    species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)

# check for equivalence of the three strategies


# 4. format multiple tables by applying strategy #3 multiple times; storing results as separate `species_data` tables.
## this may depend more on the structure and whether measures are defined over all modalities or only a subset.

DEBUG:napistu.mechanism_matching:Validated ontology columns: {'chebi', 'uniprot'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_a', 'results_b']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 4)
DEBUG:napistu.mechanism_matching:Matching 10 features to 38 species for ontology chebi
DEBUG:napistu.mechanism_matching:Matching 10 features to 10 species for ontology uniprot
INFO:napistu.mechanism_matching:Found 20 total matches across 2 ontologies


In [8]:
import test_utils
test_utils.test_drop_extra_cols()
test_utils.test_aggregate_numeric_basic()
test_utils.test_aggregate_numeric_weighted_mean()
test_utils.test_aggregate_numeric_edge_cases()
test_utils.test_resolve_matches_with_example_data()

In [9]:
from napistu import sbml_dfs_utils
from napistu.sbml_dfs_utils import _validate_assets_sbml_ids
from napistu.utils import match_pd_vars

from typing import Optional, Union, Set, Dict
import logging

logger = logging.getLogger(__name__)


def _prepare_species_identifiers(
    sbml_dfs : sbml_dfs_core.SBML_dfs,
    dogmatic : bool = False,
    species_identifiers : Optional[pd.DataFrame] = None) -> pd.DataFrame:
    """Accepts and validates species_identifiers, or extracts a fresh table if None."""

    if species_identifiers is None:
        species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(sbml_dfs, dogmatic = dogmatic)
    else:
        # check for compatibility
        try:
            # check species_identifiers format
            match_pd_vars(species_identifiers, req_vars = SPECIES_IDENTIFIERS_REQUIRED_VARS).assert_present()
            # quick check for compatibility between sbml_dfs and species_identifiers
            _validate_assets_sbml_ids(sbml_dfs, species_identifiers)
        except ValueError as e:
            logger.warning(f"The provided identifiers are not compatible with your `sbml_dfs` object. Extracting a fresh species identifier table. {e}")
            species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(sbml_dfs, dogmatic = dogmatic)

    return species_identifiers

# match a table containing identifiers from 1+ ontologies and a additional results
def bind_wide_results(
    sbml_dfs : sbml_dfs_core.SBML_dfs,
    results_df : pd.DataFrame,
    results_name : str,
    ontologies : Optional[Union[Set[str], Dict[str, str]]] = None,
    dogmatic : bool = False,
    species_identifiers : Optional[pd.DataFrame] = None
):
    
    """
    Binds wide results to a sbml_dfs object.

    Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data

    Parameters
    ----------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object to bind the results to.
    results_df : pd.DataFrame
        The table containing the results to bind.
    results_name : str
        The name of the results to bind.
    ontologies : Optional[Union[Set[str], Dict[str, str]]]
        The ontologies to use for matching.
    dogmatic : bool
        Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
    species_identifiers : Optional[pd.DataFrame]
        Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
    
    Returns
    -------
    sbml_dfs : sbml_dfs_core.SBML_dfs
        The sbml_dfs object with the results bound.
    
    """

    species_identifiers = _prepare_species_identifiers(
        sbml_dfs,
        dogmatic = dogmatic,
        species_identifiers = species_identifiers
        )
    
    # match
    matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
        example_data_wide,
        species_identifiers,
        ontologies = ontologies,
        feature_id_var = "identifier",
    )

    return matched_s_ids_from_wide

    #sbml_dfs.add_species_data(
    #    results_name,
    #    matched_s_ids_from_wide.set_index("s_id")
    #     )

    return sbml_dfs

x = bind_wide_results(
    sbml_dfs,
    example_data_wide,
    "results",
    ontologies = {"uniprot", "chebi"},
    dogmatic = False,
    species_identifiers = None
)

utils.resolve_matches(
    matched_data = x
)


INFO:napistu.sbml_dfs_utils:Running in non-dogmatic mode - genes, transcripts, and proteins will be merged if possible.


  promiscuous_component_identifiers = pd.Series(
DEBUG:napistu.mechanism_matching:Validated ontology columns: {'chebi', 'uniprot'}
INFO:napistu.mechanism_matching:Using columns as results: ['results_a', 'results_b']
DEBUG:napistu.mechanism_matching:Final long format shape: (20, 4)
DEBUG:napistu.mechanism_matching:Matching 10 features to 48 species for ontology chebi
DEBUG:napistu.mechanism_matching:Matching 10 features to 98 species for ontology uniprot
INFO:napistu.mechanism_matching:Found 25 total matches across 2 ontologies


KeyError: "Column 'feature_id' not found in DataFrame. This column is required when using numeric_agg='weighted_mean'"

next steps

- Specify a feature_id variable mapping back to the original entry
- Support for identifiers which are nested in a single entry with a delimiter (e.g, chebi_a, chebi_b). This should probably going into the backlog
- Add a verbose flag to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
- use the filtering function to trim down the output prior to binding species_data.

With this, we'll have a good process for tying numeric attributes to species data.

Next, we can specify how to translate the results to graph attributes.