# Network propagation development

- connecting results to graphs
- applying PPR

In [13]:
import os

import pandas as pd
import numpy as np

from napistu.ingestion import sbml
from napistu import sbml_dfs_core
from napistu import mechanism_matching

In [14]:
PATH_TO_TEST_DATA = os.path.expanduser("~/Desktop/GITHUB/napistu/lib/napistu-py/src/tests/test_data")
example_pathway = os.path.join(PATH_TO_TEST_DATA, "reactome_glucose_metabolism.sbml")
assert os.path.exists(example_pathway)

In [15]:
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml.SBML(example_pathway))

species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'").query("ontology != 'reactome'")

INFO:napistu.utils:creating an edgelist linking index levels s_id, entry and linking it to levels defined by ontology, identifier
DEBUG:napistu.utils:label is not defined in table_schema; adding a constant (1)


In [16]:
# create a table whose index is s_ids and columns are faux-measurements
example_data = species_identifiers.groupby("ontology").head(10)[["ontology", "identifier"]]

example_data["results"] = np.random.randn(len(example_data))

example_data

Unnamed: 0,ontology,identifier,results
0,chebi,17925,-0.093627
5,uniprot,Q9NQR9,-1.767099
13,chebi,58225,0.219614
16,chebi,15377,-1.315513
19,chebi,18367,0.09356
23,uniprot,O43826,2.882925
46,chebi,57540,-0.786413
49,chebi,30797,0.728111
52,chebi,57945,0.301388
55,chebi,30744,-1.08619


In [17]:
# pivot (identifier, ontology) to columns for each ontology
example_data_wide = example_data.pivot(columns = "ontology", values = "identifier", index = "results").reset_index()

example_data_wide

ontology,results,chebi,uniprot
0,-1.767099,,Q9NQR9
1,-1.747846,,P53007
2,-1.433893,,P35558
3,-1.315513,15377.0,
4,-1.162518,16810.0,
5,-1.08619,30744.0,
6,-0.786413,57540.0,
7,-0.467875,15378.0,
8,-0.349612,,Q16822
9,-0.093627,17925.0,


In [18]:
results_tables = dict()
for ont in example_data["ontology"].unique():
    results_tables[ont] = example_data.query("ontology == @ont").drop(columns = "ontology")

results_tables

{'chebi':    identifier   results
 0       17925 -0.093627
 13      58225  0.219614
 16      15377 -1.315513
 19      18367  0.093560
 46      57540 -0.786413
 49      30797  0.728111
 52      57945  0.301388
 55      30744 -1.086190
 58      15378 -0.467875
 89      16810 -1.162518,
 'uniprot':     identifier   results
 5       Q9NQR9 -1.767099
 23      O43826  2.882925
 61      Q9UBX3  0.586654
 127     P53007 -1.747846
 178     P35558 -1.433893
 241     Q16822 -0.349612
 316     P35575  0.045976
 399     Q9BUM1  0.496865
 469     P35557  0.908417
 522     Q14397  0.180233}

In [19]:
from napistu.constants import ONTOLOGIES_LIST

In [30]:

import logging
from typing import Dict, Optional, Set, Union

import pandas as pd

from napistu import mechanism_matching
from napistu.constants import ONTOLOGIES_LIST

logger = logging.getLogger(__name__)

def _validate_wide_ontologies(
    wide_df: pd.DataFrame,
    ontologies: Optional[Union[str, Set[str], Dict[str, str]]] = None
) -> Set[str]:
    """
    Validate ontology specifications against the wide DataFrame and ONTOLOGIES_LIST.
    
    Parameters
    ----------
    wide_df : pd.DataFrame
        DataFrame with one column per ontology and a results column
    ontologies : Optional[Union[str, Set[str], Dict[str, str]]]
        Either:
        - String specifying a single ontology column
        - Set of columns to treat as ontologies
        - Dict mapping wide column names to ontology names
        - None to automatically detect ontology columns based on ONTOLOGIES_LIST
        
    Returns
    -------
    Set[str]
        Set of validated ontology names. For dictionary mappings, returns the target ontology names.
        
    Raises
    ------
    ValueError
        If validation fails for any ontology specification or no valid ontologies are found
    """
    # Convert string input to set
    if isinstance(ontologies, str):
        ontologies = {ontologies}

    # Get the set of ontology columns
    if isinstance(ontologies, dict):
        # Check source columns exist in DataFrame
        missing_cols = set(ontologies.keys()) - set(wide_df.columns)
        if missing_cols:
            raise ValueError(
                f"Source columns not found in DataFrame: {missing_cols}"
            )
        # Validate target ontologies against ONTOLOGIES_LIST
        invalid_onts = set(ontologies.values()) - set(ONTOLOGIES_LIST)
        if invalid_onts:
            raise ValueError(
                f"Invalid ontologies in mapping: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
            )
        # Return target ontology names instead of source column names
        ontology_cols = set(ontologies.values())
        
    elif isinstance(ontologies, set):
        # Check specified columns exist in DataFrame
        missing_cols = ontologies - set(wide_df.columns)
        if missing_cols:
            raise ValueError(
                f"Specified ontology columns not found in DataFrame: {missing_cols}"
            )
        # Validate specified ontologies against ONTOLOGIES_LIST
        invalid_onts = ontologies - set(ONTOLOGIES_LIST)
        if invalid_onts:
            raise ValueError(
                f"Invalid ontologies in set: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
            )
        ontology_cols = ontologies
        
    else:
        # Auto-detect ontology columns by matching against ONTOLOGIES_LIST
        ontology_cols = set(wide_df.columns) & set(ONTOLOGIES_LIST)
        if not ontology_cols:
            raise ValueError(
                f"No valid ontology columns found in DataFrame. Column names must match one of: {ONTOLOGIES_LIST}"
            )
        logger.info(
            f"Auto-detected ontology columns: {ontology_cols}"
        )
    
    logger.debug(f"Validated ontology columns: {ontology_cols}")
    return ontology_cols

In [31]:
# auto-detect
assert _validate_wide_ontologies(example_data_wide) == {"chebi", "uniprot"}

# detect by name
assert _validate_wide_ontologies(example_data_wide, ontologies = "chebi") == {"chebi"}

# rename
assert _validate_wide_ontologies(example_data_wide, ontologies = {"chebi" : "reactome", "uniprot" : "ensembl_gene"}) == {"reactome", "ensembl_gene"}

INFO:__main__:Auto-detected ontology columns: {'chebi', 'uniprot'}
DEBUG:__main__:Validated ontology columns: {'chebi', 'uniprot'}
DEBUG:__main__:Validated ontology columns: {'chebi'}
DEBUG:__main__:Validated ontology columns: {'reactome', 'ensembl_gene'}


In [54]:
# options, for matching
# 1. match by identifier and a set of ontologies (provided by arg).
matched_s_ids = mechanism_matching.features_to_pathway_species(
    feature_identifiers = example_data.drop(columns = "ontology"), 
    species_identifiers = species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)

# 2. match by identifier and ontology.


# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.

# 4. format multiple tables by applying strategy #3 multiple times; storing results as separate `species_data` tables.

<function napistu.mechanism_matching.features_to_pathway_species(feature_identifiers: 'pd.DataFrame', species_identifiers: 'pd.DataFrame', ontologies: 'set', feature_id_var: 'str') -> 'pd.DataFrame'>

In [53]:
bound_s_ids.set_index("s_id")

Unnamed: 0_level_0,identifier,results,entry,ontology,url,bqb,s_name,s_Source
s_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
S00000000,17925,-1.103558,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Glc,<napistu.source.Source object at 0x157695e10>
S00000001,Q9NQR9,2.006847,0,uniprot,https://purl.uniprot.org/uniprot/Q9NQR9,BQB_IS,G6PC2,<napistu.source.Source object at 0x157689010>
S00000002,58225,2.20138,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,G6P,<napistu.source.Source object at 0x15699f9d0>
S00000003,15377,-1.208963,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,H2O,<napistu.source.Source object at 0x15761fb90>
S00000004,18367,1.218393,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Pi,<napistu.source.Source object at 0x15761fc90>
S00000005,O43826,-1.408536,0,uniprot,https://purl.uniprot.org/uniprot/O43826,BQB_IS,SLC37A4,<napistu.source.Source object at 0x160196b50>
S00000007,57540,-0.319507,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NAD+,<napistu.source.Source object at 0x156d76fd0>
S00000008,30797,-0.332691,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,MAL,<napistu.source.Source object at 0x16014b3d0>
S00000009,57945,-1.230691,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NADH,<napistu.source.Source object at 0x160197e50>
S00000010,30744,0.463939,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,OAA,<napistu.source.Source object at 0x160149890>
