# Network propagation development

- connecting results to graphs
- applying PPR

In [1]:
import os

import pandas as pd
import numpy as np

from napistu.ingestion import sbml
from napistu import sbml_dfs_core
from napistu import mechanism_matching

In [2]:
PATH_TO_TEST_DATA = os.path.expanduser("~/Desktop/GITHUB/napistu/lib/napistu-py/src/tests/test_data")
example_pathway = os.path.join(PATH_TO_TEST_DATA, "reactome_glucose_metabolism.sbml")
assert os.path.exists(example_pathway)

In [3]:
sbml_dfs = sbml_dfs_core.SBML_dfs(sbml.SBML(example_pathway))

species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'").query("ontology != 'reactome'")

INFO:napistu.utils:creating an edgelist linking index levels s_id, entry and linking it to levels defined by ontology, identifier
DEBUG:napistu.utils:label is not defined in table_schema; adding a constant (1)


In [4]:
# create a table whose index is s_ids and columns are faux-measurements
example_data = species_identifiers.groupby("ontology").head(10)[["ontology", "identifier"]]

example_data["results_a"] = np.random.randn(len(example_data))
example_data["results_b"] = np.random.randn(len(example_data))

example_data

Unnamed: 0,ontology,identifier,results_a,results_b
0,chebi,17925,-2.006174,0.810605
5,uniprot,Q9NQR9,1.152038,-0.56418
13,chebi,58225,-0.084049,-0.207536
16,chebi,15377,0.728968,0.493883
19,chebi,18367,1.304724,-1.125265
23,uniprot,O43826,0.130064,0.54705
46,chebi,57540,1.26724,-0.555161
49,chebi,30797,-1.767039,-0.163527
52,chebi,57945,-1.538785,-0.090907
55,chebi,30744,-1.702127,1.079919


In [5]:
# pivot (identifier, ontology) to columns for each ontology
example_data_wide = example_data.pivot(columns = "ontology", values = "identifier", index = ["results_a", "results_b"]).reset_index().rename_axis(None, axis = 1)

example_data_wide

Unnamed: 0,results_a,results_b,chebi,uniprot
0,-2.006174,0.810605,17925.0,
1,-1.767039,-0.163527,30797.0,
2,-1.702127,1.079919,30744.0,
3,-1.538785,-0.090907,57945.0,
4,-1.009992,0.175558,,P35575
5,-0.827106,-1.717689,,Q14397
6,-0.727777,0.011431,,Q9BUM1
7,-0.652388,2.037114,,P35558
8,-0.571318,2.034753,,Q16822
9,-0.191392,1.042812,15378.0,


In [6]:
results_tables = dict()
for ont in example_data["ontology"].unique():
    results_tables[ont] = example_data.query("ontology == @ont").drop(columns = "ontology")

results_tables

{'chebi':    identifier  results_a  results_b
 0       17925  -2.006174   0.810605
 13      58225  -0.084049  -0.207536
 16      15377   0.728968   0.493883
 19      18367   1.304724  -1.125265
 46      57540   1.267240  -0.555161
 49      30797  -1.767039  -0.163527
 52      57945  -1.538785  -0.090907
 55      30744  -1.702127   1.079919
 58      15378  -0.191392   1.042812
 89      16810   0.820059  -0.085367,
 'uniprot':     identifier  results_a  results_b
 5       Q9NQR9   1.152038  -0.564180
 23      O43826   0.130064   0.547050
 61      Q9UBX3  -0.000314   0.089213
 127     P53007   1.335060  -0.437013
 178     P35558  -0.652388   2.037114
 241     Q16822  -0.571318   2.034753
 316     P35575  -1.009992   0.175558
 399     Q9BUM1  -0.727777   0.011431
 469     P35557   1.022711   0.177910
 522     Q14397  -0.827106  -1.717689}

In [None]:
import utils

# options, for matching
# 1. match by identifier and a set of ontologies (provided by arg).
matched_s_ids = mechanism_matching.features_to_pathway_species(
    feature_identifiers = example_data.drop(columns = "ontology"), 
    species_identifiers = species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)

# 2. match by identifier and ontology.
matched_s_ids_w_ontologies = utils.match_by_ontology_and_identifier(
    feature_identifiers = example_data, 
    species_identifiers = species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)

# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
matched_s_ids_from_wide = utils.match_features_to_wide_pathway_species(
    example_data_wide,
    species_identifiers,
    ontologies = {"uniprot", "chebi"},
    feature_id_var = "identifier",
)

# check for equivalence of the three strategies


# 4. format multiple tables by applying strategy #3 multiple times; storing results as separate `species_data` tables.
# this may depend more on the structure and whether measures are defined over all modalities or only a subset.

DEBUG:utils:Matching 10 features to 10 species for ontology uniprot
DEBUG:utils:Matching 10 features to 38 species for ontology chebi
INFO:utils:Found 20 total matches across 2 ontologies
DEBUG:utils:Validated ontology columns: {'uniprot', 'chebi'}
INFO:utils:Using columns as results: ['results_b', 'results_a']
DEBUG:utils:Final long format shape: (20, 4)
DEBUG:utils:Matching 10 features to 10 species for ontology uniprot
DEBUG:utils:Matching 10 features to 38 species for ontology chebi
INFO:utils:Found 20 total matches across 2 ontologies


In [14]:
pd.testing.assert_frame_equal(
    matched_s_ids,
    matched_s_ids_w_ontologies,
    check_like = True,
    check_index = False
    )

TypeError: assert_frame_equal() got an unexpected keyword argument 'check_index'

In [25]:
def compare_frame_contents(df1, df2):
    """
    Compare if two DataFrames have the same content, ignoring index and column ordering.
    
    Parameters
    ----------
    df1 : pd.DataFrame
        First DataFrame to compare
    df2 : pd.DataFrame
        Second DataFrame to compare
        
    Returns
    -------
    None
    """
    df1_sorted = (df1
                    .reindex(columns=sorted(df1.columns))
                    .sort_values(sorted(df1.columns))
                    .reset_index(drop=True))
    
    df2_sorted = (df2
                    .reindex(columns=sorted(df2.columns))
                    .sort_values(sorted(df2.columns))
                    .reset_index(drop=True))
    
    pd.testing.assert_frame_equal(df1_sorted, df2_sorted, check_like=True)

    return None

compare_frame_contents(matched_s_ids.drop(columns = "s_Source"), matched_s_ids_w_ontologies.drop(columns = "s_Source"))
compare_frame_contents(matched_s_ids.drop(columns = "s_Source"), matched_s_ids_from_wide.drop(columns = "s_Source"))


In [19]:
matched_s_ids

Unnamed: 0,identifier,results_a,results_b,s_id,entry,ontology,url,bqb,s_name,s_Source
0,17925,-2.006174,0.810605,S00000000,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Glc,<napistu.source.Source object at 0x135a67ed0>
1,Q9NQR9,1.152038,-0.56418,S00000001,0,uniprot,https://purl.uniprot.org/uniprot/Q9NQR9,BQB_IS,G6PC2,<napistu.source.Source object at 0x135a62810>
2,58225,-0.084049,-0.207536,S00000002,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,G6P,<napistu.source.Source object at 0x10365e3d0>
3,15377,0.728968,0.493883,S00000003,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,H2O,<napistu.source.Source object at 0x132ff8710>
4,18367,1.304724,-1.125265,S00000004,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Pi,<napistu.source.Source object at 0x132ff8790>
5,O43826,0.130064,0.54705,S00000005,0,uniprot,https://purl.uniprot.org/uniprot/O43826,BQB_IS,SLC37A4,<napistu.source.Source object at 0x135a29990>
6,57540,1.26724,-0.555161,S00000007,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NAD+,<napistu.source.Source object at 0x135a51350>
7,30797,-1.767039,-0.163527,S00000008,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,MAL,<napistu.source.Source object at 0x135a51650>
8,57945,-1.538785,-0.090907,S00000009,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NADH,<napistu.source.Source object at 0x135a77f10>
9,30744,-1.702127,1.079919,S00000010,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,OAA,<napistu.source.Source object at 0x135a513d0>


In [11]:
matched_s_ids

Unnamed: 0,identifier,results_a,results_b,s_id,entry,ontology,url,bqb,s_name,s_Source
0,17925,-2.006174,0.810605,S00000000,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Glc,<napistu.source.Source object at 0x135a67ed0>
1,Q9NQR9,1.152038,-0.56418,S00000001,0,uniprot,https://purl.uniprot.org/uniprot/Q9NQR9,BQB_IS,G6PC2,<napistu.source.Source object at 0x135a62810>
2,58225,-0.084049,-0.207536,S00000002,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,G6P,<napistu.source.Source object at 0x10365e3d0>
3,15377,0.728968,0.493883,S00000003,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,H2O,<napistu.source.Source object at 0x132ff8710>
4,18367,1.304724,-1.125265,S00000004,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Pi,<napistu.source.Source object at 0x132ff8790>
5,O43826,0.130064,0.54705,S00000005,0,uniprot,https://purl.uniprot.org/uniprot/O43826,BQB_IS,SLC37A4,<napistu.source.Source object at 0x135a29990>
6,57540,1.26724,-0.555161,S00000007,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NAD+,<napistu.source.Source object at 0x135a51350>
7,30797,-1.767039,-0.163527,S00000008,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,MAL,<napistu.source.Source object at 0x135a51650>
8,57945,-1.538785,-0.090907,S00000009,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NADH,<napistu.source.Source object at 0x135a77f10>
9,30744,-1.702127,1.079919,S00000010,0,chebi,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,OAA,<napistu.source.Source object at 0x135a513d0>


In [12]:
species_identifiers

Unnamed: 0,s_id,entry,ontology,identifier,url,bqb,s_name,s_Source
0,S00000000,0,chebi,17925,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Glc,<napistu.source.Source object at 0x135a67ed0>
5,S00000001,0,uniprot,Q9NQR9,https://purl.uniprot.org/uniprot/Q9NQR9,BQB_IS,G6PC2,<napistu.source.Source object at 0x135a62810>
13,S00000002,0,chebi,58225,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,G6P,<napistu.source.Source object at 0x10365e3d0>
16,S00000003,0,chebi,15377,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,H2O,<napistu.source.Source object at 0x132ff8710>
19,S00000004,0,chebi,18367,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,Pi,<napistu.source.Source object at 0x132ff8790>
23,S00000005,0,uniprot,O43826,https://purl.uniprot.org/uniprot/O43826,BQB_IS,SLC37A4,<napistu.source.Source object at 0x135a29990>
46,S00000007,0,chebi,57540,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NAD+,<napistu.source.Source object at 0x135a51350>
49,S00000008,0,chebi,30797,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,MAL,<napistu.source.Source object at 0x135a51650>
52,S00000009,0,chebi,57945,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,NADH,<napistu.source.Source object at 0x135a77f10>
55,S00000010,0,chebi,30744,http://www.ebi.ac.uk/chebi/searchId.do?chebiId...,BQB_IS,OAA,<napistu.source.Source object at 0x135a513d0>


In [8]:
import test_utils

test_utils.test_validate_wide_ontologies()
test_utils.test_match_by_ontology_and_identifier()

INFO:utils:Auto-detected ontology columns: {'uniprot', 'chebi'}
DEBUG:utils:Validated ontology columns: {'uniprot', 'chebi'}
DEBUG:utils:Validated ontology columns: {'chebi'}
DEBUG:utils:Validated ontology columns: {'chebi'}
DEBUG:utils:Validated ontology columns: {'uniprot', 'chebi'}
DEBUG:utils:Validated ontology columns: {'reactome', 'ensembl_gene'}
DEBUG:utils:Matching 2 features to 2 species for ontology chebi
INFO:utils:Found 1 total matches across 1 ontologies
DEBUG:utils:Matching 2 features to 2 species for ontology uniprot
DEBUG:utils:Matching 2 features to 2 species for ontology chebi
INFO:utils:Found 2 total matches across 2 ontologies
DEBUG:utils:Matching 2 features to 2 species for ontology uniprot
DEBUG:utils:Matching 2 features to 2 species for ontology chebi
INFO:utils:Found 2 total matches across 2 ontologies
DEBUG:utils:Matching 1 features to 2 species for ontology chebi
DEBUG:utils:Matching 2 features to 2 species for ontology chebi
INFO:utils:Found 1 total matches a