---
title: Tutorial - Downloading and Formatting Pathway Data
author: "Shackett"
date: "May 9th 2025"
---

This notebook provides examples for downloading pathway information from a variety of data sources and formatting the results as `sbml_dfs` objects. Since it is often not possible to download just a subset of a data source for demonstration purposes, this notebook will download and cache raw and intermediate representations of each data source.

In [1]:
#| label: config

import tutorial_utils
logger = tutorial_utils.setup_clean_logging()
config = tutorial_utils.NapistuConfig("config.yaml", "downloading_pathway_data")

In [2]:
#| label: environment

import os
import pickle

import pandas as pd

from napistu import utils
from napistu.ingestion import bigg
from napistu.ingestion import idea_yeast
from napistu.ingestion import intact
from napistu.ingestion import omnipath
from napistu.ingestion import psi_mi
from napistu.ingestion import reactome
from napistu.ingestion import reactome_fi
from napistu.ingestion import string
from napistu.ingestion import trrust

def _log(text: str):

    banner_str = "====================================\n"
    logger.info(f"\n{banner_str}{text}\n{banner_str}")

def _log_skipped(uri: str):
    """ Log that a step was skipped. """
    logger.info(f"{uri} exists or overwrite = False")

In [3]:
#| label: globals

OVERWRITE = config.overwrite
SPECIES = config.species
FN_BIGG_SBML_DIR = config.artifacts["bigg_sbml_dir"]
FN_BIGG_SBML_DFS = config.artifacts["bigg_sbml_dfs"]
FN_IDEA_YEAST_RAW = config.artifacts["idea_yeast_raw"]
FN_IDEA_YEAST_SBML_DFS = config.artifacts["idea_yeast_sbml_dfs"]
FN_INTACT_DIR = config.artifacts["intact_dir"]
FN_INTACT_SBML_DFS = config.artifacts["intact_sbml_dfs"]
FN_OMNIPATH_SBML_DFS = config.artifacts["omnipath_sbml_dfs"]
FN_REACTOME_SBML_DIR = config.artifacts["reactome_sbml_dir"]
FN_REACTOME_SBML_DFS = config.artifacts["reactome_sbml_dfs"]
FN_REACTOME_FI_RAW = config.artifacts["reactome_fi_raw"]
FN_REACTOME_FI_SBML_DFS = config.artifacts["reactome_fi_sbml_dfs"]
FN_STRING_ALIASES = config.artifacts["string_aliases"]
FN_STRING_INTERACTIONS = config.artifacts["string_interactions"]
FN_STRING_SBML_DFS = config.artifacts["string_sbml_dfs"]
FN_TRRUST_RAW = config.artifacts["trrust_raw"]
FN_TRRUST_SBML_DFS = config.artifacts["trrust_sbml_dfs"]

# Species Agnostic Sources

These sources include pathway information which can be broadly applied to a range of species. 

## STRING

In [4]:
#| label: string

if not utils.path_exists(FN_STRING_INTERACTIONS) or OVERWRITE:
    _log(f"Downloading STRING interactions to {FN_STRING_INTERACTIONS}")
    string.download_string(FN_STRING_INTERACTIONS, organismal_species = SPECIES)
else:
    _log_skipped(FN_STRING_INTERACTIONS)

if not utils.path_exists(FN_STRING_ALIASES) or OVERWRITE:
    _log(f"Downloading STRING aliases (systematic identifiers) to {FN_STRING_ALIASES}")
    string.download_string_aliases(FN_STRING_ALIASES, organismal_species = SPECIES)
else:
    _log_skipped(FN_STRING_ALIASES)

if not utils.path_exists(FN_STRING_SBML_DFS) or OVERWRITE:
    _log(f"Combining interactions and aliases to create the STRING sbml_dfs at {FN_STRING_SBML_DFS}")

    sbml_dfs = string.convert_string_to_sbml_dfs(
        FN_STRING_INTERACTIONS,
        FN_STRING_ALIASES,
        organismal_species = SPECIES
    )
    sbml_dfs.validate()

    utils.save_pickle(FN_STRING_SBML_DFS, sbml_dfs)

else:
    _log_skipped(FN_STRING_SBML_DFS)

INFO:__main__:napistu_data/string_interactions.txt exists or overwrite = False
INFO:__main__:napistu_data/string_aliases.txt exists or overwrite = False
INFO:__main__:napistu_data/string_sbml_dfs.pickle exists or overwrite = False


## IntAct

In [5]:
#| label: intact

if not utils.path_exists(FN_INTACT_DIR) or OVERWRITE:
    _log(f"Downloading IntAct PSI-MI XMLs to {FN_INTACT_DIR}")
    intact.download_intact_xmls(FN_INTACT_DIR, organismal_species = SPECIES)
else:
    _log_skipped(FN_INTACT_DIR)

if not utils.path_exists(FN_INTACT_SBML_DFS) or OVERWRITE:
    _log(f"Aggregating PSI-MI interactions and formatting results as an sbml_dfs at {FN_INTACT_SBML_DFS}")

    formatted_psi_mis = psi_mi.format_psi_mis(FN_INTACT_DIR)
    intact_summaries = psi_mi.aggregate_psi_mis(formatted_psi_mis)

    sbml_dfs = intact.intact_to_sbml_dfs(
        intact_summaries,
        organismal_species = SPECIES
    )

    sbml_dfs.validate()

    utils.save_pickle(FN_INTACT_SBML_DFS, sbml_dfs)

else:
    _log_skipped(FN_INTACT_SBML_DFS)

INFO:__main__:napistu_data/intact exists or overwrite = False
INFO:__main__:napistu_data/intact_sbml_dfs.pickle exists or overwrite = False


# Species-Biased Sources

These sources inform multiple species but their focal point is a single species or a few species. 

## Reactome

In [6]:
#| label: reactome

if not os.path.isdir(FN_REACTOME_SBML_DIR) or OVERWRITE:
    _log(f"Downloading the Reactome pan-species tar-ball and unpacking to a directory of .sbml files at {FN_REACTOME_SBML_DIR}")
    reactome.reactome_sbml_download(FN_REACTOME_SBML_DIR, overwrite=OVERWRITE)
else:
    _log_skipped(FN_REACTOME_SBML_DIR)

if not utils.path_exists(FN_REACTOME_SBML_DFS) or OVERWRITE:
    _log(f"Merging Reactome .sbml files into an sbml_dfs model at {FN_REACTOME_SBML_DFS}")

    pw_index_uri = os.path.join(FN_REACTOME_SBML_DIR, "pw_index.tsv")
    sbml_dfs = reactome.construct_reactome_consensus(
        pw_index_uri, organismal_species=SPECIES, strict=False
    )
    sbml_dfs.validate()

    utils.save_pickle(FN_REACTOME_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_REACTOME_SBML_DFS)
    

INFO:__main__:napistu_data/reactome_sbmls exists or overwrite = False
INFO:__main__:napistu_data/reactome_sbml_dfs.pickle exists or overwrite = False


## BiGG (metabolic models)

In [7]:
#| label: bigg

if not os.path.isdir(FN_BIGG_SBML_DIR) or OVERWRITE:
    _log(f"Download multiple BiGG metabolic models to {FN_BIGG_SBML_DIR}")
    bigg.bigg_sbml_download(FN_BIGG_SBML_DIR, overwrite = OVERWRITE)
else:
    _log_skipped(FN_BIGG_SBML_DIR)


if not utils.path_exists(FN_BIGG_SBML_DFS) or OVERWRITE:
    _log(f"Formatting a BiGG .sbml model as a sbml_dfs model at {FN_BIGG_SBML_DFS}")

    pw_index_uri = os.path.join(FN_BIGG_SBML_DIR, "pw_index.tsv")
    sbml_dfs = bigg.construct_bigg_consensus(pw_index_uri, organismal_species=SPECIES)
    sbml_dfs.validate()

    utils.save_pickle(FN_BIGG_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_BIGG_SBML_DFS)


INFO:__main__:napistu_data/bigg_sbmls exists or overwrite = False
INFO:__main__:napistu_data/bigg_sbml_dfs.pickle exists or overwrite = False


## OmniPath - human, mouse, rat

In [8]:
#| label: omnipath

if not utils.path_exists(FN_OMNIPATH_SBML_DFS) or OVERWRITE:
    _log(f"Downloading OmniPath data and formatting interactions as an sbml_dfs model at {FN_OMNIPATH_SBML_DFS}")

    sbml_dfs = omnipath.format_omnipath_as_sbml_dfs(
        organismal_species=SPECIES,
        preferred_method="bioconductor",
        allow_fallback=True,
    )
    sbml_dfs.validate()

    utils.save_pickle(FN_OMNIPATH_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_OMNIPATH_SBML_DFS)


INFO:__main__:napistu_data/omnipath_sbml_dfs.pickle exists or overwrite = False


# Species-Specific Sources

## TRRUST - human

In [9]:
#| label: trrust 

if not utils.path_exists(FN_TRRUST_RAW) or OVERWRITE:
    _log(f"Downloading TRRUST to {FN_TRRUST_RAW}")
    trrust.download_trrust(FN_TRRUST_RAW)
else:
    _log_skipped(FN_TRRUST_RAW)


if not utils.path_exists(FN_TRRUST_SBML_DFS) or OVERWRITE:
    _log(f"Processing TRRUST as sbml_dfs at {FN_TRRUST_SBML_DFS}")
    sbml_dfs = trrust.convert_trrust_to_sbml_dfs(FN_TRRUST_RAW)
    sbml_dfs.validate()

    utils.save_pickle(FN_TRRUST_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_TRRUST_SBML_DFS)


INFO:__main__:napistu_data/trrust.csv exists or overwrite = False
INFO:__main__:napistu_data/trrust_sbml_dfs.pickle exists or overwrite = False


## Reactome Functional Interactions - human

In [10]:
#| label: reactome_fi 

if not utils.path_exists(FN_REACTOME_FI_RAW) or OVERWRITE:
    _log(f"Downloading Reactome FI to {FN_REACTOME_FI_RAW}")
    reactome_fi.download_reactome_fi(FN_REACTOME_FI_RAW)
else:
    _log_skipped(FN_REACTOME_FI_RAW)

if not utils.path_exists(FN_REACTOME_FI_SBML_DFS) or OVERWRITE:
    _log(f"Processing Reactome FI as sbml_dfs at {FN_REACTOME_FI_SBML_DFS}")
    
    interactions = pd.read_csv(FN_REACTOME_FI_RAW, sep = "\t")
    sbml_dfs = reactome_fi.convert_reactome_fi_to_sbml_dfs(interactions)
    sbml_dfs.validate()

    utils.save_pickle(FN_REACTOME_FI_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_REACTOME_FI_SBML_DFS)


INFO:__main__:napistu_data/reactome_fi.tsv exists or overwrite = False
INFO:__main__:napistu_data/reactome_fi_sbml_dfs.pickle exists or overwrite = False


## IDEA - yeast

In [11]:
#| label: idea_yeast 

if not utils.path_exists(FN_IDEA_YEAST_RAW) or OVERWRITE:
    _log(f"Downloading the yeast Induction Dynamics Expression Atlas to {FN_IDEA_YEAST_RAW}")
    idea_yeast.download_idea(FN_IDEA_YEAST_RAW)
else:
    _log_skipped(FN_IDEA_YEAST_RAW)

if not utils.path_exists(FN_IDEA_YEAST_SBML_DFS) or OVERWRITE:
    _log(f"Processing the yeast Induction Dynamics Expression Atlas as sbml_dfs at {FN_IDEA_YEAST_SBML_DFS}")
    
    idea_kinetics = pd.read_csv(FN_IDEA_YEAST_RAW, sep = "\t")
    sbml_dfs = idea_yeast.convert_idea_kinetics_to_sbml_dfs(idea_kinetics)
    sbml_dfs.validate()

    utils.save_pickle(FN_IDEA_YEAST_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_IDEA_YEAST_SBML_DFS)


INFO:__main__:napistu_data/idea_yeast.tsv exists or overwrite = False
INFO:__main__:
Processing the yeast Induction Dynamics Expression Atlas as sbml_dfs at napistu_data/idea_yeast_sbml_dfs.pickle

  .apply(_summarize_idea_pairs)
INFO:napistu.sbml_dfs_utils:Saving 6 extra reaction columns as reaction_data: ['v_inter', 'v_final', 't_rise', 't_fall', 'rate', 'directness']
