---
title: Tutorial - Downloading and Formatting Pathway Data
author: "Shackett"
date: "May 9th 2025"
---

This notebook provides examples for downloading pathway information from a variety of data sources and formatting the results as `sbml_dfs` objects. Since it is often not possible to download just a subset of a data source for demonstration purposes, this notebook will download and cache raw and intermediate representations of each data source.

In [3]:
#| label: config
import logging
logger = logging.getLogger()
logger.setLevel("INFO")

import tutorial_utils
config = tutorial_utils.NapistuConfig("config.yaml", "downloading_pathway_data")

In [4]:
#| label: environment
import os
import pickle

from napistu import utils
from napistu.ingestion import bigg
from napistu.ingestion import psi_mi
from napistu.ingestion import reactome
from napistu.ingestion import string
from napistu.ingestion import trrust

def _log(text: str):

    banner_str = "====================================\n"
    logger.info(f"\n{banner_str}{text}\n{banner_str}")

def _log_skipped(uri: str):
    """ Log that a step was skipped. """
    logger.info(f"{uri} exists or overwrite = False")

In [5]:
#| label: globals
OVERWRITE = config.overwrite
SPECIES = config.species
FN_BIGG_SBML_DIR = config.artifacts["bigg_sbml_dir"]
FN_BIGG_SBML_DFS = config.artifacts["bigg_sbml_dfs"]
FN_INTACT_DIR = config.artifacts["intact_dir"]
FN_INTACT_SBML_DFS = config.artifacts["intact_sbml_dfs"]
FN_REACTOME_SBML_DIR = config.artifacts["reactome_sbml_dir"]
FN_REACTOME_SBML_DFS = config.artifacts["reactome_sbml_dfs"]
FN_STRING_ALIASES = config.artifacts["string_aliases"]
FN_STRING_INTERACTIONS = config.artifacts["string_interactions"]
FN_STRING_SBML_DFS = config.artifacts["string_sbml_dfs"]
FN_TRRUST_RAW = config.artifacts["trrust_raw"]
FN_TRRUST_SBML_DFS = config.artifacts["trrust_sbml_dfs"]

# Species Agnostic Sources

These sources include pathway information which can be broadly applied to a range of species. 

## STRING

In [4]:
#| label: string

if not utils.path_exists(FN_STRING_INTERACTIONS) or OVERWRITE:
    _log(f"Downloading STRING interactions to {FN_STRING_INTERACTIONS}")
    string.download_string(FN_STRING_INTERACTIONS, species = SPECIES)
else:
    _log_skipped(FN_STRING_INTERACTIONS)

if not utils.path_exists(FN_STRING_ALIASES) or OVERWRITE:
    _log(f"Downloading STRING aliases (systematic identifiers) to {FN_STRING_ALIASES}")
    string.download_string_aliases(FN_STRING_ALIASES, species = SPECIES)
else:
    _log_skipped(FN_STRING_ALIASES)

if not utils.path_exists(FN_STRING_SBML_DFS) or OVERWRITE:
    _log(f"Combining interactions and aliases to create the STRING sbml_dfs at {FN_STRING_SBML_DFS}")

    sbml_dfs = string.convert_string_to_sbml_dfs(
        FN_STRING_INTERACTIONS,
        FN_STRING_ALIASES
    )
    sbml_dfs.validate()

    utils.save_pickle(FN_STRING_SBML_DFS, sbml_dfs)

else:
    _log_skipped(FN_STRING_SBML_DFS)

INFO:root:napistu_data/string_interactions.txt exists or overwrite = False
INFO:root:napistu_data/string_aliases.txt exists or overwrite = False
INFO:root:napistu_data/string_sbml_dfs.pickle exists or overwrite = False


# Species-Biased Sources

These sources inform multiple species but their focal point is a single species. Here which has additional types of data 

## Reactome

In [5]:
#| label: reactome

if not os.path.isdir(FN_REACTOME_SBML_DIR) or OVERWRITE:
    _log(f"Download the Reactome pan-species tar-ball and unpack to a directory of .sbml files at {FN_REACTOME_SBML_DIR}")
    reactome.reactome_sbml_download(FN_REACTOME_SBML_DIR, overwrite=OVERWRITE)
else:
    _log_skipped(FN_REACTOME_SBML_DIR)

if not utils.path_exists(FN_REACTOME_SBML_DFS) or OVERWRITE:
    _log(f"Merging Reactome .sbml files into an sbml_dfs model at {FN_REACTOME_SBML_DFS}")

    pw_index_uri = os.path.join(FN_REACTOME_SBML_DIR, "pw_index.tsv")
    sbml_dfs = reactome.construct_reactome_consensus(
        pw_index_uri, species=SPECIES, strict=False
    )
    sbml_dfs.validate()

    utils.save_pickle(FN_REACTOME_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_REACTOME_SBML_DFS)

INFO:root:napistu_data/reactome_sbmls exists or overwrite = False
INFO:root:napistu_data/reactome_sbml_dfs.pickle exists or overwrite = False


## BiGG (metabolic models)

In [6]:
#| label: bigg

if not os.path.isdir(FN_BIGG_SBML_DIR) or OVERWRITE:
    _log(f"Download multiple BiGG metabolic models to {FN_BIGG_SBML_DIR}")
    bigg.bigg_sbml_download(FN_BIGG_SBML_DIR, overwrite = OVERWRITE)
else:
    _log_skipped(FN_BIGG_SBML_DIR)

if not utils.path_exists(FN_BIGG_SBML_DFS) or OVERWRITE:
    _log(f"Formatting a BiGG .sbml model as a sbml_dfs model at {FN_BIGG_SBML_DFS}")

    pw_index_uri = os.path.join(FN_BIGG_SBML_DIR, "pw_index.tsv")
    sbml_dfs = bigg.construct_bigg_consensus(pw_index_uri, species=SPECIES)
    sbml_dfs.validate()

    utils.save_pickle(FN_BIGG_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_BIGG_SBML_DFS)

INFO:root:
Download multiple BiGG metabolic models to napistu_data/bigg_sbmls

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): bigg.ucsd.edu:80
DEBUG:urllib3.connectionpool:http://bigg.ucsd.edu:80 "GET /static/models/Recon3D.xml HTTP/1.1" 200 1240856
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): bigg.ucsd.edu:80
DEBUG:urllib3.connectionpool:http://bigg.ucsd.edu:80 "GET /static/models/iMM1415.xml HTTP/1.1" 200 662517
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): bigg.ucsd.edu:80
DEBUG:urllib3.connectionpool:http://bigg.ucsd.edu:80 "GET /static/models/iMM904.xml HTTP/1.1" 200 310805
INFO:root:
Formatting a BiGG .sbml model as a sbml_dfs model at napistu_data/bigg_sbml_dfs.pickle

  0%|          | 0/1 [00:00<?, ?it/s]INFO:napistu.consensus:processing recon3D
INFO:napistu.utils:creating an edgelist linking index levels s_id, entry and linking it to levels defined by ontology, identifier
DEBUG:napistu.utils:label is not defined in table_schem

# Species-Specific Sources

## TRRUST

In [9]:
#| label: trrust 

if not utils.path_exists(FN_TRRUST_RAW) or OVERWRITE:
    _log(f"Downloading TRRUST to {FN_TRRUST_RAW}")
    trrust.download_trrust(FN_TRRUST_RAW)
else:
    _log_skipped(FN_TRRUST_RAW)

if not utils.path_exists(FN_TRRUST_SBML_DFS) or OVERWRITE:
    _log(f"Processing TRRUST as sbml_dfs at {FN_TRRUST_SBML_DFS}")
    sbml_dfs = trrust.convert_trrust_to_sbml_dfs(FN_TRRUST_RAW)
    sbml_dfs.validate()

    utils.save_pickle(FN_TRRUST_SBML_DFS, sbml_dfs)
else:
    _log_skipped(FN_TRRUST_SBML_DFS)

INFO:root:napistu_data/trrust.csv exists or overwrite = False
INFO:root:
Processing TRRUST as sbml_dfs at napistu_data/trrust_sbml_dfs.pickle



1724219e is not a valid pubmed id, it did not match the regex: ^[0-9]+$ returning None


## IDEA