In [1]:
# Install the package in development mode if needed
# !pip install -e '.[mcp]'

import asyncio
import os
import sys
import logging
from pathlib import Path
import json

# Import the MCP components
from napistu.mcp.server import create_server, start_server
from napistu.mcp import documentation, codebase, tutorials, execution

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("napistu")

# Helper function to run async code in Jupyter
async def run_async(coro):
    return await coro

# Create a dummy session context for execution components
session_context = {}
object_registry = {}

In [2]:
from napistu.mcp.profiles import get_profile
# define the types of assets to load
profile = get_profile("full")

In [3]:
# register the relevant components
mcp_server = create_server(profile)
# initialize the relevant components
live_server = start_server(mcp_server)

In [4]:
from napistu.mcp import documentation_utils
from napistu.mcp.constants import READMES
from napistu.mcp.constants import NAPISTU_PY_READTHEDOCS
from napistu.mcp.constants import NAPISTU_PY_READTHEDOCS_API

In [5]:
# readmes
readme = await run_async(documentation_utils.load_readme_content(READMES["napistu"]))
readme

INFO:httpx:HTTP Request: GET https://raw.githubusercontent.com/napistu/napistu/main/README.md "HTTP/1.1 200 OK"


"# Napistu\n\nThe Napistu project is an approach for creating and working with genome-scale mechanistic networks. Pathways of interest can be created from multiple sources (e.g., Reactome, STRING, TRRUST), aggregated across sources, and refined to add additional information. This pathway representation can then be turned into a graphical network to identify molecular neighborhoods, find paths between molecules, and to carryout network propagation.\n\nNapistu is an active project which we hope will be used for both simple analyses (e.g., basically replacing GSEA) as well as more complex analyses (e.g., multimodal data integration). \n\nWith Napistu you can:\n\n- Represent a range of publicly-available data sources using a common data structure, `sbml_dfs`, which is meant to faithfully encode molecular biology and biochemistry.\n- Aggregate complementary sources into a consensus model which allows high-quality but incomplete interactions to be supported by data sources which more compreh

In [6]:
# to scrape read the docs, currently just focusing on the API section
# 1. read the package overview to find the urls of the subpackages
# 2. for each subpackage list modules
# 3. for the main package list its modules dropping the entries for subpackages
# 4. for each module scrape its page



In [38]:
import copy

from bs4 import BeautifulSoup

from napistu.mcp.documentation_utils import _process_rtd_package_toc
from napistu.mcp.documentation_utils import parse_rtd_module_page

from types import SimpleNamespace
PACKAGE_DEFS = SimpleNamespace(
    NAPISTU = "napistu",
)

def _prune_modules_dict(modules_dict: dict) -> dict:

    """Filter the module_dict to remove links to subpackages since they are separately handled."""

    invalid_links = list()
    for k in modules_dict[PACKAGE_DEFS.NAPISTU].keys():

        if f"{PACKAGE_DEFS.NAPISTU}.{k}" in modules_dict.keys():
            invalid_links.append(k)

    refined_modules_dict = copy.deepcopy(modules_dict)
    for k in invalid_links:
        del refined_modules_dict[PACKAGE_DEFS.NAPISTU][k]

    return refined_modules_dict

def read_read_the_docs(package_url: str, api_url: str) -> dict:

    # load top-level package TOC
    packages_dict = await run_async(_process_rtd_package_toc(NAPISTU_PY_READTHEDOCS_API))

    # load subpackage TOCs
    modules_dict = {}
    for package_name, package_url in packages_dict.items():
        package_url = NAPISTU_PY_READTHEDOCS + package_url
        modules_dict[package_name] = await run_async(_process_rtd_package_toc(package_url))

    # drop links for subpackages from the main package's module list
    pruned_models_dict = _prune_modules_dict(modules_dict)

    # loop through module defs

    rtd_docs_dict = dict()
    for package in pruned_models_dict.keys():
        
        rtd_docs_dict[package] = dict()    
        for module in pruned_models_dict[package]:

            url = NAPISTU_PY_READTHEDOCS + "/generated" + pruned_models_dict[package][module]
            page_html = await documentation_utils.load_html_page(url)

            rtd_docs_dict[package][module] = parse_rtd_module_page(page_html, url)

    return rtd_docs_dict



INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/api.html "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.html#module-napistu "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.network.html#module-napistu.network "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.ingestion.html#module-napistu.ingestion "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.modify.html#module-napistu.modify "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.rpy2.html#module-napistu.rpy2 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.gcs.html#module-napistu.gcs "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.consensus.html#module-napistu.consensus "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.constants.html#module-napistu.constants "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.identifiers.html#module-napistu.identifiers "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.indices.html#module-napistu.indices "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.mechanism_matching.html#module-napistu.mechanism_matching "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.sbml_dfs_core.html#module-napistu.sbml_dfs_core "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://napistu.readthedocs.io/en/latest/generated/napistu.sbml_dfs_u

HTTPStatusError: Client error '404 Not Found' for url 'https://napistu.readthedocs.io/en/latest/generated/#napistu.rpy2.report_r_exceptions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404

In [None]:
import copy

from bs4 import BeautifulSoup

from napistu.mcp.documentation_utils import _process_rtd_package_toc
from napistu.mcp.documentation_utils import parse_rtd_module_page

from types import SimpleNamespace
PACKAGE_DEFS = SimpleNamespace(
    NAPISTU = "napistu",
)

def _prune_modules_dict(modules_dict: dict) -> dict:

    """Filter the module_dict to remove links to subpackages since they are separately handled."""

    invalid_links = list()
    for k in modules_dict[PACKAGE_DEFS.NAPISTU].keys():

        if f"{PACKAGE_DEFS.NAPISTU}.{k}" in modules_dict.keys():
            invalid_links.append(k)

    refined_modules_dict = copy.deepcopy(modules_dict)
    for k in invalid_links:
        del refined_modules_dict[PACKAGE_DEFS.NAPISTU][k]

    return refined_modules_dict


In [45]:
rtd_docs_dict

{'napistu': {'consensus': {'module': 'napistu.consensus',
   'url': 'https://napistu.readthedocs.io/en/latest/generated/napistu.consensus.html#module-napistu.consensus',
   'functions': {'_add_consensus_sources': {'name': '_add_consensus_sources',
     'signature': 'napistu.consensus._add_consensus_sources(new_id_table:DataFrame,agg_table_harmonized:DataFrame,lookup_table:Series,table_schema:dict,pw_index:PWIndex|None)→DataFrame\uf0c1',
     'id': 'napistu.consensus._add_consensus_sources',
     'doc': 'Add source information to the consensus table. Parameters: \uf0c1 new_id_table: pd.DataFrame Consensus table without source information agg_table_harmonized: pd.DataFrame Original table with cluster assignments lookup_table: pd.Series Maps old IDs to new consensus IDs table_schema: dict Schema for the table pw_index: indices.PWIndex | None An index of all tables being aggregated Returns: \uf0c1 pd.DataFrame Consensus table with source information added'},
    '_add_entity_data': {'name'

In [151]:
from types import SimpleNamespace
PACKAGE_DEFS = SimpleNamespace(
    NAPISTU = "napistu",
)
package_name = "napistu"
module_name = "sbml_dfs_core"
if package_name is not PACKAGE_DEFS.NAPISTU:
    package_name = f"{PACKAGE_DEFS.NAPISTU}.{package_name}"

url = NAPISTU_PY_READTHEDOCS + "/generated" + modules_dict[package_name][module_name]
css_selector = _create_module_css_selector(package_name="modify", module_name="curation", modules_dict=modules_dict)

In [None]:
page_html = await documentation_utils.load_html_page(url)



In [None]:
from bs4 import BeautifulSoup, Tag

def extract_function_blocks(soup: BeautifulSoup) -> list[Tag]:
    """
    Extract all function definition blocks (<dl class="py function">) from the soup.
    """
    return soup.find_all("dl", class_="py function")

def extract_class_blocks(soup: BeautifulSoup) -> list[Tag]:
    """
    Extract all class definition blocks (<dl class="py class">) from the soup.
    """
    return soup.find_all("dl", class_="py class")

def parse_function_block(dl: Tag) -> dict:
    """
    Parse a <dl class="py function"> block into a dict with name, signature, summary, and docstring.
    """
    result = {}
    dt = dl.find("dt")
    if dt:
        # Function name
        name_span = dt.find("span", class_="sig-name descname")
        result["name"] = name_span.text if name_span else ""
        # Signature
        sig = dt.get_text(" ", strip=True)
        result["signature"] = sig
        # Doc URL (from id or headerlink)
        result["doc_url"] = "#" + dt.get("id", "")
    # Summary/Docstring
    dd = dl.find("dd")
    if dd:
        summary = dd.find("p")
        result["summary"] = summary.text if summary else ""
        # Optionally, get full docstring
        result["docstring"] = dd.get_text(" ", strip=True)
    return result

def parse_class_block(dl: Tag) -> dict:
    """
    Parse a <dl class="py class"> block into a dict with name, summary, and docstring.
    """
    result = {}
    dt = dl.find("dt")
    if dt:
        name_span = dt.find("span", class_="sig-name descname")
        result["name"] = name_span.text if name_span else ""
        sig = dt.get_text(" ", strip=True)
        result["signature"] = sig
        result["doc_url"] = "#" + dt.get("id", "")
    dd = dl.find("dd")
    if dd:
        summary = dd.find("p")
        result["summary"] = summary.text if summary else ""
        result["docstring"] = dd.get_text(" ", strip=True)
    return result

def parse_rtd_module_page(html: str, url: str = None) -> dict:
    soup = BeautifulSoup(html, "html.parser")
    result = {
        "module": None,
        "url": url,
        "functions": [],
        "classes": []
    }

    # Get module name from <h1>
    h1 = soup.find("h1")
    if h1:
        # Remove headerlink icon if present
        module_name = h1.get_text(strip=True).replace("\uf0c1", "").strip()
        result["module"] = module_name

    # Parse top-level functions
    for func_dl in soup.find_all("dl", class_="py function"):
        func = {}
        sig = func_dl.find("dt")
        if sig:
            func["name"] = sig.find("span", class_="sig-name").get_text(strip=True)
            func["signature"] = sig.get_text(strip=True)
            func["id"] = sig.get("id")
        doc = func_dl.find("dd")
        if doc:
            func["doc"] = doc.get_text(" ", strip=True)
        result["functions"].append(func)

    # Parse classes and their methods/attributes
    for class_dl in soup.find_all("dl", class_="py class"):
        cls = {
            "name": None,
            "signature": None,
            "id": None,
            "doc": None,
            "methods": [],
            "attributes": []
        }
        sig = class_dl.find("dt")
        if sig:
            cls["name"] = sig.find("span", class_="sig-name").get_text(strip=True)
            cls["signature"] = sig.get_text(strip=True)
            cls["id"] = sig.get("id")
        doc = class_dl.find("dd")
        if doc:
            cls["doc"] = doc.get_text(" ", strip=True)
            # Methods
            for meth_dl in doc.find_all("dl", class_="py method"):
                meth = {}
                meth_sig = meth_dl.find("dt")
                if meth_sig:
                    meth["name"] = meth_sig.find("span", class_="sig-name").get_text(strip=True)
                    meth["signature"] = meth_sig.get_text(strip=True)
                    meth["id"] = meth_sig.get("id")
                meth_doc = meth_dl.find("dd")
                if meth_doc:
                    meth["doc"] = meth_doc.get_text(" ", strip=True)
                cls["methods"].append(meth)
            # Attributes
            for attr_dl in doc.find_all("dl", class_="py attribute"):
                attr = {}
                attr_sig = attr_dl.find("dt")
                if attr_sig:
                    attr["name"] = attr_sig.find("span", class_="sig-name").get_text(strip=True)
                    attr["signature"] = attr_sig.get_text(strip=True)
                    attr["id"] = attr_sig.get("id")
                attr_doc = attr_dl.find("dd")
                if attr_doc:
                    attr["doc"] = attr_doc.get_text(" ", strip=True)
                cls["attributes"].append(attr)
        result["classes"].append(cls)

    return result

x = parse_rtd_module_page(page_html)
x["classes"]

In [None]:
page_html

In [None]:
selected