In [13]:
import logging
from typing import Iterable, Dict
import pandas as pd
from cap_client.cap import CapClient as Cap  # from cellannotation/cap-python-client

logger = logging.getLogger(__name__)  # user’s logger factory


def cap_human_cell_labels(limit: int = 10000) -> pd.DataFrame:
    """Fetch published CAP cell labels for Homo sapiens.

    Parameters
    ----------
    limit : int, default=5000
        Maximum number of labels to retrieve.

    Returns
    -------
    pandas.DataFrame
        Columns: ['label_id','full_name','cl_id','cl_label','category_cl_id',
        'category_label','marker_genes','dataset_id','dataset_name','project_id','project_name','count'].

    Notes
    -----
    Data source
        Cell Annotation Platform (CAP) GraphQL API via `cap-python-client`.
        CAP publishes HCA community annotations with Cell Ontology IDs, parent
        categories, marker-gene evidence, and downloadable AnnData/CAP-JSON
        files. See CAP docs and client README.  [oai_citation:10‡celltype.info](https://www.celltype.info/docs/downloading-datasets) [oai_citation:11‡GitHub](https://github.com/cellannotation/cap-python-client)
    """
    cap = Cap()  # uses public endpoints; set CAP_LOGIN/CAP_PWD or CAP_TOKEN if needed
    logger.info("Querying CAP for Homo sapiens cell labels")
    resp = cap.search_cell_labels(organism=["Homo sapiens"], limit=limit)
    cell_types = resp["ontology_term"].unique()
    return cell_types

In [14]:
ct = cap_human_cell_labels()

In [19]:
import io
import logging
from typing import List, Optional
import pandas as pd
import requests

logger = logging.getLogger(__name__)

HGNC_TSV = "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt"


def load_hgnc_symbols(locus_types: Optional[Iterable[str]] = None) -> List[str]:
    """Return HGNC approved human gene symbols from the current *hgnc_complete_set*.

    Parameters
    ----------
    locus_types : iterable of str, optional
        Filter by HGNC ``locus_type`` values, e.g. ``["gene with protein product"]``,
        ``["RNA, long non-coding"]``. If ``None``, return all approved symbols.

    Returns
    -------
    list of str
        Unique HGNC-approved symbols.

    Notes
    -----
    Data source
        HGNC “hgnc_complete_set” TSV hosted on Google Cloud Storage.
        Files are updated on Tuesdays and Fridays. See HGNC downloads page
        (new bucket paths) and archive/help.  [oai_citation:3‡Gene Names](https://www.genenames.org/download/?utm_source=chatgpt.com) [oai_citation:4‡Gene Names](https://www.genenames.org/download/archive/?utm_source=chatgpt.com)
    """
    logger.info("Downloading HGNC complete set from %s", HGNC_TSV)
    r = requests.get(HGNC_TSV, timeout=120)
    r.raise_for_status()
    df = pd.read_csv(io.StringIO(r.text), sep="\t", dtype=str)
    df = df[df["status"] == "Approved"]
    if locus_types:
        df = df[df["locus_type"].isin(list(locus_types))]
    symbols = df["symbol"].dropna().unique().tolist()
    logger.info("Collected %d HGNC symbols", len(symbols))
    return symbols

In [20]:
genes = load_hgnc_symbols()

In [25]:
import json
import logging
from typing import List, Optional, Iterable
import pandas as pd

logger = logging.getLogger(__name__)

MONDO_JSON = "http://purl.obolibrary.org/obo/mondo.json"


def load_mondo_table(url: str = MONDO_JSON) -> pd.DataFrame:
    """Load MONDO OBO-Graphs JSON and return a tidy disease table.

    Parameters
    ----------
    url : str, optional
        URL to an OBO-Graphs JSON MONDO release (default is the PURL).

    Returns
    -------
    pandas.DataFrame
        Columns:
        ``mondo_id`` (e.g. ``MONDO:0005148``),
        ``label``,
        ``definition`` (text),
        ``synonyms`` (list[str]),
        ``xrefs`` (list[str]),
        ``parents`` (list[str] MONDO IDs).

    Notes
    -----
    **Data source.** MONDO Disease Ontology release in OBO-Graphs JSON,
    obtained via OBO Foundry PURL. MONDO harmonizes disease terms and
    curates precise cross-ontology mappings (OMIM, Orphanet, DOID, ICD-11,
    NCIt, EFO, etc.). See MONDO product list and overview.  [oai_citation:2‡obofoundry.org](https://obofoundry.org/ontology/mondo.html?utm_source=chatgpt.com) [oai_citation:3‡bioregistry.io](https://bioregistry.io/mondo?utm_source=chatgpt.com) [oai_citation:4‡ontobee.org](https://ontobee.org/ontology/mondo?utm_source=chatgpt.com)
    """
    logger.info("Downloading MONDO JSON from %s", url)
    r = requests.get(url, timeout=300)
    r.raise_for_status()
    data = r.json()

    graphs = data.get("graphs", [])
    if not graphs:
        raise ValueError("No graphs found in MONDO JSON")
    nodes = graphs[0].get("nodes", [])
    edges = graphs[0].get("edges", [])

    # Build parent map from edges (is_a)
    parents_map: Dict[str, List[str]] = {}
    for e in edges:
        if e.get("pred") in ("rdfs:subClassOf", "is_a"):
            child = e.get("sub")
            parent = e.get("obj")
            if child and parent:
                parents_map.setdefault(child, []).append(parent)

    rows = []
    for n in nodes:
        if n.get("type") != "CLASS":
            continue
        curie = n.get("id")
        if not (curie and curie.startswith("MONDO:")):
            continue
        label = n.get("lbl")
        # Definition
        definition: Optional[str] = None
        for ann in n.get("meta", {}).get("definition", []) if False else []:
            pass  # placeholder if older structure used
        d_meta = n.get("meta", {}).get("definition")
        if isinstance(d_meta, dict):
            definition = d_meta.get("val")
        # Synonyms
        syns: List[str] = []
        for s in n.get("meta", {}).get("synonyms", []) or []:
            v = s.get("val")
            if v:
                syns.append(v)
        # Xrefs
        xrefs: List[str] = []
        for x in n.get("meta", {}).get("xrefs", []) or []:
            v = x.get("val")
            if v:
                xrefs.append(v)

        rows.append(
            {
                "mondo_id": curie,
                "label": label,
                "definition": definition,
                "synonyms": syns,
                "xrefs": xrefs,
                "parents": parents_map.get(curie, []),
            }
        )

    df = pd.DataFrame(rows)
    logger.info("Parsed %d MONDO classes", len(df))
    return df

In [32]:
df = load_mondo_table()

In [None]:
import logging
from typing import Any, List, Optional
import pandas as pd

logger = logging.getLogger(__name__)

OLS_BASE = "https://www.ebi.ac.uk/ols4/api"


def fetch_mondo_via_ols(
    size: int = 500, max_pages: Optional[int] = None
) -> pd.DataFrame:
    """Fetch MONDO disease terms from OLS4 with labels, definitions, synonyms, xrefs.

    Parameters
    ----------
    size : int, default=500
        Page size for pagination. OLS4 supports paging via ``page`` and ``size``;
        clients commonly use 500.
    max_pages : int, optional
        Stop after this many pages (for testing). If ``None``, iterate all.

    Returns
    -------
    pandas.DataFrame
        Columns:
        ``mondo_id`` (e.g. ``MONDO:0005148``), ``label``, ``definition``
        (first description string if present), ``synonyms`` (list[str]),
        ``xrefs`` (list[str]).

    Notes
    -----
    Data source
        EMBL-EBI **Ontology Lookup Service v4 (OLS4)**, ontology ``mondo``.
        OLS4 exposes MONDO with flattened fields:
        - definition text under ``description``
        - synonyms under ``synonyms``
        - db xrefs under ``annotation.hasDbXref``

        See OLS4 help/app note and discussion on field names. Some endpoints
        occasionally return HTTP 500; retry/backoff is recommended.

    References
    ----------
    OLS4 site and API, app note; field-name discussion; intermittent 500 issues;
    Python client docs.  [oai_citation:5‡EMBL-EBI](https://www.ebi.ac.uk/ols4/?utm_source=chatgpt.com) [oai_citation:6‡arXiv](https://arxiv.org/pdf/2501.13034?utm_source=chatgpt.com) [oai_citation:7‡GitHub](https://github.com/EBISPOT/ols4/issues/49?utm_source=chatgpt.com) [oai_citation:8‡GitHub](https://github.com/EBISPOT/ols4/issues/837?utm_source=chatgpt.com) [oai_citation:9‡GitHub](https://github.com/EBISPOT/ols4/issues/418?utm_source=chatgpt.com) [oai_citation:10‡ahida-development.github.io](https://ahida-development.github.io/ols-py/?utm_source=chatgpt.com)
    """
    page = 0
    rows: List[Dict[str, Any]] = []
    while True:
        if max_pages is not None and page >= max_pages:
            break
        url = f"{OLS_BASE}/ontologies/mondo/terms"
        params = {"size": size, "page": page}
        try:
            r = requests.get(url, params=params, timeout=120)
            r.raise_for_status()
        except requests.HTTPError as e:
            logger.warning("OLS4 request failed on page %d: %s", page, e)
            break
        data = r.json()
        terms = data.get("_embedded", {}).get("terms", []) or []
        for t in terms:
            desc = t.get("description")
            if isinstance(desc, list):
                definition = desc[0] if desc else None
            else:
                definition = desc
            xrefs = (t.get("annotation") or {}).get("hasDbXref") or []
            rows.append(
                {
                    "mondo_id": t.get("obo_id"),
                    "label": t.get("label"),
                    "definition": definition,
                    "synonyms": t.get("synonyms") or [],
                    "xrefs": xrefs,
                }
            )
        logger.debug("Fetched page %d with %d terms", page, len(terms))
        pinfo = data.get("page", {})
        if page >= (pinfo.get("totalPages", 0) - 1):
            break
        page += 1
    df = pd.DataFrame(rows).drop_duplicates(subset=["mondo_id"]).reset_index(drop=True)
    logger.info("Collected %d MONDO terms via OLS4", len(df))
    list_labels = list(df["label"].unique())
    return list_labels

In [41]:
df = fetch_mondo_via_ols()

In [44]:
df["label"].sort_values().unique()

array(['(2R)-2-hydroxy monocarboxylic acid',
       '(2R)-2-hydroxy monocarboxylic acid anion',
       '(2R,3S)-2-aminooctadec-4-ene-1,3-diol', ..., 'zygote stage',
       'zymogen granule', 'zymogen granule membrane'], dtype=object)

In [45]:
df

Unnamed: 0,mondo_id,label,definition,synonyms,xrefs
0,MONDO:0016154,obsolete qualitative or quantitative defects o...,,[],[]
1,MONDO:0016155,qualitative or quantitative defects of protein...,,"[secondary alpha-dystroglycanopathy, secondary...",[]
2,MONDO:0016156,qualitative or quantitative defects of FKRP,,[],[]
3,MONDO:0016157,obsolete qualitative or quantitative defects o...,,[],[]
4,MONDO:0016158,narcolepsy-cataplexy syndrome,A type of narcolepsy characterized by excessiv...,"[Gelineau disease, Gelineau syndrome, Gelineau...",[]
...,...,...,...,...,...
56605,MONDO:0005072,neuroblastoma,"Neuroblastoma (NB) is the most common solid, e...","[neural Crest tumor, malignant, neuroblastoma,...",[]
56606,MONDO:0005073,melanocytic nevus,A neoplasm composed of melanocytes that usuall...,"[melanocytic Nevus, melanotic Nevus, mole, mol...",[]
56607,MONDO:0005074,papillary cystadenocarcinoma,A malignant cystic serous or mucinous epitheli...,"[cystadenocarcinoma, papillary, malignant, pap...",[]
56608,MONDO:0005075,thyroid gland papillary carcinoma,A differentiated adenocarcinoma arising from t...,"[papillary cancer of the thyroid, papillary ca...",[]


In [48]:
import mygene
import pickle

mg = mygene.MyGeneInfo()

In [50]:
# Using Gene CD24 as an example
cd_24_name = mg.querymany("CD24", scopes="symbol", species="human")

Input sequence provided is already in string format. No operation performed


In [65]:
cd_24_name

[{'query': 'CD24',
  '_id': '100133941',
  '_score': 19.220335,
  'entrezgene': '100133941',
  'name': 'CD24 molecule',
  'symbol': 'CD24',
  'taxid': 9606}]

In [51]:
gene_name_to_tax_id = {}
for result in cd_24_name:
    if "_id" in result and "query" in result:
        gene_name_to_tax_id[result["symbol"]] = result["_id"]

In [52]:
gene_name_to_tax_id

{'CD24': '100133941'}

In [55]:
with open("../resources/vocab.json", "rb") as handle:
    vocab_gene = json.load(handle)
vocab_gene_list = list(vocab_gene.keys())

In [71]:
vocab_gene

{'RP5-973N23.5': 60693,
 'RP11-182N22.10': 60689,
 'CTB-53D8.3': 60687,
 'RP11-348N17.2': 60685,
 'RP11-205M20.8': 60682,
 'RP11-326C3.17': 60680,
 'RP11-439H13.3': 60675,
 'RP11-413H22.3': 60673,
 'GET1-SH3BGR': 60672,
 'CH17-476P10.1': 60670,
 'LLNLR-271E8.1': 60668,
 'CTD-2527I21.16': 60667,
 'CTC-359D24.5': 60664,
 'RP5-967N21.13': 60661,
 'CTC-264K15.5': 60659,
 'RP11-314O13.2': 60655,
 'RP11-127I20.9': 60652,
 'RP11-473M10.4': 60651,
 'OR7E11P_ENSG00000285537': 60644,
 'RP11-867O8.9': 60643,
 'TMX2-CTNND1': 60642,
 'RP11-732A19.10': 60641,
 'RP11-545E17.13': 60639,
 'RP11-1C8.8': 60638,
 'RP11-346I3.6': 60636,
 'GS1-273L24.6': 60635,
 'AL135749.6': 60634,
 'RP11-641J8.4': 60633,
 'RP11-478H11.3-001': 60631,
 'RP5-1111F22.1': 60629,
 'CTB-161K23.4': 60628,
 'RNF216P1_ENSG00000288620': 60625,
 'RP11-350J20.15': 60623,
 'XXbac-BPG254B15.11': 60622,
 'PDCD6-AHRR': 60618,
 'RP11-115L11.3': 60615,
 'RP11-553D4.3': 60614,
 'RP11-1020A11.3': 60613,
 'RP11-316O14.3': 60611,
 'AC012488.3':

In [56]:
gene_name_to_summary_page = {}

In [62]:
gene_name_to_tax_id

{'CD24': '100133941'}

In [72]:
pkl_obj = "../resources/gene_name_id_dict_gc95M.pkl"
obj = pickle.load(open(pkl_obj, "rb"))