In [1]:
import json
import re
import sys

import pandas as pd
import requests
from linkml_runtime import SchemaView
from oaklib import get_adapter
from pymongo import MongoClient
from tqdm.notebook import tqdm  # Import tqdm.notebook for Jupyter Notebook progress bars

In [2]:
sys.path.append('../..')  # Assuming notebooks/ is at the root level
import core # appears as if there's a problem in PyCharm Jupyter, but it still works


In [3]:
# todo there is probably some duplicative code in here
#   either similar functions for extracting stuff from NCBI Biosamples
#     which maybe should go in the general flatterer
#   or functionality that is already solved in the DuckDB dumper
#     see also external_metadata_awareness/biosamples_mongodb_to_duckdb.py
#   or the for blocks that find canonical labels and detect obsolescence
#     which is not being applied to NCBI records yet
#       we have addressed that in multiple other places :-( with OAK annotation
#         which requires followup filtering/curation

In [4]:
# todo there are some confusing variables in here like emp500_ncbi_biosamples and ncbi_emp500_biosamples

In [5]:
NMDC_RUNTIME_API_BASE_URL = "https://api.microbiomedata.org/nmdcschema/"

In [6]:
nmdc_schema_url = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/nmdc_schema/nmdc_materialized_patterns.yaml"

In [7]:
ontology_list = ["envo", "pato", "uberon"]


In [8]:
# there are other slots ontolgy terms too

to_label_check = [
    'env_broad_scale.id', 'env_local_scale.id', 'env_medium.id',
    'env_broad_scale.term.id', 'env_local_scale.term.id', 'env_medium.term.id',
    'envoBroadScale.id', 'envoLocalScale.id', 'envoLocalScale.id',
]

# the flattened NCMDC biosamples LOD has fields like "env_broad_scale.term.id"
# the flattened NCMDC biosamples LOD has fields like "env_broad_scale.id"

add checking of other ontologies
build an obsolete term cache

In [9]:
# # see also
# #   notebooks/environmental_context_value_sets/generate_voting_sheet.ipynb
#
# with open(envo_label_cache_json, "r", encoding="utf-8") as file:
#     envo_label_cache = json.load(file)

In [10]:
# Connect to the local MongoDB instance (default connection)
client = MongoClient('mongodb://localhost:27017/')  # Connect to your local MongoDB

In [11]:
submissions_db = client['misc_metadata']

In [12]:
submissions_collection = submissions_db['nmdc_submissions']

In [13]:
# GOLD_METADATA = "gold_metadata_studies_with_samples" # another MongoDB collection to check
GOLD_METADATA = "gold_metadata" # following mongo-ncbi-loadbalancer's name which doesn't really indicate that the collection is a subset
NCBI_METADATA = "ncbi_metadata"
# todo add build notes here

In [14]:

aid_tsv = "alternative_identifiers_cheatsheet.tsv"

biosample_chem_admin_tsv="flattened_biosample_chem_administration.tsv"
biosample_chem_admin_json = "flattened_biosample_chem_administration.json"

flattened_gold_emp500_biosamples_tsv = "flattened_gold_emp500_biosamples.tsv"
flattened_ncbi_emp500_biosamples_with_attributes_tsv= "flattened_ncbi_emp500_biosamples.tsv"
gold_emp500_biosample_contacts_tsv = "gold_emp500_biosample_contacts.tsv"
ncbi_emp500_all_attributes_tsv = "ncbi_emp500_all_attributes.tsv"
ncbi_emp500_biosample_ids_tsv = "ncbi_emp500_biosample_ids.tsv"
ncbi_emp500_biosample_models_tsv = "ncbi_emp500_biosample_models.tsv"

nmdc_submissions_biosamples_tsv = "nmdc_submissions_biosamples.tsv"

scalar_biosamples_tsv = "flattened_biosample.tsv"
scalar_biosamples_json = "flattened_biosample.json"

scalar_studies_tsv = "flattened_study.tsv"
scalar_studies_json = "flattened_study.json"

study_credit_associations_tsv = "flattened_study_has_credit_associations.tsv"
study_credit_associations_json = "flattened_study_has_credit_associations.json"

study_dois_tsv = "flattened_study_associated_dois.tsv"
study_dois_json = "flattened_study_associated_dois.json"

In [15]:
# this code now globally processes NMDC MongDB contents and records from the submissions API

# EMP500 would be a good scope for checking GOLD or NCBI records

# see also notebooks/studies_exploration/emp_500_ng/emp500_ng.ipynb
#   for illustration of how study ids were determined from text

# will probably re-use some code from (see also)
#   notebooks/mixs-slot-ranking/build_mixs_slot_rank_template.ipynb

In [16]:
emp500_ncbi_project_accession = "PRJEB42019"
emp500_gold_study_id = "Gs0154244"

In [17]:
def parse_label_curie(text):
    """
    Parses a string with an optional leading underscore, followed by a label and a CURIE inside square brackets.

    Example input: "________mediterranean savanna biome [ENVO:01000229]"

    :param text: The input string to parse
    :return: A dictionary {'label': <label>, 'curie': <curie>} if successful, else None
    """
    pattern = r"^_*(?P<label>[^\[\]]+)\s*\[(?P<curie>[^\[\]]+)\]$"
    match = re.match(pattern, text.strip())

    if match:
        return {
            "label": match.group("label").strip(),
            "curie": match.group("curie").strip()
        }

    return None



In [18]:
def generate_label_cache(entities, adapter):
    """
    Generates a label cache mapping CURIEs to their labels.

    :param entities: List of ontology entities (CURIEs)
    :param adapter: Ontology adapter to fetch labels
    :return: Dictionary mapping CURIEs to labels
    """
    label_cache = {}

    for curie in entities:
        label = adapter.label(curie)  # Fetch label for CURIE
        if label:  # Only store if a valid label exists
            label_cache[curie] = label

    return label_cache


In [19]:
def build_ontology_adapters(ontology_names):
    """
    Creates ontology adapters for the given ontology names.

    :param ontology_names: List of ontology names (e.g., ["envo", "pato", "uberon"])
    :return: Dictionary mapping ontology names to ontology adapters
    """
    return {ontology: get_adapter(f"sqlite:obo:{ontology}") for ontology in ontology_names}


In [20]:
def load_ontology_labels(ontology_adapters):
    """
    Loads ontology entity labels from multiple ontologies and aggregates their label caches.

    :param ontology_adapters: Dictionary mapping ontology names to ontology adapters
    :return: Aggregated label cache dictionary
    """
    aggregated_label_cache = {}

    for ontology, adapter in ontology_adapters.items():
        entities = sorted(list(adapter.entities()))  # Fetch and sort entities
        label_cache = generate_label_cache(entities, adapter)
        aggregated_label_cache.update(label_cache)  # Merge into a single cache

    return aggregated_label_cache

In [21]:
def find_obsolete_terms(ontology_adapters):
    """
    Identifies obsolete terms from multiple ontologies using their adapters.

    :param ontology_adapters: Dictionary mapping ontology names to ontology adapters
    :return: List of CURIEs for obsolete terms
    """
    obsolete_curies = []

    for adapter in ontology_adapters.values():
        obsolete_curies.extend(adapter.obsoletes())  # Use ontology access kit function

    return obsolete_curies

In [22]:
def stringify(obj):
    """
    Converts a list or dictionary into a string representation.
    - Uses JSON format with compact or pretty formatting based on the depth.
    - Sorts keys in dictionaries for consistency.

    :param obj: Any object (list, dict, or other Python object)
    :return: String representation
    """
    if isinstance(obj, (dict, list)):
        return json.dumps(obj, sort_keys=True, ensure_ascii=False)  # Compact format
    return str(obj)  # Fallback for other types

In [23]:
def stringify_singleton_dict_list(dict_list):
    """
    Processes a list of dictionaries:
    - Removes the 'type' key from each dictionary.
    - Tracks the largest dictionary by key count (excluding 'type').
    - If the largest dictionary has only 1 key, extracts values, sorts them, and returns a pipe-concatenated string.
    - Otherwise, returns an empty string.

    :param dict_list: List of dictionaries to process
    :return: Pipe-concatenated sorted values if all dicts have at most one key (excluding 'type'), else an empty string.
    """
    if not isinstance(dict_list, list) or not all(isinstance(d, dict) for d in dict_list):
        # raise ValueError("Input must be a list of dictionaries")
        return ""

    largest_key_count = 0
    processed_values = []

    for d in dict_list:
        cleaned_dict = {k: v for k, v in d.items() if k != "type"}  # Remove 'type'
        largest_key_count = max(largest_key_count, len(cleaned_dict))
        if len(cleaned_dict) == 1:  # If it only has one key, extract its value
            processed_values.append(next(iter(cleaned_dict.values())))

    # If the largest dictionary has only one key, return sorted pipe-concatenated string
    if largest_key_count == 1:
        return "|".join(map(str, sorted(processed_values)))  # Sort values before joining

    return ""  # Return empty string if dicts contain more than one unique key


In [24]:
def fetch_all_documents(collection_name, page_size=10):
    """
    Fetch all documents from a specified NMDC collection using paging.

    :param collection_name: Name of the NMDC collection (e.g., "study_set", "biosample_set")
    :param page_size: Number of documents per page (default: 99)
    :return: List of documents from the collection
    """
    documents = []
    next_page_token = None
    url = f"{NMDC_RUNTIME_API_BASE_URL}{collection_name}"

    while True:
        params = {"max_page_size": page_size}
        if next_page_token:
            print(f"{next_page_token = }")
            params["page_token"] = next_page_token

        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an error for bad responses
        data = response.json()

        documents.extend(data.get("resources", []))  # Using "resources" as the key for results
        next_page_token = data.get("next_page_token")

        if not next_page_token:  # Stop if there's no more data to fetch
            break

    return documents

In [25]:
def flatten(documents, ctv_slots=None, known_skips=None):
    """
    Extracts scalar fields from documents, concatenating lists of scalars with a pipe ('|') separator.
    If a field contains a dictionary with only scalar values, it is flattened using 'outer_key.inner_key' notation.
    Lists of scalars within dictionaries are also pipe-concatenated.
    Skips 'type' fields.

    Special handling for ControlledIdentifiedTerm range slots like 'env_broad_scale', 'env_local_scale', and 'env_medium' etc.:
      - Extracts 'has_raw_value', 'term.id', and 'term.name' if available.

    Prints the field name if it encounters a complex structure (nested dicts, lists of dicts, etc.).

    :param documents: List of documents (e.g., studies, biosamples)
    :param ctv_slots: Set of slots whose range is a ControlledTermValue or descendant
    :param known_skips: Set of slots that can't be stringified nicely
    :return: List of dictionaries containing only scalar fields, with lists of scalars pipe-concatenated
    """

    if ctv_slots is None:
        ctv_slots = set()

    if known_skips is None:
        known_skips = set()

    scalar_docs = []
    problem_slots = set()

    for doc in documents:
        scalar_doc = {}
        stringified_singletons = ""  # Ensure fresh value per row

        for key, value in doc.items():
            if key == "type":
                continue  # Skip 'type' fields

            if key in known_skips:
                continue

            if isinstance(value, (str, int, float, bool, type(None))):
                scalar_doc[key] = value  # Keep scalars as-is

            elif isinstance(value, list) and all(
                    isinstance(item, (str, int, float, bool, type(None))) for item in value):
                scalar_doc[key] = "|".join(map(str, value))  # Join list elements with '|'

            elif isinstance(value, dict):
                if key in ctv_slots:
                    # Extract values from env_* fields
                    if "has_raw_value" in value:
                        scalar_doc[f"{key}.has_raw_value"] = value["has_raw_value"]

                    if "term" in value and isinstance(value["term"], dict):
                        if "id" in value["term"]:
                            scalar_doc[f"{key}.term.id"] = value["term"]["id"]
                        if "name" in value["term"]:
                            scalar_doc[f"{key}.term.name"] = value["term"]["name"]

                elif all(isinstance(v, (str, int, float, bool, type(None))) or
                         (isinstance(v, list) and all(
                             isinstance(item, (str, int, float, bool, type(None))) for item in v))
                         for v in value.values()):
                    # Flatten scalar-only dicts and pipe-join any scalar lists
                    for sub_key, sub_value in value.items():
                        if sub_key == "type":
                            continue  # Skip 'type' fields inside dicts
                        if isinstance(sub_value, list):
                            scalar_doc[f"{key}.{sub_key}"] = "|".join(map(str, sub_value))
                        else:
                            scalar_doc[f"{key}.{sub_key}"] = sub_value

                else:
                    # print(f"Skipping structured field: {key}")  # Print field name if dict has nested structures
                    problem_slots.add(key)
                    stringified_singletons = stringify_singleton_dict_list(value)
                    if stringified_singletons != "":
                        scalar_doc[f"{key}"] = stringified_singletons
                    else:
                        if key not in known_skips:
                            scalar_doc[f"{key}"] = stringify(value)
            else:
                # print(f"Skipping structured field: {key}")  # Print field name for unhandled complex structures
                problem_slots.add(key)
                stringified_singletons = stringify_singleton_dict_list(value)
                if stringified_singletons != "":
                    scalar_doc[f"{key}"] = stringified_singletons
                else:
                    if key not in known_skips:
                        scalar_doc[f"{key}"] = stringify(value)

        scalar_docs.append(scalar_doc)

    stringifieds = sorted(problem_slots - known_skips)  # Ensure it's sorted before printing

    print(f"stringified: {stringifieds}")

    return scalar_docs


In [26]:
def extract_associated_dois(studies):
    """
    Extracts associated_dois from a list of studies, adding the study's ID to each DOI entry.

    :param studies: List of study documents
    :return: List of dictionaries, each representing a DOI with the study ID added
    """
    doi_entries = []

    for study in studies:
        study_id = study.get("id")  # Get study ID
        associated_dois = study.get("associated_dois", [])

        for doi in associated_dois:
            if isinstance(doi, dict):  # Ensure it's a dictionary
                doi_entry = doi.copy()  # Make a copy to avoid modifying original data
                doi_entry["study_id"] = study_id  # Add study ID
                doi_entry.pop("type", None)  # Remove 'type' field if it exists
                doi_entries.append(doi_entry)

    return doi_entries


In [27]:
def extract_credit_associations(studies):
    """
    Extracts credit associations from a list of studies, flattening PersonValue fields and pipe-concatenating applied roles.

    :param studies: List of study documents
    :return: List of dictionaries, each representing a CreditAssociation with the study ID added
    """
    credit_entries = []

    for study in studies:
        study_id = study.get("id")  # Get study ID
        credit_associations = study.get("has_credit_associations", [])

        for credit in credit_associations:
            if isinstance(credit, dict):  # Ensure it's a dictionary
                credit_entry = {"study_id": study_id}  # Start with the study ID

                # Flatten applied_roles (if multivalued)
                applied_roles = credit.get("applied_roles", [])
                if isinstance(applied_roles, list):
                    applied_roles.sort()
                    credit_entry["applied_roles"] = "|".join(map(str, applied_roles))

                # Flatten applies_to_person (PersonValue structure)
                person = credit.get("applies_to_person", {})
                if isinstance(person, dict):
                    for key in ["name", "orcid", "profile_image_url", "has_raw_value"]:
                        if key in person:
                            credit_entry[f"person.{key}"] = person[key]

                    # Pipe-concatenate websites if it's a list
                    if isinstance(person.get("websites"), list):
                        credit_entry["person.websites"] = "|".join(person["websites"])

                credit_entries.append(credit_entry)

    return credit_entries


In [28]:
def extract_chem_administration(biosamples):
    """
    Extracts chem_administration field from biosample documents,
    creating a separate table with biosample ID, chemical name, chemical ID,
    raw value, and timestamp.

    :param biosamples: List of biosample documents
    :return: List of dictionaries, each representing a chemical administration entry
    """
    chem_entries = []
    pattern = re.compile(r"^(.*?) \[([^\]]+)\];([\d\-T:]+)$")  # Regex for label, CURIE, timestamp

    for sample in biosamples:
        biosample_id = sample.get("id")  # Get biosample ID
        chem_administration = sample.get("chem_administration", [])

        for chem in chem_administration:
            if isinstance(chem, dict):
                entry = {"biosample_id": biosample_id}

                # Extract has_raw_value and parse it
                raw_value = chem.get("has_raw_value", "")
                entry["has_raw_value"] = raw_value

                match = pattern.match(raw_value)
                if match:
                    entry["extracted_label"] = match.group(1)
                    entry["extracted_curie"] = match.group(2)
                    entry["extracted_timestamp"] = match.group(3)
                else:
                    entry["extracted_label"] = ""
                    entry["extracted_curie"] = ""
                    entry["extracted_timestamp"] = ""

                # Extract term details if present
                term = chem.get("term", {})
                if isinstance(term, dict):
                    entry["term_id"] = term.get("id", "")
                    entry["term_name"] = term.get("name", "")

                chem_entries.append(entry)

    return chem_entries


In [29]:
def extract_gold_contacts(records, idcol):
    """
    Extracts contacts from biosample documents, creating a structured table with
    biosample ID, contact name, email, jgiSsoId, and roles.

    :param biosamples: List of biosample documents
    :return: List of dictionaries, each representing a contact with the biosample ID
    """
    contact_entries = []

    for sample in records:
        record_id = sample.get(idcol)  # Get biosample ID
        contacts = sample.get("contacts", [])

        for contact in contacts:
            if isinstance(contact, dict):
                entry = {
                    "id": record_id,
                    "name": contact.get("name", ""),
                    "email": contact.get("email", ""),
                    "jgiSsoId": contact.get("jgiSsoId", ""),
                    "roles": "|".join(sorted(contact.get("roles", [])))  # Pipe-concatenate sorted roles
                }
                contact_entries.append(entry)

    return contact_entries


In [30]:
def extract_harmonized_attributes(biosamples):
    """
    Extracts attributes from biosample documents, keeping only those with a 'harmonized_name' key.
    The result is a structured table where each biosample is represented by its accession and
    selected attributes mapped by harmonized_name.

    :param biosamples: List of biosample documents
    :return: List of dictionaries, each representing a biosample with selected attributes
    """
    extracted_entries = []

    for sample in biosamples:
        biosample_entry = {"accession": sample.get("accession", "")}  # Retain accession

        # Extract attributes list
        attributes_list = sample.get("Attributes", {}).get("Attribute", [])  # Use {} to avoid KeyError

        for attribute in attributes_list:
            if isinstance(attribute, dict):  # Ensure attribute is a dictionary
                harmonized_name = attribute.get("harmonized_name")
                content = attribute.get("content", "")

                if harmonized_name:  # Retain only attributes with harmonized_name
                    biosample_entry[harmonized_name] = content

        extracted_entries.append(biosample_entry)

    return extracted_entries


In [31]:
def extract_all_attributes(biosamples):
    """
    Extracts all attributes from biosample documents and appends the biosample's 'accession' field.
    Each attribute is stored as a separate dictionary entry.

    :param biosamples: List of biosample dictionaries
    :return: List of dictionaries, each representing an attribute with its associated biosample accession
    """
    extracted_entries = []

    for sample in biosamples:
        accession = sample.get("accession", "")  # Retain accession

        # Extract attributes list
        attributes_list = sample.get("Attributes", {}).get("Attribute", [])  # Use {} to avoid KeyError

        for attribute in attributes_list:
            if isinstance(attribute, dict):  # Ensure attribute is a dictionary
                # Copy attribute and add accession field
                attribute_entry = attribute.copy()
                attribute_entry["accession"] = accession
                extracted_entries.append(attribute_entry)

    return extracted_entries


In [32]:
def extract_biosample_ids(biosamples):
    """
    Extracts IDs from biosample documents and appends the biosample's 'accession' field.
    Each ID entry is stored as a separate dictionary.

    :param biosamples: List of biosample dictionaries
    :return: List of dictionaries, each representing an ID with its associated biosample accession
    """
    extracted_entries = []

    for sample in biosamples:
        accession = sample.get("accession", "")  # Retain accession

        # Extract Ids list (assuming it's a dictionary stored as a JSON-like structure)
        ids_data = sample.get("Ids", {})

        # Ensure it's a dictionary and extract the list of Ids
        ids_list = ids_data.get("Id", [])

        if not isinstance(ids_list, list):
            continue  # Skip if Ids is not a list (avoids errors)

        for id_entry in ids_list:
            if isinstance(id_entry, dict):  # Ensure ID entry is a dictionary
                # Copy ID entry and add accession field
                id_entry_copy = id_entry.copy()
                id_entry_copy["accession"] = accession
                extracted_entries.append(id_entry_copy)

    return extracted_entries


In [33]:
def extract_biosample_models_content(biosamples):
    """
    Extracts X from biosample documents and appends the biosample's 'accession' field.
    Each X is stored as a separate dictionary.

    :param biosamples: List of biosample dictionaries
    :return: List of dictionaries, each representing an X with its associated biosample accession
    """

    # todo ignores the moderate possibility that non-EMP500 studies might user other model sub-fields like version
    extracted_entries = []

    for sample in biosamples:
        accession = sample.get("accession", "")  # Retain accession

        # Extract Ids list (assuming it's a dictionary stored as a JSON-like structure)
        x_data = sample.get("Models", {})

        # is this the kind of thing that could be a dict or a list?
        x_inner = x_data.get("Model", [])

        if isinstance(x_inner, dict):
            x_inner["accession"] = accession
            extracted_entries.append(x_inner)

        if isinstance(x_inner, dict):
            for x_entry in x_inner:
                if isinstance(x_entry, dict):  # Ensure ID entry is a dictionary
                    # Copy ID entry and add accession field
                    x_entry_copy = x_entry.copy()
                    x_entry_copy["accession"] = accession
                    extracted_entries.append(x_entry_copy)

    return extracted_entries


In [34]:
def extract_biosample_owner_name_content(biosamples):
    """
    Extracts X from biosample documents and appends the biosample's 'accession' field.
    Each X is stored as a separate dictionary.

    :param biosamples: List of biosample dictionaries
    :return: List of dictionaries, each representing an X with its associated biosample accession
    """

    # todo ignores the strong possibility that non-EMP500 studies might some of the many other owner sub-fields

    extracted_entries = []

    for sample in biosamples:
        accession = sample.get("accession", "")  # Retain accession

        owner = sample.get("Owner", [])
        owner_name = owner.get("Name", [])
        owner_name['accession'] = accession
        extracted_entries.append(owner_name)

    return extracted_entries


In [35]:
def flatten_ncbi_emp500_biosample_descriptions(biosamples):
    """
    Flattens biosample descriptions into a structured format, extracting:
    - Accession (from Title)
    - Taxonomy ID and Name (from Organism)
    - Organism Name (from Organism.OrganismName.content)
    - Description (from Comment.Paragraph.content)

    :param biosamples: List of biosample dictionaries
    :return: List of flattened biosample records
    """
    flattened_entries = []

    for sample in biosamples:
        flattened_entry = {}

        accession = sample.get("accession", "")

        biosample_desc = sample.get("Description", {})

        # Extract accession from Title
        flattened_entry["accession"] = accession

        # Extract taxonomy information from Organism
        organism = biosample_desc.get("Organism", {})
        flattened_entry["taxonomy_id"] = organism.get("taxonomy_id", "")
        flattened_entry["taxonomy_name"] = organism.get("taxonomy_name", "")
        flattened_entry["organism_name"] = organism.get("OrganismName", {}).get("content", "")

        # Extract sample description from Comment
        comment = biosample_desc.get("Comment", {}).get("Paragraph", {})
        flattened_entry["description"] = comment.get("content", "")

        flattened_entries.append(flattened_entry)

    return flattened_entries


In [36]:
def reorder_columns(df, primary_columns, do_sort=True):
    """
    Reorders a DataFrame's columns so that specified columns appear first in the given order,
    followed by all other columns sorted alphabetically.

    :param df: pandas DataFrame
    :param primary_columns: List of columns to move to the first positions in order
    :return: DataFrame with reordered columns
    """
    if not isinstance(primary_columns, list):
        raise ValueError("primary_columns must be a list")

    missing_columns = [col for col in primary_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Columns {missing_columns} not found in DataFrame")

    remaining_columns = [col for col in df.columns if col not in primary_columns]  # Exclude primary columns
    if do_sort:
        remaining_columns = sorted(remaining_columns)  # Sort remaining columns alphabetically
    new_order = primary_columns + remaining_columns  # Place primary columns first in order
    return df[new_order]  # Reorder DataFrame


In [37]:
ever_seen = set() # all values found when checking to_label_check slots

In [38]:
ontology_adapters = build_ontology_adapters(ontology_list)

In [39]:
label_cache = load_ontology_labels(ontology_adapters)

In [40]:
obsolete_terms_list = find_obsolete_terms(ontology_adapters)

In [41]:
# determine what nmdc-schema slots on which classes might contain links to external biosamples, studies or article DOIs

# anything outside of https://microbiomedata.github.io/nmdc-schema/alternative_identifiers/ ->
#   https://microbiomedata.github.io/nmdc-schema/external_database_identifiers/ ?

In [42]:
nmdc_schema_view = SchemaView(nmdc_schema_url)

In [43]:
alternative_identifier_descendants = nmdc_schema_view.slot_descendants("alternative_identifiers")

In [44]:
alternative_identifier_descendants.sort()

In [45]:
aid_rows = []
for aid_name in alternative_identifier_descendants:
    aid = nmdc_schema_view.get_slot(aid_name)
    aid_mixins = aid['mixins']
    aid_is_a = aid.is_a
    aid_classes = nmdc_schema_view.get_classes_by_slot(aid)
    aid_class_descendant_names = set()
    for c in aid_classes:
        c_descendants = nmdc_schema_view.class_descendants(c)
        aid_class_descendant_names.update(c_descendants)
    aid_class_descendant_names = list(aid_class_descendant_names)
    aid_class_descendant_names.sort()
    aid_rows.append({
        'slot': aid_name,
        'parent': aid_is_a,
        'mixins': aid_mixins,
        'classes_using': aid_class_descendant_names,
    })

In [46]:
aid_frame = pd.DataFrame(aid_rows)

In [47]:
aid_frame.to_csv(aid_tsv, sep="\t", index=False)

----

In [48]:
nmdc_schema_usage_index = nmdc_schema_view.usage_index() # collections.defaultdict; Dict[ElementName, List[SchemaUsage]]

In [49]:
ctv_usage = nmdc_schema_usage_index['ControlledTermValue'] # list of linkml_runtime.utils.schemaview.SchemaUsage

# SchemaUsage(used_by='Biosample', slot='chem_administration', metaslot='range', used='ControlledTermValue', inferred=True)

In [50]:
citv_usage = nmdc_schema_usage_index['ControlledIdentifiedTermValue']

In [51]:
ctv_using_slots = set()
for i in ctv_usage:
    ctv_using_slots.add(i.slot)
print(len(ctv_using_slots))

9


In [52]:
for i in citv_usage:
    ctv_using_slots.add(i.slot)
print(len(ctv_using_slots))

15


In [53]:
ctv_using_slots

{'chem_administration',
 'env_broad_scale',
 'env_local_scale',
 'env_medium',
 'experimental_factor',
 'feature_category',
 'growth_facil',
 'host_body_product',
 'host_body_site',
 'host_phenotype',
 'host_taxid',
 'plant_growth_med',
 'plant_struc',
 'samp_mat_process',
 'samp_taxon_id'}

----

In [54]:
# Example usage:
studies = fetch_all_documents("study_set", page_size=20)  # Fetch with a custom page size

next_page_token = 'nmdc:sys0kt4x0h61'
next_page_token = 'nmdc:sys03ktzbm84'


In [55]:
print(f"Retrieved {len(studies)} studies")

Retrieved 44 studies


In [56]:
scalar_studies = flatten(
    studies,
    ctv_slots=ctv_using_slots,
    known_skips={'associated_dois', 'has_credit_associations'}
)

stringified: ['protocol_link', 'study_image']


stringified: {'study_image', 'protocol_link'}

In [57]:
scalar_study_df = pd.DataFrame(scalar_studies)

In [58]:
scalar_study_df = reorder_columns(scalar_study_df, ["id"])

In [59]:
scalar_study_df.to_csv(scalar_studies_tsv, sep="\t", index=False)

In [60]:
scalar_study_df.to_json(scalar_studies_json, orient="records", lines=True)

see also

notebooks/studies_exploration/streams_assesments_llm/streams_prep.ipynb

----

In [61]:
doi_list = extract_associated_dois(studies)

In [62]:
study_doi_frame = pd.DataFrame(doi_list)

In [63]:
study_doi_frame = reorder_columns(study_doi_frame, ["study_id"])

In [64]:
study_doi_frame.to_csv(study_dois_tsv, sep="\t", index=False)

In [65]:
study_doi_frame.to_json(study_dois_tsv, orient="records", lines=True)

----

In [66]:
credit_list = extract_credit_associations(studies)

In [67]:
credit_frame = pd.DataFrame(credit_list)

In [68]:
credit_frame = reorder_columns(credit_frame, ["study_id"])

In [69]:
credit_frame.to_csv(study_credit_associations_tsv, sep="\t", index=False)

In [70]:
credit_frame.to_json(study_credit_associations_json, orient="records", lines=True)

----

In [71]:
biosamples = fetch_all_documents("biosample_set", page_size=1000)  # Fetch with a custom page size


next_page_token = 'nmdc:sys0qgs2em55'
next_page_token = 'nmdc:sys04x4ge633'
next_page_token = 'nmdc:sys056w6eg41'
next_page_token = 'nmdc:sys0fcwtzp50'
next_page_token = 'nmdc:sys0qtd59m94'
next_page_token = 'nmdc:sys09jbgp374'
next_page_token = 'nmdc:sys0z5pkft84'
next_page_token = 'nmdc:sys0456mg139'
next_page_token = 'nmdc:sys0x9mh7965'
next_page_token = 'nmdc:sys0pry5t246'
next_page_token = 'nmdc:sys0fcwyy340'
next_page_token = 'nmdc:sys00jpea547'
next_page_token = 'nmdc:sys08eg5yd36'


In [72]:
print(f"Retrieved {len(biosamples)} biosamples")

Retrieved 13006 biosamples


In [73]:
scalar_biosamples = flatten(biosamples, ctv_slots=ctv_using_slots, known_skips={"chem_administration"})

stringified: ['agrochem_addition', 'fertilizer_regm', 'host_diet', 'misc_param', 'perturbation', 'watering_regm']


stringified: ['fertilizer_regm', 'host_diet', 'perturbation']

In [74]:
# Iterate over each sample dictionary
for sample in scalar_biosamples:
    for key in to_label_check:
        if key in sample:
            ever_seen.add(sample[key])
            if sample[key] in label_cache:  # Check if the value is a CURIE in the cache
                sample[f"{key}.canonical_label"] = label_cache[sample[key]]
            if sample[key] in obsolete_terms_list:
                sample[f"{key}.obsolete"] = True
            else:
                sample[f"{key}.obsolete"] = False

In [75]:
scalar_biosample_frame = pd.DataFrame(scalar_biosamples)

In [76]:
scalar_biosample_frame = reorder_columns(scalar_biosample_frame, ["id"])

In [77]:
scalar_biosample_frame.to_csv(scalar_biosamples_tsv, sep="\t", index=False)

In [78]:
scalar_biosample_frame.to_json(scalar_biosamples_json, orient="records", lines=True)

----

In [79]:
chem_admin_data = extract_chem_administration(biosamples)

In [80]:
chem_admin_df = pd.DataFrame(chem_admin_data)

In [81]:
chem_admin_df.to_csv(biosample_chem_admin_tsv, sep="\t", index=False)

In [82]:
chem_admin_df.to_json(biosample_chem_admin_json, orient="records", lines=True)

----

In [83]:
# get all production submission biosamples
# previously loaded by

In [84]:
# see also
# notebooks/studies_exploration/get_nmdc_submissions.ipynb

In [85]:
submission_biosamples = []

skip_templates = [
    'emsl_data',
    'host_associated_data',
    'jgi_mg_data',
    'jgi_mg_lr_data',
    'jgi_mt_data',
]

# Iterate through each document in the local MongoDB submissions collection
for doc in submissions_collection.find():
    # Ensure metadata_submission and sampleData exist
    print(doc['id'])
    if 'metadata_submission' in doc and 'sampleData' in doc['metadata_submission']:
        sample_data = doc['metadata_submission']['sampleData']

        # Iterate through the keys in sampleData (e.g., "soil_data", "water_data", etc.)
        for key, sample_list in sample_data.items():
            if key in skip_templates:
                continue
            if isinstance(sample_list, list):  # Ensure it's a list of dictionaries
                for sample in sample_list:
                    for k, v in list(sample.items()):  # Iterate over a list copy to avoid modification issues
                        if k in ctv_using_slots:
                            parsed_label_curie = parse_label_curie(v)
                            if parsed_label_curie:
                                sample[f"{k}.id"] = parsed_label_curie['curie']
                                sample[f"{k}.claimed_label"] = parsed_label_curie['label']

                    if isinstance(sample, dict):  # Ensure each sample is a dictionary
                        # Create a copy of the sample dictionary and add extra fields
                        sample_with_id = sample.copy()
                        sample_with_id['sampleData'] = key  # Store the category of sampleData
                        sample_with_id['submission_id'] = doc['id']  # Add the document ID
                        sample_with_id['created'] = doc['created']  # Add the document ID
                        sample_with_id['date_last_modified'] = doc['date_last_modified']  # Add the document ID
                        sample_with_id['status'] = doc['status']  # Add the document ID

                        submission_biosamples.append(sample_with_id)


ec4108bd-61c1-4c11-872c-b154c3f794cb
9728c7ab-59ce-454d-b4f2-ebd77e27a1e5
7ab5c37b-d429-4f31-8970-8e766336921b
e21e3caf-c927-48d0-bbb8-0279f52c3d50
52c53a87-def6-4e1f-9cd8-f3a3d0d4d82b
66c08834-cf09-4c9a-b9a7-6bd8287d4b0a
b7787731-ad52-4a9b-a6cd-f855ad9cd576
41d70a4a-355a-473a-a5f9-5636152b975a
238f8790-a48d-4ae6-aaf6-80f02b6ac6bb
e8d929f2-56fd-49fd-b148-5448b063783f
65010e2f-2788-4eca-b903-852333869e59
ee2141a6-2805-4cf3-81c1-3c81f9998bff
bd7ece1a-8848-4051-9570-1843b2978811
c995ec8f-a43d-46ed-a36f-2d84adc28dbc
063639d1-26c2-4a1f-9b13-e5ec72e5ff88
cd8c3d67-78c7-4375-aec5-163f0c70bbd1
435a61eb-de45-4e6e-a7e0-c34f5a20378a
532f0047-1a7d-4226-a6f6-b0278e969c13
26f70af2-67e3-4713-9717-98c526ba283b
e6d7e259-2e0d-4350-8ae5-cce9adecbe51
a93284d3-fe9d-442f-a6cf-85bdae8b786d
548710b8-e000-4fc7-b7e8-41a787add1d0
721089db-b07d-409e-b2c2-260ee617a144
2d5856be-42c6-449b-834c-c020b7f0f59d
c06c1c49-2e7e-48b5-a4b0-849c878be45f
10b189c9-941b-4dad-b752-0eb099ad77a2
462739ad-f899-4f0b-bdb8-a16a15031bc6
b

In [86]:
print(len(submission_biosamples))

16299


In [87]:
# Iterate over each sample dictionary
for sample in submission_biosamples:
    for key in to_label_check:
        if key in sample:
            ever_seen.add(sample[key])
            if sample[key] in label_cache:  # Check if the value is a CURIE in the cache
                sample[f"{key}.canonical_label"] = label_cache[sample[key]]
            if sample[key] in obsolete_terms_list:
                sample[f"{key}.obsolete"] = True
            else:
                sample[f"{key}.obsolete"] = False

In [88]:
submission_biosamples_frame = pd.DataFrame(submission_biosamples)

In [89]:
submission_biosamples_frame = reorder_columns(submission_biosamples_frame, [
    "submission_id",
    'created',
    "date_last_modified",
    "status",
    "sampleData",
])

In [90]:
submission_biosamples_frame.to_csv(nmdc_submissions_biosamples_tsv, sep="\t", index=False)

In [91]:
ever_seen.intersection(set(obsolete_terms_list))

set()

----

In [92]:
# try flattening gold EMP500 records
#   see also notebooks/studies_exploration/emp_500_ng/emp500_ng.ipynb

In [93]:
client = core.get_mongo_client() # there may be inconsistent MongoDB connection techniques in the notebook

In [94]:
# Search for GOLD studies
emp500_gold_study_cursor = core.fetch_mongodb_records_by_path(
    client,
    GOLD_METADATA,
    "studies",
    "studyGoldId",
    emp500_gold_study_id
)

In [95]:
emp500_gold_study_list = list(emp500_gold_study_cursor)


In [96]:
emp500_gold_biosamples_list = emp500_gold_study_list[0].get("biosamples", [])

In [97]:
emp500_gold_biosamples_cursor = core.fetch_mongodb_records_by_path_in(
    client,
    GOLD_METADATA,
    "biosamples",
    "biosampleGoldId",
    emp500_gold_biosamples_list
)


In [98]:
emp500_gold_biosamples = [
    {key: value for key, value in document.items() if key != '_id'}
    for document in emp500_gold_biosamples_cursor
]

In [99]:
print(len(emp500_gold_biosamples))

1024


In [100]:
flat_gold_emp500_biosamples = flatten(emp500_gold_biosamples, known_skips={'contacts'})

stringified: []


In [101]:
# Iterate over each sample dictionary
for sample in flat_gold_emp500_biosamples:
    for key in to_label_check:
        if key in sample:
            canonical_curie = sample[key].replace('_',':')
            sample[f"{key}.canonical_curie"] = canonical_curie
            ever_seen.add(canonical_curie)
            if canonical_curie in label_cache:  # Check if the value is a CURIE in the cache
                sample[f"{key}.canonical_label"] = label_cache[canonical_curie]
            if canonical_curie in obsolete_terms_list:
                sample[f"{key}.obsolete"] = True
            else:
                sample[f"{key}.obsolete"] = False

In [102]:
flat_gold_emp500_biosamples_frame = pd.DataFrame(flat_gold_emp500_biosamples)

In [103]:
flat_gold_emp500_biosamples_frame = reorder_columns(flat_gold_emp500_biosamples_frame, ['biosampleGoldId'])

In [104]:
flat_gold_emp500_biosamples_frame.to_csv(flattened_gold_emp500_biosamples_tsv, sep="\t", index=False)

In [105]:
gold_emp500_biosample_contacts = extract_gold_contacts(emp500_gold_biosamples, "biosampleGoldId")

In [106]:
gold_emp500_biosample_contacts_frame = pd.DataFrame(gold_emp500_biosample_contacts)

In [107]:
gold_emp500_biosample_contacts_frame.to_csv(gold_emp500_biosample_contacts_tsv, sep="\t", index=False)

----

In [108]:
# NCBI EMP500 Biosamples!

reconstitute sra_biosamples_bioprojects or otherwise work around

```sql
SELECT
  DISTINCT biosample AS biosample_accession,
  bioproject AS bioproject_accession
FROM
  `nih-sra-datastore.sra.metadata`
WHERE
  bioproject= 'PRJEB42019' ;
```

In [109]:
emp500_ncbi_project_accession

'PRJEB42019'

In [110]:
emp500_bioproj_biosample_links_cursor = core.fetch_mongodb_records_by_path(
    client,
    NCBI_METADATA,
    "sra_biosamples_bioprojects",
    "bioproject_accession",
    emp500_ncbi_project_accession
)

In [111]:
emp500_bioproj_biosample_links = [
    {key: value for key, value in document.items() if key != '_id'}
    for document in emp500_bioproj_biosample_links_cursor
]

In [112]:
emp500_biosample_accessions = [doc["biosample_accession"] for doc in emp500_bioproj_biosample_links]

In [113]:
emp500_ncbi_biosamples_cursor = core.fetch_mongodb_records_by_path_in(
    client,
    NCBI_METADATA,
    "biosamples",
    "accession",
    emp500_biosample_accessions
)

In [114]:
emp500_ncbi_biosamples = [
    {key: value for key, value in document.items() if key != '_id'}
    for document in emp500_ncbi_biosamples_cursor
]

In [115]:
print(len(emp500_ncbi_biosamples))

1024


In [116]:
flattened_ncbi_emp500_biosamples = flatten(emp500_ncbi_biosamples, known_skips={
    'Attributes',
    'Ids',
    'Models',
    "Owner",
    "Description"
})

stringified: []


stringified: ['Description']


In [117]:
emp500_ncbi_biosamples

[{'access': 'public',
  'publication_date': '2020-12-17T00:00:00.000',
  'last_update': '2021-08-23T00:26:32.000',
  'submission_date': '2020-12-18T16:06:46.873',
  'id': '17116592',
  'accession': 'SAMEA7723388',
  'Ids': {'Id': [{'content': 'SAMEA7723388',
     'db': 'BioSample',
     'is_primary': '1'},
    {'content': 'ERS5470766', 'db': 'SRA'}]},
  'Description': {'Title': {'content': '13114.angenent.65.s001'},
   'Organism': {'taxonomy_id': '1076179',
    'taxonomy_name': 'bioreactor metagenome',
    'OrganismName': {'content': 'bioreactor metagenome'}},
   'Comment': {'Paragraph': {'content': 'Corn Beer Fermentation Bioreactor Biomass'}}},
  'Owner': {'Name': {'content': 'EBI'}},
  'Models': {'Model': {'content': 'Generic'}},
  'Package': {'content': 'Generic.1.0', 'display_name': 'Generic'},
  'Attributes': {'Attribute': [{'content': '2020-12-17',
     'attribute_name': 'ENA first public'},
    {'content': '2020-12-15', 'attribute_name': 'ENA last update'},
    {'content': 'ERC

In [118]:
ncbi_emp500_biosample_attributes = extract_harmonized_attributes(emp500_ncbi_biosamples)

In [119]:
ncbi_emp500_biosample_attributes

[{'accession': 'SAMEA7723388',
  'elev': '123',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_package': 'host-associated',
  'env_medium': 'anaerobic sludge',
  'geo_loc_name': 'USA: State of New York,City of Ithaca',
  'host_subject_id': 'LTA.misc.104',
  'investigation_type': 'metagenome',
  'project_name': 'Corn Beer Fermentation Bioreactor',
  'sample_name': 'qiita_sid_13114:13114.angenent.65.s001',
  'sample_type': 'bioreactor sludge'},
 {'accession': 'SAMEA7723389',
  'elev': '123',
  'env_broad_scale': 'urban biome',
  'env_local_scale': 'anaerobic bioreactor',
  'env_package': 'host-associated',
  'env_medium': 'anaerobic sludge',
  'geo_loc_name': 'USA: State of New York,City of Ithaca',
  'host_subject_id': 'LTA.misc.519',
  'investigation_type': 'metagenome',
  'project_name': 'Corn Beer Fermentation Bioreactor',
  'sample_name': 'qiita_sid_13114:13114.angenent.65.s002',
  'sample_type': 'bioreactor sludge'},
 {'accession': 'SAMEA772

In [120]:
ncbi_emp500_biosample_attributes_frame = pd.DataFrame(ncbi_emp500_biosample_attributes)

In [121]:
flattened_ncbi_emp500_biosamples_frame = pd.DataFrame(flattened_ncbi_emp500_biosamples)

In [122]:
flattened_ncbi_emp500_biosamples_frame = reorder_columns(
    flattened_ncbi_emp500_biosamples_frame, ["accession"], do_sort=True)

In [123]:
biosample_models = extract_biosample_models_content(emp500_ncbi_biosamples)
biosample_models_frame = pd.DataFrame(biosample_models)
biosample_models_frame.rename(columns={"content": "model"}, inplace=True)

In [124]:
merged_1 = flattened_ncbi_emp500_biosamples_frame.merge(
    biosample_models_frame, on="accession")

In [125]:
owner_names = extract_biosample_owner_name_content(emp500_ncbi_biosamples)
owner_names_frame = pd.DataFrame(owner_names)
owner_names_frame.rename(columns={"content": "owner_name"}, inplace=True)

In [126]:
merged_2 = flattened_ncbi_emp500_biosamples_frame.merge(
    owner_names_frame, on="accession")

In [127]:
flattened_ncbi_emp500_biosample_descriptions = flatten_ncbi_emp500_biosample_descriptions(emp500_ncbi_biosamples)
flattened_ncbi_emp500_biosample_descriptions_frame = pd.DataFrame(flattened_ncbi_emp500_biosample_descriptions)
flattened_ncbi_emp500_biosample_descriptions_frame = reorder_columns(flattened_ncbi_emp500_biosample_descriptions_frame, ["accession"])

In [128]:
merged_3 = merged_2.merge(
    flattened_ncbi_emp500_biosample_descriptions_frame, on="accession")

In [129]:
merged_4 = merged_3.merge(
    ncbi_emp500_biosample_attributes_frame, on="accession")
merged_4 = reorder_columns(
    merged_4, ["accession"], do_sort=False)


In [130]:
merged_4.to_csv(
    flattened_ncbi_emp500_biosamples_with_attributes_tsv, sep="\t", index=False)

In [131]:
ncbi_emp500_all_attributes_table = extract_all_attributes(emp500_ncbi_biosamples)
ncbi_emp500_all_attributes_frame = pd.DataFrame(ncbi_emp500_all_attributes_table)
ncbi_emp500_all_attributes_frame = reorder_columns(ncbi_emp500_all_attributes_frame,
                                                   [
                                                       'accession',
                                                       'attribute_name',
                                                       'harmonized_name',
                                                       'display_name',
                                                       'content'
                                                   ])
ncbi_emp500_all_attributes_frame.to_csv(ncbi_emp500_all_attributes_tsv, sep="\t", index=False)

In [132]:
ncbi_emp500_biosample_ids = extract_biosample_ids(emp500_ncbi_biosamples)
ncbi_emp500_biosample_ids_frame = pd.DataFrame(ncbi_emp500_biosample_ids)
ncbi_emp500_biosample_ids_frame = reorder_columns(ncbi_emp500_biosample_ids_frame, ['accession', 'db', 'is_primary', 'content'])
ncbi_emp500_biosample_ids_frame.to_csv(ncbi_emp500_biosample_ids_tsv, sep="\t", index=False)