In [1]:
import pprint
from collections import Counter

from nmdc_schema.get_nmdc_view import ViewGetter
from linkml_runtime.utils.schemaview import OrderedBy

import requests
import pandas as pd


In [2]:
target_class_name = "ControlledTermValue"

## Settings for fetching and saving data

In [3]:
BASE_URL = "https://api.microbiomedata.org/nmdcschema"
COLLECTION = "biosample_set"
MAX_PAGE_SIZE = 10000
# PROJECTION = ["chem_administration", "env_broad_scale", "experimental_factor"]

VERBOSE = True  # Set to True for verbose logging

MAX_DOCUMENTS=999999

output_file = "documents.tsv"

In [4]:
vg = ViewGetter()
schema_view = vg.get_view()

In [5]:
target_class_descendants = schema_view.class_descendants(target_class_name)
# print(target_class_descendants)

In [6]:
all_classes = schema_view.all_classes(ordered_by=OrderedBy.LEXICAL)


In [7]:
ctv_slots = {}

In [8]:
for ck, _ in all_classes.items():
    c_id_slot = schema_view.get_identifier_slot(ck)
    if c_id_slot:
        class_sigil = "+"
    else:
        class_sigil = "-"
    class_induced_attributes = schema_view.class_induced_slots(ck)
    class_induced_attributes = sorted(class_induced_attributes, key=lambda instance: instance.name)
    for a in class_induced_attributes:
        if a.range:
            a_range = a.range
        else:
            print("No range for", a.name)
            continue
        if a.required:
            a_sigil = "+"
        else:
            a_sigil = "-"
        if a_range in target_class_descendants:
            print(f"{class_sigil}{ck}.{a_sigil}{a.name} -> {a_range}")
            if ck in ctv_slots:
                ctv_slots[ck][a.name] = a_range
            else:
                ctv_slots[ck] = {a.name: a_range}
        if a.any_of:
            for a_constraint in a.any_of:
                if a_constraint.range:
                    if a_constraint.range in target_class_descendants:
                        print(f"{ck}.{a.name} -> {a_constraint.range}")
                        if ck in ctv_slots:
                            ctv_slots[ck][a.name] = a_range
                        else:
                            ctv_slots[ck] = {a.name: a_range}
        # what are all of the ways we could determine the range?
        # slot's global range? No, skip that.
        # slot usage range (which should be reflected in these induced attributes)
        # any_of range

+Biosample.-chem_administration -> ControlledTermValue
+Biosample.+env_broad_scale -> ControlledIdentifiedTermValue
+Biosample.+env_local_scale -> ControlledIdentifiedTermValue
+Biosample.+env_medium -> ControlledIdentifiedTermValue
+Biosample.-experimental_factor -> ControlledTermValue
+Biosample.-growth_facil -> ControlledTermValue
+Biosample.-host_body_product -> ControlledTermValue
+Biosample.-host_body_site -> ControlledTermValue
+Biosample.-host_phenotype -> ControlledTermValue
+Biosample.-host_taxid -> ControlledIdentifiedTermValue
+Biosample.-plant_growth_med -> ControlledTermValue
+Biosample.-plant_struc -> ControlledTermValue
+Biosample.-samp_mat_process -> ControlledTermValue
+Biosample.-samp_taxon_id -> ControlledIdentifiedTermValue
-FunctionalAnnotation.-feature_category -> ControlledIdentifiedTermValue
-GenomeFeature.-feature_category -> ControlledIdentifiedTermValue


In [9]:
pprint.pprint(ctv_slots)

{'Biosample': {'chem_administration': 'ControlledTermValue',
               'env_broad_scale': 'ControlledIdentifiedTermValue',
               'env_local_scale': 'ControlledIdentifiedTermValue',
               'env_medium': 'ControlledIdentifiedTermValue',
               'experimental_factor': 'ControlledTermValue',
               'growth_facil': 'ControlledTermValue',
               'host_body_product': 'ControlledTermValue',
               'host_body_site': 'ControlledTermValue',
               'host_phenotype': 'ControlledTermValue',
               'host_taxid': 'ControlledIdentifiedTermValue',
               'plant_growth_med': 'ControlledTermValue',
               'plant_struc': 'ControlledTermValue',
               'samp_mat_process': 'ControlledTermValue',
               'samp_taxon_id': 'ControlledIdentifiedTermValue'},
 'FunctionalAnnotation': {'feature_category': 'ControlledIdentifiedTermValue'},
 'GenomeFeature': {'feature_category': 'ControlledIdentifiedTermValue'}}


In [10]:
to_project = list(ctv_slots["Biosample"].keys())
to_project.sort()

In [11]:
to_project

['chem_administration',
 'env_broad_scale',
 'env_local_scale',
 'env_medium',
 'experimental_factor',
 'growth_facil',
 'host_body_product',
 'host_body_site',
 'host_phenotype',
 'host_taxid',
 'plant_growth_med',
 'plant_struc',
 'samp_mat_process',
 'samp_taxon_id']

## Functions for retrieving documents from MongoDB via the API

In [12]:
def log(message):
    if VERBOSE:
        print(message)


def fetch_all_documents(base_url, collection, max_page_size, projection, max_documents=None):
    all_documents = []
    next_page_token = None
    projection_param = ",".join(projection)
    total_fetched = 0

    while True:
        params = {
            "max_page_size": max_page_size,
            "projection": projection_param,
        }
        if next_page_token:
            params["page_token"] = next_page_token

        log(f"Requesting page with params: {params}")

        response = requests.get(
            f"{base_url}/{collection}",
            headers={"accept": "application/json"},
            params=params
        )
        response.raise_for_status()  # Raise an exception for HTTP errors

        data = response.json()
        documents = data.get("resources", [])
        fetched_count = len(documents)
        total_fetched += fetched_count

        log(f"Fetched {fetched_count} documents, Total fetched: {total_fetched}")

        all_documents.extend(documents)

        if max_documents and len(all_documents) >= max_documents:
            all_documents = all_documents[:max_documents]
            log(f"Reached max documents limit: {max_documents}")
            break

        next_page_token = data.get("next_page_token")
        if not next_page_token:
            log("No more pages available.")
            break

    log(f"Total documents fetched: {len(all_documents)}")
    return all_documents


def normalize_document(doc, projection):
    normalized = {"id": doc.get("id")}
    for field in projection:
        if field in doc:
            if isinstance(doc[field], list):
                aggregated_sub_values = {}
                for item in doc[field]:
                    if isinstance(item, dict):
                        for sub_key, sub_value in item.items():
                            if isinstance(sub_value, dict):
                                sub_field = f"{field}.{sub_key}.id"
                                if sub_field not in aggregated_sub_values:
                                    aggregated_sub_values[sub_field] = []
                                if 'id' in sub_value:
                                    aggregated_sub_values[sub_field].append(sub_value['id'])
                            else:
                                sub_field = f"{field}.{sub_key}"
                                if sub_field not in aggregated_sub_values:
                                    aggregated_sub_values[sub_field] = []
                                aggregated_sub_values[sub_field].append(sub_value)
                for sub_field, values in aggregated_sub_values.items():
                    normalized[sub_field] = values
            elif isinstance(doc[field], dict):
                for sub_key, sub_value in doc[field].items():
                    if isinstance(sub_value, dict):
                        normalized[f"{field}.{sub_key}.id"] = sub_value.get('id')
                    else:
                        normalized[f"{field}.{sub_key}"] = sub_value
            else:
                normalized[field] = doc[field]
        else:
            normalized[field] = None
    return normalized


def documents_to_dataframe(documents, projection):
    normalized_docs = [normalize_document(doc, projection) for doc in documents]
    df = pd.DataFrame(normalized_docs)

    # Reorder columns: 'id' first, then all other columns alphabetically
    columns = df.columns.tolist()
    columns.remove('id')
    columns = ['id'] + sorted(columns)
    df = df[columns]

    return df


def write_dataframe_to_tsv(df, file_path):
    df.to_csv(file_path, sep='\t', index=False)
    log(f"Saved DataFrame to {file_path}")


def tabulate_prefixes(df):
    term_id_columns = [col for col in df.columns if col.endswith('.id')]
    prefix_counts = []

    for col in term_id_columns:
        df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [x]).fillna('')
        prefixes = df[col].explode().apply(lambda x: x.split(':')[0] if isinstance(x, str) and ':' in x else None).dropna()
        prefix_counter = Counter(prefixes)
        for prefix, count in prefix_counter.items():
            prefix_counts.append({"Column": col, "Prefix": prefix, "Count": count})
    
    prefix_df = pd.DataFrame(prefix_counts)
    pivot_df = prefix_df.pivot(index='Column', columns='Prefix', values='Count').fillna(0)
    return pivot_df


## Execute the retrieval and save the results

In [13]:
PROJECTION = to_project

# Fetch all documents
documents = fetch_all_documents(BASE_URL, COLLECTION, MAX_PAGE_SIZE, PROJECTION, max_documents=MAX_DOCUMENTS)

# Convert documents to DataFrame
df = documents_to_dataframe(documents, PROJECTION)

# Save DataFrame to TSV
write_dataframe_to_tsv(df, output_file)


Requesting page with params: {'max_page_size': 10000, 'projection': 'chem_administration,env_broad_scale,env_local_scale,env_medium,experimental_factor,growth_facil,host_body_product,host_body_site,host_phenotype,host_taxid,plant_growth_med,plant_struc,samp_mat_process,samp_taxon_id'}
Fetched 8182 documents, Total fetched: 8182
No more pages available.
Total documents fetched: 8182
Saved DataFrame to documents.tsv


In [14]:
df

Unnamed: 0,id,chem_administration,chem_administration.has_raw_value,chem_administration.term.id,env_broad_scale.has_raw_value,env_broad_scale.term.id,env_local_scale.has_raw_value,env_local_scale.term.id,env_medium.has_raw_value,env_medium.term.id,...,host_phenotype,host_taxid,host_taxid.has_raw_value,host_taxid.term.id,plant_growth_med,plant_struc,samp_mat_process,samp_taxon_id,samp_taxon_id.has_raw_value,samp_taxon_id.term.id
0,nmdc:bsm-13-amrnys72,,,,ENVO:01000253,ENVO:01000253,ENVO:01000621,ENVO:01000621,ENVO:01000017,ENVO:01000017,...,,,,,,,,,,
1,nmdc:bsm-13-4jtymw61,,,,ENVO:01000253,ENVO:01000253,ENVO:01000621,ENVO:01000621,ENVO:01000017,ENVO:01000017,...,,,,,,,,,,
2,nmdc:bsm-13-99ey0251,,,,ENVO:01000253,ENVO:01000253,ENVO:01000621,ENVO:01000621,ENVO:01000017,ENVO:01000017,...,,,,,,,,,,
3,nmdc:bsm-13-efabad96,,,,ENVO:01000253,ENVO:01000253,ENVO:01000621,ENVO:01000621,ENVO:01000017,ENVO:01000017,...,,,,,,,,,,
4,nmdc:bsm-13-w2cwcx50,,,,ENVO:01000253,ENVO:01000253,ENVO:01000621,ENVO:01000621,ENVO:01000017,ENVO:01000017,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8177,nmdc:bsm-12-g4j6b812,,,,ENVO_00000446,ENVO:00000446,ENVO_00000044,ENVO:00000044,ENVO_00005774,ENVO:00005774,...,,,,,,,,,,NCBITaxon:1325974
8178,nmdc:bsm-12-zm5skn77,,,,ENVO_00000446,ENVO:00000446,ENVO_00000044,ENVO:00000044,ENVO_00005774,ENVO:00005774,...,,,,,,,,,,NCBITaxon:1325974
8179,nmdc:bsm-12-kprgwc64,,,,ENVO_00000446,ENVO:00000446,ENVO_00000044,ENVO:00000044,ENVO_00005774,ENVO:00005774,...,,,,,,,,,,NCBITaxon:1325974
8180,nmdc:bsm-12-012qca33,,,,ENVO_00000446,ENVO:00000446,ENVO_00000044,ENVO:00000044,ENVO_00005774,ENVO:00005774,...,,,,,,,,,,NCBITaxon:1325974


In [15]:
prefix_df = tabulate_prefixes(df)

In [16]:
prefix_df

Prefix,CHEBI,ENVO,NCBITaxon,PO,UBERON
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chem_administration.term.id,90.0,0.0,0.0,0.0,0.0
env_broad_scale.term.id,0.0,8182.0,0.0,0.0,0.0
env_local_scale.term.id,0.0,8016.0,0.0,10.0,156.0
env_medium.term.id,0.0,8013.0,0.0,108.0,61.0
growth_facil.term.id,0.0,136.0,0.0,0.0,0.0
host_body_site.term.id,0.0,0.0,0.0,0.0,61.0
host_taxid.term.id,0.0,0.0,401.0,0.0,0.0
samp_taxon_id.term.id,0.0,0.0,1467.0,0.0,0.0
