In [1]:


import csv
import gzip
import os
import re
import shutil
import sqlite3
from typing import Dict, Any
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests
from linkml_runtime import SchemaView
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer  # from scikit-learn
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [3]:
# make a biomes curie -> label dict?
BIOME_CURIE = 'ENVO:00000428'

In [4]:
ENV_BROAD_SCALE_ENUM = "EnvBroadScaleSoilEnum"

In [5]:
ncbi_query = """
SELECT content, COUNT(1) AS sample_count 
FROM attributes 
WHERE harmonized_name = 'env_broad_scale' AND package_content = 'MIMS.me.soil.6.0' 
GROUP BY content 
ORDER BY COUNT(1) DESC
"""

In [6]:
envo_adapter_string = "sqlite:obo:envo"

In [7]:
# goldterms_adapter_string = "sqlite:obo:envo"

In [8]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"

# https://s3.amazonaws.com/bbop-sqlite/
# <Contents>
# <Key>goldterms.db.gz</Key>
# <LastModified>2024-11-03T17:24:56.000Z</LastModified>
# <ETag>"fe8e35b215786cb9fc347b7fadbe055f"</ETag>
# <Size>2935781</Size>
# <StorageClass>STANDARD</StorageClass>
# </Contents>


In [9]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = """
SELECT
	*
FROM
	statements s
WHERE
	predicate = 'mixs:env_broad'"""

In [10]:
GOLDTERMS_SOIL = 'GOLDTERMS:4212'

In [11]:
goldterms_soil_subclass_query = f"""
select
	subject
from
	entailed_edge ee
where
	predicate = 'rdfs:subClassOf'
	and object = '{GOLDTERMS_SOIL}'
"""

In [12]:
previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [13]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [14]:
env_package_override_file = '../../mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [15]:
# ncbi_duckdb_file = '../../ncbi_biosamples.duckdb'

ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'

In [16]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
BIOSAMPLES_SHEET = "Biosample"

In [17]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
ancestor_cache = {}
descendant_cache = {}

In [18]:
# todo is filling memory with things like this a good idea? for understandability? or performance?
# todo they should be aggregated somewhere, as specified by the config.yaml
# todo or should we going straight to data frames? in which case a dlist of dicts might be preferable
def get_curie_descendants_label_dict(curie, predicates, adapter):
    curie_label_dict = {}
    for descendant in adapter.descendants(curie, predicates=predicates):
        curie_label_dict[descendant] = adapter.label(descendant)
    return curie_label_dict

In [19]:
def curie_descendants_label_dict_to_lod(curie_label_dict):
    return [{'curie': k, 'label': v} for k, v in curie_label_dict.items()]

In [20]:
def curie_descendants_label_lod_to_df(curie_label_lod):
    return pd.DataFrame(curie_label_lod)

In [21]:
def get_schemaview_from_source(source):
    return SchemaView(source)

In [22]:
# def get_schema_from_schemaview(schemaview):
#     return schemaview.schema

In [23]:
def parse_hierarchically_underscored_strings(hierarchically_underscored_string_list):
    result = []
    for item in hierarchically_underscored_string_list:
        # Remove leading underscores for label, split on '[' to separate curie
        label, curie = item.lstrip('_').split(' [')
        # Remove the trailing ']' from curie
        curie = curie.rstrip(']')
        # Append dictionary with label and curie
        result.append({'label': label.strip(), 'curie': curie.strip()})
    return result

In [24]:
def dedupe_underscoreless_pvs(underscoreless_pvs):
    # Dictionary to store CURIE as key and list of unique labels as values
    curie_to_labels = {}

    for item in underscoreless_pvs:
        curie = item['curie']
        label = item['label']

        # Initialize the list if curie is not yet a key
        if curie not in curie_to_labels:
            curie_to_labels[curie] = []

        # Add label if it is not already in the list for this curie
        if label not in curie_to_labels[curie]:
            curie_to_labels[curie].append(label)
    return curie_to_labels


In [25]:
def validate_curie_label_list_dict(curie_label_dict, adapter, print_flag=False):
    problem_curies = []
    valid_curies = []
    for curie, labels in curie_label_dict.items():
        true_label = adapter.label(curie)
        if true_label not in labels:
            problem_curies.append(curie)
            if print_flag:
                print(f"Error: {curie} has true label {true_label} which doesn't appear in {labels}")
        else:
            valid_curies.append({"curie": curie, "label": true_label})
    return {"problems": problem_curies, "valids": valid_curies}

In [26]:
# todo could pre-determine the collection sizes
# todo could report elapsed time

def get_docs_from_nmdc_collection(base_url, collection_name, max_page_size=1000, stop_after=None):
    """
    Fetch all documents from a paginated API. Defaults to fetching a large number of documents per page.
    Optionally stop after a specified number of documents.

    Parameters:
    - base_url: The base URL of the API endpoint (e.g., 'https://api.microbiomedata.org/nmdcschema/').
    - collection_name: The name of the collection to fetch (e.g., 'biosample_set').
    - max_page_size: The maximum number of items to retrieve per page (default 1000).
    - stop_after: Optional parameter to stop fetching after a certain number of documents (default None).

    Returns:
    - A list of documents fetched from the API.
    """
    documents = []
    page_token = None
    total_documents = 0
    page_count = 0

    # Construct the full URL with the collection name
    url = f"{base_url}{collection_name}"

    while True:
        page_count += 1
        # Prepare the query parameters
        params = {
            'collection_name': collection_name,
            'max_page_size': max_page_size,  # Set large max_page_size to reduce pagination
        }

        if page_token:
            params['page_token'] = page_token  # Add the page token for pagination

        # Send the request to the API
        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"Error fetching data: {response.status_code}")
            break

        data = response.json()

        # Add the current page of documents to the list
        num_documents_on_page = len(data['resources'])
        documents.extend(data['resources'])
        total_documents += num_documents_on_page

        # Status reporting
        print(f"Fetched page {page_count} with {num_documents_on_page} documents. Total fetched: {total_documents}")

        # If stop_after is provided, stop fetching after reaching the specified number of documents
        if stop_after and total_documents >= stop_after:
            documents = documents[:stop_after]  # Trim to the required number
            print(f"Reached stop_after limit of {stop_after} documents.")
            break

        # Check if there is a next page
        page_token = data.get('next_page_token')
        if not page_token:
            print("All documents fetched.")
            break  # Exit the loop if no more pages are available

    return documents

In [27]:

def get_name_or_rawval(env_scale: Dict[str, Any]) -> str:
    """Safely extract label from environmental scale data."""
    if env_scale:
        term = env_scale.get('term')
        if term:
            return term.get('name', term.get('has_raw_value', ''))
    return ''

In [28]:
def tsv_to_dict_of_dicts(tsv_file, outer_key_column):
    """
    Reads a TSV file into a dictionary of dictionaries.

    :param tsv_file: Path to the TSV file.
    :param outer_key_column: The column name or index to be used as the key for the outer dictionary.
    :return: A dictionary of dictionaries, with outer keys being the values from the specified column.
    """
    with open(tsv_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')

        result = {}

        for row in reader:
            outer_key = row[outer_key_column]
            result[outer_key] = {key: value for key, value in row.items() if key != outer_key_column}

    return result

In [29]:
# todo only gets authoritative labels from the passed adapter, which is presumably EnvO only
# todo would benefit from caching of labels

def biosamples_lod_context_extractor(biosamples_lod, adapter, env_pacakge_overrides=None):
    new_lod = []
    for biosample in biosamples_lod:
        insdc_identifiers = biosample.get('insdc_biosample_identifiers', [])

        env_broad_scale_label = get_name_or_rawval(biosample.get('env_broad_scale'))
        env_local_scale_label = get_name_or_rawval(biosample.get('env_local_scale'))
        env_medium_label = get_name_or_rawval(biosample.get('env_medium'))

        # Extracting optional scalar env_package.has_raw_value
        env_package_has_raw_value = biosample.get('env_package', {}).get('has_raw_value', '')

        # Extracting required multivalued part_of
        associated_studies = '|'.join(biosample.get('associated_studies', []))  # Assuming part_of is a list of strings

        row: Dict[str, str] = {
            'id': biosample['id'],
            'insdc_biosample_identifiers': '|'.join(insdc_identifiers) if insdc_identifiers else '',

            'env_broad_scale_id': biosample['env_broad_scale']['term']['id'],
            'env_broad_scale_mongo_label': env_broad_scale_label,
            'env_broad_scale_auth_label': adapter.label(biosample['env_broad_scale']['term']['id']),

            'env_local_scale_id': biosample['env_local_scale']['term']['id'],
            'env_local_scale_mongo_label': env_local_scale_label,
            'env_local_scale_auth_label': adapter.label(biosample['env_local_scale']['term']['id']),

            'env_medium_id': biosample['env_medium']['term']['id'],
            'env_medium_mongo_label': env_medium_label,
            'env_medium_auth_label': adapter.label(biosample['env_medium']['term']['id']),

            'env_package_has_raw_value': env_package_has_raw_value,
            'normalized_env_package': 'soil' if env_package_has_raw_value == 'ENVO:00001998' else env_package_has_raw_value.lower(),
            # todo abstract this though label search, or at least providing a lookup structure

            'associated_studies': associated_studies
        }

        if env_pacakge_overrides and biosample['id'] in env_pacakge_overrides:
            print(
                f"Overriding env_package for biosample {biosample['id']} from {row['normalized_env_package']} to {env_pacakge_overrides[biosample['id']]['mam_inferred_env_package']}")
            row['normalized_env_package'] = env_pacakge_overrides[biosample['id']]['mam_inferred_env_package']

        new_lod.append(row)
    return new_lod


In [30]:
def get_hierarchy_terms(curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if curie in ancestor_cache:
        ancestors = ancestor_cache[curie]
    else:
        try:
            ancestors = list(adapter.ancestors(curie, predicates=[IS_A]))
            ancestor_cache[curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as e:
            print(f"Error retrieving ancestors for {curie}: {e}")
            ancestor_cache[curie] = []

    if curie in descendant_cache:
        descendants = descendant_cache[curie]
    else:
        try:
            descendants = list(adapter.descendants(curie, predicates=[IS_A]))
            descendant_cache[curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as e:
            print(f"Error retrieving descendants for {curie}: {e}")
            descendant_cache[curie] = []

    return {
        'ancestors': ancestor_cache[curie],
        'descendants': descendant_cache[curie],
    }

In [31]:
def vectorize_terms(df, column):
    """
    Vectorize the ancestor or descendant terms for a given column.

    Args:
        df (pd.DataFrame): The input dataframe.
        column (str): The column name to vectorize.

    Returns:
        sparse matrix: The vectorized term matrix.
    """
    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(
        df[column].apply(lambda x: ' '.join([str(term) for term in x if term is not None]) if x is not None else '')
    )

In [32]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # # Predict the normalized_env_package for all rows
    # df['predicted_normalized_env_package'] = clf.predict(X)

    # # If you want to add confidence scores for each class
    # class_probabilities = clf.predict_proba(X)
    # 
    # # Get the class labels from the model
    # class_labels = clf.classes_
    # 
    # # Add a column for each class with the corresponding confidence score
    # for i, class_label in enumerate(class_labels):
    #     df[f'confidence_{class_label}'] = class_probabilities[:, i]
    # 
    # return df

    return clf.predict(X)

In [33]:
def parse_curie_label(text, approved_prefixes=['ENVO']):
    # Case-insensitive pattern for matching approved prefixes followed by an ID
    pattern = r'\b(?:' + '|'.join(approved_prefixes) + r')\s*[:_]\s*(\d+)\b'
    curie_match = re.search(pattern, text, re.IGNORECASE)

    if curie_match:
        curie = f"{approved_prefixes[0].upper()}:{curie_match.group(1)}"  # standardize prefix to 'ENVO:ID'
        label = re.sub(pattern, "", text).strip("[]() ")
        # replace any colons in the label with a whitespace
        return pd.Series([label, curie])
    else:
        label = re.sub(r':', ' ', text)
        return pd.Series([label, None])  # No CURIE found, return original label and None for CURIE


In [34]:
def get_longest_annotation_curie(text, adapter):
    annotations = adapter.annotate_text(text)
    if not annotations:  # Check if annotations list is empty
        return None
    try:
        longest_annotation = max(annotations, key=lambda x: x.subject_end - x.subject_start)
        return longest_annotation.object_id
    except ValueError:
        return None  # Return None if there's an unexpected issue with finding the max


In [35]:
# Determine the filenames and target directory
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

In [36]:
# Fetch the contents from the URL and save compressed file in target directory
ncbi_response = requests.get(ncbi_duckdb_url)
ncbi_compressed_file_path = os.path.join(target_dir, ncbi_compressed_filename)
with open(ncbi_compressed_file_path, "wb") as f:
    f.write(ncbi_response.content)

# ~ 2 minutes @ 250 Mbps

In [37]:
# Unzip the compressed file and save the extracted file in target directory
ncbi_uncompressed_file_path = os.path.join(target_dir, ncbi_filename)
with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
    with open(ncbi_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 minute

In [38]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [39]:
envo_adapter = get_adapter(envo_adapter_string)

In [40]:
biome_descendants = get_curie_descendants_label_dict(BIOME_CURIE, [IS_A], envo_adapter)

In [41]:
biome_descendants_lod = curie_descendants_label_dict_to_lod(biome_descendants)

In [42]:
biome_descendants_frame = curie_descendants_label_lod_to_df(biome_descendants_lod)

In [43]:
biome_descendants_frame

Unnamed: 0,curie,label
0,ENVO:01001505,alpine tundra biome
1,ENVO:01000024,marine benthic biome
2,ENVO:01000252,freshwater lake biome
3,ENVO:01000180,tundra biome
4,ENVO:01000123,marine sponge reef biome
...,...,...
123,ENVO:01000858,marine upwelling biome
124,ENVO:01000188,tropical savanna biome
125,ENVO:01000042,neritic epipelagic zone biome
126,ENVO:01000045,epeiric sea biome


**Use `biome_descendants_frame` as an approximation of `local/biome-ids.tsv`**

In [44]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [45]:
soil_env_broad_scale_enum = sv.get_enum(ENV_BROAD_SCALE_ENUM)
soil_env_broad_scale_pvs_keys = list(soil_env_broad_scale_enum.permissible_values.keys())

In [46]:
initially_parsed_soil_env_broad_scale_pvs = parse_hierarchically_underscored_strings(soil_env_broad_scale_pvs_keys)

In [47]:
deduped_soil_env_broad_scale_pvs = dedupe_underscoreless_pvs(initially_parsed_soil_env_broad_scale_pvs)

In [48]:
pv_validation_results = validate_curie_label_list_dict(deduped_soil_env_broad_scale_pvs, envo_adapter, print_flag=True)

In [49]:
pv_validation_results

{'problems': [],
 'valids': [{'curie': 'ENVO:01001838', 'label': 'arid biome'},
  {'curie': 'ENVO:01001837', 'label': 'subalpine biome'},
  {'curie': 'ENVO:01001836', 'label': 'montane biome'},
  {'curie': 'ENVO:01000223', 'label': 'montane savanna biome'},
  {'curie': 'ENVO:01000216', 'label': 'montane shrubland biome'},
  {'curie': 'ENVO:01001835', 'label': 'alpine biome'},
  {'curie': 'ENVO:01001505', 'label': 'alpine tundra biome'},
  {'curie': 'ENVO:01001834', 'label': 'subpolar biome'},
  {'curie': 'ENVO:01001832', 'label': 'subtropical biome'},
  {'curie': 'ENVO:01001833', 'label': 'mediterranean biome'},
  {'curie': 'ENVO:01000229', 'label': 'mediterranean savanna biome'},
  {'curie': 'ENVO:01000217', 'label': 'mediterranean shrubland biome'},
  {'curie': 'ENVO:01000208', 'label': 'mediterranean woodland biome'},
  {'curie': 'ENVO:01000222', 'label': 'subtropical woodland biome'},
  {'curie': 'ENVO:01000213', 'label': 'subtropical shrubland biome'},
  {'curie': 'ENVO:01000187',

**Use `pv_validation_results['valids']` as an approximation of `local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv`**

In [50]:
all_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)  # Example with stop_after

Fetched page 1 with 1000 documents. Total fetched: 1000
Fetched page 2 with 1000 documents. Total fetched: 2000
Fetched page 3 with 1000 documents. Total fetched: 3000
Fetched page 4 with 1000 documents. Total fetched: 4000
Fetched page 5 with 1000 documents. Total fetched: 5000
Fetched page 6 with 1000 documents. Total fetched: 6000
Fetched page 7 with 1000 documents. Total fetched: 7000
Fetched page 8 with 1000 documents. Total fetched: 8000
Fetched page 9 with 320 documents. Total fetched: 8320
All documents fetched.


In [51]:
all_studies = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL, STUDY_SET_COLLECTION)  # Example with stop_after

Fetched page 1 with 29 documents. Total fetched: 29
All documents fetched.


In [52]:
env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

In [53]:
# env_pacakge_overrides
# todo or show as frame
# todo include some other columns for context?

In [54]:
biosample_contexts_lod = biosamples_lod_context_extractor(all_biosamples, envo_adapter,
                                                          env_pacakge_overrides=env_pacakge_overrides)

Overriding env_package for biosample nmdc:bsm-11-0k8nkx16 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-19v98823 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-1yvac190 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-28kgw077 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-2hswww54 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-34przm31 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-35m0rm03 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-3636w778 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nffqc45 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nhng665 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3r4g4610 from  to hydrocarbon resources-fluids_swabs
Overriding env_package

In [55]:
nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

In [56]:
# print a value count for the normalized_env_package column
print("Value counts for normalized_env_package column:")
print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))

Value counts for normalized_env_package column:
normalized_env_package
                                                   5838
soil                                               1665
plant-associated                                    401
water                                               192
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


In [57]:
package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        21
             hydrocarbon resources-fluids_swabs       1.00      0.83      0.91         6
miscellaneous natural or artificial environment       1.00      1.00      1.00        44
                               plant-associated       1.00      1.00      1.00       132
                                           soil       1.00      1.00      1.00       489
                                          water       0.98      1.00      0.99        53

                                       accuracy                           1.00       745
                                      macro avg       1.00      0.97      0.98       745
                                   weighted avg       1.00      1.00      1.00       745



In [58]:
nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

In [59]:
nmdc_biosample_contexts_frame.shape

(8320, 15)

In [60]:
nmdc_soil_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == 'soil']

In [61]:
nmdc_soil_biosample_contexts_frame.shape

(5860, 15)

**filter and count, then use `nmdc_soil_biosample_contexts_frame` as an approximation of `local/nmdc-production-biosamples-soil-env_broad_scale.tsv`**

In [62]:
ncbi_frame = ncbi_conn.execute(ncbi_query).fetchdf()

In [63]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [64]:
# includes env broad scale values with counts of one... useful for discovering drag-down submissions?

In [65]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [66]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [67]:
ncbi_frame.shape

(4427, 5)

In [68]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [69]:
ncbi_frame.shape

(4492, 5)

In [70]:
# how many content_list strings contain envo multiple times now?

In [71]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [72]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    3582
1     909
2       1
Name: count, dtype: int64

doesn't account for multiple label strings delimited with something other than '|'

In [73]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [74]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


In [75]:
parse_failures

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie
415,411,"ENVO:Temperate grasslands, savannas, and shrub...",37,"ENVO:Temperate grasslands, savannas, and shrub...",1,1,"ENVO Temperate grasslands, savannas, and shrub...",
423,419,ENVO:subtropical coniferous forest biome,36,ENVO:subtropical coniferous forest biome,1,1,ENVO subtropical coniferous forest biome,
425,421,ENVO:taiga,36,ENVO:taiga,1,1,ENVO taiga,
426,422,ENVO:Agricultural ecosystem,36,ENVO:Agricultural ecosystem,1,1,ENVO Agricultural ecosystem,
439,434,ENVO:Tropical and subtropical moist broadleaf ...,36,ENVO:Tropical and subtropical moist broadleaf ...,1,1,ENVO Tropical and subtropical moist broadleaf ...,
650,638,rhizosphere environment[ENVO01000999],15,rhizosphere environment[ENVO01000999],1,1,rhizosphere environment[ENVO01000999],
785,769,ENVO：00002255,9,ENVO：00002255,1,1,ENVO：00002255,
981,963,tropical biome [ENVO:01001830]|histosol [ENVO:...,4,wetland ecosystem [ENVO01001209],3,1,wetland ecosystem [ENVO01001209],


In [76]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

In [77]:
# Apply the function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_adapter))


ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [78]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [79]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,missing,9203,missing,1,0,missing,,,,
1,2,not applicable,3869,not applicable,1,0,not applicable,,,CHEBI:25555,nitrogen atom
2,3,soil,3472,soil,1,0,soil,,,ENVO:00001998,soil
3,4,not collected,3464,not collected,1,0,not collected,,,ENVO:00000084,mountain pass
4,5,forest biome,2674,forest biome,1,0,forest biome,,,ENVO:01000174,forest biome
...,...,...,...,...,...,...,...,...,...,...,...
4487,4423,S_D4,1,S_D4,1,0,S_D4,,,CHEBI:26833,sulfur atom
4488,4424,paddy and upland,1,paddy and upland,1,0,paddy and upland,,,ENVO:00000182,plateau
4489,4425,"bulk soil of Larix decidua in a forest, prokar...",1,"bulk soil of Larix decidua in a forest, prokar...",1,0,"bulk soil of Larix decidua in a forest, prokar...",,,ENVO:00005802,bulk soil
4490,4426,"bulk soil of Larix decidua in a forest, fungal...",1,"bulk soil of Larix decidua in a forest, fungal...",1,0,"bulk soil of Larix decidua in a forest, fungal...",,,ENVO:00005802,bulk soil


**Use `ncbi_frame` as an approximation of `local/ncbi-mims-soil-biosamples-env_broad_scale-annotated.tsv`**

In [80]:
gold_biosamples_frame = pd.read_excel(gold_data_url, sheet_name=BIOSAMPLES_SHEET)
# 2 minutes

  warn("Workbook contains no default style, apply openpyxl's default")


In [81]:
gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(
    0).astype(int)


In [82]:
# gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].astype(int)

In [83]:
gold_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
1,Gb0035601,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
2,Gb0035602,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
3,Gb0035635,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
4,Gb0035638,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210862,Gb0405291,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210863,Gb0405292,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210864,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified
210865,Gb0405294,Freshwater microbial communities from Prairie ...,449393.0,freshwater metagenome,lake water,2023-08-08,"USA: Prairie Lake NEON Field Site, Vashti, Nor...",47.159710,-99.118723,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [84]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [85]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)

In [86]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

In [87]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [88]:
goldterms_soil_subjects = pd.read_sql_query(goldterms_soil_subclass_query, goldterms_conn)

In [89]:
goldterms_soil_subjects['path_id'] = goldterms_soil_subjects['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [90]:
goldterms_soil_subjects

Unnamed: 0,subject,path_id
0,GOLDTERMS:5820,5820
1,GOLDTERMS:5421,5421
2,GOLDTERMS:5617,5617
3,GOLDTERMS:Environmental-Terrestrial-Soil-Natur...,
4,GOLDTERMS:Environmental-Terrestrial-Soil-Pasture,
...,...,...
81,GOLDTERMS:4203,4203
82,GOLDTERMS:Environmental-Terrestrial-Soil-Uncla...,
83,GOLDTERMS:5804,5804
84,GOLDTERMS:4241,4241


In [91]:
soil_path_ids = goldterms_soil_subjects['path_id'].dropna().unique().tolist()
soil_path_ids = [int(id) for id in soil_path_ids]


In [92]:
gold_soil_biosamples_frame = gold_biosamples_frame[
    gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].isin(soil_path_ids)]


In [93]:
gold_soil_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
11,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
12,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
13,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
14,Gb0050974,Soil ecosystem from different sites within th...,410658.0,soil metagenome,"soil classification euic, hyperthermic lithic ...",,Soils collected from different sites within th...,26.663199,-80.628500,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
56,Gb0051017,Mammuthus primigenius fossil ecosystem from Bo...,444079.0,fossil metagenome,"Bolshaya Kolopatkaya river, Russia",,Russia: Sakha Republic,70.000000,151.000000,4418,Environmental,Terrestrial,Soil,Fossil,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210832,Gb0405261,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
210833,Gb0405262,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
210834,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
210835,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified


In [94]:
goldterms_mixs_broad_frame = pd.read_sql_query(goldterms_envo_query, goldterms_conn)

In [95]:
goldterms_mixs_broad_frame['mixs_broad_label'] = goldterms_mixs_broad_frame['object'].apply(envo_adapter.label)

In [96]:
goldterms_mixs_broad_frame['path_id'] = goldterms_mixs_broad_frame['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [97]:
goldterms_mixs_broad_frame

Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph,mixs_broad_label,path_id
0,GOLDTERMS:Engineered-Artificial-ecosystem,GOLDTERMS:Engineered-Artificial-ecosystem,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
1,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
2,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
3,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
4,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
...,...,...,...,...,...,...,...,...,...,...
643,GOLDTERMS:5841,GOLDTERMS:5841,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5841
644,GOLDTERMS:5843,GOLDTERMS:5843,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5843
645,GOLDTERMS:5846,GOLDTERMS:5846,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5846
646,GOLDTERMS:5849,GOLDTERMS:5849,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5849


In [98]:
# Fill NaN values in 'BIOSAMPLE ECOSYSTEM PATH ID' with 0 and convert to int
gold_soil_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_soil_biosamples_frame[
    'BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int)

# Drop rows with NaN in 'path_id' in goldterms_mixs_broad_frame
goldterms_mixs_broad_frame = goldterms_mixs_broad_frame.dropna(subset=['path_id'])

# Convert 'path_id' to int
goldterms_mixs_broad_frame['path_id'] = goldterms_mixs_broad_frame['path_id'].astype(int)

# Perform the left merge
gold_soil_biosamples_inferred_broad = gold_soil_biosamples_frame.merge(
    goldterms_mixs_broad_frame,
    left_on='BIOSAMPLE ECOSYSTEM PATH ID',
    right_on='path_id',
    how='left'
)


In [99]:
gold_soil_biosamples_inferred_broad

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,mixs_broad_label,path_id
0,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
1,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
2,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
3,Gb0050974,Soil ecosystem from different sites within th...,410658.0,soil metagenome,"soil classification euic, hyperthermic lithic ...",,Soils collected from different sites within th...,26.663199,-80.628500,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
4,Gb0051017,Mammuthus primigenius fossil ecosystem from Bo...,444079.0,fossil metagenome,"Bolshaya Kolopatkaya river, Russia",,Russia: Sakha Republic,70.000000,151.000000,4418,...,GOLDTERMS:4418,GOLDTERMS:4418,mixs:env_broad,ENVO:01000254,,,,,environmental system,4418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15365,Gb0405261,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
15366,Gb0405262,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
15367,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
15368,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212


**Use `gold_soil_biosamples_inferred_broad` as an approximation of `local/goldData_biosamples-inferred-soil-env_broad_scale-counts.tsv`**

```config/soil-env_broad_scale-evidence-config.yaml``` from the same unmerged branch:

```yaml
- filename: local/biome-ids.tsv # biome_descendants_frame
  output_prefix: all_biomes_oak
  header: false
  data_column_number: 1
- filename: local/EnvBroadScaleSoilEnum-pvs-keys-parsed-unique.csv # pv_validation_results['valids']
  output_prefix: historical_permissible_values
  header: true
  data_column_name: normalized_curie
- filename: local/nmdc-production-biosamples-soil-env_broad_scale.tsv # biosample_contexts_frame
  output_prefix: NMDC_soil
  header: false
  data_column_number: 1
  count_column_number: 2
- filename: local/ncbi-mims-soil-biosamples-env_broad_scale-annotated.tsv # ncbi_frame
  output_prefix: NCBI_mims_soil_trusting_CURIe
  header: true
  data_column_name: normalized_curie
  count_column_name: count
- filename: local/ncbi-mims-soil-biosamples-env_broad_scale-annotated.tsv
  output_prefix: NCBI_mims_soil_trusting_labels
  header: true
  data_column_name: matched_id
  count_column_name: count
- filename: local/goldData_biosamples-inferred-soil-env_broad_scale-counts.tsv # gold_soil_biosamples_inferred_broad
  output_prefix: GOLD_env_terr_soil
  header: false
  data_column_number: 1
  count_column_number: 2
```

In [100]:
include_in_rows = set()

In [101]:
include_in_rows.update(biome_descendants_frame['curie'])

In [102]:
include_in_rows.update([i['curie'] for i in pv_validation_results['valids']])

In [103]:
include_in_rows.update(nmdc_biosample_contexts_frame['env_broad_scale_id'])

In [104]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [105]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [106]:
include_in_rows.update(gold_soil_biosamples_inferred_broad['object'])

In [107]:
rows_lod = []

In [108]:
# TODO MOVE THESE UP, because the expressions are already being used above

biome_curies = list(biome_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]
terrestrial_biome_curies = list(envo_adapter.descendants('ENVO:00000446', predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants('ENVO:00002030', predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants('ENVO:01000813', predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants('ENVO:01000254', predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants('ENVO:00010483', predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

for curie in include_in_rows:
    if curie is None:
        continue
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        'legacy_pv': False,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
    }
    prefix, local_id = curie.split(':')
    if prefix and prefix == 'ENVO' and row['label'] is not None:
        row['envo_native'] = True
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in legacy_pv_curies:
        row['legacy_pv'] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    rows_lod.append(row)

# todo terrestrial biome, aquatic biome, ABP, environmental material


In [109]:
rows_frame = pd.DataFrame(rows_lod)

In [110]:
nmdc_biosample_ebs_counts = nmdc_soil_biosample_contexts_frame['env_broad_scale_id'].value_counts().reset_index()
nmdc_biosample_ebs_counts.columns = ['curie', 'nmdc_ebs_count']


In [111]:
nmdc_biosample_ebs_counts

Unnamed: 0,curie,nmdc_ebs_count
0,ENVO:00000446,5400
1,ENVO:01000174,192
2,ENVO:01000177,113
3,ENVO:01000250,31
4,ENVO:01000221,27
5,ENVO:01000219,22
6,ENVO:01001837,18
7,ENVO:01000249,15
8,ENVO:01000245,9
9,ENVO:01000215,7


In [112]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_ebs_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [113]:
gold_soil_biosamples_inferred_broad

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,mixs_broad_label,path_id
0,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
1,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
2,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
3,Gb0050974,Soil ecosystem from different sites within th...,410658.0,soil metagenome,"soil classification euic, hyperthermic lithic ...",,Soils collected from different sites within th...,26.663199,-80.628500,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
4,Gb0051017,Mammuthus primigenius fossil ecosystem from Bo...,444079.0,fossil metagenome,"Bolshaya Kolopatkaya river, Russia",,Russia: Sakha Republic,70.000000,151.000000,4418,...,GOLDTERMS:4418,GOLDTERMS:4418,mixs:env_broad,ENVO:01000254,,,,,environmental system,4418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15365,Gb0405261,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
15366,Gb0405262,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
15367,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212
15368,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:01000254,,,,,environmental system,4212


In [114]:
gold_biosample_ebs_counts = gold_soil_biosamples_inferred_broad['object'].value_counts().reset_index()
gold_biosample_ebs_counts.columns = ['curie', 'gold_ebs_count']

In [115]:
gold_biosample_ebs_counts

Unnamed: 0,curie,gold_ebs_count
0,ENVO:01000254,11867
1,ENVO:00000077,1612
2,ENVO:01001209,1527
3,ENVO:00000078,205
4,ENVO:00000232,159


In [116]:
# Perform the left merge
rows_frame = rows_frame.merge(
    gold_biosample_ebs_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [117]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,nmdc_ebs_count,gold_ebs_count
0,ENVO:00002154,radioactive sediment,True,False,False,False,False,False,False,False,True,,
1,ENVO:01000390,temperate mixed needleleaf forest,True,False,False,True,True,False,False,False,False,,
2,ENVO:00002074,,False,False,False,False,False,False,False,False,False,,
3,ENVO:00002250,plinthosol,True,False,False,True,False,False,False,False,True,,
4,ENVO:00000054,saline marsh,True,False,False,False,True,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,BFO:0000029,site,False,False,False,False,False,False,False,False,False,,
988,ENVO:01000430,broadleaf forest,True,False,False,True,True,False,False,False,False,,
989,ENVO:00002294,,False,False,False,False,False,False,False,False,False,,
990,ENVO:00002035,meteorite,True,False,False,False,False,False,False,False,False,,


In [118]:
# gold and ncbi counts are slightly trickier
# for gold may want to include presence or mapping in goldterms in addition to biosmaples counts
# ncbi: we have extracted curies and annotated curies

In [119]:
# todo move this stuff up to immediately after the creation of ncbi_frame

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda row: list({row['extracted_curie'], row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [120]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count
0,1,missing,9203,missing,1,0,missing,,,,,[],0
1,2,not applicable,3869,not applicable,1,0,not applicable,,,CHEBI:25555,nitrogen atom,[CHEBI:25555],1
2,3,soil,3472,soil,1,0,soil,,,ENVO:00001998,soil,[ENVO:00001998],1
3,4,not collected,3464,not collected,1,0,not collected,,,ENVO:00000084,mountain pass,[ENVO:00000084],1
4,5,forest biome,2674,forest biome,1,0,forest biome,,,ENVO:01000174,forest biome,[ENVO:01000174],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4487,4423,S_D4,1,S_D4,1,0,S_D4,,,CHEBI:26833,sulfur atom,[CHEBI:26833],1
4488,4424,paddy and upland,1,paddy and upland,1,0,paddy and upland,,,ENVO:00000182,plateau,[ENVO:00000182],1
4489,4425,"bulk soil of Larix decidua in a forest, prokar...",1,"bulk soil of Larix decidua in a forest, prokar...",1,0,"bulk soil of Larix decidua in a forest, prokar...",,,ENVO:00005802,bulk soil,[ENVO:00005802],1
4490,4426,"bulk soil of Larix decidua in a forest, fungal...",1,"bulk soil of Larix decidua in a forest, fungal...",1,0,"bulk soil of Larix decidua in a forest, fungal...",,,ENVO:00005802,bulk soil,[ENVO:00005802],1


In [121]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    3204
0    1059
2     229
Name: count, dtype: int64

In [122]:
ncbi_frame.to_csv('ncbi_soil_samples_annotated.csv', index=False, sep='\t')