In [1]:
import csv
import gzip
import os
import re
import shutil
import sqlite3
from typing import Dict, Any
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests
from linkml_runtime import SchemaView
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer  # from scikit-learn
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import pprint

In [2]:
# todo deal with circularity in env package prediction -> env triad reporting

In [3]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [4]:
MIN_ANNOTATION_LEN = 3

In [5]:
# make a biomes curie -> label dict?
BIOME_CURIE = 'ENVO:00000428'

In [6]:
# ENV_LC = 'soil'
ENV_LC = 'water'

In [7]:
# ENV_BROAD_SCALE_ENUM = "EnvBroadScaleSoilEnum"
ENV_BROAD_SCALE_ENUM = "EnvBroadScaleWaterEnum"

In [8]:
# todo new since soil: why are we only considering MIMS.me for discovering appropriate env triad values?

# NCBI_PACKAGE = 'MIMS.me.soil.6.0'
NCBI_PACKAGE = 'MIMS.me.water.6.0'

In [9]:
ncbi_query = f"""
SELECT content, COUNT(1) AS sample_count 
FROM attributes 
WHERE harmonized_name = 'env_broad_scale' AND package_content = '{NCBI_PACKAGE}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [10]:
envo_adapter_string = "sqlite:obo:envo"

In [11]:
# goldterms_adapter_string = "sqlite:obo:envo"

In [12]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"

# https://s3.amazonaws.com/bbop-sqlite/
# <Contents>
# <Key>goldterms.db.gz</Key>
# <LastModified>2024-11-03T17:24:56.000Z</LastModified>
# <ETag>"fe8e35b215786cb9fc347b7fadbe055f"</ETag>
# <Size>2935781</Size>
# <StorageClass>STANDARD</StorageClass>
# </Contents>


In [13]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = """
SELECT
	*
FROM
	statements s
WHERE
	predicate = 'mixs:env_broad'"""

In [14]:
GOLDTERMS_SOIL = 'GOLDTERMS:4212'
GOLDTERMS_WATER = 'GOLDTERMS:3984'

In [15]:
# GOLDTERMS_ROOT = GOLDTERMS_SOIL
GOLDTERMS_ROOT = GOLDTERMS_WATER

In [16]:
goldterms_subclass_query = f"""
select
	subject
from
	entailed_edge ee
where
	predicate = 'rdfs:subClassOf'
	and object = '{GOLDTERMS_ROOT}'
"""

In [17]:
previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [18]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [19]:
env_package_override_file = '../../mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [20]:
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'

In [21]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
BIOSAMPLES_SHEET = "Biosample"

In [22]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for functiosn that use these globals?
ancestor_cache = {}
descendant_cache = {}

In [23]:
# todo is filling memory with things like this a good idea? for understandability? or performance?
# todo they should be aggregated somewhere, as specified by the config.yaml
# todo or should we going straight to data frames? in which case a dlist of dicts might be preferable
def get_curie_descendants_label_dict(curie, predicates, adapter):
    curie_label_dict = {}
    for descendant in adapter.descendants(curie, predicates=predicates):
        curie_label_dict[descendant] = adapter.label(descendant)
    return curie_label_dict

In [24]:
def curie_descendants_label_dict_to_lod(curie_label_dict):
    return [{'curie': k, 'label': v} for k, v in curie_label_dict.items()]

In [25]:
def curie_descendants_label_lod_to_df(curie_label_lod):
    return pd.DataFrame(curie_label_lod)

In [26]:
def get_schemaview_from_source(source):
    return SchemaView(source)

In [28]:
def parse_hierarchically_underscored_strings(hierarchically_underscored_string_list):
    result = []
    for item in hierarchically_underscored_string_list:
        # Remove leading underscores for label, split on '[' to separate curie
        label, curie = item.lstrip('_').split(' [')
        # Remove the trailing ']' from curie
        curie = curie.rstrip(']')
        # Append dictionary with label and curie
        result.append({'label': label.strip(), 'curie': curie.strip()})
    return result

In [29]:
def dedupe_underscoreless_pvs(underscoreless_pvs):
    # Dictionary to store CURIE as key and list of unique labels as values
    curie_to_labels = {}

    for item in underscoreless_pvs:
        curie = item['curie']
        label = item['label']

        # Initialize the list if curie is not yet a key
        if curie not in curie_to_labels:
            curie_to_labels[curie] = []

        # Add label if it is not already in the list for this curie
        if label not in curie_to_labels[curie]:
            curie_to_labels[curie].append(label)
    return curie_to_labels


In [30]:
def validate_curie_label_list_dict(curie_label_dict, adapter, print_flag=False):
    problem_curies = []
    valid_curies = []
    for curie, labels in curie_label_dict.items():
        true_label = adapter.label(curie)
        if true_label not in labels:
            problem_curies.append(curie)
            if print_flag:
                print(f"Error: {curie} has true label {true_label} which doesn't appear in {labels}")
        else:
            valid_curies.append({"curie": curie, "label": true_label})
    return {"problems": problem_curies, "valids": valid_curies}

In [31]:
# todo could pre-determine the collection sizes
# todo could report elapsed time

def get_docs_from_nmdc_collection(base_url, collection_name, max_page_size=1000, stop_after=None):
    """
    Fetch all documents from a paginated API. Defaults to fetching a large number of documents per page.
    Optionally stop after a specified number of documents.

    Parameters:
    - base_url: The base URL of the API endpoint (e.g., 'https://api.microbiomedata.org/nmdcschema/').
    - collection_name: The name of the collection to fetch (e.g., 'biosample_set').
    - max_page_size: The maximum number of items to retrieve per page (default 1000).
    - stop_after: Optional parameter to stop fetching after a certain number of documents (default None).

    Returns:
    - A list of documents fetched from the API.
    """
    documents = []
    page_token = None
    total_documents = 0
    page_count = 0

    # Construct the full URL with the collection name
    url = f"{base_url}{collection_name}"

    while True:
        page_count += 1
        # Prepare the query parameters
        params = {
            'collection_name': collection_name,
            'max_page_size': max_page_size,  # Set large max_page_size to reduce pagination
        }

        if page_token:
            params['page_token'] = page_token  # Add the page token for pagination

        # Send the request to the API
        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"Error fetching data: {response.status_code}")
            break

        data = response.json()

        # Add the current page of documents to the list
        num_documents_on_page = len(data['resources'])
        documents.extend(data['resources'])
        total_documents += num_documents_on_page

        # Status reporting
        print(f"Fetched page {page_count} with {num_documents_on_page} documents. Total fetched: {total_documents}")

        # If stop_after is provided, stop fetching after reaching the specified number of documents
        if stop_after and total_documents >= stop_after:
            documents = documents[:stop_after]  # Trim to the required number
            print(f"Reached stop_after limit of {stop_after} documents.")
            break

        # Check if there is a next page
        page_token = data.get('next_page_token')
        if not page_token:
            print("All documents fetched.")
            break  # Exit the loop if no more pages are available

    return documents

In [32]:

def get_name_or_rawval(env_scale: Dict[str, Any]) -> str:
    """Safely extract label from environmental scale data."""
    if env_scale:
        term = env_scale.get('term')
        if term:
            return term.get('name', term.get('has_raw_value', ''))
    return ''

In [33]:
def tsv_to_dict_of_dicts(tsv_file, outer_key_column):
    """
    Reads a TSV file into a dictionary of dictionaries.

    :param tsv_file: Path to the TSV file.
    :param outer_key_column: The column name or index to be used as the key for the outer dictionary.
    :return: A dictionary of dictionaries, with outer keys being the values from the specified column.
    """
    with open(tsv_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')

        result = {}

        for row in reader:
            outer_key = row[outer_key_column]
            result[outer_key] = {key: value for key, value in row.items() if key != outer_key_column}

    return result

In [34]:
# todo only gets authoritative labels from the passed adapter, which is presumably EnvO only
# todo would benefit from caching of labels

def biosamples_lod_context_extractor(biosamples_lod, adapter, env_pacakge_overrides=None):
    new_lod = []
    for biosample in biosamples_lod:
        insdc_identifiers = biosample.get('insdc_biosample_identifiers', [])

        env_broad_scale_label = get_name_or_rawval(biosample.get('env_broad_scale'))
        env_local_scale_label = get_name_or_rawval(biosample.get('env_local_scale'))
        env_medium_label = get_name_or_rawval(biosample.get('env_medium'))

        # Extracting optional scalar env_package.has_raw_value
        env_package_has_raw_value = biosample.get('env_package', {}).get('has_raw_value', '')

        # Extracting required multivalued part_of
        associated_studies = '|'.join(biosample.get('associated_studies', []))  # Assuming part_of is a list of strings

        row: Dict[str, str] = {
            'id': biosample['id'],
            'insdc_biosample_identifiers': '|'.join(insdc_identifiers) if insdc_identifiers else '',

            'env_broad_scale_id': biosample['env_broad_scale']['term']['id'],
            'env_broad_scale_mongo_label': env_broad_scale_label,
            'env_broad_scale_auth_label': adapter.label(biosample['env_broad_scale']['term']['id']),

            'env_local_scale_id': biosample['env_local_scale']['term']['id'],
            'env_local_scale_mongo_label': env_local_scale_label,
            'env_local_scale_auth_label': adapter.label(biosample['env_local_scale']['term']['id']),

            'env_medium_id': biosample['env_medium']['term']['id'],
            'env_medium_mongo_label': env_medium_label,
            'env_medium_auth_label': adapter.label(biosample['env_medium']['term']['id']),

            'env_package_has_raw_value': env_package_has_raw_value,
            'normalized_env_package': 'soil' if env_package_has_raw_value == 'ENVO:00001998' else env_package_has_raw_value.lower(),
            # todo abstract this though label search, or at least providing a lookup structure

            'associated_studies': associated_studies
        }

        if env_pacakge_overrides and biosample['id'] in env_pacakge_overrides:
            print(
                f"Overriding env_package for biosample {biosample['id']} from {row['normalized_env_package']} to {env_pacakge_overrides[biosample['id']]['mam_inferred_env_package']}")
            row['normalized_env_package'] = env_pacakge_overrides[biosample['id']]['mam_inferred_env_package']

        new_lod.append(row)
    return new_lod


In [35]:
def get_hierarchy_terms(curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if curie in ancestor_cache:
        ancestors = ancestor_cache[curie]
    else:
        try:
            ancestors = list(adapter.ancestors(curie, predicates=[IS_A]))
            ancestor_cache[curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as e:
            print(f"Error retrieving ancestors for {curie}: {e}")
            ancestor_cache[curie] = []

    if curie in descendant_cache:
        descendants = descendant_cache[curie]
    else:
        try:
            descendants = list(adapter.descendants(curie, predicates=[IS_A]))
            descendant_cache[curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as e:
            print(f"Error retrieving descendants for {curie}: {e}")
            descendant_cache[curie] = []

    return {
        'ancestors': ancestor_cache[curie],
        'descendants': descendant_cache[curie],
    }

In [36]:
def vectorize_terms(df, column):
    """
    Vectorize the ancestor or descendant terms for a given column.

    Args:
        df (pd.DataFrame): The input dataframe.
        column (str): The column name to vectorize.

    Returns:
        sparse matrix: The vectorized term matrix.
    """
    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(
        df[column].apply(lambda x: ' '.join([str(term) for term in x if term is not None]) if x is not None else '')
    )

In [37]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [38]:
def parse_curie_label(text, approved_prefixes=['ENVO']):
    # Case-insensitive pattern for matching approved prefixes followed by an ID
    pattern = r'\b(?:' + '|'.join(approved_prefixes) + r')\s*[:_]\s*(\d+)\b'
    curie_match = re.search(pattern, text, re.IGNORECASE)

    if curie_match:
        curie = f"{approved_prefixes[0].upper()}:{curie_match.group(1)}"  # standardize prefix to 'ENVO:ID'
        label = re.sub(pattern, "", text).strip("[]() ")
        # replace any colons in the label with a whitespace
        return pd.Series([label, curie])
    else:
        label = re.sub(r':', ' ', text)
        return pd.Series([label, None])  # No CURIE found, return original label and None for CURIE


In [39]:
def get_longest_annotation_curie(text, adapter):
    annotations = adapter.annotate_text(text)
    if not annotations:  # Check if annotations list is empty
        return None
    try:
        longest_annotation = max(annotations, key=lambda x: x.subject_end - x.subject_start)

        if longest_annotation.subject_end - longest_annotation.subject_start < MIN_ANNOTATION_LEN:
            return None
        return longest_annotation.object_id
    except ValueError:
        return None  # Return None if there's an unexpected issue with finding the max


In [40]:
# Determine the filenames and target directory
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

In [41]:
# Fetch the contents from the URL and save compressed file in target directory
ncbi_response = requests.get(ncbi_duckdb_url)
ncbi_compressed_file_path = os.path.join(target_dir, ncbi_compressed_filename)
with open(ncbi_compressed_file_path, "wb") as f:
    f.write(ncbi_response.content)

# ~ 2 minutes @ 250 Mbps

In [42]:
# Unzip the compressed file and save the extracted file in target directory
ncbi_uncompressed_file_path = os.path.join(target_dir, ncbi_filename)
with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
    with open(ncbi_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 2 minutes

In [43]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [44]:
envo_adapter = get_adapter(envo_adapter_string)

In [45]:
biome_descendants = get_curie_descendants_label_dict(BIOME_CURIE, [IS_A], envo_adapter)

In [46]:
biome_descendants_lod = curie_descendants_label_dict_to_lod(biome_descendants)

In [47]:
biome_descendants_frame = curie_descendants_label_lod_to_df(biome_descendants_lod)

In [48]:
biome_descendants_frame

Unnamed: 0,curie,label
0,ENVO:01001505,alpine tundra biome
1,ENVO:01000024,marine benthic biome
2,ENVO:01000252,freshwater lake biome
3,ENVO:01000180,tundra biome
4,ENVO:01000123,marine sponge reef biome
...,...,...
123,ENVO:01000858,marine upwelling biome
124,ENVO:01000188,tropical savanna biome
125,ENVO:01000042,neritic epipelagic zone biome
126,ENVO:01000045,epeiric sea biome


----

In [49]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [50]:
# todo break out slow steps into its own cell

try:
    env_broad_scale_enum_def = sv.get_enum(ENV_BROAD_SCALE_ENUM)
    env_broad_scale_pvs_keys = list(env_broad_scale_enum_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    env_broad_scale_pvs_keys =[]
    

An AttributeError occurred: 'NoneType' object has no attribute 'permissible_values'


In [51]:
initially_parsed_env_broad_scale_pvs = parse_hierarchically_underscored_strings(env_broad_scale_pvs_keys)

In [52]:
deduped_env_broad_scale_pvs = dedupe_underscoreless_pvs(initially_parsed_env_broad_scale_pvs)

In [53]:
pv_validation_results = validate_curie_label_list_dict(deduped_env_broad_scale_pvs, envo_adapter, print_flag=True)

In [54]:
pv_validation_results

{'problems': [], 'valids': []}

----

In [55]:
# todo rename to all_nmdc_samples etc
all_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)  # Example with stop_afte

Fetched page 1 with 1000 documents. Total fetched: 1000
Fetched page 2 with 1000 documents. Total fetched: 2000
Fetched page 3 with 1000 documents. Total fetched: 3000
Fetched page 4 with 1000 documents. Total fetched: 4000
Fetched page 5 with 1000 documents. Total fetched: 5000
Fetched page 6 with 1000 documents. Total fetched: 6000
Fetched page 7 with 1000 documents. Total fetched: 7000
Fetched page 8 with 1000 documents. Total fetched: 8000
Fetched page 9 with 320 documents. Total fetched: 8320
All documents fetched.


In [56]:
# todo I don't think we're actually using this
all_studies = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL, STUDY_SET_COLLECTION)  # Example with stop_after

Fetched page 1 with 29 documents. Total fetched: 29
All documents fetched.


In [57]:
env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

In [58]:
# env_pacakge_overrides
# todo or show as frame
# todo include some other columns for context?

In [59]:
biosample_contexts_lod = biosamples_lod_context_extractor(all_biosamples, envo_adapter,
                                                          env_pacakge_overrides=env_pacakge_overrides)

Overriding env_package for biosample nmdc:bsm-11-0k8nkx16 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-19v98823 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-1yvac190 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-28kgw077 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-2hswww54 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-34przm31 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-35m0rm03 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-3636w778 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nffqc45 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nhng665 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3r4g4610 from  to hydrocarbon resources-fluids_swabs
Overriding env_package

In [60]:
nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

In [61]:
# print a value count for the normalized_env_package column
print("Value counts for normalized_env_package column:")
print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))

Value counts for normalized_env_package column:
normalized_env_package
                                                   5838
soil                                               1665
plant-associated                                    401
water                                               192
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


In [62]:
package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        21
             hydrocarbon resources-fluids_swabs       1.00      0.83      0.91         6
miscellaneous natural or artificial environment       1.00      1.00      1.00        44
                               plant-associated       1.00      1.00      1.00       132
                                           soil       1.00      1.00      1.00       489
                                          water       0.98      1.00      0.99        53

                                       accuracy                           1.00       745
                                      macro avg       1.00      0.97      0.98       745
                                   weighted avg       1.00      1.00      1.00       745



In [63]:
nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

In [64]:
nmdc_biosample_contexts_frame.shape

(8320, 15)

In [65]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == ENV_LC]

In [66]:
nmdc_biosample_contexts_frame.shape

(1649, 15)

----

In [67]:
ncbi_frame = ncbi_conn.execute(ncbi_query).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [68]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [69]:
# includes env broad scale values with counts of one... useful for discovering drag-down submissions?

In [70]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [71]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [72]:
ncbi_frame.shape

(3037, 5)

In [73]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [74]:
ncbi_frame.shape

(3299, 5)

In [75]:
# how many content_list strings contain envo multiple times now?

In [76]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [77]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    1930
1    1160
2     207
3       2
Name: count, dtype: int64

doesn't account for multiple label strings delimited with something other than '|'

In [78]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [79]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


In [80]:
parse_failures

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie


In [81]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

In [82]:
# Apply the function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_adapter))


ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [83]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [84]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,marine,3700,marine,1,0,marine,,,,
1,2,ocean biome,2415,ocean biome,1,0,ocean biome,,,ENVO:01000048,ocean biome
2,3,tundra biome,2089,tundra biome,1,0,tundra biome,,,ENVO:01000180,tundra biome
3,4,missing,2088,missing,1,0,missing,,,,
4,5,freshwater biome,1932,freshwater biome,1,0,freshwater biome,,,ENVO:00000873,freshwater biome
...,...,...,...,...,...,...,...,...,...,...,...
3294,3035,"=IF(F279=""Tissue"",""coral reef [ENVO:00000150]""...",1,"coral reef [ENVO:00000150]"")",2,1,"coral reef []""",ENVO:00000150,coral reef,ENVO:00000150,coral reef
3295,3036,"=IF(F285=""Tissue"",""coral reef [ENVO:00000150]""...",1,"=IF(F285=""Tissue"",""coral reef [ENVO:00000150]""...",2,2,"=IF(F285=""Tissue"",""coral reef []"",""sea water",ENVO:00000150,coral reef,ENVO:00000150,coral reef
3296,3036,"=IF(F285=""Tissue"",""coral reef [ENVO:00000150]""...",1,"coral reef [ENVO:00000150]"")",2,1,"coral reef []""",ENVO:00000150,coral reef,ENVO:00000150,coral reef
3297,3037,"=IF(F290=""Tissue"",""coral reef [ENVO:00000150]""...",1,"=IF(F290=""Tissue"",""coral reef [ENVO:00000150]""...",2,2,"=IF(F290=""Tissue"",""coral reef []"",""sea water",ENVO:00000150,coral reef,ENVO:00000150,coral reef


----

In [85]:
gold_biosamples_frame = pd.read_excel(gold_data_url, sheet_name=BIOSAMPLES_SHEET)
# 2 minutes

  warn("Workbook contains no default style, apply openpyxl's default")


In [86]:
gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(
    0).astype(int)


In [87]:
gold_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
1,Gb0035601,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
2,Gb0035602,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
3,Gb0035635,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
4,Gb0035638,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210873,Gb0405291,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210874,Gb0405292,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210875,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified
210876,Gb0405294,Freshwater microbial communities from Prairie ...,449393.0,freshwater metagenome,lake water,2023-08-08,"USA: Prairie Lake NEON Field Site, Vashti, Nor...",47.159710,-99.118723,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [88]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [89]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)

In [90]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

In [91]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [92]:
goldterms_subjects = pd.read_sql_query(goldterms_subclass_query, goldterms_conn)

In [93]:
goldterms_subjects['path_id'] = goldterms_subjects['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [94]:
goldterms_subjects

Unnamed: 0,subject,path_id
0,GOLDTERMS:4184,4184
1,GOLDTERMS:5342,5342
2,GOLDTERMS:4019,4019
3,GOLDTERMS:4012,4012
4,GOLDTERMS:5544,5544
...,...,...
205,GOLDTERMS:4653,4653
206,GOLDTERMS:4167,4167
207,GOLDTERMS:5346,5346
208,GOLDTERMS:3965,3965


In [95]:
gold_path_ids = goldterms_subjects['path_id'].dropna().unique().tolist()
gold_path_ids = [int(id) for id in gold_path_ids]


In [96]:
gold_env_filtered_biosamples_frame = gold_biosamples_frame[
    gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].isin(gold_path_ids)]


In [97]:
gold_env_filtered_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
15,Gb0050975,Acid Mine Drainage (ARMAN) microbial communiti...,718308.0,mine drainage metagenome,Acid mine drainage,2005-06-01,"Richmond Mine, Iron Mountain CA",40.677339,-122.522194,4164,Environmental,Aquatic,Freshwater,Groundwater,Acid Mine Drainage
16,Gb0050977,Marine microbial communities from the Indian O...,405178.0,marine metagenome,Indian Ocean,2005-08-01,Indian Ocean,-8.505250,80.375583,4008,Environmental,Aquatic,Marine,Oceanic,Unclassified
17,Gb0050978,Marine ecosystem from Global Ocean Sampling (G...,405178.0,marine metagenome,"Cocos Island, Costa Rica",,"Cocos Island, Costa Rica",5.640000,-86.565280,3973,Environmental,Aquatic,Non-marine Saline and Alkaline,Saline,Unclassified
18,Gb0050979,"Fossil microbial communities from Whale Fall, ...",444079.0,fossil metagenome,"Whale Fall, Santa Cruz Basin, Pacific Ocean",,"Whale Fall, Santa Cruz Basin, Pacific Ocean",33.300000,-119.220000,4000,Environmental,Aquatic,Marine,Fossil,Whale fall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210828,Gb0404279,Freshwater sediment microbial communities from...,749907.0,sediment metagenome,freshwater epipsammon,2023-10-18,"USA: Gatlinburg, Tennessee",35.690420,-83.503790,5385,Environmental,Aquatic,Freshwater,Creek,Sediment
210829,Gb0404280,Freshwater microbial communities from Caribou ...,449393.0,freshwater metagenome,stream water,2023-10-31,"USA: Chatanika, Alaska",65.153080,-147.501997,4514,Environmental,Aquatic,Freshwater,Creek,Unclassified
210871,Gb0405289,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,river biofilm,2023-07-11,"USA: Arikaree River NEON Field Site, Yuma Coun...",39.758200,-102.447148,5348,Environmental,Aquatic,Freshwater,River,River biofilm
210875,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [98]:
goldterms_mixs_broad_frame = pd.read_sql_query(goldterms_envo_query, goldterms_conn)

In [99]:
goldterms_mixs_broad_frame['mixs_broad_label'] = goldterms_mixs_broad_frame['object'].apply(envo_adapter.label)

In [100]:
goldterms_mixs_broad_frame['path_id'] = goldterms_mixs_broad_frame['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [101]:
goldterms_mixs_broad_frame

Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph,mixs_broad_label,path_id
0,GOLDTERMS:Engineered-Artificial-ecosystem,GOLDTERMS:Engineered-Artificial-ecosystem,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
1,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
2,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
3,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
4,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
...,...,...,...,...,...,...,...,...,...,...
643,GOLDTERMS:5841,GOLDTERMS:5841,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5841
644,GOLDTERMS:5843,GOLDTERMS:5843,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5843
645,GOLDTERMS:5846,GOLDTERMS:5846,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5846
646,GOLDTERMS:5849,GOLDTERMS:5849,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5849


In [102]:
# Fill NaN values in 'BIOSAMPLE ECOSYSTEM PATH ID' with 0 and convert to int
gold_env_filtered_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_env_filtered_biosamples_frame[
    'BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int)

# Drop rows with NaN in 'path_id' in goldterms_mixs_broad_frame
goldterms_mixs_broad_frame = goldterms_mixs_broad_frame.dropna(subset=['path_id'])

# Convert 'path_id' to int
goldterms_mixs_broad_frame['path_id'] = goldterms_mixs_broad_frame['path_id'].astype(int)

# Perform the left merge
gold_env_filtered_biosamples_inferred_broad = gold_env_filtered_biosamples_frame.merge(
    goldterms_mixs_broad_frame,
    left_on='BIOSAMPLE ECOSYSTEM PATH ID',
    right_on='path_id',
    how='left'
)


In [103]:
gold_env_filtered_biosamples_inferred_broad

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,mixs_broad_label,path_id
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,...,GOLDTERMS:3992,GOLDTERMS:3992,mixs:env_broad,ENVO:01000254,,,,,environmental system,3992
1,Gb0050975,Acid Mine Drainage (ARMAN) microbial communiti...,718308.0,mine drainage metagenome,Acid mine drainage,2005-06-01,"Richmond Mine, Iron Mountain CA",40.677339,-122.522194,4164,...,GOLDTERMS:4164,GOLDTERMS:4164,mixs:env_broad,ENVO:01000254,,,,,environmental system,4164
2,Gb0050977,Marine microbial communities from the Indian O...,405178.0,marine metagenome,Indian Ocean,2005-08-01,Indian Ocean,-8.505250,80.375583,4008,...,GOLDTERMS:4008,GOLDTERMS:4008,mixs:env_broad,ENVO:01000254,,,,,environmental system,4008
3,Gb0050978,Marine ecosystem from Global Ocean Sampling (G...,405178.0,marine metagenome,"Cocos Island, Costa Rica",,"Cocos Island, Costa Rica",5.640000,-86.565280,3973,...,GOLDTERMS:3973,GOLDTERMS:3973,mixs:env_broad,ENVO:01000254,,,,,environmental system,3973
4,Gb0050979,"Fossil microbial communities from Whale Fall, ...",444079.0,fossil metagenome,"Whale Fall, Santa Cruz Basin, Pacific Ocean",,"Whale Fall, Santa Cruz Basin, Pacific Ocean",33.300000,-119.220000,4000,...,GOLDTERMS:4000,GOLDTERMS:4000,mixs:env_broad,ENVO:01000254,,,,,environmental system,4000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51832,Gb0404279,Freshwater sediment microbial communities from...,749907.0,sediment metagenome,freshwater epipsammon,2023-10-18,"USA: Gatlinburg, Tennessee",35.690420,-83.503790,5385,...,GOLDTERMS:5385,GOLDTERMS:5385,mixs:env_broad,ENVO:01000254,,,,,environmental system,5385
51833,Gb0404280,Freshwater microbial communities from Caribou ...,449393.0,freshwater metagenome,stream water,2023-10-31,"USA: Chatanika, Alaska",65.153080,-147.501997,4514,...,GOLDTERMS:4514,GOLDTERMS:4514,mixs:env_broad,ENVO:01000254,,,,,environmental system,4514
51834,Gb0405289,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,river biofilm,2023-07-11,"USA: Arikaree River NEON Field Site, Yuma Coun...",39.758200,-102.447148,5348,...,GOLDTERMS:5348,GOLDTERMS:5348,mixs:env_broad,ENVO:01000254,,,,,environmental system,5348
51835,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,...,GOLDTERMS:4179,GOLDTERMS:4179,mixs:env_broad,ENVO:01000254,,,,,environmental system,4179


----

In [104]:
include_in_rows = set()

In [105]:
include_in_rows.update(biome_descendants_frame['curie'])

In [106]:
include_in_rows.update([i['curie'] for i in pv_validation_results['valids']])

In [107]:
include_in_rows.update(nmdc_biosample_contexts_frame['env_broad_scale_id'])

In [108]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [109]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [110]:
include_in_rows.update(gold_env_filtered_biosamples_inferred_broad['object'])

In [111]:
rows_lod = []

In [112]:
# TODO MOVE THESE UP, because the expressions are already being used above

biome_curies = list(biome_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]
terrestrial_biome_curies = list(envo_adapter.descendants('ENVO:00000446', predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants('ENVO:00002030', predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants('ENVO:01000813', predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants('ENVO:01000254', predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants('ENVO:00010483', predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

for curie in include_in_rows:
    if curie is None:
        continue
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        'legacy_pv': False,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
    }
    prefix, local_id = curie.split(':')
    if prefix and prefix == 'ENVO' and row['label'] is not None:
        row['envo_native'] = True
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in legacy_pv_curies:
        row['legacy_pv'] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    rows_lod.append(row)

# todo terrestrial biome, aquatic biome, ABP, environmental material


In [113]:
rows_frame = pd.DataFrame(rows_lod)

In [114]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat
0,ENVO:01000627,feedlot,True,False,False,False,True,False,False,False,False
1,ENVO:01000027,marine abyssal zone biome,True,False,False,True,True,True,False,True,False
2,ENVO:01000406,snow,True,False,False,False,False,False,False,False,True
3,ENVO:01000470,building envelope,True,False,False,True,False,False,False,False,False
4,ENVO:2018051507,,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
993,ENVO:01000294,crevice,True,False,False,True,False,False,False,False,False
994,ENVO:01000085,plume front,True,False,False,False,False,False,False,False,False
995,ENVO:01000375,tropical marine condition,True,False,False,False,False,False,False,False,False
996,ENVO:01000321,sea water environment,True,False,False,False,True,False,False,False,False


In [115]:
nmdc_biosample_ebs_counts = nmdc_biosample_contexts_frame['env_broad_scale_id'].value_counts().reset_index()
nmdc_biosample_ebs_counts.columns = ['curie', 'nmdc_ebs_count']


In [116]:
nmdc_biosample_ebs_counts

Unnamed: 0,curie,nmdc_ebs_count
0,ENVO:01000253,1040
1,ENVO:01000252,467
2,ENVO:03605008,104
3,ENVO:00000873,38


In [117]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_ebs_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [118]:
gold_env_filtered_biosamples_inferred_broad

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,mixs_broad_label,path_id
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,...,GOLDTERMS:3992,GOLDTERMS:3992,mixs:env_broad,ENVO:01000254,,,,,environmental system,3992
1,Gb0050975,Acid Mine Drainage (ARMAN) microbial communiti...,718308.0,mine drainage metagenome,Acid mine drainage,2005-06-01,"Richmond Mine, Iron Mountain CA",40.677339,-122.522194,4164,...,GOLDTERMS:4164,GOLDTERMS:4164,mixs:env_broad,ENVO:01000254,,,,,environmental system,4164
2,Gb0050977,Marine microbial communities from the Indian O...,405178.0,marine metagenome,Indian Ocean,2005-08-01,Indian Ocean,-8.505250,80.375583,4008,...,GOLDTERMS:4008,GOLDTERMS:4008,mixs:env_broad,ENVO:01000254,,,,,environmental system,4008
3,Gb0050978,Marine ecosystem from Global Ocean Sampling (G...,405178.0,marine metagenome,"Cocos Island, Costa Rica",,"Cocos Island, Costa Rica",5.640000,-86.565280,3973,...,GOLDTERMS:3973,GOLDTERMS:3973,mixs:env_broad,ENVO:01000254,,,,,environmental system,3973
4,Gb0050979,"Fossil microbial communities from Whale Fall, ...",444079.0,fossil metagenome,"Whale Fall, Santa Cruz Basin, Pacific Ocean",,"Whale Fall, Santa Cruz Basin, Pacific Ocean",33.300000,-119.220000,4000,...,GOLDTERMS:4000,GOLDTERMS:4000,mixs:env_broad,ENVO:01000254,,,,,environmental system,4000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51832,Gb0404279,Freshwater sediment microbial communities from...,749907.0,sediment metagenome,freshwater epipsammon,2023-10-18,"USA: Gatlinburg, Tennessee",35.690420,-83.503790,5385,...,GOLDTERMS:5385,GOLDTERMS:5385,mixs:env_broad,ENVO:01000254,,,,,environmental system,5385
51833,Gb0404280,Freshwater microbial communities from Caribou ...,449393.0,freshwater metagenome,stream water,2023-10-31,"USA: Chatanika, Alaska",65.153080,-147.501997,4514,...,GOLDTERMS:4514,GOLDTERMS:4514,mixs:env_broad,ENVO:01000254,,,,,environmental system,4514
51834,Gb0405289,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,river biofilm,2023-07-11,"USA: Arikaree River NEON Field Site, Yuma Coun...",39.758200,-102.447148,5348,...,GOLDTERMS:5348,GOLDTERMS:5348,mixs:env_broad,ENVO:01000254,,,,,environmental system,5348
51835,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,...,GOLDTERMS:4179,GOLDTERMS:4179,mixs:env_broad,ENVO:01000254,,,,,environmental system,4179


In [119]:
gold_biosample_ebs_counts = gold_env_filtered_biosamples_inferred_broad['object'].value_counts().reset_index()
gold_biosample_ebs_counts.columns = ['curie', 'gold_ebs_count']

In [120]:
gold_biosample_ebs_counts

Unnamed: 0,curie,gold_ebs_count
0,ENVO:01000254,49517
1,ENVO:01001209,1239
2,ENVO:00000054,535
3,ENVO:03600074,245
4,ENVO:00000057,232
5,ENVO:00000232,65
6,ENVO:00000233,4


In [121]:
# Perform the left merge
rows_frame = rows_frame.merge(
    gold_biosample_ebs_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [122]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,nmdc_ebs_count,gold_ebs_count
0,ENVO:01000627,feedlot,True,False,False,False,True,False,False,False,False,,
1,ENVO:01000027,marine abyssal zone biome,True,False,False,True,True,True,False,True,False,,
2,ENVO:01000406,snow,True,False,False,False,False,False,False,False,True,,
3,ENVO:01000470,building envelope,True,False,False,True,False,False,False,False,False,,
4,ENVO:2018051507,,False,False,False,False,False,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,ENVO:01000294,crevice,True,False,False,True,False,False,False,False,False,,
994,ENVO:01000085,plume front,True,False,False,False,False,False,False,False,False,,
995,ENVO:01000375,tropical marine condition,True,False,False,False,False,False,False,False,False,,
996,ENVO:01000321,sea water environment,True,False,False,False,True,False,False,False,False,,


In [123]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [124]:
# gold and ncbi counts are slightly trickier
# for gold may want to include presence or mapping in goldterms in addition to biosamples counts
# ncbi: we have extracted curies and annotated curies

In [125]:
# todo move this stuff up to immediately after the creation of ncbi_frame ?

# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long runs of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda row: list({row['extracted_curie'], row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [126]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count
0,1,marine,3700,marine,1,0,marine,,,,,[],0
1,2,ocean biome,2415,ocean biome,1,0,ocean biome,,,ENVO:01000048,ocean biome,[ENVO:01000048],1
2,3,tundra biome,2089,tundra biome,1,0,tundra biome,,,ENVO:01000180,tundra biome,[ENVO:01000180],1
3,4,missing,2088,missing,1,0,missing,,,,,[],0
4,5,freshwater biome,1932,freshwater biome,1,0,freshwater biome,,,ENVO:00000873,freshwater biome,[ENVO:00000873],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3294,3035,"=IF(F279=""Tissue"",""coral reef [ENVO:00000150]""...",1,"coral reef [ENVO:00000150]"")",2,1,"coral reef []""",ENVO:00000150,coral reef,ENVO:00000150,coral reef,[ENVO:00000150],1
3295,3036,"=IF(F285=""Tissue"",""coral reef [ENVO:00000150]""...",1,"=IF(F285=""Tissue"",""coral reef [ENVO:00000150]""...",2,2,"=IF(F285=""Tissue"",""coral reef []"",""sea water",ENVO:00000150,coral reef,ENVO:00000150,coral reef,[ENVO:00000150],1
3296,3036,"=IF(F285=""Tissue"",""coral reef [ENVO:00000150]""...",1,"coral reef [ENVO:00000150]"")",2,1,"coral reef []""",ENVO:00000150,coral reef,ENVO:00000150,coral reef,[ENVO:00000150],1
3297,3037,"=IF(F290=""Tissue"",""coral reef [ENVO:00000150]""...",1,"=IF(F290=""Tissue"",""coral reef [ENVO:00000150]""...",2,2,"=IF(F290=""Tissue"",""coral reef []"",""sea water",ENVO:00000150,coral reef,ENVO:00000150,coral reef,[ENVO:00000150],1


In [127]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    1750
0     828
2     721
Name: count, dtype: int64

In [128]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [129]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [130]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [131]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [132]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [133]:
def find_consecutive_stretches_dict(series):
    """
    Detect consecutive stretches of integer values in a pandas Series.
    Returns a dictionary where the keys are serial numbers (starting at 0),
    and the values are lists of consecutive integers.
    """
    # Ensure the series is clean: drop NaN, duplicates, and non-integer values
    series = series.dropna().drop_duplicates()
    series = series[series.apply(lambda x: isinstance(x, (int, float)) and (x == int(x)))].astype(int)
    series = series.sort_values()

    stretches_dict = {}
    current_stretch = []
    stretch_index = 1

    for i in range(len(series)):
        if i == 0 or series.iloc[i] - series.iloc[i - 1] == 1:
            current_stretch.append(series.iloc[i])
        else:
            if len(current_stretch) >= 3:
                stretches_dict[stretch_index] = current_stretch
                stretch_index += 1
            current_stretch = [series.iloc[i]]

    if len(current_stretch) >= 3:
        stretches_dict[stretch_index] = current_stretch

    return stretches_dict


In [134]:
# Function to convert stretches dict to long-format DataFrame
def stretches_dict_to_long_dataframe(stretches_dict):
    """
    Convert a dictionary of consecutive stretches into a long-format DataFrame.
    Each row corresponds to an individual integer value in a stretch, with:
    - 'stretch_id': The key from the dictionary.
    - 'value': The integer value within the stretch.
    """
    rows = []
    for stretch_id, stretch_values in stretches_dict.items():
        for value in stretch_values:
            rows.append({'stretch_id': stretch_id, 'value': int(value)})  # Ensure integers only
    return pd.DataFrame(rows)

In [135]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [136]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)

# pprint.pprint(stretches_dict)

In [137]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

In [138]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,1000021
1,1,1000022
2,1,1000023
3,1,1000024
4,1,1000025
...,...,...
700,5,2018051508
701,5,2018051509
702,5,2018051510
703,5,2018051511


In [139]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [140]:
# Create a new DataFrame summarizing each stretch_id with the most common longest_annotation_curie
def summarize_stretch_groups(df):
    summary_rows = []
    
    # Iterate through each group of rows by stretch_id
    for stretch_id, group in df.dropna(subset=['stretch_id']).groupby('stretch_id'):
        # Calculate the most common longest_annotation_curie and its fraction
        most_common_curie = group['longest_annotation_curie'].value_counts().idxmax()
        fraction = group['longest_annotation_curie'].value_counts(normalize=True).max()
        
        # Append the summary row
        summary_rows.append({
            'stretch_id': stretch_id,
            'most_common_longest_annotation_curie': most_common_curie,
            'fraction': fraction
        })
    
    # Convert the summary rows into a new DataFrame
    return pd.DataFrame(summary_rows)


In [141]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


In [142]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:01000020,1.0
1,2.0,ENVO:00000015,0.995489
2,3.0,ENVO:00000044,1.0
3,4.0,ENVO:00000044,1.0
4,5.0,ENVO:00000044,1.0


In [143]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [144]:
# ncbi_frame.to_csv('ncbi_frame.tsv', sep='\t', index=False)

In [145]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [146]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
1,ENVO:01000252,ENVO:00000021,True
2,ENVO:01000048,ENVO:00000015,True
4,ENVO:01000253,ENVO:01000297,True
8,ENVO:01000251,ENVO:01001834,True
12,ENVO:01000066,ENVO:00000015,True
...,...,...,...
712,ENVO:01000696,ENVO:00000015,True
713,ENVO:01000697,ENVO:00000015,True
714,ENVO:01000699,ENVO:00000015,True
715,ENVO:01000705,ENVO:00000015,True


In [147]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [148]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [149]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    1750
0     828
2     721
Name: count, dtype: int64

In [150]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    2460
0     828
2      11
Name: count, dtype: int64

In [151]:
# ncbi_frame.to_csv('ncbi_frame.tsv', sep='\t', index=False)

In [152]:
ncbi_frame.shape

(3299, 16)

In [153]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [154]:
ncbi_frame_undisputed.shape

(3288, 16)

In [155]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [156]:
ncbi_frame_disputed.shape

(11, 16)

In [157]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [158]:
ncbi_frame_disputed.shape

(22, 16)

In [159]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

In [160]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [161]:
ncbi_disputes_exploded_frame.shape

(3310, 16)

In [162]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,marine,3700,marine,1,0,marine,,,,,[],0,,[],0
1,2,ocean biome,2415,ocean biome,1,0,ocean biome,,,ENVO:01000048,ocean biome,[ENVO:01000048],1,,[ENVO:01000048],1
2,3,tundra biome,2089,tundra biome,1,0,tundra biome,,,ENVO:01000180,tundra biome,[ENVO:01000180],1,,[ENVO:01000180],1
3,4,missing,2088,missing,1,0,missing,,,,,[],0,,[],0
4,5,freshwater biome,1932,freshwater biome,1,0,freshwater biome,,,ENVO:00000873,freshwater biome,[ENVO:00000873],1,,[ENVO:00000873],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,1162,caldera [ENVO_00000096] | island arc [ENVO_000...,2,island arc [ENVO_00000353],2,1,island arc,ENVO:00000353,island arc,ENVO:00000220,archipelago,"[ENVO:00000220, ENVO:00000353]",2,,[ENVO:00000353],2
3306,2239,"island arc [ENVO:00000353], humid subtropical ...",1,"island arc [ENVO:00000353], humid subtropical ...",1,3,"island arc [], humid subtropical [], intertida...",ENVO:00000353,island arc,ENVO:01000377,humid subtropical,"[ENVO:00000353, ENVO:01000377]",2,,[ENVO:00000353],2
3307,2239,"island arc [ENVO:00000353], humid subtropical ...",1,"island arc [ENVO:00000353], humid subtropical ...",1,3,"island arc [], humid subtropical [], intertida...",ENVO:00000353,island arc,ENVO:01000377,humid subtropical,"[ENVO:00000353, ENVO:01000377]",2,,[ENVO:01000377],2
3308,2400,Submarine Groundwater Discharge [ENVO:01001630],1,Submarine Groundwater Discharge [ENVO:01001630],1,1,Submarine Groundwater Discharge,ENVO:01001630,submarine groundwater discharge process,ENVO:01001004,groundwater,"[ENVO:01001630, ENVO:01001004]",2,,[ENVO:01001630],2


In [163]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [164]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    2482
0     828
Name: count, dtype: int64

In [165]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [166]:
# ncbi_biosample_ebs_counts = ncbi_disputes_exploded_frame['post_explode_curie'].value_counts().reset_index()

ncbi_biosample_ebs_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["sample_count"].sum().reset_index()
)

ncbi_biosample_ebs_counts.columns = ['curie', 'ncbi_ebs_count']

In [167]:
ncbi_biosample_ebs_counts

Unnamed: 0,curie,ncbi_ebs_count
0,BFO:0000029,16
1,CHEBI:15377,1121
2,CHEBI:15379,15
3,CHEBI:16183,3
4,CHEBI:24433,3
...,...,...
317,PCO:1000004,73
318,RO:0001019,3
319,RO:0002170,5
320,RO:0002577,110


In [168]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_ebs_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [169]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,nmdc_ebs_count,gold_ebs_count,ncbi_ebs_count
0,ENVO:01000627,feedlot,True,False,False,False,True,False,False,False,False,,,
1,ENVO:01000027,marine abyssal zone biome,True,False,False,True,True,True,False,True,False,,,63.0
2,ENVO:01000406,snow,True,False,False,False,False,False,False,False,True,,,31.0
3,ENVO:01000470,building envelope,True,False,False,True,False,False,False,False,False,,,
4,ENVO:2018051507,,False,False,False,False,False,False,False,False,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,ENVO:01000294,crevice,True,False,False,True,False,False,False,False,False,,,
994,ENVO:01000085,plume front,True,False,False,False,False,False,False,False,False,,,
995,ENVO:01000375,tropical marine condition,True,False,False,False,False,False,False,False,False,,,
996,ENVO:01000321,sea water environment,True,False,False,False,True,False,False,False,False,,,


In [171]:
rows_frame.to_csv('water_ebs_rows_frame.tsv', sep='\t', index=False)