In [1]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import yaml


In [2]:
print("verify output is being rendered")

verify output is being rendered


In [3]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals? Or just use caching around the function?
ancestor_cache = {}
descendant_cache = {}

In [4]:
# todo deal with circularity in env package prediction -> env triad reporting

# todo this on-demand NCBI curie extraction and annotation recapitulates work that is being added to
# https://portal.nersc.gov/project/m3408/biosamples_duckdb/
# via 
#   although that doesn't detect auto-incremented curies from  spreadsheet dragging

# todo if more caching is desired, it should probably take the form of saving dataframes for TSV

# eventually, dig up a complete JSON gold biosample dump for non-hybrid biosample counts

# overall run time (if NCBI biosamples and goldData are cached): ~ 10 minutes

# count studies not biosamples ?

# Task Settings
_For making a Soil env_broad_scale voting sheet vs a Sediment env_local_scale sheet, etc._

todo: bundle these into dicts so they don't have to be modified independently and kept in sync with one another.

In [5]:
output_file_name = "voting_sheets_output/water_env_medium_voting_sheet.tsv"

In [6]:
# semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale
# semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale
semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

In [7]:
plant_first_where = "s1.value like 'host-associated > plants%'"
sediment_first_where = "lower(s1.value) like 'environmental > aquatic%sediment%'"
soil_first_where = "s1.value like 'environmental > terrestrial > soil%'"
water_first_where = "s1.value like 'environmental > aquatic%' and lower(s1.value) not like '%sediment%'"

## context selectors

In [8]:
# gold_context_selector = 'mixs:env_broad'
# gold_context_selector = 'mixs:env_local'
gold_context_selector = 'mixs:env_medium'

In [9]:
# ncbi_context_selector = 'env_broad_scale'
# ncbi_context_selector = 'env_local_scale'
ncbi_context_selector = 'env_medium'

In [10]:
# nmdc_context_selector= 'env_broad_scale_id'
# nmdc_context_selector= 'env_local_scale_id'
nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors

In [11]:
gold_first_where = water_first_where

In [12]:
# todo new since soil: why are we only considering MIMS.me for discovering appropriate env triad values?
#   there's usually a roughly equal number of biosamples from in each extension for MIMS.me and 

# ncbi_package_selector = 'plant-associated.6.0'
# ncbi_package_selector = 'sediment.6.0'
# ncbi_package_selector = 'soil.6.0'
ncbi_package_selector = 'water.6.0'

In [13]:
# nmdc_package_selector = 'plant-associated'
# nmdc_package_selector = 'sediment'
# nmdc_package_selector = 'soil'
nmdc_package_selector = 'water'


In [14]:
GOLDTERMS_NA = '' # ???

GOLDTERMS_PLANT_ASSOCIATED = GOLDTERMS_NA # host associated -> viridiplantae? take a string approach!
GOLDTERMS_SEDIMENT = 'GOLDTERMS:3985' #  doesn't have any subclasses
GOLDTERMS_SOIL = 'GOLDTERMS:4212'
GOLDTERMS_WATER = 'GOLDTERMS:3984'

# GOLDTERMS:4180, 'Environmental > Aquatic > Freshwater > Pond > Sediment' and ~64 more don't share a common root
# poetry run runoak -i sqlite:obo:goldterms info 't~sediment'


In [15]:
goldterms_root = GOLDTERMS_WATER

## selecting name and version of one enum for comparison


In [16]:
# only the Soil enums have legacy definitions (v10.7 and earlier?)

# CONTEXT_ENUM = "EnvBroadScaleSoilEnum"
# CONTEXT_ENUM = "EnvLocalScaleSoilEnum"
# CONTEXT_ENUM = "EnvMediumSoilEnum"

CONTEXT_ENUM = ""

In [17]:
# todo: add columns for membership in multiple enums from multiple version of the schema?
#  like sediment local vs soil local and water local (once that's completed)
#  get them from schema files or something prior to that? sems like the voting sheets are too raw/preliminary for that
#   can use a more recent schema url for more recent enums!

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

# previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [18]:
# todo: don't call the column "legacy_pv". use the name of the enum and the version of the schema?

# comparison_enum_column_name = 'EnvMediumSoilEnum_10_7'
comparison_enum_column_name = 'no_comparison_enum'

# Additional Settings

In [19]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [20]:
MIN_ANNOTATION_LEN = 3

In [21]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [22]:
envo_adapter_string = "sqlite:obo:envo"

In [23]:
# goldterms_adapter_string = "sqlite:obo:envo"

In [24]:
env_package_override_file = 'mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [25]:
# ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples.duckdb.gz'

In [26]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
gold_data_file_name = "goldData.xlsx" # goldData.xlsx: Microsoft Excel 2007+
gold_csv_file_name = "gold_biosamples.csv"
BIOSAMPLES_SHEET = "Biosample"

In [27]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"


# CURIe Constants

In [28]:
BIOME = 'ENVO:00000428'
TERRESTRIAL_BIOME = 'ENVO:00000446'
AQUATIC_BIOME = 'ENVO:00002030'
ABP = 'ENVO:01000813'
ENVIRONMENTAL_SYSTEM = 'ENVO:01000254'
ENVIRONMENTAL_MATERIAL = 'ENVO:00010483'

SOIL = 'ENVO:00001998'
LIQUID_WATER = 'ENVO:00002006'
WATER_ICE = 'ENVO:01000277'

# Settings-based Queries

In [29]:
goldterms_subclass_query = f"""
select
	subject
from
	entailed_edge ee
where
	predicate = 'rdfs:subClassOf'
	and object = '{goldterms_root}'
"""

In [30]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate = '{gold_context_selector}'"""

In [31]:
ncbi_query = f"""
SELECT content, COUNT(1) AS sample_count 
FROM attributes 
WHERE harmonized_name = '{ncbi_context_selector}' AND package_content like '%{ncbi_package_selector}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [32]:
# and s1.subject = s1.stanza eliminates matches on blank node anntoation rows (probably woudn't change results but adds a little overhead)

extension_query = f"""
select
		s1.subject ,
		s2.predicate,
		COALESCE (s2."object",
	s2."value") as content
from
	statements s1
join statements s2 on 
	s1.subject = s2.subject
where
	{gold_first_where}
	and s1.predicate = 'rdfs:label'
	and s1.subject = s1.stanza
	and s2.predicate in ('mixs:env_broad', 'mixs:env_local', 'mixs:env_medium', 'mixs:mixs_extension', 'rdfs:label', 'mixs:other', 'mixs:anatomical_site', 'mixs:host_taxon') ;
"""

# todo provide examples of sediment samples that are excluded by the requirement for aquatic

# Locally Defined Functions
_Currently using locally-defined cache dictionaries_

In [33]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [34]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

# Procedural Code Starts Here

In [35]:
# Determine the filenames and target directory for the NCBI DuckDB
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
ncbi_compressed_file_path = os.path.join(ncbi_compressed_filename)
ncbi_uncompressed_file_path = os.path.join(ncbi_filename)

# target_dir = os.path.join('.') # just assume the files are downloaded into the same directory as the notebook

In [36]:
if os.path.isfile(ncbi_uncompressed_file_path):
    print(f"{ncbi_uncompressed_file_path} is already present in the current working directory.")
else:
    if os.path.isfile(ncbi_compressed_file_path):
        print(f"{ncbi_compressed_file_path} is already present in the current working directory.")
    else:
        print(f"{ncbi_compressed_file_path} needs to be downloaded")
        ncbi_response = requests.get(ncbi_duckdb_url)
        with open(ncbi_compressed_file_path, "wb") as f:
            f.write(ncbi_response.content)
        # ~ 2 minutes @ 250 Mbps
    
    # Unzip the compressed file and save the extracted file in target directory
    print(f"{ncbi_compressed_file_path} needs to be unpacked")
    with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
        with open(ncbi_uncompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # ~ 2 minutes

ncbi_biosamples.duckdb is already present in the current working directory.


In [37]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [38]:
envo_adapter = get_adapter(envo_adapter_string)

In [39]:
anchor_descendants = get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter)

In [40]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [41]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [42]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,FOODON:00001838,fermented cereal beverage
1,ENVO:02000111,copper ore
2,ENVO:01000480,glass
3,FOODON:03400865,soup (eurofir)
4,FOODON:00001279,cake icing food product
...,...,...
1095,ENVO:01001117,poultry manure
1096,ENVO:00002117,creosote contaminated soil
1097,ENVO:01001121,plant matter
1098,ENVO:03605000,periphytic biofilm


----

In [43]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [44]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys =[]
    

An AttributeError occurred: 'NoneType' object has no attribute 'permissible_values'


In [45]:
print(context_pvs_keys)

[]


In [46]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [47]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [48]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [49]:
pv_validation_results

{'problems': [], 'valids': []}

----

In [50]:
# todo rename to all_nmdc_samples etc
all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)  # Example with stop_after

# ~ 1 minute
# how long would saving and restoring to a file take?
# YAML is pretty but that would be the slowest
# try JSON
# pre-filter to only include the fields we need?

# todo cache this as a file

Fetched page 1 with 1000 documents. Total fetched: 1000
Fetched page 2 with 1000 documents. Total fetched: 2000
Fetched page 3 with 1000 documents. Total fetched: 3000
Fetched page 4 with 1000 documents. Total fetched: 4000
Fetched page 5 with 1000 documents. Total fetched: 5000
Fetched page 6 with 1000 documents. Total fetched: 6000
Fetched page 7 with 1000 documents. Total fetched: 7000
Fetched page 8 with 1000 documents. Total fetched: 8000
Fetched page 9 with 362 documents. Total fetched: 8362
All documents fetched.


In [51]:
env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

In [52]:
# todo show env_pacakge_overrides as a data frame
#   with some other columns for context?

In [53]:
biosample_contexts_lod = biosamples_lod_context_extractor(all_nmdc_biosamples, envo_adapter,
                                                          my_env_pacakge_overrides=env_pacakge_overrides)

# ~ 10 seconds, lots of logging

Overriding env_package for biosample nmdc:bsm-11-0k8nkx16 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-19v98823 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-1yvac190 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-28kgw077 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-2hswww54 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-34przm31 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-35m0rm03 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-3636w778 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nffqc45 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nhng665 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3r4g4610 from  to hydrocarbon resources-fluids_swabs
Overriding env_package

In [54]:
nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

In [55]:
# print a value count for the normalized_env_package column
print("Value counts for normalized_env_package column:")
print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))

Value counts for normalized_env_package column:
normalized_env_package
                                                   5838
soil                                               1707
plant-associated                                    401
water                                               192
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


In [56]:
package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

# these predictions often have a f1 of 1.00
# many people might find that hard to believe

(8362, 14)
                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        16
             hydrocarbon resources-fluids_swabs       1.00      1.00      1.00         7
miscellaneous natural or artificial environment       1.00      1.00      1.00        50
                               plant-associated       1.00      1.00      1.00       122
                                           soil       1.00      1.00      1.00       506
                                          water       1.00      1.00      1.00        57

                                       accuracy                           1.00       758
                                      macro avg       1.00      1.00      1.00       758
                                   weighted avg       1.00      1.00      1.00       758



In [57]:
nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

In [58]:
nmdc_biosample_contexts_frame.shape

(8362, 15)

In [59]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == nmdc_package_selector]

In [60]:
nmdc_biosample_contexts_frame.shape

(1388, 15)

----

In [61]:
ncbi_frame = ncbi_conn.execute(ncbi_query).fetchdf()

In [62]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [63]:
# includes values with counts of one... useful for discovering drag-down submissions?

In [64]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [65]:
## diagnostically useful, but why are we saving this and not all of the intermediate dataframes?
# ncbi_frame.to_csv("ncbi_frame.tsv", sep="\t", index=False)

In [66]:
# todo is there any reason to not do this ?
ncbi_frame = ncbi_frame[ncbi_frame['content'].notna() & (ncbi_frame['content'] != '')]

In [67]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [68]:
ncbi_frame.shape

(12085, 5)

In [69]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [70]:
ncbi_frame.shape

(12252, 5)

In [71]:
# how many content_list strings contain envo multiple times now?

In [72]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [73]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    9686
1    2353
2     212
3       1
Name: count, dtype: int64

doesn't split multiple annotation strings delimited with something other than '|'
annotations with no curies but multiple strings will be "annotated" with OAK, but currently only the one best OAK annotation is kept

In [74]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [75]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


In [76]:
parse_failures

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie
177,168,intertidal sedimentENVO_00002179],213,intertidal sedimentENVO_00002179],1,1,intertidal sedimentENVO_00002179],
411,392,ENVO:activated sludge,71,ENVO:activated sludge,1,1,ENVO activated sludge,
487,464,ENVO00010483,50,ENVO00010483,1,1,ENVO00010483,
611,580,anaerobic sludgeENVO:00002129,36,anaerobic sludgeENVO:00002129,1,1,anaerobic sludgeENVO 00002129,
786,746,ENVO00002123,23,ENVO00002123,1,1,ENVO00002123,
799,758,[ENVO00010483],22,[ENVO00010483],1,1,[ENVO00010483],
853,811,Freshwater Lake [ENVO 00000021],18,Freshwater Lake [ENVO 00000021],1,1,Freshwater Lake [ENVO 00000021],
1060,1013,ENVO: travertine fluid,10,ENVO: travertine fluid,1,1,ENVO travertine fluid,
1338,1283,sewage [ENVO00002018],6,sewage [ENVO00002018],1,1,sewage [ENVO00002018],
1583,1523,ENVO: borehole water,4,ENVO: borehole water,1,1,ENVO borehole water,


Should we try parsing on additional CURIE delimiters? Or no delimiter?


In [77]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

In [78]:
# Apply the annotation function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_adapter, MIN_ANNOTATION_LEN))

# this cell only takes ~ 1 minute, but generates a lot of "ERRORS" and WARNINGS in a red fornt
#   while loading the ontologies that are used for annotating


ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [79]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [80]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,water,50076,water,1,0,water,,,CHEBI:15377,water
1,2,seawater,12181,seawater,1,0,seawater,,,ENVO:00002149,sea water
2,3,not applicable,11993,not applicable,1,0,not applicable,,,,
3,4,sea water,10391,sea water,1,0,sea water,,,ENVO:00002149,sea water
4,5,missing,6481,missing,1,0,missing,,,,
...,...,...,...,...,...,...,...,...,...,...,...
12247,12082,Well1,1,Well1,1,0,Well1,,,ENVO:00000026,well
12248,12083,Dicentrarhus labrax (host),1,Dicentrarhus labrax (host),1,0,Dicentrarhus labrax (host),,,,
12249,12084,blue hole rim A 30' out water,1,blue hole rim A 30' out water,1,0,blue hole rim A 30' out water,,,CHEBI:15377,water
12250,12085,blue hole rim B Rim 60' out water,1,blue hole rim B Rim 60' out water,1,0,blue hole rim B Rim 60' out water,,,CHEBI:15377,water


In [81]:
if os.path.isfile(gold_data_file_name):
    print(f"{gold_data_file_name} is already present in the current working directory.")
else:
    print(f"{gold_data_file_name} needs to be downloaded")
    gold_response = requests.get(gold_data_url)
    with open(gold_data_file_name, "wb") as f:
        f.write(gold_response.content)
        # ~ 10 seconds  @ 250 Mbps

goldData.xlsx is already present in the current working directory.


Expect to see

> /home/mark/.cache/pypoetry/virtualenvs/nmdc-submission-schema-DC6HKp4p-py3.10/lib/python3.10/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

In [82]:
if os.path.isfile(gold_csv_file_name):
    print(f"{gold_csv_file_name} is present in the current working directory and will be parsed into gold_biosamples_frame.")
    gold_biosamples_frame = pd.read_csv(gold_csv_file_name, sep="\t")
else:
    print(f"gold_biosamples_frame and {gold_csv_file_name} need to be created")
    gold_biosamples_frame = pd.read_excel(gold_data_file_name, sheet_name=BIOSAMPLES_SHEET)
    gold_biosamples_frame.to_csv("gold_biosamples.csv", index=False, sep="\t")
    # 2 minutes

gold_biosamples.csv is present in the current working directory and will be parsed into gold_biosamples_frame.


In [83]:
gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(
    0).astype(int)


In [84]:
gold_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
1,Gb0035601,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
2,Gb0035602,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
3,Gb0035635,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
4,Gb0035638,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211608,Gb0405291,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
211609,Gb0405292,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
211610,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified
211611,Gb0405294,Freshwater microbial communities from Prairie ...,449393.0,freshwater metagenome,lake water,2023-08-08,"USA: Prairie Lake NEON Field Site, Vashti, Nor...",47.159710,-99.118723,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [85]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [86]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)
    
# ~ 1 second

In [87]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 second

In [88]:
# that's all fast. don't bother caching

In [89]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [90]:
goldterms_subjects = pd.read_sql_query(goldterms_subclass_query, goldterms_conn)

In [91]:
goldterms_subjects['path_id'] = goldterms_subjects['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [92]:
goldterms_subjects

Unnamed: 0,subject,path_id
0,GOLDTERMS:4184,4184
1,GOLDTERMS:5342,5342
2,GOLDTERMS:4019,4019
3,GOLDTERMS:4012,4012
4,GOLDTERMS:5544,5544
...,...,...
205,GOLDTERMS:4653,4653
206,GOLDTERMS:4167,4167
207,GOLDTERMS:5346,5346
208,GOLDTERMS:3965,3965


In [93]:
gold_path_ids = goldterms_subjects['path_id'].dropna().unique().tolist()
gold_path_ids = [int(my_id) for my_id in gold_path_ids]


In [94]:
gold_env_filtered_biosamples_frame = gold_biosamples_frame[
    gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].isin(gold_path_ids)]


In [95]:
gold_env_filtered_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
15,Gb0050975,Acid Mine Drainage (ARMAN) microbial communiti...,718308.0,mine drainage metagenome,Acid mine drainage,2005-06-01,"Richmond Mine, Iron Mountain CA",40.677339,-122.522194,4164,Environmental,Aquatic,Freshwater,Groundwater,Acid Mine Drainage
16,Gb0050977,Marine microbial communities from the Indian O...,405178.0,marine metagenome,Indian Ocean,2005-08-01,Indian Ocean,-8.505250,80.375583,4008,Environmental,Aquatic,Marine,Oceanic,Unclassified
17,Gb0050978,Marine ecosystem from Global Ocean Sampling (G...,405178.0,marine metagenome,"Cocos Island, Costa Rica",,"Cocos Island, Costa Rica",5.640000,-86.565280,3973,Environmental,Aquatic,Non-marine Saline and Alkaline,Saline,Unclassified
18,Gb0050979,"Fossil microbial communities from Whale Fall, ...",444079.0,fossil metagenome,"Whale Fall, Santa Cruz Basin, Pacific Ocean",,"Whale Fall, Santa Cruz Basin, Pacific Ocean",33.300000,-119.220000,4000,Environmental,Aquatic,Marine,Fossil,Whale fall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211563,Gb0404279,Freshwater sediment microbial communities from...,749907.0,sediment metagenome,freshwater epipsammon,2023-10-18,"USA: Gatlinburg, Tennessee",35.690420,-83.503790,5385,Environmental,Aquatic,Freshwater,Creek,Sediment
211564,Gb0404280,Freshwater microbial communities from Caribou ...,449393.0,freshwater metagenome,stream water,2023-10-31,"USA: Chatanika, Alaska",65.153080,-147.501997,4514,Environmental,Aquatic,Freshwater,Creek,Unclassified
211606,Gb0405289,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,river biofilm,2023-07-11,"USA: Arikaree River NEON Field Site, Yuma Coun...",39.758200,-102.447148,5348,Environmental,Aquatic,Freshwater,River,River biofilm
211610,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [96]:
goldterms_context_frame = pd.read_sql_query(goldterms_envo_query, goldterms_conn)

In [97]:
goldterms_context_frame['object_label'] = goldterms_context_frame['object'].apply(envo_adapter.label)

In [98]:
goldterms_context_frame['path_id'] = goldterms_context_frame['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [99]:
goldterms_context_frame

Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,GOLDTERMS:Engineered-Lab-enrichment-Defined-me...,GOLDTERMS:Engineered-Lab-enrichment-Defined-me...,mixs:env_medium,ENVO:00002170,,,,,compost,
1,GOLDTERMS:Engineered-Lab-enrichment-Freshwater,GOLDTERMS:Engineered-Lab-enrichment-Freshwater,mixs:env_medium,ENVO:00002011,,,,,fresh water,
2,GOLDTERMS:Engineered-Lab-enrichment-Rock,GOLDTERMS:Engineered-Lab-enrichment-Rock,mixs:env_medium,ENVO:00001995,,,,,rock,
3,GOLDTERMS:Engineered-Solid-waste-Agricultural-...,GOLDTERMS:Engineered-Solid-waste-Agricultural-...,mixs:env_medium,ENVO:01000371,,,,,agricultural waste material,
4,GOLDTERMS:Engineered-Solid-waste-Cellulose-ass...,GOLDTERMS:Engineered-Solid-waste-Cellulose-ass...,mixs:env_medium,ENVO:02000155,,,,,solid waste material,
...,...,...,...,...,...,...,...,...,...,...
393,GOLDTERMS:5833,GOLDTERMS:5833,mixs:env_medium,ENVO:00002001,,,,,waste water,5833
394,GOLDTERMS:5836,GOLDTERMS:5836,mixs:env_medium,ENVO:00002044,,,,,sludge,5836
395,GOLDTERMS:5838,GOLDTERMS:5838,mixs:env_medium,ENVO:00002001,,,,,waste water,5838
396,GOLDTERMS:5841,GOLDTERMS:5841,mixs:env_medium,ENVO:03600013,,,,,bituminous sand,5841


In [100]:
# Fill NaN values in 'BIOSAMPLE ECOSYSTEM PATH ID' with 0 and convert to int
gold_env_filtered_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_env_filtered_biosamples_frame[
    'BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int)

# Drop rows with NaN in 'path_id' in goldterms_context_frame
goldterms_context_frame = goldterms_context_frame.dropna(subset=['path_id'])

# Convert 'path_id' to int
goldterms_context_frame['path_id'] = goldterms_context_frame['path_id'].astype(int)

# Perform the left merge
gold_env_filtered_biosamples_with_inferred = gold_env_filtered_biosamples_frame.merge(
    goldterms_context_frame,
    left_on='BIOSAMPLE ECOSYSTEM PATH ID',
    right_on='path_id',
    how='left'
)


In [101]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,...,,,,,,,,,,
1,Gb0050975,Acid Mine Drainage (ARMAN) microbial communiti...,718308.0,mine drainage metagenome,Acid mine drainage,2005-06-01,"Richmond Mine, Iron Mountain CA",40.677339,-122.522194,4164,...,GOLDTERMS:4164,GOLDTERMS:4164,mixs:env_medium,ENVO:01001004,,,,,groundwater,4164.0
2,Gb0050977,Marine microbial communities from the Indian O...,405178.0,marine metagenome,Indian Ocean,2005-08-01,Indian Ocean,-8.505250,80.375583,4008,...,,,,,,,,,,
3,Gb0050978,Marine ecosystem from Global Ocean Sampling (G...,405178.0,marine metagenome,"Cocos Island, Costa Rica",,"Cocos Island, Costa Rica",5.640000,-86.565280,3973,...,,,,,,,,,,
4,Gb0050979,"Fossil microbial communities from Whale Fall, ...",444079.0,fossil metagenome,"Whale Fall, Santa Cruz Basin, Pacific Ocean",,"Whale Fall, Santa Cruz Basin, Pacific Ocean",33.300000,-119.220000,4000,...,GOLDTERMS:4000,GOLDTERMS:4000,mixs:env_medium,ENVO:01000140,,,,,whale fall,4000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52099,Gb0404279,Freshwater sediment microbial communities from...,749907.0,sediment metagenome,freshwater epipsammon,2023-10-18,"USA: Gatlinburg, Tennessee",35.690420,-83.503790,5385,...,GOLDTERMS:5385,GOLDTERMS:5385,mixs:env_medium,ENVO:00002007,,,,,sediment,5385.0
52100,Gb0404280,Freshwater microbial communities from Caribou ...,449393.0,freshwater metagenome,stream water,2023-10-31,"USA: Chatanika, Alaska",65.153080,-147.501997,4514,...,GOLDTERMS:4514,GOLDTERMS:4514,mixs:env_medium,ENVO:00002011,,,,,fresh water,4514.0
52101,Gb0405289,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,river biofilm,2023-07-11,"USA: Arikaree River NEON Field Site, Yuma Coun...",39.758200,-102.447148,5348,...,GOLDTERMS:5348,GOLDTERMS:5348,mixs:env_medium,ENVO:00002011,,,,,fresh water,5348.0
52102,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,...,GOLDTERMS:4179,GOLDTERMS:4179,mixs:env_medium,ENVO:00002011,,,,,fresh water,4179.0


# GOLDTERMS only approach

In [102]:
goldterms_result = pd.read_sql_query(extension_query, goldterms_conn)

In [103]:
goldterms_result

Unnamed: 0,subject,predicate,content
0,GOLDTERMS:Environmental-Aquatic-Floodplain,mixs:env_broad,ENVO:00002030
1,GOLDTERMS:Environmental-Aquatic-Floodplain,mixs:env_local,ENVO:00000255
2,GOLDTERMS:Environmental-Aquatic-Floodplain,rdfs:label,Environmental > Aquatic > Floodplain
3,GOLDTERMS:Environmental-Aquatic-Freshwater-Aqu...,mixs:env_broad,ENVO:00002030
4,GOLDTERMS:Environmental-Aquatic-Freshwater-Aqu...,mixs:env_medium,ENVO:00002011
...,...,...,...
569,GOLDTERMS:5807,mixs:env_medium,ENVO:00002010
570,GOLDTERMS:5807,rdfs:label,Environmental > Aquatic > Non-marine Saline an...
571,GOLDTERMS:5825,mixs:env_broad,ENVO:00000447
572,GOLDTERMS:5825,mixs:env_local,ENVO:01000687


In [104]:
# # todo: save this kind of content before subsetting on an environment
# #   the subsetting is currently baked into the query
# 
# # see also goldterms_queries.ipynb in MAM's Collab
# goldterms_result.to_csv("goldterms_single_environment_mappings_long.tsv", sep="\t", index=False)

In [105]:
goldterms_only_curies = goldterms_result.loc[goldterms_result['predicate'] == gold_context_selector, 'content']


In [106]:
goldterms_only_curies = goldterms_only_curies.unique().tolist()

In [107]:
goldterms_only_curies

['ENVO:00002011',
 'ENVO:01000008',
 'ENVO:00002012',
 'ENVO:01000140',
 'ENVO:03600064',
 'ENVO:00003064',
 'ENVO:01000277',
 'ENVO:00000133',
 'ENVO:01001004',
 'ENVO:01001267',
 'ENVO:02000145',
 'ENVO:03600082',
 'ENVO:01001869',
 'ENVO:03605000',
 'ENVO:01000063',
 'ENVO:01001064',
 'ENVO:00003096',
 'ENVO:00001995',
 'ENVO:00003044',
 'ENVO:00001998',
 'ENVO:00002200',
 'ENVO:01000173',
 'ENVO:02000138',
 'ENVO:00002034',
 'ENVO:01000406',
 'ENVO:01000142',
 'ENVO:03600007',
 'ENVO:00002010']

----

In [108]:
anchor_curies = list(anchor_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]

biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A])) # 
terrestrial_biome_curies = list(envo_adapter.descendants(TERRESTRIAL_BIOME, predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants(AQUATIC_BIOME, predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants(ENVIRONMENTAL_SYSTEM, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENVIRONMENTAL_MATERIAL, predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

soil_curies = list(envo_adapter.descendants(SOIL, predicates=[IS_A])) # 
liquid_water_curies = list(envo_adapter.descendants(LIQUID_WATER, predicates=[IS_A])) # 
water_ice_curies = list(envo_adapter.descendants(WATER_ICE, predicates=[IS_A])) # 

In [109]:
include_in_rows = set()

In [110]:
include_in_rows.update(anchor_curies)

In [111]:
include_in_rows.update(legacy_pv_curies)

In [112]:
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_context_selector])

In [113]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [114]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [115]:
include_in_rows.update(gold_env_filtered_biosamples_with_inferred['object'])

In [116]:
rows_lod = []

In [117]:
for curie in include_in_rows:
    if curie is None:
        continue
        
    # ONCE AGAIN, assuming that EnvO is the only ontology we'll check against
    current_ancestors = list(envo_adapter.ancestors(curie, predicates=[IS_A])) # vs legacy_pv_curies
    ancestors_in_enum_count = len(set(current_ancestors) & set(legacy_pv_curies))
    
    current_descendants  = list(envo_adapter.descendants(curie, predicates=[IS_A])) # vs legacy_pv_curies
    descendants_in_enum_count  = len(set(current_descendants) & set(legacy_pv_curies))
    
    
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        comparison_enum_column_name: False,
        'ancestors_in_enum_count': ancestors_in_enum_count,
        'descendants_in_enum_count': descendants_in_enum_count,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
        'soil': False,
        'liquid water': False,
        'water ice': False,
        'goldterms_mappings': False,
    }
        
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in soil_curies:
        row['soil'] = True
    if curie in liquid_water_curies:
        row['liquid water'] = True
    if curie in water_ice_curies:
        row['water ice'] = True
    if curie in legacy_pv_curies:
        row[comparison_enum_column_name] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    if curie in goldterms_only_curies:
        row['goldterms_mappings'] = True
        
    try:
        prefix, local_id = curie.split(':')
        if prefix and prefix == 'ENVO' and row['label'] is not None:
            row['envo_native'] = True
    except Exception as e:
        # Print the exception message
        print(f"An error occurred: {e} trying to split {curie}")

    rows_lod.append(row)

# 2 minutes


An error occurred: 'float' object has no attribute 'split' trying to split nan


In [118]:
rows_frame = pd.DataFrame(rows_lod)

In [119]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,no_comparison_enum,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,soil,liquid water,water ice,goldterms_mappings
0,ENVO:00005753,urea enriched soil,True,False,False,0,0,True,False,False,False,False,True,True,False,False,False
1,ENVO:01001644,material primarily composed of biogenic carbon...,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False
2,,,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False
3,ENVO:00002623,,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False
4,PATO:0001566,distributed,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2608,ENVO:03501324,latex,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False
2609,ENVO:00000570,obsolete brackish water habitat,True,True,False,0,0,False,False,False,False,False,False,False,False,False,False
2610,ENVO:00000210,marine aphotic zone,True,False,False,0,0,True,False,False,False,False,False,False,False,False,False
2611,ENVO:03605004,epipsammon,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False


In [120]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_context_selector].value_counts().reset_index()
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_scoped_count']


In [121]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_scoped_count
0,ENVO:04000007,467
1,ENVO:00002042,287
2,ENVO:00002007,165
3,ENVO:03605001,150
4,ENVO:03605006,104
5,ENVO:03605004,93
6,ENVO:01000017,35
7,ENVO:03605002,27
8,ENVO:03605003,25
9,ENVO:03605005,22


In [122]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [123]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,...,,,,,,,,,,
1,Gb0050975,Acid Mine Drainage (ARMAN) microbial communiti...,718308.0,mine drainage metagenome,Acid mine drainage,2005-06-01,"Richmond Mine, Iron Mountain CA",40.677339,-122.522194,4164,...,GOLDTERMS:4164,GOLDTERMS:4164,mixs:env_medium,ENVO:01001004,,,,,groundwater,4164.0
2,Gb0050977,Marine microbial communities from the Indian O...,405178.0,marine metagenome,Indian Ocean,2005-08-01,Indian Ocean,-8.505250,80.375583,4008,...,,,,,,,,,,
3,Gb0050978,Marine ecosystem from Global Ocean Sampling (G...,405178.0,marine metagenome,"Cocos Island, Costa Rica",,"Cocos Island, Costa Rica",5.640000,-86.565280,3973,...,,,,,,,,,,
4,Gb0050979,"Fossil microbial communities from Whale Fall, ...",444079.0,fossil metagenome,"Whale Fall, Santa Cruz Basin, Pacific Ocean",,"Whale Fall, Santa Cruz Basin, Pacific Ocean",33.300000,-119.220000,4000,...,GOLDTERMS:4000,GOLDTERMS:4000,mixs:env_medium,ENVO:01000140,,,,,whale fall,4000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52099,Gb0404279,Freshwater sediment microbial communities from...,749907.0,sediment metagenome,freshwater epipsammon,2023-10-18,"USA: Gatlinburg, Tennessee",35.690420,-83.503790,5385,...,GOLDTERMS:5385,GOLDTERMS:5385,mixs:env_medium,ENVO:00002007,,,,,sediment,5385.0
52100,Gb0404280,Freshwater microbial communities from Caribou ...,449393.0,freshwater metagenome,stream water,2023-10-31,"USA: Chatanika, Alaska",65.153080,-147.501997,4514,...,GOLDTERMS:4514,GOLDTERMS:4514,mixs:env_medium,ENVO:00002011,,,,,fresh water,4514.0
52101,Gb0405289,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,river biofilm,2023-07-11,"USA: Arikaree River NEON Field Site, Yuma Coun...",39.758200,-102.447148,5348,...,GOLDTERMS:5348,GOLDTERMS:5348,mixs:env_medium,ENVO:00002011,,,,,fresh water,5348.0
52102,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,...,GOLDTERMS:4179,GOLDTERMS:4179,mixs:env_medium,ENVO:00002011,,,,,fresh water,4179.0


In [124]:
gold_biosample_scoped_counts = gold_env_filtered_biosamples_with_inferred['object'].value_counts().reset_index()
gold_biosample_scoped_counts.columns = ['curie', 'gold_hybrid_count']

In [125]:
gold_biosample_scoped_counts

Unnamed: 0,curie,gold_hybrid_count
0,ENVO:00002011,14334
1,ENVO:00002007,7322
2,ENVO:01000008,1447
3,ENVO:01001004,613
4,ENVO:01000277,368
5,ENVO:00003064,337
6,ENVO:00000133,212
7,ENVO:00002012,179
8,ENVO:00001998,141
9,ENVO:01000063,138


In [126]:
# Perform the left merge
rows_frame = rows_frame.merge(
    gold_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [127]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,no_comparison_enum,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,soil,liquid water,water ice,goldterms_mappings,nmdc_scoped_count,gold_hybrid_count
0,ENVO:00005753,urea enriched soil,True,False,False,0,0,True,False,False,False,False,True,True,False,False,False,,
1,ENVO:01001644,material primarily composed of biogenic carbon...,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False,,
2,,,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False,,
3,ENVO:00002623,,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False,,
4,PATO:0001566,distributed,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2608,ENVO:03501324,latex,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False,,
2609,ENVO:00000570,obsolete brackish water habitat,True,True,False,0,0,False,False,False,False,False,False,False,False,False,False,,
2610,ENVO:00000210,marine aphotic zone,True,False,False,0,0,True,False,False,False,False,False,False,False,False,False,,
2611,ENVO:03605004,epipsammon,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False,93.0,


In [128]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [129]:
# gold and ncbi counts are slightly trickier
# for gold: including mappings only, mappings in hybrid with biosample counts. 
#    Switch to direct biosample counts of GOLD "envo" annotations?
# ncbi: we have extracted curies and annotated curies

In [130]:
# todo move this stuff up to immediately after the creation of ncbi_frame ?

# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long runs of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [131]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count
0,1,water,50076,water,1,0,water,,,CHEBI:15377,water,[CHEBI:15377],1
1,2,seawater,12181,seawater,1,0,seawater,,,ENVO:00002149,sea water,[ENVO:00002149],1
2,3,not applicable,11993,not applicable,1,0,not applicable,,,,,[],0
3,4,sea water,10391,sea water,1,0,sea water,,,ENVO:00002149,sea water,[ENVO:00002149],1
4,5,missing,6481,missing,1,0,missing,,,,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12247,12082,Well1,1,Well1,1,0,Well1,,,ENVO:00000026,well,[ENVO:00000026],1
12248,12083,Dicentrarhus labrax (host),1,Dicentrarhus labrax (host),1,0,Dicentrarhus labrax (host),,,,,[],0
12249,12084,blue hole rim A 30' out water,1,blue hole rim A 30' out water,1,0,blue hole rim A 30' out water,,,CHEBI:15377,water,[CHEBI:15377],1
12250,12085,blue hole rim B Rim 60' out water,1,blue hole rim B Rim 60' out water,1,0,blue hole rim B Rim 60' out water,,,CHEBI:15377,water,[CHEBI:15377],1


In [132]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
0    7195
1    3969
2    1088
Name: count, dtype: int64

In [133]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [134]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [135]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [136]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [137]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [138]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [139]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)

# pprint.pprint(stretches_dict)

In [140]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

In [141]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
893,6,2018051508
894,6,2018051509
895,6,2018051510
896,6,2018051511


In [142]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [143]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


In [144]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:00002007,0.925926
1,2.0,ENVO:00002150,0.713362
2,3.0,ENVO:01000301,0.974359
3,4.0,CHEBI:15377,1.0
4,5.0,CHEBI:15377,1.0
5,6.0,CHEBI:15377,1.0


In [145]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [146]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [147]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
1,ENVO:00002150,ENVO:00002149,True
2,ENVO:00002041,ENVO:01001004,True
5,ENVO:000002150,ENVO:00002150,True
10,ENVO:00002066,ENVO:00002043,True
11,ENVO:01000301,CHEBI:15377,True
...,...,...,...
1055,ENVO:00000010,ENVO:00002007,True
1056,ENVO:00000011,ENVO:00002007,True
1057,ENVO:00000015,ENVO:00002007,True
1058,ENVO:00000017,ENVO:00002007,True


In [148]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [149]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [150]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
0    7195
1    3969
2    1088
Name: count, dtype: int64

In [151]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
0    7195
1    5023
2      34
Name: count, dtype: int64

In [152]:
ncbi_frame.shape

(12252, 16)

In [153]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [154]:
ncbi_frame_undisputed.shape

(12218, 16)

In [155]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [156]:
ncbi_frame_disputed.shape

(34, 16)

In [157]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [158]:
ncbi_frame_disputed.shape

(68, 16)

In [159]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

In [160]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [161]:
ncbi_disputes_exploded_frame.shape

(12286, 16)

In [162]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,water,50076,water,1,0,water,,,CHEBI:15377,water,[CHEBI:15377],1,,[CHEBI:15377],1
1,2,seawater,12181,seawater,1,0,seawater,,,ENVO:00002149,sea water,[ENVO:00002149],1,,[ENVO:00002149],1
2,3,not applicable,11993,not applicable,1,0,not applicable,,,,,[],0,,[],0
3,4,sea water,10391,sea water,1,0,sea water,,,ENVO:00002149,sea water,[ENVO:00002149],1,,[ENVO:00002149],1
4,5,missing,6481,missing,1,0,missing,,,,,[],0,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12281,8528,"Water, [ENVO:00002007]",1,"Water, [ENVO:00002007]",1,1,"Water,",ENVO:00002007,sediment,CHEBI:15377,water,"[ENVO:00002007, CHEBI:15377]",2,,[CHEBI:15377],2
12282,11695,Surface sea water [ENVO:01001581],1,Surface sea water [ENVO:01001581],1,1,Surface sea water,ENVO:01001581,sea surface layer,ENVO:00002149,sea water,"[ENVO:00002149, ENVO:01001581]",2,,[ENVO:00002149],2
12283,11695,Surface sea water [ENVO:01001581],1,Surface sea water [ENVO:01001581],1,1,Surface sea water,ENVO:01001581,sea surface layer,ENVO:00002149,sea water,"[ENVO:00002149, ENVO:01001581]",2,,[ENVO:01001581],2
12284,11699,Thin stillage (ENVO:01000371),1,Thin stillage (ENVO:01000371),1,1,Thin stillage,ENVO:01000371,agricultural waste material,PATO:0000592,decreased thickness,"[ENVO:01000371, PATO:0000592]",2,,[ENVO:01000371],2


In [163]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [164]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
0    7195
1    5091
Name: count, dtype: int64

In [165]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [166]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["sample_count"].sum().reset_index()
)

ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_scoped_count']

In [167]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_scoped_count
0,BFO:0000029,1
1,BFO:0000030,129
2,CHEBI:15022,6
3,CHEBI:15377,66578
4,CHEBI:16134,2
...,...,...
1076,UBERON:0002416,794
1077,UBERON:0002535,3
1078,UBERON:0004529,1
1079,UBERON:0006314,4


In [168]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [169]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,no_comparison_enum,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,soil,liquid water,water ice,goldterms_mappings,nmdc_scoped_count,gold_hybrid_count,ncbi_scoped_count
0,ENVO:00005753,urea enriched soil,True,False,False,0,0,True,False,False,False,False,True,True,False,False,False,,,
1,ENVO:01001644,material primarily composed of biogenic carbon...,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False,,,
2,,,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False,,,
3,ENVO:00002623,,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False,,,
4,PATO:0001566,distributed,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False,,,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2608,ENVO:03501324,latex,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False,,,
2609,ENVO:00000570,obsolete brackish water habitat,True,True,False,0,0,False,False,False,False,False,False,False,False,False,False,,,1.0
2610,ENVO:00000210,marine aphotic zone,True,False,False,0,0,True,False,False,False,False,False,False,False,False,False,,,5.0
2611,ENVO:03605004,epipsammon,True,False,False,0,0,False,False,False,False,False,True,False,False,False,False,93.0,,


In [170]:
rows_frame.to_csv(output_file_name, sep="\t", index=False)