In [1]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import yaml


In [2]:
print("verify output is being rendered")

verify output is being rendered


In [3]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals? Or just use caching around the function?
ancestor_cache = {}
descendant_cache = {}

In [4]:
# todo deal with circularity in env package prediction -> env triad reporting

# todo this on-demand NCBI curie extraction and annotation recapitulates work that is being added to
# https://portal.nersc.gov/project/m3408/biosamples_duckdb/
# via 
#   although that doesn't detect auto-incremented curies from  spreadsheet dragging

# todo if more caching is desired, it should probably take the form of saving dataframes for TSV

# eventually, dig up a complete JSON gold biosample dump for non-hybrid biosample counts

# overall run time (if NCBI biosamples and goldData are cached): ~ 10 minutes

# Task Settings
_For making a Soil env_broad_scale voting sheet vs a Sediment env_local_scale sheet, etc._

todo: bundle these into dicts so they don't have to be modified independently and kept in sync with one another.

In [5]:
output_file_name = "voting_sheets_output/soil_env_broad_scale_voting_sheet.tsv"

In [6]:
semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale
# semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale
# semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

In [7]:
plant_first_where = "s1.value like 'host-associated > plants%'"
sediment_first_where = "lower(s1.value) like 'environmental > aquatic%sediment%'"
soil_first_where = "s1.value like 'environmental > terrestrial > soil%'"
water_first_where = "s1.value like 'environmental > aquatic%' and lower(s1.value) not like '%sediment%'"

## context selectors

In [8]:
gold_context_selector = 'mixs:env_broad'
# gold_context_selector = 'mixs:env_local'
# gold_context_selector = 'mixs:env_medium'

In [9]:
ncbi_context_selector = 'env_broad_scale'
# ncbi_context_selector = 'env_local_scale'
# ncbi_context_selector = 'env_medium'

In [10]:
nmdc_context_selector= 'env_broad_scale_id'
# nmdc_context_selector= 'env_local_scale_id'
# nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors

In [11]:
gold_first_where = soil_first_where

In [12]:
# todo new since soil: why are we only considering MIMS.me for discovering appropriate env triad values?
#   there's usually a roughly equal number of biosamples from in each extension for MIMS.me and 

# ncbi_package_selector = 'MIMS.me.plant-associated.6.0'
# ncbi_package_selector = 'MIMS.me.sediment.6.0'
# ncbi_package_selector = 'MIMS.me.soil.6.0'
# ncbi_package_selector = 'MIMS.me.water.6.0'

# ncbi_package_selector = 'MIMS.me.plant-associated.6.0'
# ncbi_package_selector = 'MIMS.me.sediment.6.0'
ncbi_package_selector = 'soil.6.0'
# ncbi_package_selector = 'MIMS.me.water.6.0'

In [13]:
# nmdc_package_selector = 'plant-associated'
# nmdc_package_selector = 'sediment'
nmdc_package_selector = 'soil'
# nmdc_package_selector = 'water'


In [14]:
GOLDTERMS_NA = '' # ???

GOLDTERMS_PLANT_ASSOCIATED = GOLDTERMS_NA # host associated -> viridiplantae? take a string approach!
GOLDTERMS_SEDIMENT = 'GOLDTERMS:3985' #  doesn't have any subclasses
GOLDTERMS_SOIL = 'GOLDTERMS:4212'
GOLDTERMS_WATER = 'GOLDTERMS:3984'

# GOLDTERMS:4180, 'Environmental > Aquatic > Freshwater > Pond > Sediment' and ~64 more don't share a common root
# poetry run runoak -i sqlite:obo:goldterms info 't~sediment'


In [15]:
goldterms_root = GOLDTERMS_SOIL

## selecting name and version of one enum for comparison


In [16]:
# only the Soil enums have legacy definitions (v10.7 and earlier?)

CONTEXT_ENUM = "EnvBroadScaleSoilEnum"
# CONTEXT_ENUM = "EnvLocalScaleSoilEnum"
# CONTEXT_ENUM = "EnvMediumSoilEnum"

# CONTEXT_ENUM = ""

In [17]:
# todo: add columns for membership in multiple enums from multiple version of the schema?
#  like sediment local vs soil local and water local (once that's completed)
#  get them from schema files or something prior to that? sems like the voting sheets are too raw/preliminary for that
#   can use a more recent schema url for more recent enums!

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

# previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [18]:
# todo: don't call the column "legacy_pv". use the name of the enum and the version of the schema?

comparison_enum_column_name = 'EnvBroadScaleSoilEnum_10_7'

# Additional Settings

In [19]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [20]:
MIN_ANNOTATION_LEN = 3

In [21]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [22]:
envo_adapter_string = "sqlite:obo:envo"

In [23]:
# goldterms_adapter_string = "sqlite:obo:envo"

In [24]:
env_package_override_file = 'mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [25]:
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'

In [26]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
gold_data_file_name = "goldData.xlsx" # goldData.xlsx: Microsoft Excel 2007+
gold_csv_file_name = "gold_biosamples.csv"
BIOSAMPLES_SHEET = "Biosample"

In [27]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"


# CURIe Constants

In [28]:
BIOME = 'ENVO:00000428'
TERRESTRIAL_BIOME = 'ENVO:00000446'
AQUATIC_BIOME = 'ENVO:00002030'
ABP = 'ENVO:01000813'
ENVIRONMENTAL_SYSTEM = 'ENVO:01000254'
ENVIRONMENTAL_MATERIAL = 'ENVO:00010483'

# Settings-based Queries

In [29]:
goldterms_subclass_query = f"""
select
	subject
from
	entailed_edge ee
where
	predicate = 'rdfs:subClassOf'
	and object = '{goldterms_root}'
"""

In [30]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate = '{gold_context_selector}'"""

In [31]:
ncbi_query = f"""
SELECT content, COUNT(1) AS sample_count 
FROM attributes 
WHERE harmonized_name = '{ncbi_context_selector}' AND package_content like '%{ncbi_package_selector}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [32]:
# and s1.subject = s1.stanza eliminates matches on blank node anntoation rows (probably woudn't change results but adds a little overhead)

extension_query = f"""
select
		s1.subject ,
		s2.predicate,
		COALESCE (s2."object",
	s2."value") as content
from
	statements s1
join statements s2 on 
	s1.subject = s2.subject
where
	{gold_first_where}
	and s1.predicate = 'rdfs:label'
	and s1.subject = s1.stanza
	and s2.predicate in ('mixs:env_broad', 'mixs:env_local', 'mixs:env_medium', 'mixs:mixs_extension', 'rdfs:label', 'mixs:other', 'mixs:anatomical_site', 'mixs:host_taxon') ;
"""

# todo provide examples of sediment samples that are excluded by the requirement for aquatic

# Locally Defined Functions
_Currently using locally-defined cache dictionaries_

In [33]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [34]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

# Procedural Code Starts Here

In [35]:
# Determine the filenames and target directory for the NCBI DuckDB
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
ncbi_compressed_file_path = os.path.join(ncbi_compressed_filename)
ncbi_uncompressed_file_path = os.path.join(ncbi_filename)

# target_dir = os.path.join('.') # just assume the files are downloaded into the same directory as the notebook

In [36]:
if os.path.isfile(ncbi_uncompressed_file_path):
    print(f"{ncbi_uncompressed_file_path} is already present in the current working directory.")
else:
    if os.path.isfile(ncbi_compressed_file_path):
        print(f"{ncbi_compressed_file_path} is already present in the current working directory.")
    else:
        print(f"{ncbi_compressed_file_path} needs to be downloaded")
        ncbi_response = requests.get(ncbi_duckdb_url)
        with open(ncbi_compressed_file_path, "wb") as f:
            f.write(ncbi_response.content)
        # ~ 2 minutes @ 250 Mbps
    
    # Unzip the compressed file and save the extracted file in target directory
    print(f"{ncbi_compressed_file_path} needs to be unpacked")
    with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
        with open(ncbi_uncompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # ~ 2 minutes

ncbi_biosamples_2024-09-23.duckdb is already present in the current working directory.


In [37]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [38]:
envo_adapter = get_adapter(envo_adapter_string)

In [39]:
anchor_descendants = get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter)

In [40]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [41]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [42]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,ENVO:01001505,alpine tundra biome
1,ENVO:01000024,marine benthic biome
2,ENVO:01000252,freshwater lake biome
3,ENVO:01000180,tundra biome
4,ENVO:01000123,marine sponge reef biome
...,...,...
123,ENVO:01000858,marine upwelling biome
124,ENVO:01000188,tropical savanna biome
125,ENVO:01000042,neritic epipelagic zone biome
126,ENVO:01000045,epeiric sea biome


----

In [43]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [44]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys =[]
    

In [45]:
print(context_pvs_keys)

['arid biome [ENVO:01001838]', 'subalpine biome [ENVO:01001837]', 'montane biome [ENVO:01001836]', '__montane savanna biome [ENVO:01000223]', '__montane shrubland biome [ENVO:01000216]', 'alpine biome [ENVO:01001835]', '__alpine tundra biome [ENVO:01001505]', 'subpolar biome [ENVO:01001834]', 'subtropical biome [ENVO:01001832]', '__mediterranean biome [ENVO:01001833]', '____mediterranean savanna biome [ENVO:01000229]', '____mediterranean shrubland biome [ENVO:01000217]', '____mediterranean woodland biome [ENVO:01000208]', '__subtropical woodland biome [ENVO:01000222]', '__subtropical shrubland biome [ENVO:01000213]', '__subtropical savanna biome [ENVO:01000187]', 'temperate biome [ENVO:01001831]', '__temperate woodland biome [ENVO:01000221]', '__temperate shrubland biome [ENVO:01000215]', '__temperate savanna biome [ENVO:01000189]', 'tropical biome [ENVO:01001830]', '__tropical woodland biome [ENVO:01000220]', '__tropical shrubland biome [ENVO:01000214]', '__tropical savanna biome [ENV

In [46]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [47]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [48]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [49]:
pv_validation_results

{'problems': [],
 'valids': [{'curie': 'ENVO:01001838', 'label': 'arid biome'},
  {'curie': 'ENVO:01001837', 'label': 'subalpine biome'},
  {'curie': 'ENVO:01001836', 'label': 'montane biome'},
  {'curie': 'ENVO:01000223', 'label': 'montane savanna biome'},
  {'curie': 'ENVO:01000216', 'label': 'montane shrubland biome'},
  {'curie': 'ENVO:01001835', 'label': 'alpine biome'},
  {'curie': 'ENVO:01001505', 'label': 'alpine tundra biome'},
  {'curie': 'ENVO:01001834', 'label': 'subpolar biome'},
  {'curie': 'ENVO:01001832', 'label': 'subtropical biome'},
  {'curie': 'ENVO:01001833', 'label': 'mediterranean biome'},
  {'curie': 'ENVO:01000229', 'label': 'mediterranean savanna biome'},
  {'curie': 'ENVO:01000217', 'label': 'mediterranean shrubland biome'},
  {'curie': 'ENVO:01000208', 'label': 'mediterranean woodland biome'},
  {'curie': 'ENVO:01000222', 'label': 'subtropical woodland biome'},
  {'curie': 'ENVO:01000213', 'label': 'subtropical shrubland biome'},
  {'curie': 'ENVO:01000187',

----

In [50]:
# todo rename to all_nmdc_samples etc
all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)  # Example with stop_after

# ~ 1 minute
# how long would saving and restoring to a file take?
# YAML is pretty but that would be the slowest
# try JSON
# pre-filter to only include the fields we need?

# todo cache this as a file

Fetched page 1 with 1000 documents. Total fetched: 1000
Fetched page 2 with 1000 documents. Total fetched: 2000
Fetched page 3 with 1000 documents. Total fetched: 3000
Fetched page 4 with 1000 documents. Total fetched: 4000
Fetched page 5 with 1000 documents. Total fetched: 5000
Fetched page 6 with 1000 documents. Total fetched: 6000
Fetched page 7 with 1000 documents. Total fetched: 7000
Fetched page 8 with 1000 documents. Total fetched: 8000
Fetched page 9 with 362 documents. Total fetched: 8362
All documents fetched.


In [51]:
# # todo I don't think we're actually using this
# all_studies = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL, STUDY_SET_COLLECTION)  # Example with stop_after

In [52]:
env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

In [53]:
# todo show env_pacakge_overrides as a data frame
#   with some other columns for context?

In [54]:
biosample_contexts_lod = biosamples_lod_context_extractor(all_nmdc_biosamples, envo_adapter,
                                                          my_env_pacakge_overrides=env_pacakge_overrides)

# ~ 10 seconds, lots of logging

Overriding env_package for biosample nmdc:bsm-11-0k8nkx16 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-19v98823 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-1yvac190 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-28kgw077 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-2hswww54 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-34przm31 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-35m0rm03 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-3636w778 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nffqc45 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nhng665 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3r4g4610 from  to hydrocarbon resources-fluids_swabs
Overriding env_package

In [55]:
nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

In [56]:
# print a value count for the normalized_env_package column
print("Value counts for normalized_env_package column:")
print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))

Value counts for normalized_env_package column:
normalized_env_package
                                                   5838
soil                                               1707
plant-associated                                    401
water                                               192
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


In [57]:
package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

# these predictions often have a f1 of 1.00
# many people might find that hard to believe

(8362, 14)
                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        16
             hydrocarbon resources-fluids_swabs       1.00      1.00      1.00         7
miscellaneous natural or artificial environment       1.00      1.00      1.00        50
                               plant-associated       1.00      1.00      1.00       122
                                           soil       1.00      1.00      1.00       506
                                          water       1.00      1.00      1.00        57

                                       accuracy                           1.00       758
                                      macro avg       1.00      1.00      1.00       758
                                   weighted avg       1.00      1.00      1.00       758



In [58]:
nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

In [59]:
nmdc_biosample_contexts_frame.shape

(8362, 15)

In [60]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == nmdc_package_selector]

In [61]:
nmdc_biosample_contexts_frame.shape

(6162, 15)

----

In [62]:
ncbi_frame = ncbi_conn.execute(ncbi_query).fetchdf()

In [63]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [64]:
# includes values with counts of one... useful for discovering drag-down submissions?

In [65]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [66]:
## diagnostically useful, but why are we saving this and not all of the intermediate dataframes?
# ncbi_frame.to_csv("ncbi_frame.tsv", sep="\t", index=False)

In [67]:
# todo is there any reason to not do this ?
ncbi_frame = ncbi_frame[ncbi_frame['content'].notna() & (ncbi_frame['content'] != '')]

In [68]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [69]:
ncbi_frame.shape

(7191, 5)

In [70]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [71]:
ncbi_frame.shape

(7330, 5)

In [72]:
# how many content_list strings contain envo multiple times now?

In [73]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [74]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    5110
1    2213
2       6
3       1
Name: count, dtype: int64

doesn't split multiple annotation strings delimited with something other than '|'
annotations with no curies but multiple strings will be "annotated" with OAK, but currently only the one best OAK annotation is kept

In [75]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [76]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


In [77]:
parse_failures

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie
316,310,ENVO:Tropical and subtropical moist broadleaf ...,172,ENVO:Tropical and subtropical moist broadleaf ...,1,1,ENVO Tropical and subtropical moist broadleaf ...,
318,312,ENVO:subtropical coniferous forest biome,171,ENVO:subtropical coniferous forest biome,1,1,ENVO subtropical coniferous forest biome,
348,340,ENVO:Tropical and subtropical coniferous fores...,150,ENVO:Tropical and subtropical coniferous fores...,1,1,ENVO Tropical and subtropical coniferous fores...,
372,364,"ENVO:Temperate grasslands, savannas, and shrub...",137,"ENVO:Temperate grasslands, savannas, and shrub...",1,1,"ENVO Temperate grasslands, savannas, and shrub...",
373,365,ENVO:Tropical moist broadleaf forest biome,137,ENVO:Tropical moist broadleaf forest biome,1,1,ENVO Tropical moist broadleaf forest biome,
...,...,...,...,...,...,...,...,...
6557,6433,ENVO0000317,1,ENVO0000317,1,1,ENVO0000317,
6558,6434,ENVO0000320,1,ENVO0000320,1,1,ENVO0000320,
6559,6435,ENVO0000325,1,ENVO0000325,1,1,ENVO0000325,
6725,6601,ENVO0000019,1,ENVO0000019,1,1,ENVO0000019,


Should we try parsing on additional CURIE delimiters? Or no delimiter?


In [78]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

In [79]:
# Apply the annotation function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_adapter, MIN_ANNOTATION_LEN))

# this cell only takes ~ 1 minute, but generates a lot of "ERRORS" and WARNINGS in a red fornt
#   while loading the ontologies that are used for annotating


ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [80]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [81]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,soil biome [ENVO:01001044],16533,soil biome [ENVO:01001044],1,1,soil biome,ENVO:01001044,soil environment,ENVO:00000428,biome
1,2,missing,12170,missing,1,0,missing,,,,
2,3,soil,10370,soil,1,0,soil,,,ENVO:00001998,soil
3,4,not applicable,9683,not applicable,1,0,not applicable,,,,
4,5,not collected,6404,not collected,1,0,not collected,,,,
...,...,...,...,...,...,...,...,...,...,...,...
7325,7187,peat biome after methane incubation,1,peat biome after methane incubation,1,0,peat biome after methane incubation,,,CHEBI:16183,methane
7326,7188,Chernozemic soil,1,Chernozemic soil,1,0,Chernozemic soil,,,ENVO:00002237,chernozem
7327,7189,marine supra-littoral zone,1,marine supra-littoral zone,1,0,marine supra-littoral zone,,,ENVO:01000124,marine supra-littoral zone
7328,7190,soil microcosm,1,soil microcosm,1,0,soil microcosm,,,ENVO:01000621,microcosm


In [82]:
if os.path.isfile(gold_data_file_name):
    print(f"{gold_data_file_name} is already present in the current working directory.")
else:
    print(f"{gold_data_file_name} needs to be downloaded")
    gold_response = requests.get(gold_data_url)
    with open(gold_data_file_name, "wb") as f:
        f.write(gold_response.content)
        # ~ 10 seconds  @ 250 Mbps

goldData.xlsx is already present in the current working directory.


Expect to see

> /home/mark/.cache/pypoetry/virtualenvs/nmdc-submission-schema-DC6HKp4p-py3.10/lib/python3.10/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

In [83]:
if os.path.isfile(gold_csv_file_name):
    print(f"{gold_csv_file_name} is present in the current working directory and will be parsed into gold_biosamples_frame.")
    gold_biosamples_frame = pd.read_csv(gold_csv_file_name, sep="\t")
else:
    print(f"gold_biosamples_frame and {gold_csv_file_name} need to be created")
    gold_biosamples_frame = pd.read_excel(gold_data_file_name, sheet_name=BIOSAMPLES_SHEET)
    gold_biosamples_frame.to_csv("gold_biosamples.csv", index=False, sep="\t")
    # 2 minutes

gold_biosamples.csv is present in the current working directory and will be parsed into gold_biosamples_frame.


In [84]:
gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(
    0).astype(int)


In [85]:
gold_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
1,Gb0035601,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
2,Gb0035602,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
3,Gb0035635,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
4,Gb0035638,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210932,Gb0405291,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210933,Gb0405292,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210934,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified
210935,Gb0405294,Freshwater microbial communities from Prairie ...,449393.0,freshwater metagenome,lake water,2023-08-08,"USA: Prairie Lake NEON Field Site, Vashti, Nor...",47.159710,-99.118723,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [86]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [87]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)
    
# ~ 1 second

In [88]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 second

In [89]:
# that's all fast. don't bother caching

In [90]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [91]:
goldterms_subjects = pd.read_sql_query(goldterms_subclass_query, goldterms_conn)

In [92]:
goldterms_subjects['path_id'] = goldterms_subjects['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [93]:
goldterms_subjects

Unnamed: 0,subject,path_id
0,GOLDTERMS:5820,5820
1,GOLDTERMS:5421,5421
2,GOLDTERMS:5617,5617
3,GOLDTERMS:Environmental-Terrestrial-Soil-Natur...,
4,GOLDTERMS:Environmental-Terrestrial-Soil-Pasture,
...,...,...
81,GOLDTERMS:4203,4203
82,GOLDTERMS:Environmental-Terrestrial-Soil-Uncla...,
83,GOLDTERMS:5804,5804
84,GOLDTERMS:4241,4241


In [94]:
gold_path_ids = goldterms_subjects['path_id'].dropna().unique().tolist()
gold_path_ids = [int(my_id) for my_id in gold_path_ids]


In [95]:
gold_env_filtered_biosamples_frame = gold_biosamples_frame[
    gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].isin(gold_path_ids)]


In [96]:
gold_env_filtered_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
11,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
12,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
13,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
14,Gb0050974,Soil ecosystem from different sites within th...,410658.0,soil metagenome,"soil classification euic, hyperthermic lithic ...",,Soils collected from different sites within th...,26.663199,-80.628500,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
56,Gb0051017,Mammuthus primigenius fossil ecosystem from Bo...,444079.0,fossil metagenome,"Bolshaya Kolopatkaya river, Russia",,Russia: Sakha Republic,70.000000,151.000000,4418,Environmental,Terrestrial,Soil,Fossil,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210902,Gb0405261,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
210903,Gb0405262,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
210904,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
210905,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified


In [97]:
goldterms_context_frame = pd.read_sql_query(goldterms_envo_query, goldterms_conn)

In [98]:
goldterms_context_frame['object_label'] = goldterms_context_frame['object'].apply(envo_adapter.label)

In [99]:
goldterms_context_frame['path_id'] = goldterms_context_frame['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [100]:
goldterms_context_frame

Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,GOLDTERMS:Engineered-Artificial-ecosystem,GOLDTERMS:Engineered-Artificial-ecosystem,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
1,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
2,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
3,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
4,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
...,...,...,...,...,...,...,...,...,...,...
643,GOLDTERMS:5841,GOLDTERMS:5841,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5841
644,GOLDTERMS:5843,GOLDTERMS:5843,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5843
645,GOLDTERMS:5846,GOLDTERMS:5846,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5846
646,GOLDTERMS:5849,GOLDTERMS:5849,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,5849


In [101]:
# Fill NaN values in 'BIOSAMPLE ECOSYSTEM PATH ID' with 0 and convert to int
gold_env_filtered_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_env_filtered_biosamples_frame[
    'BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int)

# Drop rows with NaN in 'path_id' in goldterms_context_frame
goldterms_context_frame = goldterms_context_frame.dropna(subset=['path_id'])

# Convert 'path_id' to int
goldterms_context_frame['path_id'] = goldterms_context_frame['path_id'].astype(int)

# Perform the left merge
gold_env_filtered_biosamples_with_inferred = gold_env_filtered_biosamples_frame.merge(
    goldterms_context_frame,
    left_on='BIOSAMPLE ECOSYSTEM PATH ID',
    right_on='path_id',
    how='left'
)


In [102]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
1,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
2,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
3,Gb0050974,Soil ecosystem from different sites within th...,410658.0,soil metagenome,"soil classification euic, hyperthermic lithic ...",,Soils collected from different sites within th...,26.663199,-80.628500,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
4,Gb0051017,Mammuthus primigenius fossil ecosystem from Bo...,444079.0,fossil metagenome,"Bolshaya Kolopatkaya river, Russia",,Russia: Sakha Republic,70.000000,151.000000,4418,...,GOLDTERMS:4418,GOLDTERMS:4418,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15362,Gb0405261,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
15363,Gb0405262,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
15364,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
15365,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212


# GOLDTERMS only approach

In [103]:
goldterms_result = pd.read_sql_query(extension_query, goldterms_conn)

In [104]:
goldterms_result

Unnamed: 0,subject,predicate,content
0,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:env_broad,ENVO:00000446
1,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:env_local,ENVO:01001177
2,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:env_medium,ENVO:00005742
3,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:mixs_extension,mixs:Soil
4,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,rdfs:label,Environmental > Terrestrial > Soil > Arable
...,...,...,...
403,GOLDTERMS:5823,mixs:env_broad,ENVO:00000446
404,GOLDTERMS:5823,mixs:env_medium,ENVO:00005802
405,GOLDTERMS:5823,mixs:mixs_extension,mixs:Soil
406,GOLDTERMS:5823,mixs:other,ENVO:03600045


In [105]:
# # todo: save this kind of content before subsetting on an environment
# #   the subsetting is currently baked into the query
# 
# # see also goldterms_queries.ipynb in MAM's Collab
# goldterms_result.to_csv("goldterms_single_environment_mappings_long.tsv", sep="\t", index=False)

In [106]:
goldterms_only_curies = goldterms_result.loc[goldterms_result['predicate'] == gold_context_selector, 'content']


In [107]:
goldterms_only_curies = goldterms_only_curies.unique().tolist()

In [108]:
goldterms_only_curies

['ENVO:00000446',
 'ENVO:01000177',
 'ENVO:00000077',
 'ENVO:01000250',
 'ENVO:01001209',
 'ENVO:00000078',
 'ENVO:01001835',
 'ENVO:00000232']

----

In [109]:
anchor_curies = list(anchor_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]

biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A])) # 
terrestrial_biome_curies = list(envo_adapter.descendants(TERRESTRIAL_BIOME, predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants(AQUATIC_BIOME, predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants(ENVIRONMENTAL_SYSTEM, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENVIRONMENTAL_MATERIAL, predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

In [110]:
include_in_rows = set()

In [111]:
include_in_rows.update(anchor_curies)

In [112]:
include_in_rows.update(legacy_pv_curies)

In [113]:
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_context_selector])

In [114]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [115]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [116]:
include_in_rows.update(gold_env_filtered_biosamples_with_inferred['object'])

In [117]:
rows_lod = []

In [118]:
for curie in include_in_rows:
    if curie is None:
        continue
        
    # ONCE AGAIN, assuming that EnvO is the only ontology we'll check against
    current_ancestors = list(envo_adapter.ancestors(curie, predicates=[IS_A])) # vs legacy_pv_curies
    ancestors_in_enum_count = len(set(current_ancestors) & set(legacy_pv_curies))
    
    current_descendants  = list(envo_adapter.descendants(curie, predicates=[IS_A])) # vs legacy_pv_curies
    descendants_in_enum_count  = len(set(current_descendants) & set(legacy_pv_curies))
    
    
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        comparison_enum_column_name: False,
        'ancestors_in_enum_count': ancestors_in_enum_count,
        'descendants_in_enum_count': descendants_in_enum_count,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
        'goldterms_mappings': False,
    }
        
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in legacy_pv_curies:
        row[comparison_enum_column_name] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    if curie in goldterms_only_curies:
        row['goldterms_mappings'] = True
        
    try:
        prefix, local_id = curie.split(':')
        if prefix and prefix == 'ENVO' and row['label'] is not None:
            row['envo_native'] = True
    except Exception as e:
        # Print the exception message
        print(f"An error occurred: {e} trying to split {curie}")

    rows_lod.append(row)

# 2 minutes


In [119]:
rows_frame = pd.DataFrame(rows_lod)

In [120]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvBroadScaleSoilEnum_10_7,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,goldterms_mappings
0,ENVO:00002297,obsolete environmental feature,True,True,False,0,0,False,False,False,False,False,False,False
1,ENVO:0000472,,False,False,False,0,0,False,False,False,False,False,False,False
2,ENVO:01000210,tropical coniferous forest biome,True,False,False,2,0,True,True,True,True,False,False,False
3,ENVO:01000250,subpolar coniferous forest biome,True,False,False,2,0,True,True,True,True,False,False,True
4,ENVO:00002078,,False,False,False,0,0,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1575,ENVO:0000436,,False,False,False,0,0,False,False,False,False,False,False,False
1576,ENVO:01001790,terrestrial ecosystem,True,False,False,0,26,True,True,False,False,False,False,False
1577,ENVO:00005950,,False,False,False,0,0,False,False,False,False,False,False,False
1578,ENVO:01000245,cropland biome,True,False,False,2,0,True,True,True,True,False,False,False


In [121]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_context_selector].value_counts().reset_index()
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_scoped_count']


In [122]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_scoped_count
0,ENVO:00000446,5400
1,ENVO:01000253,260
2,ENVO:01000174,234
3,ENVO:01000177,113
4,ENVO:01000250,31
5,ENVO:01000221,27
6,ENVO:01000219,22
7,ENVO:01001837,18
8,ENVO:01000249,15
9,ENVO:01000245,9


In [123]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [124]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
1,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
2,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
3,Gb0050974,Soil ecosystem from different sites within th...,410658.0,soil metagenome,"soil classification euic, hyperthermic lithic ...",,Soils collected from different sites within th...,26.663199,-80.628500,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
4,Gb0051017,Mammuthus primigenius fossil ecosystem from Bo...,444079.0,fossil metagenome,"Bolshaya Kolopatkaya river, Russia",,Russia: Sakha Republic,70.000000,151.000000,4418,...,GOLDTERMS:4418,GOLDTERMS:4418,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15362,Gb0405261,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
15363,Gb0405262,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
15364,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
15365,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212


In [125]:
gold_biosample_scoped_counts = gold_env_filtered_biosamples_with_inferred['object'].value_counts().reset_index()
gold_biosample_scoped_counts.columns = ['curie', 'gold_hybrid_count']

In [126]:
gold_biosample_scoped_counts

Unnamed: 0,curie,gold_hybrid_count
0,ENVO:00000446,10862
1,ENVO:00000077,1609
2,ENVO:01001209,1527
3,ENVO:01000177,947
4,ENVO:00000078,205
5,ENVO:00000232,159
6,ENVO:01001835,46
7,ENVO:01000250,12


In [127]:
# Perform the left merge
rows_frame = rows_frame.merge(
    gold_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [128]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvBroadScaleSoilEnum_10_7,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,goldterms_mappings,nmdc_scoped_count,gold_hybrid_count
0,ENVO:00002297,obsolete environmental feature,True,True,False,0,0,False,False,False,False,False,False,False,,
1,ENVO:0000472,,False,False,False,0,0,False,False,False,False,False,False,False,,
2,ENVO:01000210,tropical coniferous forest biome,True,False,False,2,0,True,True,True,True,False,False,False,,
3,ENVO:01000250,subpolar coniferous forest biome,True,False,False,2,0,True,True,True,True,False,False,True,31.0,12.0
4,ENVO:00002078,,False,False,False,0,0,False,False,False,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1575,ENVO:0000436,,False,False,False,0,0,False,False,False,False,False,False,False,,
1576,ENVO:01001790,terrestrial ecosystem,True,False,False,0,26,True,True,False,False,False,False,False,,
1577,ENVO:00005950,,False,False,False,0,0,False,False,False,False,False,False,False,,
1578,ENVO:01000245,cropland biome,True,False,False,2,0,True,True,True,True,False,False,False,9.0,


In [129]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [130]:
# gold and ncbi counts are slightly trickier
# for gold: including mappings only, mappings in hybrid with biosample counts. 
#    Switch to direct biosample counts of GOLD "envo" annotations?
# ncbi: we have extracted curies and annotated curies

In [131]:
# todo move this stuff up to immediately after the creation of ncbi_frame ?

# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long runs of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [132]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count
0,1,soil biome [ENVO:01001044],16533,soil biome [ENVO:01001044],1,1,soil biome,ENVO:01001044,soil environment,ENVO:00000428,biome,"[ENVO:00000428, ENVO:01001044]",2
1,2,missing,12170,missing,1,0,missing,,,,,[],0
2,3,soil,10370,soil,1,0,soil,,,ENVO:00001998,soil,[ENVO:00001998],1
3,4,not applicable,9683,not applicable,1,0,not applicable,,,,,[],0
4,5,not collected,6404,not collected,1,0,not collected,,,,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7325,7187,peat biome after methane incubation,1,peat biome after methane incubation,1,0,peat biome after methane incubation,,,CHEBI:16183,methane,[CHEBI:16183],1
7326,7188,Chernozemic soil,1,Chernozemic soil,1,0,Chernozemic soil,,,ENVO:00002237,chernozem,[ENVO:00002237],1
7327,7189,marine supra-littoral zone,1,marine supra-littoral zone,1,0,marine supra-littoral zone,,,ENVO:01000124,marine supra-littoral zone,[ENVO:01000124],1
7328,7190,soil microcosm,1,soil microcosm,1,0,soil microcosm,,,ENVO:01000621,microcosm,[ENVO:01000621],1


In [133]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    4215
0    2465
2     650
Name: count, dtype: int64

In [134]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [135]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [136]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [137]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [138]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [139]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [140]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)

# pprint.pprint(stretches_dict)

In [141]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

In [142]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
454,10,1000271
455,10,1000272
456,10,1000273
457,10,1000274


In [143]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [144]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


In [145]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:01001811,0.931034
1,2.0,ENVO:00000428,1.0
2,3.0,ENVO:00000428,1.0
3,4.0,ENVO:00000446,0.941176
4,5.0,ENVO:00000428,1.0
5,6.0,ENVO:00002044,1.0
6,7.0,ENVO:00005801,0.6
7,8.0,PCO:1000004,0.923077
8,9.0,ENVO:01000020,1.0
9,10.0,ENVO:00000428,0.916667


In [146]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [147]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [148]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
2,ENVO:01000250,ENVO:00000109,True
8,ENVO:01000245,ENVO:00000011,True
17,ENVO:00000053,ENVO:00001998,True
18,ENVO:01000002,ENVO:00000428,True
21,ENVO:01000238,ENVO:01000339,True
...,...,...,...
592,ENVO:01000258,ENVO:00000428,True
593,ENVO:01000262,ENVO:00000428,True
594,ENVO:00002058,ENVO:00002044,True
595,ENVO:00002060,ENVO:00002044,True


In [149]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [150]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [151]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    4215
0    2465
2     650
Name: count, dtype: int64

In [152]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    4758
0    2465
2     107
Name: count, dtype: int64

In [153]:
ncbi_frame.shape

(7330, 16)

In [154]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [155]:
ncbi_frame_undisputed.shape

(7223, 16)

In [156]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [157]:
ncbi_frame_disputed.shape

(107, 16)

In [158]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [159]:
ncbi_frame_disputed.shape

(214, 16)

In [160]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

In [161]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [162]:
ncbi_disputes_exploded_frame.shape

(7437, 16)

In [163]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,2,missing,12170,missing,1,0,missing,,,,,[],0,,[],0
1,3,soil,10370,soil,1,0,soil,,,ENVO:00001998,soil,[ENVO:00001998],1,,[ENVO:00001998],1
2,4,not applicable,9683,not applicable,1,0,not applicable,,,,,[],0,,[],0
3,5,not collected,6404,not collected,1,0,not collected,,,,,[],0,,[],0
4,6,forest biome,6373,forest biome,1,0,forest biome,,,ENVO:01000174,forest biome,[ENVO:01000174],1,,[ENVO:01000174],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7432,7156,plain biome [ENVO:00000254]|agricultural biome...,1,plain biome [ENVO:00000254],2,1,plain biome,ENVO:00000254,till plain,ENVO:00000086,plain,"[ENVO:00000254, ENVO:00000086]",2,,[ENVO:00000086],2
7433,7157,plain biome [ENVO:00000254]|agricultural biome...,1,plain biome [ENVO:00000254],2,1,plain biome,ENVO:00000254,till plain,ENVO:00000086,plain,"[ENVO:00000254, ENVO:00000086]",2,,[ENVO:00000254],2
7434,7157,plain biome [ENVO:00000254]|agricultural biome...,1,plain biome [ENVO:00000254],2,1,plain biome,ENVO:00000254,till plain,ENVO:00000086,plain,"[ENVO:00000254, ENVO:00000086]",2,,[ENVO:00000086],2
7435,7158,plain biome [ENVO:00000254]|agricultural biome...,1,plain biome [ENVO:00000254],2,1,plain biome,ENVO:00000254,till plain,ENVO:00000086,plain,"[ENVO:00000254, ENVO:00000086]",2,,[ENVO:00000254],2


In [164]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [165]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    4972
0    2465
Name: count, dtype: int64

In [166]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [167]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["sample_count"].sum().reset_index()
)

ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_scoped_count']

In [168]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_scoped_count
0,BFO:0000029,109
1,CHEBI:140503,1
2,CHEBI:15377,44
3,CHEBI:16183,1
4,CHEBI:22695,4
...,...,...
1168,RO:0001000,30
1169,RO:0002577,119
1170,UBERON:0001988,30
1171,UBERON:0002416,7


In [169]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [170]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvBroadScaleSoilEnum_10_7,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,goldterms_mappings,nmdc_scoped_count,gold_hybrid_count,ncbi_scoped_count
0,ENVO:00002297,obsolete environmental feature,True,True,False,0,0,False,False,False,False,False,False,False,,,1.0
1,ENVO:0000472,,False,False,False,0,0,False,False,False,False,False,False,False,,,3.0
2,ENVO:01000210,tropical coniferous forest biome,True,False,False,2,0,True,True,True,True,False,False,False,,,
3,ENVO:01000250,subpolar coniferous forest biome,True,False,False,2,0,True,True,True,True,False,False,True,31.0,12.0,2953.0
4,ENVO:00002078,,False,False,False,0,0,False,False,False,False,False,False,False,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1575,ENVO:0000436,,False,False,False,0,0,False,False,False,False,False,False,False,,,3.0
1576,ENVO:01001790,terrestrial ecosystem,True,False,False,0,26,True,True,False,False,False,False,False,,,562.0
1577,ENVO:00005950,,False,False,False,0,0,False,False,False,False,False,False,False,,,
1578,ENVO:01000245,cropland biome,True,False,False,2,0,True,True,True,True,False,False,False,9.0,,3120.0


In [171]:
rows_frame.to_csv(output_file_name, sep="\t", index=False)