In [1]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from oaklib.utilities.lexical.lexical_indexer import create_lexical_index, save_lexical_index, load_lexical_index
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface
from oaklib.implementations import AggregatorImplementation

from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import yaml
import json


## todos
- [ ] cache goldterms.db
- [ ] cache goldterms.db **in notebooks/environmental_context_value_sets** with other cached files

In [2]:

selected_configuration = "plant_associated_env_local_scale"

In [3]:
configurations_file = "voting_sheets_configurations.yaml"

In [4]:
# these are the variables that will be set by the configuration

gold_context_selectors = None
ncbi_context_selector = None
ncbi_package_selector = None
first_where = None
gold_first_where = None
semantic_anchors = None
previous_submission_schema_url = None
CONTEXT_ENUM = None
nmdc_package_selector = None
nmdc_context_selector = None
comparison_enum_column_name = None
output_file_name = None


In [5]:
# Load configurations from YAML file
with open(configurations_file, "r") as file:
    configurations = yaml.safe_load(file)["configurations"]


# Function to apply a configuration
def apply_configuration(config_name):
    config = configurations.get(config_name)
    if not config:
        raise ValueError(f"Configuration '{config_name}' not found.")

    # Update global variables
    globals().update(config)
    print(f"Configuration '{config_name}' applied.")


In [6]:
apply_configuration(selected_configuration)

# Now the global variables like `output_file_name`, `semantic_anchor`, etc., are set.

Configuration 'plant_associated_env_local_scale' applied.


In [7]:
gold_first_where = first_where

In [8]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals? Or just use caching around the function?
ancestor_cache = {}
descendant_cache = {}

# Preliminary label, ancestor and descendant caching

In [9]:
# todo this on-demand NCBI curie extraction and annotation recapitulates work that is being added to
# https://portal.nersc.gov/project/m3408/biosamples_duckdb/
# via ???
#   although that doesn't detect auto-incremented curies from  spreadsheet dragging

# todo eventually, dig up a complete JSON gold biosample dump for non-hybrid biosample counts

# todo make it clearer whether biosamples or studies are being counted
#   count nmdc or gold STUDIES too?

# Additional Settings

In [10]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [11]:
MIN_ANNOTATION_LEN = 3

In [12]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [13]:
envo_adapter_string = "sqlite:obo:envo"

In [14]:
po_adapter_string = "sqlite:obo:po"

In [15]:
env_package_override_file = 'mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [16]:
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/old/ncbi_biosamples_2024-09-23.duckdb.gz'
# ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples.duckdb.gz'

In [17]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
gold_data_file_name = "goldData.xlsx"  # goldData.xlsx: Microsoft Excel 2007+
gold_csv_file_name = "gold_biosamples.csv"
BIOSAMPLES_SHEET = "Biosample"

In [18]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"

In [19]:
all_nmdc_biosamples_file = 'all_nmdc_biosamples.json'

# CURIe Constants

In [20]:
BIOME = 'ENVO:00000428'
TERRESTRIAL_BIOME = 'ENVO:00000446'
AQUATIC_BIOME = 'ENVO:00002030'
ABP = 'ENVO:01000813'
ENVIRONMENTAL_SYSTEM = 'ENVO:01000254'
ENVIRONMENTAL_MATERIAL = 'ENVO:00010483'

SOIL = 'ENVO:00001998'
LIQUID_WATER = 'ENVO:00002006'
WATER_ICE = 'ENVO:01000277'

HUMAN_CONSTRUCTION = 'ENVO:00000070'
BUILDING = 'ENVO:00000073'
BUILDING_PART = 'ENVO:01000420'

SNOW = 'ENVO:01000406'
MASS_OF_ENVIRONMENTAL_MATERIAL = 'ENVO:01001686'

PLANT_STRUCTURE = "PO:0009011" # does not include plant anatomical space or portion of plant substance (from plant anatomical entity)

MARINE_BIOME = 'ENVO:00000447'

VEGETATION_LAYER = "ENVO:01000355"


# Settings-based Queries

In [21]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate in ('{"', '".join(gold_context_selectors)}')"""

In [22]:
ncbi_biosamples_per_annotation_query = f"""
SELECT content, COUNT(1) AS count 
FROM attributes 
WHERE harmonized_name = '{ncbi_context_selector}' AND package_content like '%{ncbi_package_selector}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [23]:
ncbi_bioprojects_per_annotation_query = f"""
SELECT
	a.content,
	count(DISTINCT l.content) AS count
FROM
	main.ATTRIBUTES a
JOIN main.links l 
	ON
	a.id = l.id
WHERE
	l.target = 'bioproject'
	AND harmonized_name = 'env_local_scale'
	AND package_content like '%{ncbi_package_selector}'
GROUP BY
	a.content
ORDER BY
	count(DISTINCT l.content) DESC ;
"""

In [24]:
ncbi_X_per_annotation_query = ncbi_bioprojects_per_annotation_query

In [25]:
# and s1.subject = s1.stanza eliminates matches on blank node annotation rows (probably wouldn't change results but adds a little overhead)

extension_query = f"""
select
		s1.subject ,
		s2.predicate,
		COALESCE (s2."object",
	s2."value") as content
from
	statements s1
join statements s2 on 
	s1.subject = s2.subject
where
	{gold_first_where}
	and s1.predicate = 'rdfs:label'
	and s1.subject = s1.stanza
	and s2.predicate in ('mixs:env_broad', 'mixs:env_local', 'mixs:env_medium', 'mixs:mixs_extension', 'rdfs:label', 'mixs:other', 'mixs:anatomical_site', 'mixs:host_taxon') ;
"""


# Locally Defined Functions
_Currently using locally-defined cache dictionaries_

In [26]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [27]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

# Procedural Code Starts Here

In [28]:
# Determine the filenames and target directory for the NCBI DuckDB
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
ncbi_compressed_file_path = os.path.join(ncbi_compressed_filename)
ncbi_uncompressed_file_path = os.path.join(ncbi_filename)


In [29]:
if os.path.isfile(ncbi_uncompressed_file_path):
    print(f"{ncbi_uncompressed_file_path} is already present in the current working directory.")
else:
    if os.path.isfile(ncbi_compressed_file_path):
        print(f"{ncbi_compressed_file_path} is already present in the current working directory.")
    else:
        print(f"{ncbi_compressed_file_path} needs to be downloaded")
        ncbi_response = requests.get(ncbi_duckdb_url)
        with open(ncbi_compressed_file_path, "wb") as f:
            f.write(ncbi_response.content)
        # ~ 2 minutes @ 250 Mbps

    # Unzip the compressed file and save the extracted file in target directory
    print(f"{ncbi_compressed_file_path} needs to be unpacked")
    with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
        with open(ncbi_uncompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # ~ 2 minutes

ncbi_biosamples_2024-09-23.duckdb is already present in the current working directory.


In [30]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

add po_adapter

should we be creating a list of adapters?

In [31]:
envo_adapter = get_adapter(envo_adapter_string)

In [32]:
po_adapter = get_adapter(po_adapter_string)

**Preliminary replacement for `get_hierarchy_terms` and ancestor/descendant lookup in main row construction loop**

In [33]:
reflexivity = False

# File paths
label_cache_file = "envo_label_cache.json"
ancestors_file = "envo_ancestors.json"
descendants_file = "envo_descendants.json"

# Initialize caches
envo_label_cache = {}
envo_ancs_cache = {}
envo_descs_cache = {}

envo_entities_generator = envo_adapter.entities()
envo_entities = list(envo_entities_generator)
envo_entities.sort()


# Function to generate and save the label cache
def generate_label_cache():
    print("Generating label cache...")

    label_cache = {}
    for ee in envo_entities:
        label_cache[ee] = envo_adapter.label(ee)
    return label_cache


# Check for label cache
if os.path.exists(label_cache_file):
    print(f"Loading label cache from {label_cache_file}...")
    with open(label_cache_file, "r") as f:
        envo_label_cache = json.load(f)
else:
    envo_label_cache = generate_label_cache()
    with open(label_cache_file, "w") as f:
        json.dump(envo_label_cache, f)
    print(f"Label cache saved to {label_cache_file}.")

# Check for ancestor cache
if os.path.exists(ancestors_file):
    print(f"Loading ancestors from {ancestors_file}...")
    with open(ancestors_file, "r") as f:
        envo_ancs_cache = json.load(f)
else:
    print("Ancestors file not found. Generating ancestor cache...")

    for ee in envo_entities:
        ancs_gen = envo_adapter.ancestors(ee, reflexive=reflexivity)
        envo_ancs_cache[ee] = [{a: envo_label_cache.get(a, None)} for a in ancs_gen]

    with open(ancestors_file, "w") as f:
        json.dump(envo_ancs_cache, f)
    print(f"Ancestors saved to {ancestors_file}.")

# Check for descendant cache
if os.path.exists(descendants_file):
    print(f"Loading descendants from {descendants_file}...")
    with open(descendants_file, "r") as f:
        envo_descs_cache = json.load(f)
else:
    # 5 minutes
    print("Descendants file not found. Generating descendant cache...")

    for ee in envo_entities:
        descs_gen = envo_adapter.descendants(ee, reflexive=reflexivity)
        envo_descs_cache[ee] = [{d: envo_label_cache.get(d, None)} for d in descs_gen]

    with open(descendants_file, "w") as f:
        json.dump(envo_descs_cache, f)
    print(f"Descendants saved to {descendants_file}.")


Loading label cache from envo_label_cache.json...
Loading ancestors from envo_ancestors.json...
Loading descendants from envo_descendants.json...


# Anchor aka bootstrapping classes

In [34]:
# anchor_descendants = get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter)

anchor_descendants = {}
for semantic_anchor in semantic_anchors:  # Assuming semantic_anchors is a list
    anchor_descendants.update(get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter))


In [35]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [36]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [37]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,ENVO:00001999,marine water body
1,ENVO:01000188,tropical savanna biome
2,ENVO:00000487,paternoster lake
3,ENVO:01000860,temperate marine upwelling biome
4,ENVO:01000199,mediterranean forest biome
...,...,...
1737,ENVO:01000335,understory
1738,ENVO:01001242,canopy
1739,ENVO:01000338,litter layer
1740,ENVO:01000355,vegetation layer


# Classes from the reference enumeration

In [38]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [39]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys = []


In [40]:
print(context_pvs_keys)

['active permafrost layer [ENVO:04000009]', 'alas [ENVO:00000438]', 'badland [ENVO:00000127]', 'beach [ENVO:00000091]', 'butte [ENVO:00000287]', 'caldera [ENVO:00000096]', 'campground [ENVO:01000935]', 'canyon [ENVO:00000169]', 'cave [ENVO:00000067]', 'channel [ENVO:03000117]', 'cirque [ENVO:00000155]', 'cliff [ENVO:00000087]', 'crater [ENVO:00000514]', 'crevasse [ENVO:00000320]', 'cryosphere [ENVO:03000143]', 'dam [ENVO:00000074]', 'desert [ENVO:01001357]', 'drainage basin [ENVO:00000291]', 'drumlin [ENVO:00000276]', 'dry lake [ENVO:00000277]', 'dune [ENVO:00000170]', 'endorheic basin [ENVO:00000551]', 'escarpment [ENVO:00000280]', 'esker [ENVO:00000282]', 'farm [ENVO:00000078]', 'fen [ENVO:00000232]', 'fjord [ENVO:00000039]', 'flood plain [ENVO:00000255]', 'frost heave [ENVO:01001568]', 'fumarole [ENVO:00000216]', 'garden [ENVO:00000011]', 'glacier [ENVO:00000133]', 'greenhouse [ENVO:03600087]', 'harbour [ENVO:00000463]', 'hill [ENVO:00000083]', 'hummock [ENVO:00000516]', 'isthmus [E

In [41]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [42]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [43]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [44]:
pv_validation_results

{'problems': [],
 'valids': [{'curie': 'ENVO:04000009', 'label': 'active permafrost layer'},
  {'curie': 'ENVO:00000438', 'label': 'alas'},
  {'curie': 'ENVO:00000127', 'label': 'badland'},
  {'curie': 'ENVO:00000091', 'label': 'beach'},
  {'curie': 'ENVO:00000287', 'label': 'butte'},
  {'curie': 'ENVO:00000096', 'label': 'caldera'},
  {'curie': 'ENVO:01000935', 'label': 'campground'},
  {'curie': 'ENVO:00000169', 'label': 'canyon'},
  {'curie': 'ENVO:00000067', 'label': 'cave'},
  {'curie': 'ENVO:03000117', 'label': 'channel'},
  {'curie': 'ENVO:00000155', 'label': 'cirque'},
  {'curie': 'ENVO:00000087', 'label': 'cliff'},
  {'curie': 'ENVO:00000514', 'label': 'crater'},
  {'curie': 'ENVO:00000320', 'label': 'crevasse'},
  {'curie': 'ENVO:03000143', 'label': 'cryosphere'},
  {'curie': 'ENVO:00000074', 'label': 'dam'},
  {'curie': 'ENVO:01001357', 'label': 'desert'},
  {'curie': 'ENVO:00000291', 'label': 'drainage basin'},
  {'curie': 'ENVO:00000276', 'label': 'drumlin'},
  {'curie': '

# Get the CURIEs used in NMDC Biosample annotations

In [45]:
if os.path.isfile(all_nmdc_biosamples_file):
    print(
        f"{all_nmdc_biosamples_file} is present in the current working directory and will be read into all_nmdc_biosamples.")
    # with open(all_nmdc_biosamples_file, 'r') as file:
    #     all_nmdc_biosamples = yaml.full_load(file)
    # read as json
    with open(all_nmdc_biosamples_file, 'r') as f:
        all_nmdc_biosamples = json.load(f)

else:
    print(f"All NMDC Biosamples need to be fetched and saved to {all_nmdc_biosamples_file}")
    all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                                        BIOSAMPLE_SET_COLLECTION)
    # with open(all_nmdc_biosamples_file, 'w') as file:
    #     documents = yaml.dump(all_nmdc_biosamples, file)
    # save as json
    with open(all_nmdc_biosamples_file, 'w') as f:
        json.dump(all_nmdc_biosamples, f)

# this saves network traffic. could use JSON for faster performance. 
# 1 minute for network fetch and JSON write?!
# 1 minute for yaml read
# instantaneous for JSON read?

all_nmdc_biosamples.json is present in the current working directory and will be read into all_nmdc_biosamples.


## Prediction of env_package annotations 

In [46]:
# Specify the output file name
env_packages_file = "nmdc_biosample_asserted_normalized_and_inferred_env_package.tsv"

if os.path.exists(env_packages_file):
    # Load the DataFrame from the file if it exists
    print(f"Loading {env_packages_file} into nmdc_biosample_contexts_frame...")
    nmdc_biosample_contexts_frame = pd.read_csv(env_packages_file, sep='\t')
else:
    # File doesn't exist; generate the DataFrame
    print(f"{env_packages_file} not found. Predicting from asserted records and {env_package_override_file}...")

    # Load environment package overrides
    env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

    # Extract biosample contexts
    biosample_contexts_lod = biosamples_lod_context_extractor(
        all_nmdc_biosamples, envo_adapter,
        my_env_pacakge_overrides=env_pacakge_overrides
    )

    # Create the DataFrame
    nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

    # Print value counts for the 'normalized_env_package' column
    print("\n")
    print("Value counts for normalized_env_package column:")
    print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))
    print("\n")

    # Generate package predictions
    package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

    # Add predictions to the DataFrame
    nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

    # Save the DataFrame to the file
    nmdc_biosample_contexts_frame.to_csv(env_packages_file, sep='\t', index=False)
    print(f"env_package predictions saved to {env_packages_file}")



nmdc_biosample_asserted_normalized_and_inferred_env_package.tsv not found. Predicting from asserted records and mam-env-package-overrides.tsv...


Value counts for normalized_env_package column:
normalized_env_package
None                                               5673
soil                                               1707
                                                    666
plant-associated                                    401
water                                               192
sediment                                            165
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


(9028, 14)
                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        28
             hydrocarbon resources-fluids_swabs       1.00  

In [47]:
nmdc_biosample_contexts_frame['predicted_env_package'].value_counts()

predicted_env_package
soil                                               6304
water                                              1637
plant-associated                                    512
sediment                                            165
host-associated                                     156
miscellaneous natural or artificial environment     140
hydrocarbon resources-fluids_swabs                  114
Name: count, dtype: int64

## env-package prediction complete

To-do: save this and don't recreate it if it's available

Then get it reviewed by other NMDC stakeholders and inject it into MongoDB if approved

## Destructively filter `nmdc_biosample_contexts_frame` by `env_package` 

In [48]:
nmdc_biosample_contexts_frame.shape

(9028, 15)

In [49]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == nmdc_package_selector]

In [50]:
nmdc_biosample_contexts_frame.shape

(512, 15)

# Long process of predicting OBO foundry CURIes from NCBI Biosamples

## Start by getting unique annotations? Pre-counted by Biosamples

Current task is to provide counts by "study" aka Bioproject in addition to Biosample counts or instead of Biosamples counts if necessary

In [51]:
ncbi_frame = ncbi_conn.execute(ncbi_X_per_annotation_query).fetchdf()


In [52]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [53]:
# includes values with counts of one... useful for discovering drag-down submissions?

## MIxS and NCBI guidelines imply environmental context slots are multivalued
and that the pipe `|` should be used as a delimiter

there's an envo_count value below that indicates how ofter other delimiters might be used

In [54]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [55]:
# todo is there any reason to not do this ?
ncbi_frame = ncbi_frame[ncbi_frame['content'].notna() & (ncbi_frame['content'] != '')]

In [56]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [57]:
ncbi_frame.shape

(5189, 5)

In [58]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [59]:
ncbi_frame.shape

(5307, 5)

## splitting adds ~ 5% more rows
which might be important since were currently using a longest annotation strategy here

In [60]:
# how many content_list strings contain envo multiple times now?

In [61]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [62]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    4759
1     531
3      15
2       1
7       1
Name: count, dtype: int64

## If my math is correct, about 0.1% of the annotations still contain multiple CURIes 
after splitting on pipes

There will also be annotations with multiple label-like strings that weren't split because they were not delimited on pipes
That might be a source of lost information since we are using a longest-match annotator here
I.e. there could be annotations with multiple hits worth keeping

## Parsing out CURIEs

this has a few limitations. The function only tries pre-specified prefixes (['ENVO'] by default) and only considers colons and underscores valid delimiters.

In [63]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [64]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


## In what kinds of cases could no CURIe be parsed
despite the presence of "ENVO" in the content string?

In [65]:
parse_failures

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie
425,417,ENVO：00000316,2,ENVO：00000316,1,1,ENVO：00000316,
1370,1332,ENVO,1,ENVO,1,1,ENVO,
2428,2374,Tropical woodland biome [ENVO01000220],1,Tropical woodland biome [ENVO01000220],1,1,Tropical woodland biome [ENVO01000220],
2844,2781,Tropical shrubland biome [ENVO01000214],1,Tropical shrubland biome [ENVO01000214],1,1,Tropical shrubland biome [ENVO01000214],


## Retrieve the labels for the parsed CURIes

In [66]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(po_adapter.label)

## Apply oaklib annotation to the strings after CURIe removal
Actually the annotator can (sometimes?) detect colon-delimited CURIEs with lower case prefixes

This returns CURIes with evidence but not necessarily the label corresponding to the CURIe

In [67]:
# # Specify the lexical index file name
# aggregated_lexical_index_file = "aggregated_lexical_index.yaml"
# 
# # Check if the lexical index file exists
# if os.path.exists(aggregated_lexical_index_file):
#     print(f"Loading lexical index from {aggregated_lexical_index_file}...")
#     ix = load_lexical_index(aggregated_lexical_index_file)
# else:
#     print(f"{aggregated_lexical_index_file} not found. Creating lexical index from agg_adapter...")
#     ix = create_lexical_index(agg_adapter)
#     # Save the lexical index to a file
#     save_lexical_index(ix, aggregated_lexical_index_file)
#     print(f"Lexical index saved to {aggregated_lexical_index_file}")
# 
# # Initialize the TextAnnotatorInterface
# aggregated_text_annotator_interface = TextAnnotatorInterface()
# aggregated_text_annotator_interface.lexical_index = ix
# 
# # this cell only takes ~ 1 minute, but generates a lot of "ERRORS" and WARNINGS in a red font
# #   while lexically indexing the ontology

In [68]:
aggregated_text_annotator_interface = AggregatorImplementation(implementations=[envo_adapter, po_adapter])

In [69]:
# Apply the annotation function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, aggregated_text_annotator_interface, MIN_ANNOTATION_LEN))

# ~ 1 minute


ERROR:root:Skipping statements(subject=ENVO:01001644,predicate=oio:hasDbXref,object=None,value=Carbonate which is formed as the result of some biological process.,datatype=None,language=None,); ValueError: Carbonate which is formed as the result of some biological process. is not a valid URI or CURIE


## Add the labels for the CURIes identified though oaklib annotation of strings

In [70]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(aggregated_text_annotator_interface.label)

In [71]:
ncbi_frame

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,not applicable,262,not applicable,1,0,not applicable,,,,
1,2,missing,199,missing,1,0,missing,,,,
2,3,not collected,127,not collected,1,0,not collected,,,,
3,4,Orchard,87,Orchard,1,0,Orchard,,,ENVO:00000115,orchard
4,5,,79,,1,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
5302,5186,Woodland,1,Woodland,1,0,Woodland,,,ENVO:00000057,mangrove swamp
5303,5187,sweet potato,1,sweet potato,1,0,sweet potato,,,,
5304,5188,Individual 8,1,Individual 8,1,0,Individual 8,,,,
5305,5189,mixed hardwood forest,1,mixed hardwood forest,1,0,mixed hardwood forest,,,ENVO:00000111,forested area


## we now have a list of CURIes for each normalized annotation

This could be because the submitter provided a CURIe and a label that don't match
*One* case of this is dragging a CURIe down a column in a spreadsheet, expecting it to be copied,
but actually auto-incrementing it

Now attempt to find one best CURIe for each annotation... by now we have lost the ability to retain multiple legitimate
but improperly separated CURIes

In [72]:
# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long stretches of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [73]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    2694
0    2519
2      94
Name: count, dtype: int64

In [74]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [75]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [76]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [77]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':',
                                                                                                                 expand=True)

In [78]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'],
                                                             errors='coerce').astype('Int64')

In [79]:
double_curie_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,extracted_prefix,extracted_local_id,extracted_local_id_int
145,ENVO:0025034,PO:0025034,ENVO,0025034,25034
148,ENVO:01000245,ENVO:01000635,ENVO,01000245,1000245
226,ENVO:0003093,ENVO:00003081,ENVO,0003093,3093
227,ENVO:0003099,ENVO:00003081,ENVO,0003099,3099
231,ENVO:0003101,ENVO:00003081,ENVO,0003101,3101
...,...,...,...,...,...
4743,ENVO:01001443,ENVO:01001430,ENVO,01001443,1001443
4782,ENVO:01001460,ENVO:01001430,ENVO,01001460,1001460
4832,ENVO:01001439,ENVO:01001430,ENVO,01001439,1001439
4853,ENVO:01000892,ENVO:01000635,ENVO,01000892,1000892


In [80]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [81]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)


In [82]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

`stretches_df` shows groups of extracted EnvoO ids (CURIes without prefix or padding zeros) that share a common CURIe by oaklib annotation of the textual part. This may not be the best or only way to address these spurious drag-stretch, auto-incremented CURIes

Ie 1001458 corresponds to ENVO:01001458, 'mist'

_although it theoretically could have been ENVO:1001458 since EnvO CURIes can have 7 or 8 digits_

In group 9, there are another ~ 50 sequential id values, all corresponding to environmental context annotations whose best oak-annotated class is ENVO:01001803, 'tropical forest'!

How much of an impact does this have? 

In [83]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,3081
1,1,3082
2,1,3083
3,1,3084
4,1,3085
5,1,3086
6,1,3087
7,1,3088
8,1,3089
9,1,3090


In [84]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [85]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


For stretch 9, which included extracted CURIes from ENVO:01001458 to ENVO:01001511, the oaklib test annotation of 100% of the submitted environmental context annotations was ENVO:01001803, so we will keep that and disregard all of the CURIes from the stretch


In [86]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:00003081,1.0
1,2.0,ENVO:01001430,0.942857


In [87]:
decisive_fraction_threshold = 0.9

In [88]:
decisive_stretch_summary_df = stretch_summary_df[stretch_summary_df['fraction'] >= decisive_fraction_threshold]

In [89]:
decisive_stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:00003081,1.0
1,2.0,ENVO:01001430,0.942857


In [90]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    decisive_stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [91]:
double_curie_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,extracted_prefix,extracted_local_id,extracted_local_id_int,stretch_id,value,most_common_longest_annotation_curie,fraction
0,ENVO:0025034,PO:0025034,ENVO,0025034,25034,,,,
1,ENVO:01000245,ENVO:01000635,ENVO,01000245,1000245,,,,
2,ENVO:0003093,ENVO:00003081,ENVO,0003093,3093,1.0,3093.0,ENVO:00003081,1.000000
3,ENVO:0003099,ENVO:00003081,ENVO,0003099,3099,1.0,3099.0,ENVO:00003081,1.000000
4,ENVO:0003101,ENVO:00003081,ENVO,0003101,3101,1.0,3101.0,ENVO:00003081,1.000000
...,...,...,...,...,...,...,...,...,...
83,ENVO:01001443,ENVO:01001430,ENVO,01001443,1001443,2.0,1001443.0,ENVO:01001430,0.942857
84,ENVO:01001460,ENVO:01001430,ENVO,01001460,1001460,2.0,1001460.0,ENVO:01001430,0.942857
85,ENVO:01001439,ENVO:01001430,ENVO,01001439,1001439,2.0,1001439.0,ENVO:01001430,0.942857
86,ENVO:01000892,ENVO:01000635,ENVO,01000892,1000892,,,,


In [92]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [93]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
2,ENVO:0003093,ENVO:00003081,True
3,ENVO:0003099,ENVO:00003081,True
4,ENVO:0003101,ENVO:00003081,True
5,ENVO:0003081,ENVO:00003081,True
6,ENVO:01001442,ENVO:01001246,True
7,ENVO:0003086,ENVO:00003081,True
8,ENVO:0003085,ENVO:00003081,True
9,ENVO:0003094,ENVO:00003081,True
10,ENVO:0003100,ENVO:00003081,True
11,ENVO:0003084,ENVO:00003081,True


In [94]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [95]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [96]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    2694
0    2519
2      94
Name: count, dtype: int64

In [97]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    2750
0    2519
2      38
Name: count, dtype: int64

## The extent of multiple detected CURIes has been reduced ~ 4.5 fold 
(for soil env_local_scale)

Isolate the submitter annotations for which there's clearly one best CURIe after removing the drag-stretches

In [98]:
ncbi_frame.shape

(5307, 16)

In [99]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [100]:
ncbi_frame_undisputed.shape

(5269, 16)

In [101]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [102]:
ncbi_frame_disputed.shape

(38, 16)

In [103]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [104]:
ncbi_frame_disputed.shape

(76, 16)

In [105]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

## Just include all of the remaining disputed CURIe assignments

In [106]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [107]:
ncbi_disputes_exploded_frame.shape

(5345, 16)

In [108]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,not applicable,262,not applicable,1,0,not applicable,,,,,[],0,,[],0
1,2,missing,199,missing,1,0,missing,,,,,[],0,,[],0
2,3,not collected,127,not collected,1,0,not collected,,,,,[],0,,[],0
3,4,Orchard,87,Orchard,1,0,Orchard,,,ENVO:00000115,orchard,[ENVO:00000115],1,,[ENVO:00000115],1
4,5,,79,,1,0,,,,,,[],0,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5340,4745,Area of crop land ENVO_01000892,1,Area of crop land ENVO_01000892,1,1,Area of crop land,ENVO:01000892,,ENVO:01000635,planetary landmass,"[ENVO:01000635, ENVO:01000892]",2,,[ENVO:01000892],2
5341,5004,wood [ENVO_00002040],1,wood [ENVO_00002040],1,1,wood,ENVO:00002040,,ENVO:00000109,woodland area,"[ENVO:00002040, ENVO:00000109]",2,,[ENVO:00002040],2
5342,5004,wood [ENVO_00002040],1,wood [ENVO_00002040],1,1,wood,ENVO:00002040,,ENVO:00000109,woodland area,"[ENVO:00002040, ENVO:00000109]",2,,[ENVO:00000109],2
5343,5006,field(ENVO:00000114),1,field(ENVO:00000114),1,1,field,ENVO:00000114,,ENVO:01000352,field,"[ENVO:01000352, ENVO:00000114]",2,,[ENVO:01000352],2


In [109]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(
    len)

In [110]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    2826
0    2519
Name: count, dtype: int64

In [111]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [112]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["count"].sum().reset_index()
)

# todo parameterize the column name based on what was really counted, generally biosamples or bioprojects
ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_bioprojects_count']

This is currently a count of Biosamples for which the indicated CURIes can be extracted or inferred by oaklib annotation, after removal of drag-stretch, auto-incremented CURIes

In [113]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_bioprojects_count
0,BFO:0000015,4
1,BFO:0000020,1
2,BFO:0000029,6
3,CHEBI:15377,32
4,CHEBI:22695,15
...,...,...
560,UBERON:0000965,1
561,UBERON:0001913,1
562,UBERON:0001988,1
563,UBERON:0002100,5


## GOLDTERMS mappings only approach

we're currently including
- mappings only

and have retired the previous use of
- mappings in hybrid with biosample counts

And we're casting a wide net, especially for the hybrid approach
- searching for 'soil', 'sediment' etc. in GOLDTERMS labels without anchoring them like 'Environmental > Aquatic > Sediment'
- retrieving the CURIes for env_broad_scale, env_local_scale and env_medium for all voting sheets, and trusting orthogonal filtering to remove the inappropriate CURIes

Should we now add (or switch to) direct biosample counts of GOLD "envo" annotations?

Efficient retrieval of  all GOLD data in a given scope isn't easy

In [114]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [115]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)

# ~ 1 second

In [116]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 second

In [117]:
## that's all fast. don't bother caching

In [118]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [119]:
goldterms_result = pd.read_sql_query(extension_query, goldterms_conn)

In [120]:
goldterms_result

Unnamed: 0,subject,predicate,content
0,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313
1,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_local,ENVO:00010622
2,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:mixs_extension,mixs:MiscellaneousNaturalOrArtificialEnvironment
3,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,rdfs:label,Engineered > Artificial ecosystem > Plant grow...
4,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313
...,...,...,...
286,GOLDVOCAB:Plant-pot,rdfs:label,Plant pot
287,GOLDVOCAB:Plants,rdfs:label,Plants
288,GOLDVOCAB:Sewage-treatment-plant,rdfs:label,Sewage treatment plant
289,GOLDVOCAB:Soil-_28non-planted_29,rdfs:label,Soil (non-planted)


In [121]:
# # todo: save this kind of content before subsetting on an environment
# #   the subsetting is currently baked into the query
# 
# # see also goldterms_queries.ipynb in MAM's Collab
# goldterms_result.to_csv("goldterms_single_environment_mappings_long.tsv", sep="\t", index=False)

In [122]:
goldterms_only_curies = goldterms_result.loc[goldterms_result['predicate'].isin(gold_context_selectors), 'content']


In [123]:
goldterms_only_curies = goldterms_only_curies.unique().tolist()

In [124]:
# goldterms_only_curies

# Make lists of CURIEs
which will determine
- the rows in the table
- the boolean filter columns

this next block got a lot slower when I switched from the envo adapter to the aggregated adapter

In [125]:
anchor_curies = list(anchor_descendants_frame['curie'])


In [126]:
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]

In [127]:
obsoletes_curies_envo = set(envo_adapter.obsoletes())
obsoletes_curies_po = set(po_adapter.obsoletes())
obsoletes_curies = list(obsoletes_curies_envo.union(obsoletes_curies_po))

In [128]:
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants(AQUATIC_BIOME, predicates=[IS_A]))
biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A]))  # 
building_curies = list(envo_adapter.descendants(BUILDING, predicates=[IS_A]))  #
building_part_curies = list(envo_adapter.descendants(BUILDING_PART, predicates=[IS_A]))  #
env_mat_curies = list(envo_adapter.descendants(ENVIRONMENTAL_MATERIAL, predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants(ENVIRONMENTAL_SYSTEM, predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants(ENVIRONMENTAL_SYSTEM, predicates=[IS_A]))
human_construction_curies = list(envo_adapter.descendants(HUMAN_CONSTRUCTION, predicates=[IS_A]))  #
liquid_water_curies = list(envo_adapter.descendants(LIQUID_WATER, predicates=[IS_A]))  # 
marine_biome_curies = list(envo_adapter.descendants(MARINE_BIOME, predicates=[IS_A]))
vegetation_layer_curies = list(envo_adapter.descendants(VEGETATION_LAYER, predicates=[IS_A]))  #
mass_of_environmental_material_curies = list(envo_adapter.descendants(MASS_OF_ENVIRONMENTAL_MATERIAL, predicates=[IS_A]))  #
plant_structure_curies = list(po_adapter.descendants(PLANT_STRUCTURE, predicates=[IS_A]))
snow_curies = list(envo_adapter.descendants(SNOW, predicates=[IS_A]))  #
soil_curies = list(envo_adapter.descendants(SOIL, predicates=[IS_A]))  #
terrestrial_biome_curies = list(envo_adapter.descendants(TERRESTRIAL_BIOME, predicates=[IS_A]))
water_ice_curies = list(envo_adapter.descendants(WATER_ICE, predicates=[IS_A]))  #


In [129]:
# Specify an output file name
nlcd2011_class_iris_file = "nlcd2011_class_iris.txt"

nlcd_subset_textual_representation = "nlcd2011"

in_subset_curie = "oio:inSubset"

# Initialize nlcd_classes
nlcd_classes = []

if os.path.exists(nlcd2011_class_iris_file):
    # Load the list from the file if it exists
    print(f"Loading {nlcd_subset_textual_representation} classes from {nlcd2011_class_iris_file}...")
    with open(nlcd2011_class_iris_file, "r") as file:
        nlcd_classes = [line.strip() for line in file.readlines()]
else:
    # File doesn't exist; generate the list using the loop
    print(
        f"{nlcd2011_class_iris_file} not found. Identifying classes in {nlcd_subset_textual_representation} subset (~2 minutes)...")

    # Retrieve all classes
    entities = envo_adapter.entities()

    # super slow 2 minutes
    # but retrieving classes by named subset seems to crash on EnvO with its textual subsets?
    for entity in entities:
        term_metadata = envo_adapter.entity_metadata_map(entity)
        if in_subset_curie in term_metadata:
            subsets = term_metadata[in_subset_curie]
            if nlcd_subset_textual_representation in subsets:
                nlcd_classes.append(entity)

    # Save the generated list to the file
    with open(nlcd2011_class_iris_file, "w") as file:
        for string in nlcd_classes:
            file.write(string + "\n")

    print(f"List saved to {nlcd2011_class_iris_file}")

# At this point, nlcd_classes contains the desired list
print(f"Total {nlcd_subset_textual_representation} classes loaded: {len(nlcd_classes)}")


Loading nlcd2011 classes from nlcd2011_class_iris.txt...
Total nlcd2011 classes loaded: 22


## Bootstrap the rows

In [130]:
include_in_rows = set()

In [131]:
include_in_rows.update(anchor_curies)
include_in_rows.update(goldterms_only_curies)
include_in_rows.update(legacy_pv_curies)
include_in_rows.update(ncbi_frame['extracted_curie'])
include_in_rows.update(ncbi_frame['longest_annotation_curie'])
include_in_rows.update(nlcd_classes)
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_context_selector])

In [132]:
rows_lod = []

# Voting sheet rows and boolean columns

In [133]:
for curie in include_in_rows:
    if curie is None:
        continue

    # ONCE AGAIN, assuming that EnvO is the only ontology we'll check against
    current_ancestors = [
        list(ancestor.keys())[0] for ancestor in envo_ancs_cache.get(curie, [])
    ]
    ancestors_in_enum_count = len(set(current_ancestors) & set(legacy_pv_curies))

    current_descendants = [
        list(descendant.keys())[0] for descendant in envo_descs_cache.get(curie, [])
    ]
    descendants_in_enum_count = len(set(current_descendants) & set(legacy_pv_curies))

    row = {
        'curie': curie,
        'claimed_prefix': curie.split(':')[0],
        'label': aggregated_text_annotator_interface.label(curie),
        'envo_native': False,
        'obsolete': False,
        comparison_enum_column_name: False,
        'ancestors_in_enum_count': ancestors_in_enum_count,
        'descendants_in_enum_count': descendants_in_enum_count,
        'nlcd_class': False,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'marine_biome': False,
        'env_mat': False,
        'soil': False,
        'liquid water': False,
        'water ice': False,
        'snow': False,
        'mass_of_environmental_material': False,
        'human_construction': False,
        'building': False,
        'building_part': False,
        'plant_structure': False,
        'vegetation_layer': False,
        'goldterms_mappings': False,
    }

    if curie in abp_curies: row['abp'] = True
    if curie in aquatic_biome_curies: row['aquatic_biome'] = True
    if curie in biome_curies: row['biome'] = True
    if curie in building_curies: row['building'] = True
    if curie in building_part_curies: row['building_part'] = True
    if curie in env_mat_curies: row['env_mat'] = True
    if curie in env_sys_curies: row['env_sys'] = True
    if curie in goldterms_only_curies: row['goldterms_mappings'] = True
    if curie in human_construction_curies: row['human_construction'] = True
    if curie in legacy_pv_curies: row[comparison_enum_column_name] = True
    if curie in liquid_water_curies: row['liquid water'] = True
    if curie in marine_biome_curies: row['marine_biome'] = True
    if curie in mass_of_environmental_material_curies: row['mass_of_environmental_material'] = True
    if curie in nlcd_classes: row['nlcd_class'] = True
    if curie in obsoletes_curies: row['obsolete'] = True
    if curie in plant_structure_curies: row['plant_structure'] = True
    if curie in snow_curies: row['snow'] = True
    if curie in soil_curies: row['soil'] = True
    if curie in terrestrial_biome_curies: row['terrestrial_biome'] = True
    if curie in vegetation_layer_curies: row['vegetation_layer'] = True
    if curie in water_ice_curies: row['water ice'] = True

    try:
        prefix, local_id = curie.split(':')
        if prefix and prefix == 'ENVO' and row['label'] is not None:
            row['envo_native'] = True
    except Exception as e:
        # Print the exception message
        print(f"An error occurred: {e} trying to split {curie}")

    rows_lod.append(row)

# 2 minutes


# ^ Voting sheet rows and boolean columns

In [134]:
rows_frame = pd.DataFrame(rows_lod)

In [135]:
rows_frame

Unnamed: 0,curie,claimed_prefix,label,envo_native,obsolete,EnvLocalScaleSoilEnum_11_1,ancestors_in_enum_count,descendants_in_enum_count,nlcd_class,abp,...,liquid water,water ice,snow,mass_of_environmental_material,human_construction,building,building_part,plant_structure,vegetation_layer,goldterms_mappings
0,ENVO:01000023,ENVO,marine pelagic biome,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,False
1,NCBITaxon:3193,NCBITaxon,Embryophyta,False,False,False,0,0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,ENVO:06105274,ENVO,sandy loam,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,False
3,ENVO:01001895,ENVO,soda spring,True,False,False,1,0,False,True,...,False,False,False,False,False,False,False,False,False,False
4,FOODON:00001199,FOODON,chickpea food product,False,False,False,0,0,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084,ENVO:00000420,ENVO,tombolo,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,False
2085,ENVO:01001104,ENVO,rift zone,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,False
2086,ENVO:01000213,ENVO,subtropical shrubland biome,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,False
2087,ENVO:00000119,ENVO,planted forest,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,False


# Merge in NMDC counts

In [136]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_context_selector].value_counts().reset_index()
# todo parameterize the column name based on what what really counter (generally biosamples or studies)
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_biosamples_count']


In [137]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_biosamples_count
0,ENVO:01001057,199
1,ENVO:01001442,192
2,PO:0025025,85
3,ENVO:00002003,18
4,ENVO:2100002,10
5,ENVO:01000155,7
6,ENVO:00002040,1


In [138]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [139]:
rows_frame

Unnamed: 0,curie,claimed_prefix,label,envo_native,obsolete,EnvLocalScaleSoilEnum_11_1,ancestors_in_enum_count,descendants_in_enum_count,nlcd_class,abp,...,water ice,snow,mass_of_environmental_material,human_construction,building,building_part,plant_structure,vegetation_layer,goldterms_mappings,nmdc_biosamples_count
0,ENVO:01000023,ENVO,marine pelagic biome,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,
1,NCBITaxon:3193,NCBITaxon,Embryophyta,False,False,False,0,0,False,False,...,False,False,False,False,False,False,False,False,False,
2,ENVO:06105274,ENVO,sandy loam,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,
3,ENVO:01001895,ENVO,soda spring,True,False,False,1,0,False,True,...,False,False,False,False,False,False,False,False,False,
4,FOODON:00001199,FOODON,chickpea food product,False,False,False,0,0,False,False,...,False,False,False,False,False,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084,ENVO:00000420,ENVO,tombolo,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,
2085,ENVO:01001104,ENVO,rift zone,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,
2086,ENVO:01000213,ENVO,subtropical shrubland biome,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,
2087,ENVO:00000119,ENVO,planted forest,True,False,False,0,0,False,True,...,False,False,False,False,False,False,False,False,False,


In [140]:
# gold and ncbi counts are slightly trickier
# for gold: including mappings only, mappings in hybrid with biosample counts. 
#    Switch to direct biosample counts of GOLD "envo" annotations?
# ncbi: we have extracted curies and annotated curies

## Merge in NCBI counts

In [141]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [142]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [143]:
rows_frame.to_csv(output_file_name, sep="\t", index=False)

In [144]:
ncbi_conn.close()