In [1]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from oaklib.utilities.lexical.lexical_indexer import create_lexical_index, save_lexical_index, load_lexical_index
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface

from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import yaml
import json


In [2]:

selected_configuration = "water_env_medium"

In [3]:
configurations_file = "voting_sheets_configurations.yaml"

In [4]:
# these are the variables that will be set by the configuration

gold_context_selectors = None
ncbi_context_selector = None
ncbi_package_selector = None
first_where = None
gold_first_where = None
semantic_anchor = None
previous_submission_schema_url = None
CONTEXT_ENUM = None
nmdc_package_selector = None
nmdc_context_selector = None
comparison_enum_column_name = None
output_file_name = None


In [5]:
# Load configurations from YAML file
with open(configurations_file, "r") as file:
    configurations = yaml.safe_load(file)["configurations"]

# Function to apply a configuration
def apply_configuration(config_name):
    config = configurations.get(config_name)
    if not config:
        raise ValueError(f"Configuration '{config_name}' not found.")
    
    # Update global variables
    globals().update(config)
    print(f"Configuration '{config_name}' applied.")


In [6]:
apply_configuration(selected_configuration)

# Now the global variables like `output_file_name`, `semantic_anchor`, etc., are set.

Configuration 'water_env_medium' applied.


In [7]:
gold_first_where = first_where

In [8]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals? Or just use caching around the function?
ancestor_cache = {}
descendant_cache = {}

# Preliminary label, ancestor and descendant caching

In [9]:
# todo this on-demand NCBI curie extraction and annotation recapitulates work that is being added to
# https://portal.nersc.gov/project/m3408/biosamples_duckdb/
# via ???
#   although that doesn't detect auto-incremented curies from  spreadsheet dragging

# todo eventually, dig up a complete JSON gold biosample dump for non-hybrid biosample counts

# todo make it clearer whether biosamples or studies are being counted
#   count nmdc or gold STUDIES too?

# Additional Settings

In [10]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [11]:
MIN_ANNOTATION_LEN = 3

In [12]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [13]:
envo_adapter_string = "sqlite:obo:envo"

In [14]:
env_package_override_file = 'mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [15]:
# ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples.duckdb.gz'

In [16]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
gold_data_file_name = "goldData.xlsx" # goldData.xlsx: Microsoft Excel 2007+
gold_csv_file_name = "gold_biosamples.csv"
BIOSAMPLES_SHEET = "Biosample"

In [17]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"

In [18]:
all_nmdc_biosamples_file = 'all_nmdc_biosamples.json'

# CURIe Constants

In [19]:
BIOME = 'ENVO:00000428'
TERRESTRIAL_BIOME = 'ENVO:00000446'
AQUATIC_BIOME = 'ENVO:00002030'
ABP = 'ENVO:01000813'
ENVIRONMENTAL_SYSTEM = 'ENVO:01000254'
ENVIRONMENTAL_MATERIAL = 'ENVO:00010483'

SOIL = 'ENVO:00001998'
LIQUID_WATER = 'ENVO:00002006'
WATER_ICE = 'ENVO:01000277'

HUMAN_CONSTRUCTION = 'ENVO:00000070'
BUILDING = 'ENVO:00000073'
BUILDING_PART = 'ENVO:01000420'

# Settings-based Queries

In [20]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate in ('{"', '".join(gold_context_selectors)}')"""

In [21]:
ncbi_biosamples_per_annotation_query = f"""
SELECT content, COUNT(1) AS count 
FROM attributes 
WHERE harmonized_name = '{ncbi_context_selector}' AND package_content like '%{ncbi_package_selector}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [22]:
ncbi_bioprojects_per_annotation_query = f"""
SELECT
	a.content,
	count(DISTINCT l.content) AS count
FROM
	main.ATTRIBUTES a
JOIN main.links l 
	ON
	a.id = l.id
WHERE
	l.target = 'bioproject'
	AND harmonized_name = 'env_local_scale'
	AND package_content like '%{ncbi_package_selector}'
GROUP BY
	a.content
ORDER BY
	count(DISTINCT l.content) DESC ;
"""

In [23]:
ncbi_X_per_annotation_query = ncbi_bioprojects_per_annotation_query

In [24]:
# and s1.subject = s1.stanza eliminates matches on blank node annotation rows (probably wouldn't change results but adds a little overhead)

extension_query = f"""
select
		s1.subject ,
		s2.predicate,
		COALESCE (s2."object",
	s2."value") as content
from
	statements s1
join statements s2 on 
	s1.subject = s2.subject
where
	{gold_first_where}
	and s1.predicate = 'rdfs:label'
	and s1.subject = s1.stanza
	and s2.predicate in ('mixs:env_broad', 'mixs:env_local', 'mixs:env_medium', 'mixs:mixs_extension', 'rdfs:label', 'mixs:other', 'mixs:anatomical_site', 'mixs:host_taxon') ;
"""


# Locally Defined Functions
_Currently using locally-defined cache dictionaries_

In [25]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [26]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

# Procedural Code Starts Here

In [27]:
# Determine the filenames and target directory for the NCBI DuckDB
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
ncbi_compressed_file_path = os.path.join(ncbi_compressed_filename)
ncbi_uncompressed_file_path = os.path.join(ncbi_filename)


In [28]:
if os.path.isfile(ncbi_uncompressed_file_path):
    print(f"{ncbi_uncompressed_file_path} is already present in the current working directory.")
else:
    if os.path.isfile(ncbi_compressed_file_path):
        print(f"{ncbi_compressed_file_path} is already present in the current working directory.")
    else:
        print(f"{ncbi_compressed_file_path} needs to be downloaded")
        ncbi_response = requests.get(ncbi_duckdb_url)
        with open(ncbi_compressed_file_path, "wb") as f:
            f.write(ncbi_response.content)
        # ~ 2 minutes @ 250 Mbps
    
    # Unzip the compressed file and save the extracted file in target directory
    print(f"{ncbi_compressed_file_path} needs to be unpacked")
    with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
        with open(ncbi_uncompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # ~ 2 minutes

ncbi_biosamples.duckdb is already present in the current working directory.


In [29]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [30]:
envo_adapter = get_adapter(envo_adapter_string)

**Preliminary replacement for `get_hierarchy_terms` and ancestor/descendant lookup in main row construction loop**

In [31]:
reflexivity = False

# File paths
label_cache_file = "envo_label_cache.json"
ancestors_file = "envo_ancestors.json"
descendants_file = "envo_descendants.json"

# Initialize caches
envo_label_cache = {}
envo_ancs_cache = {}
envo_descs_cache = {}

envo_entities_generator = envo_adapter.entities()
envo_entities = list(envo_entities_generator)
envo_entities.sort()

# Function to generate and save the label cache
def generate_label_cache():
    print("Generating label cache...")

    label_cache = {}
    for ee in envo_entities:
        label_cache[ee] = envo_adapter.label(ee)
    return label_cache

# Check for label cache
if os.path.exists(label_cache_file):
    print(f"Loading label cache from {label_cache_file}...")
    with open(label_cache_file, "r") as f:
        envo_label_cache = json.load(f)
else:
    envo_label_cache = generate_label_cache()
    with open(label_cache_file, "w") as f:
        json.dump(envo_label_cache, f)
    print(f"Label cache saved to {label_cache_file}.")

# Check for ancestor cache
if os.path.exists(ancestors_file):
    print(f"Loading ancestors from {ancestors_file}...")
    with open(ancestors_file, "r") as f:
        envo_ancs_cache = json.load(f)
else:
    print("Ancestors file not found. Generating ancestor cache...")

    for ee in envo_entities:
        ancs_gen = envo_adapter.ancestors(ee, reflexive=reflexivity)
        envo_ancs_cache[ee] = [{a: envo_label_cache.get(a, None)} for a in ancs_gen]

    with open(ancestors_file, "w") as f:
        json.dump(envo_ancs_cache, f)
    print(f"Ancestors saved to {ancestors_file}.")

# Check for descendant cache
if os.path.exists(descendants_file):
    print(f"Loading descendants from {descendants_file}...")
    with open(descendants_file, "r") as f:
        envo_descs_cache = json.load(f)
else:
    # 5 minutes
    print("Descendants file not found. Generating descendant cache...")

    for ee in envo_entities:
        descs_gen = envo_adapter.descendants(ee, reflexive=reflexivity)
        envo_descs_cache[ee] = [{d: envo_label_cache.get(d, None)} for d in descs_gen]

    with open(descendants_file, "w") as f:
        json.dump(envo_descs_cache, f)
    print(f"Descendants saved to {descendants_file}.")


Loading label cache from envo_label_cache.json...
Loading ancestors from envo_ancestors.json...
Loading descendants from envo_descendants.json...


# Anchor aka bootstrapping classes

In [32]:
anchor_descendants = get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter)

In [33]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [34]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [35]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,FOODON:00001838,fermented cereal beverage
1,ENVO:02000111,copper ore
2,ENVO:01000480,glass
3,FOODON:03400865,soup (eurofir)
4,FOODON:00001279,cake icing food product
...,...,...
1095,ENVO:01001117,poultry manure
1096,ENVO:00002117,creosote contaminated soil
1097,ENVO:01001121,plant matter
1098,ENVO:03605000,periphytic biofilm


# Classes from the reference enumeration

In [36]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [37]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys =[]
    

An AttributeError occurred: 'NoneType' object has no attribute 'permissible_values'


In [38]:
print(context_pvs_keys)

[]


In [39]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [40]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [41]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [42]:
pv_validation_results

{'problems': [], 'valids': []}

# Get the CURIEs used in NMDC Biosample annotations

In [43]:
if os.path.isfile(all_nmdc_biosamples_file):
    print(f"{all_nmdc_biosamples_file} is present in the current working directory and will be read into all_nmdc_biosamples.")
    # with open(all_nmdc_biosamples_file, 'r') as file:
    #     all_nmdc_biosamples = yaml.full_load(file)
    # read as json
    with open(all_nmdc_biosamples_file, 'r') as f:
        all_nmdc_biosamples = json.load(f)

else:
    print(f"All NMDC Biosamples need to be fetched and saved to {all_nmdc_biosamples_file}")
    all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)
    # with open(all_nmdc_biosamples_file, 'w') as file:
    #     documents = yaml.dump(all_nmdc_biosamples, file)
    # save as json
    with open(all_nmdc_biosamples_file, 'w') as f:
        json.dump(all_nmdc_biosamples, f)

# this saves network traffic. could use JSON for faster performance. 
# 1 minute for network fetch and JSON write?!
# 1 minute for yaml read
# instantaneous for JSON read?

all_nmdc_biosamples.json is present in the current working directory and will be read into all_nmdc_biosamples.


## Prediction of env_package annotations 

In [44]:
# Specify the output file name
env_packages_file = "nmdc_biosample_asserted_normalized_and_inferred_env_package.tsv"

if os.path.exists(env_packages_file):
    # Load the DataFrame from the file if it exists
    print(f"Loading {env_packages_file} into nmdc_biosample_contexts_frame...")
    nmdc_biosample_contexts_frame = pd.read_csv(env_packages_file, sep='\t')
else:
    # File doesn't exist; generate the DataFrame
    print(f"{env_packages_file} not found. Predicting from asserted records and {env_package_override_file}...")

    # Load environment package overrides
    env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

    # Extract biosample contexts
    biosample_contexts_lod = biosamples_lod_context_extractor(
        all_nmdc_biosamples, envo_adapter,
        my_env_pacakge_overrides=env_pacakge_overrides
    )

    # Create the DataFrame
    nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

    # Print value counts for the 'normalized_env_package' column
    print("\n")
    print("Value counts for normalized_env_package column:")
    print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))
    print("\n")

    # Generate package predictions
    package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

    # Add predictions to the DataFrame
    nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

    # Save the DataFrame to the file
    nmdc_biosample_contexts_frame.to_csv(env_packages_file, sep='\t', index=False)
    print(f"env_package predictions saved to {env_packages_file}")



Loading nmdc_biosample_asserted_normalized_and_inferred_env_package.tsv into nmdc_biosample_contexts_frame...


## env-package prediction complete

To-do: save this and don't recreate it if it's available

Then get it reviewed by other NMDC stakeholders and inject it into MongoDB if approved

## Destructively filter `nmdc_biosample_contexts_frame` by `env_package` 

In [45]:
nmdc_biosample_contexts_frame.shape

(8362, 15)

In [46]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == nmdc_package_selector]

In [47]:
nmdc_biosample_contexts_frame.shape

(1742, 15)

# Long process of predicting OBO foundry CURIes from NCBI Biosamples

## Start by getting unique annotations? Pre-counted by Biosamples

Current task is to provide counts by "study" aka Bioproject in addition to Biosample counts or instead of Biosamples counts if necessary

In [48]:
ncbi_frame = ncbi_conn.execute(ncbi_X_per_annotation_query).fetchdf()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [49]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [50]:
# includes values with counts of one... useful for discovering drag-down submissions?

## MIxS and NCBI guidelines imply environmental context slots are multivalued
and that the pipe `|` should be used as a delimiter

there's an envo_count value below that indicates how ofter other delimiters might be used

In [51]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [52]:
# todo is there any reason to not do this ?
ncbi_frame = ncbi_frame[ncbi_frame['content'].notna() & (ncbi_frame['content'] != '')]

In [53]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [54]:
ncbi_frame.shape

(9007, 5)

In [55]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [56]:
ncbi_frame.shape

(9692, 5)

## splitting adds ~ 5% more rows
which might be important since were currently using a longest annotation strategy here

In [57]:
# how many content_list strings contain envo multiple times now?

In [58]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [59]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    7171
1    2511
2       8
3       2
Name: count, dtype: int64

## If my math is correct, about 0.1% of the annotations still contain multiple CURIes 
after splitting on pipes

There will also be annotations with multiple label-like strings that weren't split because they were not delimited on pipes
That might be a source of lost information since we are using a longest-match annotator here
I.e. there could be annotations with multiple hits worth keeping

## Parsing out CURIEs

this has a few limitations. The function only tries pre-specified prefixes (['ENVO'] by default) and only considers colons and underscores valid delimiters.

In [60]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [61]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


## In what kinds of cases could no CURIe be parsed
despite the presence of "ENVO" in the content string?

In [62]:
parse_failures

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie
970,949,marine photic zoneENVO:00000209,2,marine photic zoneENVO:00000209,1,1,marine photic zoneENVO 00000209,
1904,1810,ENVO:bioreactor,1,ENVO:bioreactor,1,1,ENVO bioreactor,
2422,2281,coastal sea waterENVO:00002150 | marine water ...,1,coastal sea waterENVO:00002150,3,1,coastal sea waterENVO 00002150,
2423,2281,coastal sea waterENVO:00002150 | marine water ...,1,marine water bodyENVO:00001999,3,1,marine water bodyENVO 00001999,
2424,2281,coastal sea waterENVO:00002150 | marine water ...,1,coastal water bodyENVO:02000051,3,1,coastal water bodyENVO 02000051,
3232,3029,well [ENVO00000026]|depth [PATO:0001595],1,well [ENVO00000026],2,1,well [ENVO00000026],
3250,3045,ENVO:aerobic bioreactor,1,ENVO:aerobic bioreactor,1,1,ENVO aerobic bioreactor,
3401,3186,ENVO:water,1,ENVO:water,1,1,ENVO water,
3996,3729,ENVO00012408 Aquifer,1,ENVO00012408 Aquifer,1,1,ENVO00012408 Aquifer,
4306,4010,ENVO: alkaline habitat,1,ENVO: alkaline habitat,1,1,ENVO alkaline habitat,


## Retrieve the labels for the parsed CURIes

In [63]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

## Apply oaklib annotation to the strings after CURIe removal
Actually the annotator can (sometimes?) detect colon-delimited CURIEs with lower case prefixes

This returns CURIes with evidence but not necessarily the label corresponding to the CURIe

In [64]:
# Specify the lexical index file name
envo_lexical_index_file = "envo_lexical_index.yaml"

# Check if the lexical index file exists
if os.path.exists(envo_lexical_index_file):
    print(f"Loading lexical index from {envo_lexical_index_file}...")
    ix = load_lexical_index(envo_lexical_index_file)
else:
    print(f"{envo_lexical_index_file} not found. Creating lexical index from envo_adapter...")
    # Create the lexical index from envo_adapter
    ix = create_lexical_index(envo_adapter)
    # Save the lexical index to a file
    save_lexical_index(ix, envo_lexical_index_file)
    print(f"Lexical index saved to {envo_lexical_index_file}")

# Initialize the TextAnnotatorInterface
envo_text_annotator_interface = TextAnnotatorInterface()
envo_text_annotator_interface.lexical_index = ix

# this cell only takes ~ 1 minute, but generates a lot of "ERRORS" and WARNINGS in a red font
#   while lexically indexing the ontology

Loading lexical index from envo_lexical_index.yaml...


In [65]:
# Apply the annotation function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_text_annotator_interface, MIN_ANNOTATION_LEN))

# ~ 1 minute


## Add the labels for the CURIes identified though oaklib annotation of strings

In [66]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [67]:
ncbi_frame

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,not applicable,271,not applicable,1,0,not applicable,,,,
1,2,missing,165,missing,1,0,missing,,,,
2,3,lake,136,lake,1,0,lake,,,ENVO:00000020,lake
3,4,middle and lower reaches,100,middle and lower reaches,1,0,middle and lower reaches,,,ENVO:00000029,watercourse
4,5,shoreline,82,shoreline,1,0,shoreline,,,ENVO:00000486,shoreline
...,...,...,...,...,...,...,...,...,...,...,...
9687,9003,Lake beach sand_S2.63.R2,1,Lake beach sand_S2.63.R2,1,0,Lake beach sand_S2.63.R2,,,ENVO:00002138,beach sand
9688,9004,Lake beach sand_S3.63.R3.2,1,Lake beach sand_S3.63.R3.2,1,0,Lake beach sand_S3.63.R3.2,,,ENVO:00002138,beach sand
9689,9005,Lake beach sand_S3.N3.R1,1,Lake beach sand_S3.N3.R1,1,0,Lake beach sand_S3.N3.R1,,,ENVO:00002138,beach sand
9690,9006,LS-DP-27JUN19,1,LS-DP-27JUN19,1,0,LS-DP-27JUN19,,,,


## we now have a list of CURIes for each normalized annotation

This could be because the submitter provided a CURIe and a label that don't match
*One* case of this is dragging a CURIe down a column in a spreadsheet, expecting it to be copied,
but actually auto-incrementing it

Now attempt to find one best CURIe for each annotation... by now we have lost the ability to retain multiple legitimate
but improperly separated CURIes

In [68]:
# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long stretches of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [69]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    5955
0    2679
2    1058
Name: count, dtype: int64

In [70]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [71]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [72]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [73]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [74]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [75]:
double_curie_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,extracted_prefix,extracted_local_id,extracted_local_id_int
236,ENVO:00002006,CHEBI:15377,ENVO,00002006,2006
457,ENVO:00002150,ENVO:01000007,ENVO,00002150,2150
482,ENVO:0001,ENVO:01001110,ENVO,0001,1
579,ENVO:00000213,ENVO:01000065,ENVO,00000213,213
635,ENVO:00000489,ENVO:00000486,ENVO,00000489,489
...,...,...,...,...,...
9513,ENVO:00001070,ENVO:00000486,ENVO,00001070,1070
9514,ENVO:00001139,ENVO:00000486,ENVO,00001139,1139
9573,ENVO:00002201,ENVO:00002131,ENVO,00002201,2201
9667,ENVO:00002181,ENVO:00002131,ENVO,00002181,2181


In [76]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [77]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)


In [78]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

`stretches_df` shows groups of extracted EnvoO ids (CURIes without prefix or padding zeros) that share a common CURIe by oaklib annotation of the textual part. This may not be the best or only way to address these spurious drag-stretch, auto-incremented CURIes

Ie 1001458 corresponds to ENVO:01001458, 'mist'

_although it theoretically could have been ENVO:1001458 since EnvO CURIes can have 7 or 8 digits_

In group 9, there are another ~ 50 sequential id values, all corresponding to environmental context annotations whose best oak-annotated class is ENVO:01001803, 'tropical forest'!

How much of an impact does this have? 

In [79]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
926,7,2018051508
927,7,2018051509
928,7,2018051510
929,7,2018051511


In [80]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [81]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


For stretch 9, which included extracted CURIes from ENVO:01001458 to ENVO:01001511, the oaklib test annotation of 100% of the submitted environmental context annotations was ENVO:01001803, so we will keep that and disregard all of the CURIes from the stretch


In [82]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:00000486,0.821429
1,2.0,ENVO:00000486,0.988042
2,3.0,ENVO:00002001,0.685393
3,4.0,ENVO:02000049,1.0
4,5.0,ENVO:01000341,1.0
5,6.0,ENVO:01000341,1.0
6,7.0,ENVO:01000341,1.0


In [83]:
decisive_fraction_threshold = 0.9

In [84]:
decisive_stretch_summary_df = stretch_summary_df[stretch_summary_df['fraction'] >= decisive_fraction_threshold]

In [85]:
decisive_stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
1,2.0,ENVO:00000486,0.988042
3,4.0,ENVO:02000049,1.0
4,5.0,ENVO:01000341,1.0
5,6.0,ENVO:01000341,1.0
6,7.0,ENVO:01000341,1.0


In [86]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    decisive_stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [87]:
double_curie_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,extracted_prefix,extracted_local_id,extracted_local_id_int,stretch_id,value,most_common_longest_annotation_curie,fraction
0,ENVO:00002006,CHEBI:15377,ENVO,00002006,2006,3.0,2006.0,,
1,ENVO:00002150,ENVO:01000007,ENVO,00002150,2150,3.0,2150.0,,
2,ENVO:0001,ENVO:01001110,ENVO,0001,1,1.0,1.0,,
3,ENVO:00000213,ENVO:01000065,ENVO,00000213,213,,,,
4,ENVO:00000489,ENVO:00000486,ENVO,00000489,489,2.0,489.0,ENVO:00000486,0.988042
...,...,...,...,...,...,...,...,...,...
1040,ENVO:00001070,ENVO:00000486,ENVO,00001070,1070,2.0,1070.0,ENVO:00000486,0.988042
1041,ENVO:00001139,ENVO:00000486,ENVO,00001139,1139,2.0,1139.0,ENVO:00000486,0.988042
1042,ENVO:00002201,ENVO:00002131,ENVO,00002201,2201,3.0,2201.0,,
1043,ENVO:00002181,ENVO:00002131,ENVO,00002181,2181,3.0,2181.0,,


In [88]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [89]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
0,ENVO:00002006,CHEBI:15377,True
1,ENVO:00002150,ENVO:01000007,True
2,ENVO:0001,ENVO:01001110,True
4,ENVO:00000489,ENVO:00000486,True
5,ENVO:00000488,ENVO:00000486,True
...,...,...,...
1040,ENVO:00001070,ENVO:00000486,True
1041,ENVO:00001139,ENVO:00000486,True
1042,ENVO:00002201,ENVO:00002131,True
1043,ENVO:00002181,ENVO:00002131,True


In [90]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [91]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [92]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    5955
0    2679
2    1058
Name: count, dtype: int64

In [93]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    6965
0    2679
2      48
Name: count, dtype: int64

## The extent of multiple detected CURIes has been reduced ~ 4.5 fold 
(for soil env_local_scale)

Isolate the submitter annotations for which there's clearly one best CURIe after removing the drag-stretches

In [94]:
ncbi_frame.shape

(9692, 16)

In [95]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [96]:
ncbi_frame_undisputed.shape

(9644, 16)

In [97]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [98]:
ncbi_frame_disputed.shape

(48, 16)

In [99]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [100]:
ncbi_frame_disputed.shape

(96, 16)

In [101]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

## Just include all of the remaining disputed CURIe assignments

In [102]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [103]:
ncbi_disputes_exploded_frame.shape

(9740, 16)

In [104]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,not applicable,271,not applicable,1,0,not applicable,,,,,[],0,,[],0
1,2,missing,165,missing,1,0,missing,,,,,[],0,,[],0
2,3,lake,136,lake,1,0,lake,,,ENVO:00000020,lake,[ENVO:00000020],1,,[ENVO:00000020],1
3,4,middle and lower reaches,100,middle and lower reaches,1,0,middle and lower reaches,,,ENVO:00000029,watercourse,[ENVO:00000029],1,,[ENVO:00000029],1
4,5,shoreline,82,shoreline,1,0,shoreline,,,ENVO:00000486,shoreline,[ENVO:00000486],1,,[ENVO:00000486],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9735,8284,sludge [ENVO:00003043],1,sludge [ENVO:00003043],1,1,sludge,ENVO:00003043,sewage plant,ENVO:00002044,sludge,"[ENVO:00002044, ENVO:00003043]",2,,[ENVO:00003043],2
9736,8590,reservoir [ENVO:00000377],1,reservoir [ENVO:00000377],1,1,reservoir,ENVO:00000377,artificial lake,ENVO:00000025,reservoir,"[ENVO:00000377, ENVO:00000025]",2,,[ENVO:00000377],2
9737,8590,reservoir [ENVO:00000377],1,reservoir [ENVO:00000377],1,1,reservoir,ENVO:00000377,artificial lake,ENVO:00000025,reservoir,"[ENVO:00000377, ENVO:00000025]",2,,[ENVO:00000025],2
9738,8697,River Freshwater [ENVO:01000297],1,River Freshwater [ENVO:01000297],1,1,River Freshwater,ENVO:01000297,freshwater river,ENVO:00002011,fresh water,"[ENVO:01000297, ENVO:00002011]",2,,[ENVO:01000297],2


In [105]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [106]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    7061
0    2679
Name: count, dtype: int64

In [107]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [108]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["count"].sum().reset_index()
)

# todo parameterize the column name based on what was really counted, generally biosamples or bioprojects
ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_bioporjects_count']

This is currently a count of Biosamples for which the indicated CURIes can be extracted or inferred by oaklib annotation, after removal of drag-stretch, auto-incremented CURIes

In [109]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_bioporjects_count
0,BFO:0000015,4
1,BFO:0000029,108
2,BFO:0000030,1
3,CHEBI:15377,391
4,CHEBI:15379,18
...,...,...
1135,UBERON:0000912,1
1136,UBERON:0000955,1
1137,UBERON:0001088,1
1138,UBERON:0001769,1


## GOLDTERMS mappings only approach

we're currently including
- mappings only

and have retired the previous use of
- mappings in hybrid with biosample counts

And we're casting a wide net, especially for the hybrid approach
- searching for 'soil', 'sediment' etc. in GOLDTERMS labels without anchoring them like 'Environmental > Aquatic > Sediment'
- retrieving the CURIes for env_broad_scale, env_local_scale and env_medium for all voting sheets, and trusting orthogonal filtering to remove the inappropriate CURIes

Should we now add (or switch to) direct biosample counts of GOLD "envo" annotations?

Efficient retrieval of  all GOLD data in a given scope isn't easy

In [110]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [111]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)

# ~ 1 second

In [112]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 second

In [113]:
## that's all fast. don't bother caching

In [114]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [115]:
goldterms_result = pd.read_sql_query(extension_query, goldterms_conn)

In [116]:
goldterms_result

Unnamed: 0,subject,predicate,content
0,GOLDTERMS:Environmental-Aquatic-Floodplain,mixs:env_broad,ENVO:00002030
1,GOLDTERMS:Environmental-Aquatic-Floodplain,mixs:env_local,ENVO:00000255
2,GOLDTERMS:Environmental-Aquatic-Floodplain,rdfs:label,Environmental > Aquatic > Floodplain
3,GOLDTERMS:Environmental-Aquatic-Freshwater-Aqu...,mixs:env_broad,ENVO:00002030
4,GOLDTERMS:Environmental-Aquatic-Freshwater-Aqu...,mixs:env_medium,ENVO:00002011
...,...,...,...
570,GOLDTERMS:5807,rdfs:label,Environmental > Aquatic > Non-marine Saline an...
571,GOLDTERMS:5825,mixs:env_broad,ENVO:00000447
572,GOLDTERMS:5825,mixs:env_local,ENVO:01000687
573,GOLDTERMS:5825,rdfs:label,Environmental > Aquatic > Marine > Coastal > P...


In [117]:
# # todo: save this kind of content before subsetting on an environment
# #   the subsetting is currently baked into the query
# 
# # see also goldterms_queries.ipynb in MAM's Collab
# goldterms_result.to_csv("goldterms_single_environment_mappings_long.tsv", sep="\t", index=False)

In [118]:
goldterms_only_curies = goldterms_result.loc[goldterms_result['predicate'].isin(gold_context_selectors), 'content']


In [119]:
goldterms_only_curies = goldterms_only_curies.unique().tolist()

In [120]:
# goldterms_only_curies

# Make lists of CURIEs
which will determine
- the rows in the table
- the boolean filter columns

In [121]:
anchor_curies = list(anchor_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]

biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A])) # 
terrestrial_biome_curies = list(envo_adapter.descendants(TERRESTRIAL_BIOME, predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants(AQUATIC_BIOME, predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants(ENVIRONMENTAL_SYSTEM, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENVIRONMENTAL_MATERIAL, predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

soil_curies = list(envo_adapter.descendants(SOIL, predicates=[IS_A])) # 
liquid_water_curies = list(envo_adapter.descendants(LIQUID_WATER, predicates=[IS_A])) # 
water_ice_curies = list(envo_adapter.descendants(WATER_ICE, predicates=[IS_A])) # 

human_construction_curies = list(envo_adapter.descendants(HUMAN_CONSTRUCTION, predicates=[IS_A])) #
building_curies = list(envo_adapter.descendants(BUILDING, predicates=[IS_A])) #
building_part_curies = list(envo_adapter.descendants(BUILDING_PART, predicates=[IS_A])) #


In [122]:
# Specify an output file name
nlcd2011_class_iris_file = "nlcd2011_class_iris.txt"

nlcd_subset_textual_representation = "nlcd2011"

in_subset_curie = "oio:inSubset"


# Initialize nlcd_classes
nlcd_classes = []

if os.path.exists(nlcd2011_class_iris_file):
    # Load the list from the file if it exists
    print(f"Loading {nlcd_subset_textual_representation} classes from {nlcd2011_class_iris_file}...")
    with open(nlcd2011_class_iris_file, "r") as file:
        nlcd_classes = [line.strip() for line in file.readlines()]
else:
    # File doesn't exist; generate the list using the loop
    print(f"{nlcd2011_class_iris_file} not found. Identifying classes in {nlcd_subset_textual_representation} subset (~2 minutes)...")

    # Retrieve all classes
    entities = envo_adapter.entities()

    # super slow 2 minutes
    # but retrieving classes by named subset seems to crash on EnvO with its textual subsets?
    for entity in entities:
        term_metadata = envo_adapter.entity_metadata_map(entity)
        if in_subset_curie in term_metadata:
            subsets = term_metadata[in_subset_curie]
            if nlcd_subset_textual_representation in subsets:
                nlcd_classes.append(entity)

    # Save the generated list to the file
    with open(nlcd2011_class_iris_file, "w") as file:
        for string in nlcd_classes:
            file.write(string + "\n")

    print(f"List saved to {nlcd2011_class_iris_file}")

# At this point, nlcd_classes contains the desired list
print(f"Total {nlcd_subset_textual_representation} classes loaded: {len(nlcd_classes)}")


Loading nlcd2011 classes from nlcd2011_class_iris.txt...
Total nlcd2011 classes loaded: 22


## Bootstrap the rows

In [123]:
include_in_rows = set()

In [124]:
include_in_rows.update(anchor_curies)

In [125]:
include_in_rows.update(legacy_pv_curies)

In [126]:
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_context_selector])

In [127]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [128]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [129]:
include_in_rows.update(goldterms_only_curies)

In [130]:
include_in_rows.update(nlcd_classes)

In [131]:
rows_lod = []

# Voting sheet rows and boolean columns

In [132]:
for curie in include_in_rows:
    if curie is None:
        continue

    # ONCE AGAIN, assuming that EnvO is the only ontology we'll check against
    current_ancestors = [
        list(ancestor.keys())[0] for ancestor in envo_ancs_cache.get(curie, [])
    ]
    ancestors_in_enum_count = len(set(current_ancestors) & set(legacy_pv_curies))

    current_descendants = [
        list(descendant.keys())[0] for descendant in envo_descs_cache.get(curie, [])
    ]
    descendants_in_enum_count = len(set(current_descendants) & set(legacy_pv_curies))

    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        comparison_enum_column_name: False,
        'ancestors_in_enum_count': ancestors_in_enum_count,
        'descendants_in_enum_count': descendants_in_enum_count,
        'nlcd_class': False,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
        'soil': False,
        'liquid water': False,
        'water ice': False,
        'human_construction': False,
        'building': False,
        'building_part': False,
        'goldterms_mappings': False,
    }

    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in soil_curies:
        row['soil'] = True
    if curie in liquid_water_curies:
        row['liquid water'] = True
    if curie in water_ice_curies:
        row['water ice'] = True
    if curie in human_construction_curies:
        row['human_construction'] = True
    if curie in building_curies:
        row['building'] = True
    if curie in building_part_curies:
        row['building_part'] = True
    if curie in legacy_pv_curies:
        row[comparison_enum_column_name] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    if curie in goldterms_only_curies:
        row['goldterms_mappings'] = True
    if curie in nlcd_classes:
        row['nlcd_class'] = True

    try:
        prefix, local_id = curie.split(':')
        if prefix and prefix == 'ENVO' and row['label'] is not None:
            row['envo_native'] = True
    except Exception as e:
        # Print the exception message
        print(f"An error occurred: {e} trying to split {curie}")

    rows_lod.append(row)

# 2 minutes


# ^ Voting sheet rows and boolean columns

In [133]:
rows_frame = pd.DataFrame(rows_lod)

In [134]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,no_comparison_enum,ancestors_in_enum_count,descendants_in_enum_count,nlcd_class,abp,env_sys,...,terrestrial_biome,aquatic_biome,env_mat,soil,liquid water,water ice,human_construction,building,building_part,goldterms_mappings
0,ENVO:00000770,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,ENVO:00001027,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,ENVO:03510029,primer (paint),True,False,False,0,0,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,ENVO:03600087,greenhouse,True,False,False,0,0,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,ENVO:02000036,saliva material,True,False,False,0,0,False,False,False,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2873,ENVO:00003323,tannery,True,False,False,0,0,False,True,False,...,False,False,False,False,False,False,True,True,False,False
2874,ENVO:00000745,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2875,ENVO:00000026,well,True,False,False,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2876,ENVO:00000872,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Merge in NMDC counts

In [135]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_context_selector].value_counts().reset_index()
# todo parameterize the column name based on what what really counter (generally biosamples or studies)
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_biosamples_count']


In [136]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_biosamples_count
0,ENVO:04000007,560
1,ENVO:03605001,371
2,ENVO:00002042,287
3,ENVO:00002007,165
4,ENVO:03605006,104
5,ENVO:03605004,100
6,ENVO:01000017,52
7,ENVO:03605005,37
8,ENVO:03605002,27
9,ENVO:03605003,25


In [137]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [138]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,no_comparison_enum,ancestors_in_enum_count,descendants_in_enum_count,nlcd_class,abp,env_sys,...,aquatic_biome,env_mat,soil,liquid water,water ice,human_construction,building,building_part,goldterms_mappings,nmdc_biosamples_count
0,ENVO:00000770,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,
1,ENVO:00001027,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,
2,ENVO:03510029,primer (paint),True,False,False,0,0,False,False,False,...,False,True,False,False,False,False,False,False,False,
3,ENVO:03600087,greenhouse,True,False,False,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,
4,ENVO:02000036,saliva material,True,False,False,0,0,False,False,False,...,False,True,False,False,False,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2873,ENVO:00003323,tannery,True,False,False,0,0,False,True,False,...,False,False,False,False,False,True,True,False,False,
2874,ENVO:00000745,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,
2875,ENVO:00000026,well,True,False,False,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,
2876,ENVO:00000872,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,


In [139]:
# gold and ncbi counts are slightly trickier
# for gold: including mappings only, mappings in hybrid with biosample counts. 
#    Switch to direct biosample counts of GOLD "envo" annotations?
# ncbi: we have extracted curies and annotated curies

## Merge in NCBI counts

In [140]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [141]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [142]:
rows_frame.to_csv(output_file_name, sep="\t", index=False)

In [143]:
ncbi_conn.close()