In [139]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from oaklib.utilities.lexical.lexical_indexer import create_lexical_index, save_lexical_index, load_lexical_index
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface

from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import yaml
import json


In [2]:
print("verify output is being rendered")

verify output is being rendered


In [3]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals? Or just use caching around the function?
ancestor_cache = {}
descendant_cache = {}

In [4]:
# todo this on-demand NCBI curie extraction and annotation recapitulates work that is being added to
# https://portal.nersc.gov/project/m3408/biosamples_duckdb/
# via ???
#   although that doesn't detect auto-incremented curies from  spreadsheet dragging

# todo eventually, dig up a complete JSON gold biosample dump for non-hybrid biosample counts

# todo make it clearer whether biosamples or studies are being counted
#   count nmdc or gold STUDIES too?

# Isolated Task Settings
**Pre-assembled settings block below are preferred**

_For making a soil env_broad_scale voting sheet vs a sediment env_local_scale sheet, etc._

todo: bundle these into dicts so they don't have to be modified independently and kept in sync with one another.

In [None]:
output_file_name = "voting_sheets_output/sediment_env_local_scale_voting_sheet.tsv"

In [None]:
# semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale
semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale
# semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

## context selectors

In [None]:
gold_context_selectors =  [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]


In [None]:
# ncbi_context_selector = 'env_broad_scale'
ncbi_context_selector = 'env_local_scale'
# ncbi_context_selector = 'env_medium'

In [None]:
# nmdc_context_selector= 'env_broad_scale_id'
nmdc_context_selector= 'env_local_scale_id'
# nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors

In [None]:
# plant_first_where = "s1.value like 'host-associated > plants%'"
# sediment_first_where = "lower(s1.value) like 'environmental > aquatic%sediment%'"
# soil_first_where = "s1.value like 'environmental > terrestrial > soil%'"
# water_first_where = "s1.value like 'environmental > aquatic%' and lower(s1.value) not like '%sediment%'"

plant_first_where = "lower(s1.value) like '%plant%'" # picks up waste water treatment plant
sediment_first_where = "lower(s1.value) like '%sediment%'"
soil_first_where = "lower(s1.value) like '%soil%'"
water_first_where = "lower(s1.value) like '%aquatic%' and lower(s1.value) not like '%sediment%'"

In [None]:
gold_first_where = sediment_first_where

In [None]:
# todo new since soil: why are we only considering MIMS.me for discovering appropriate env triad values?
#   there's usually a roughly equal number of biosamples from in each extension for MIMS.me and 

# ncbi_package_selector = 'plant-associated.6.0'
ncbi_package_selector = 'sediment.6.0'
# ncbi_package_selector = 'soil.6.0'
# ncbi_package_selector = 'water.6.0'

In [None]:
# nmdc_package_selector = 'plant-associated'
nmdc_package_selector = 'sediment'
# nmdc_package_selector = 'soil'
# nmdc_package_selector = 'water'


In [None]:
GOLDTERMS_NA = '' # ???

GOLDTERMS_PLANT_ASSOCIATED = GOLDTERMS_NA # host associated -> viridiplantae? take a string approach!
GOLDTERMS_SEDIMENT = 'GOLDTERMS:3985' #  doesn't have any subclasses
GOLDTERMS_SOIL = 'GOLDTERMS:4212'
GOLDTERMS_WATER = 'GOLDTERMS:3984'

# GOLDTERMS:4180, 'Environmental > Aquatic > Freshwater > Pond > Sediment' and ~64 more don't share a common root
# poetry run runoak -i sqlite:obo:goldterms info 't~sediment'


In [None]:
goldterms_root = GOLDTERMS_NA

## selecting name and version of one enum for comparison

In [None]:
# only the Soil enums have legacy definitions (v10.7 and earlier?)

# CONTEXT_ENUM = "EnvBroadScaleSoilEnum"
CONTEXT_ENUM = "EnvLocalScaleSoilEnum"
# CONTEXT_ENUM = "EnvMediumSoilEnum"

# CONTEXT_ENUM = ""

In [None]:
# todo: add columns for membership in multiple enums from multiple version of the schema?
#  like sediment local vs soil local and water local (once that's completed)
#  get them from schema files or something prior to that? sems like the voting sheets are too raw/preliminary for that
#   can use a more recent schema url for more recent enums!

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

# previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [None]:
# todo: don't call the column "legacy_pv". use the name of the enum and the version of the schema?

comparison_enum_column_name = 'EnvLocalSoilEnum_10_7'
# comparison_enum_column_name = 'EnvLocalScaleSoilEnum_11_1'
# comparison_enum_column_name = 'no_comparison_enum'

# Pre-compiled settings
Do not "run all cells below" from any one of these pre-compiled blocks. 
Run one configuration block, and then proceed to "Additional Settings"

## plant-associated env_broad_scale

In [5]:
output_file_name = "voting_sheets_output/plant_associated_env_broad_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_broad_scale'

nmdc_context_selector= 'env_broad_scale_id'

## package aka environment aka extension selectors
plant_first_where = "lower(s1.value) like '%plant%'"  # picks up waste water treatment plant
gold_first_where = plant_first_where

ncbi_package_selector = 'plant-associated.6.0'

nmdc_package_selector = 'plant-associated'

CONTEXT_ENUM = "EnvBroadScaleSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvBroadScaleSoilEnum_11_1'


## sediment env_broad_scale

In [None]:
output_file_name = "voting_sheets_output/sediment_env_broad_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_broad_scale'

nmdc_context_selector= 'env_broad_scale_id'

## package aka environment aka extension selectors
sediment_first_where = "lower(s1.value) like '%sediment%'"
gold_first_where = sediment_first_where

ncbi_package_selector = 'sediment.6.0'

nmdc_package_selector = 'sediment'

CONTEXT_ENUM = "EnvBroadScaleSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvBroadScaleSoilEnum_11_1'


## soil env_broad_scale

In [None]:
output_file_name = "voting_sheets_output/soil_env_broad_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_broad_scale'

nmdc_context_selector= 'env_broad_scale_id'

## package aka environment aka extension selectors
soil_first_where = "lower(s1.value) like '%soil%'"
gold_first_where = soil_first_where

ncbi_package_selector = 'soil.6.0'

nmdc_package_selector = 'soil'

CONTEXT_ENUM = "EnvBroadScaleSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvBroadScaleSoilEnum_10_7'


## plant-associated env_local_scale

In [None]:
output_file_name = "voting_sheets_output/plant_associated_env_local_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_local_scale'

nmdc_context_selector= 'env_local_scale_id'

## package aka environment aka extension selectors
plant_first_where = "lower(s1.value) like '%plant%'"  # picks up waste water treatment plant
gold_first_where = plant_first_where

ncbi_package_selector = 'plant-associated.6.0'

nmdc_package_selector = 'plant-associated'

CONTEXT_ENUM = "EnvLocalScaleSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvLocalScaleSoilEnum_11_1'


## sediment env_local_scale

In [None]:
output_file_name = "voting_sheets_output/sediment_env_local_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_local_scale'

nmdc_context_selector= 'env_local_scale_id'

## package aka environment aka extension selectors
sediment_first_where = "lower(s1.value) like '%sediment%'"
gold_first_where = sediment_first_where

ncbi_package_selector = 'sediment.6.0'

nmdc_package_selector = 'sediment'

CONTEXT_ENUM = "EnvLocalScaleSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvLocalScaleSoilEnum_11_1'


## soil env_local_scale

In [None]:
output_file_name = "voting_sheets_output/soil_env_local_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_local_scale'

nmdc_context_selector= 'env_local_scale_id'

## package aka environment aka extension selectors
soil_first_where = "lower(s1.value) like '%soil%'"
gold_first_where = soil_first_where

ncbi_package_selector = 'soil.6.0'

nmdc_package_selector = 'soil'

CONTEXT_ENUM = "EnvLocalScaleSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvLocalScaleSoilEnum_10_7'


## plant-associated env_medium

In [None]:
output_file_name = "voting_sheets_output/plant_associated_env_medium_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

## context selectors
gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_medium'

nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors
plant_first_where = "lower(s1.value) like '%plant%'"  # picks up waste water treatment plant
gold_first_where = plant_first_where

ncbi_package_selector = 'plant-associated.6.0'

nmdc_package_selector = 'plant-associated'

CONTEXT_ENUM = "EnvMediumSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvMediumSoilEnum_11_1'


## sediment env_medium

In [None]:
output_file_name = "voting_sheets_output/sediment_env_medium_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

## context selectors
gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_medium'

nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors
sediment_first_where = "lower(s1.value) like '%sediment%'"
gold_first_where = sediment_first_where

ncbi_package_selector = 'sediment.6.0'

nmdc_package_selector = 'sediment'

CONTEXT_ENUM = "EnvMediumSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvMediumSoilEnum_11_1'


## soil env_medium

In [None]:
output_file_name = "voting_sheets_output/soil_env_medium_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

## context selectors
gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_medium'

nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors
soil_first_where = "lower(s1.value) like '%soil%'"
gold_first_where = soil_first_where

ncbi_package_selector = 'soil.6.0'

nmdc_package_selector = 'soil'

CONTEXT_ENUM = "EnvMediumSoilEnum"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'EnvMediumSoilEnum_10_7'


# water env_broad_scale

In [None]:
output_file_name = "voting_sheets_output/water_env_broad_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_broad_scale'

nmdc_context_selector= 'env_broad_scale_id'

## package aka environment aka extension selectors
water_first_where = "lower(s1.value) like '%aquatic%' and lower(s1.value) not like '%sediment%'"
gold_first_where = water_first_where

ncbi_package_selector = 'water.6.0'

nmdc_package_selector = 'water'

CONTEXT_ENUM = ""

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'no_comparison_enum'



## water env_local_scale

In [None]:
output_file_name = "voting_sheets_output/water_env_local_scale_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale

gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_local_scale'

nmdc_context_selector= 'env_local_scale_id'

## package aka environment aka extension selectors
water_first_where = "lower(s1.value) like '%aquatic%' and lower(s1.value) not like '%sediment%'"
gold_first_where = water_first_where

ncbi_package_selector = 'water.6.0'

nmdc_package_selector = 'water'

CONTEXT_ENUM = ""

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'no_comparison_enum'

## water env_medium

In [None]:
output_file_name = "voting_sheets_output/water_env_medium_voting_sheet.tsv"

# bootstrapping
semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

## context selectors
gold_context_selectors = [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]

ncbi_context_selector = 'env_medium'

nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors
water_first_where = "lower(s1.value) like '%aquatic%' and lower(s1.value) not like '%sediment%'"
gold_first_where = water_first_where

ncbi_package_selector = 'water.6.0'

nmdc_package_selector = 'water'

CONTEXT_ENUM = ""

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

comparison_enum_column_name = 'no_comparison_enum'

# Additional Settings
safe to run all of these cells sequentially

In [6]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [7]:
MIN_ANNOTATION_LEN = 3

In [8]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [9]:
envo_adapter_string = "sqlite:obo:envo"

In [10]:
env_package_override_file = 'mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [11]:
# ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples.duckdb.gz'

In [12]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
gold_data_file_name = "goldData.xlsx" # goldData.xlsx: Microsoft Excel 2007+
gold_csv_file_name = "gold_biosamples.csv"
BIOSAMPLES_SHEET = "Biosample"

In [13]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"

In [14]:
all_nmdc_biosamples_file = 'all_nmdc_biosamples.json'

# CURIe Constants

In [15]:
BIOME = 'ENVO:00000428'
TERRESTRIAL_BIOME = 'ENVO:00000446'
AQUATIC_BIOME = 'ENVO:00002030'
ABP = 'ENVO:01000813'
ENVIRONMENTAL_SYSTEM = 'ENVO:01000254'
ENVIRONMENTAL_MATERIAL = 'ENVO:00010483'

SOIL = 'ENVO:00001998'
LIQUID_WATER = 'ENVO:00002006'
WATER_ICE = 'ENVO:01000277'

HUMAN_CONSTRUCTION = 'ENVO:00000070'
BUILDING = 'ENVO:00000073'
BUILDING_PART = 'ENVO:01000420'

# Settings-based Queries

In [16]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate in ('{"', '".join(gold_context_selectors)}')"""

In [17]:
ncbi_biosamples_per_annotation_query = f"""
SELECT content, COUNT(1) AS count 
FROM attributes 
WHERE harmonized_name = '{ncbi_context_selector}' AND package_content like '%{ncbi_package_selector}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [18]:
ncbi_bioprojects_per_annotation_query = f"""
SELECT
	a.content,
	count(DISTINCT l.content) AS count
FROM
	main.ATTRIBUTES a
JOIN main.links l 
	ON
	a.id = l.id
WHERE
	l.target = 'bioproject'
	AND harmonized_name = 'env_local_scale'
	AND package_content like '%{ncbi_package_selector}'
GROUP BY
	a.content
ORDER BY
	count(DISTINCT l.content) DESC ;
"""

In [19]:
ncbi_X_per_annotation_query = ncbi_bioprojects_per_annotation_query

In [20]:
# and s1.subject = s1.stanza eliminates matches on blank node annotation rows (probably wouldn't change results but adds a little overhead)

extension_query = f"""
select
		s1.subject ,
		s2.predicate,
		COALESCE (s2."object",
	s2."value") as content
from
	statements s1
join statements s2 on 
	s1.subject = s2.subject
where
	{gold_first_where}
	and s1.predicate = 'rdfs:label'
	and s1.subject = s1.stanza
	and s2.predicate in ('mixs:env_broad', 'mixs:env_local', 'mixs:env_medium', 'mixs:mixs_extension', 'rdfs:label', 'mixs:other', 'mixs:anatomical_site', 'mixs:host_taxon') ;
"""


# Locally Defined Functions
_Currently using locally-defined cache dictionaries_

In [21]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [22]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

# Procedural Code Starts Here

In [23]:
# Determine the filenames and target directory for the NCBI DuckDB
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
ncbi_compressed_file_path = os.path.join(ncbi_compressed_filename)
ncbi_uncompressed_file_path = os.path.join(ncbi_filename)


In [24]:
if os.path.isfile(ncbi_uncompressed_file_path):
    print(f"{ncbi_uncompressed_file_path} is already present in the current working directory.")
else:
    if os.path.isfile(ncbi_compressed_file_path):
        print(f"{ncbi_compressed_file_path} is already present in the current working directory.")
    else:
        print(f"{ncbi_compressed_file_path} needs to be downloaded")
        ncbi_response = requests.get(ncbi_duckdb_url)
        with open(ncbi_compressed_file_path, "wb") as f:
            f.write(ncbi_response.content)
        # ~ 2 minutes @ 250 Mbps
    
    # Unzip the compressed file and save the extracted file in target directory
    print(f"{ncbi_compressed_file_path} needs to be unpacked")
    with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
        with open(ncbi_uncompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # ~ 2 minutes

ncbi_biosamples.duckdb is already present in the current working directory.


In [25]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [26]:
envo_adapter = get_adapter(envo_adapter_string)

# Anchor aka bootstrapping classes

In [27]:
anchor_descendants = get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter)

In [28]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [29]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [30]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,ENVO:01001505,alpine tundra biome
1,ENVO:01000024,marine benthic biome
2,ENVO:01000252,freshwater lake biome
3,ENVO:01000180,tundra biome
4,ENVO:01000123,marine sponge reef biome
...,...,...
123,ENVO:01000858,marine upwelling biome
124,ENVO:01000188,tropical savanna biome
125,ENVO:01000042,neritic epipelagic zone biome
126,ENVO:01000045,epeiric sea biome


# Classes from the reference enumeration

In [31]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [32]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys =[]
    

In [33]:
print(context_pvs_keys)

['alpine tundra biome [ENVO:01001505]', 'anthropogenic terrestrial biome [ENVO:01000219]', 'broadleaf forest biome [ENVO:01000197]', 'coniferous forest biome [ENVO:01000196]', 'cropland biome [ENVO:01000245]', 'flooded grassland biome [ENVO:01000195]', 'flooded savanna biome [ENVO:01000190]', 'forest biome [ENVO:01000174]', 'grassland biome [ENVO:01000177]', 'mangrove biome [ENVO:01000181]', 'mediterranean forest biome [ENVO:01000199]', 'mediterranean grassland biome [ENVO:01000224]', 'mediterranean savanna biome [ENVO:01000229]', 'mediterranean shrubland biome [ENVO:01000217]', 'mediterranean woodland biome [ENVO:01000208]', 'mixed forest biome [ENVO:01000198]', 'montane grassland biome [ENVO:01000194]', 'montane savanna biome [ENVO:01000223]', 'montane shrubland biome [ENVO:01000216]', 'rangeland biome [ENVO:01000247]', 'savanna biome [ENVO:01000178]', 'shrubland biome [ENVO:01000176]', 'subpolar coniferous forest biome [ENVO:01000250]', 'subtropical broadleaf forest biome [ENVO:0100

In [34]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [35]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [36]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [37]:
pv_validation_results

{'problems': [],
 'valids': [{'curie': 'ENVO:01001505', 'label': 'alpine tundra biome'},
  {'curie': 'ENVO:01000219', 'label': 'anthropogenic terrestrial biome'},
  {'curie': 'ENVO:01000197', 'label': 'broadleaf forest biome'},
  {'curie': 'ENVO:01000196', 'label': 'coniferous forest biome'},
  {'curie': 'ENVO:01000245', 'label': 'cropland biome'},
  {'curie': 'ENVO:01000195', 'label': 'flooded grassland biome'},
  {'curie': 'ENVO:01000190', 'label': 'flooded savanna biome'},
  {'curie': 'ENVO:01000174', 'label': 'forest biome'},
  {'curie': 'ENVO:01000177', 'label': 'grassland biome'},
  {'curie': 'ENVO:01000181', 'label': 'mangrove biome'},
  {'curie': 'ENVO:01000199', 'label': 'mediterranean forest biome'},
  {'curie': 'ENVO:01000224', 'label': 'mediterranean grassland biome'},
  {'curie': 'ENVO:01000229', 'label': 'mediterranean savanna biome'},
  {'curie': 'ENVO:01000217', 'label': 'mediterranean shrubland biome'},
  {'curie': 'ENVO:01000208', 'label': 'mediterranean woodland biom

# Get the CURIEs used in NMDC Biosample annotations

In [38]:
if os.path.isfile(all_nmdc_biosamples_file):
    print(f"{all_nmdc_biosamples_file} is present in the current working directory and will be read into all_nmdc_biosamples.")
    # with open(all_nmdc_biosamples_file, 'r') as file:
    #     all_nmdc_biosamples = yaml.full_load(file)
    # read as json
    with open(all_nmdc_biosamples_file, 'r') as f:
        all_nmdc_biosamples = json.load(f)

else:
    print(f"All NMDC Biosamples need to be fetched and saved to {all_nmdc_biosamples_file}")
    all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)
    # with open(all_nmdc_biosamples_file, 'w') as file:
    #     documents = yaml.dump(all_nmdc_biosamples, file)
    # save as json
    with open(all_nmdc_biosamples_file, 'w') as f:
        json.dump(all_nmdc_biosamples, f)

# this saves network traffic. could use JSON for faster performance. 
# 1 minute for network fetch and JSON write?!
# 1 minute for yaml read
# instantaneous for JSON read?

all_nmdc_biosamples.json is present in the current working directory and will be read into all_nmdc_biosamples.


## Prediction of env_package annotations 

In [39]:
# Specify the output file name
env_packages_file = "nmdc_biosample_asserted_normalized_and_inferred_env_package.tsv"

if os.path.exists(env_packages_file):
    # Load the DataFrame from the file if it exists
    print(f"Loading {env_packages_file} into nmdc_biosample_contexts_frame...")
    nmdc_biosample_contexts_frame = pd.read_csv(env_packages_file, sep='\t')
else:
    # File doesn't exist; generate the DataFrame
    print(f"{env_packages_file} not found. Predicting from asserted records and {env_package_override_file}...")

    # Load environment package overrides
    env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

    # Extract biosample contexts
    biosample_contexts_lod = biosamples_lod_context_extractor(
        all_nmdc_biosamples, envo_adapter,
        my_env_pacakge_overrides=env_pacakge_overrides
    )

    # Create the DataFrame
    nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

    # Print value counts for the 'normalized_env_package' column
    print("\n")
    print("Value counts for normalized_env_package column:")
    print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))
    print("\n")

    # Generate package predictions
    package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

    # Add predictions to the DataFrame
    nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

    # Save the DataFrame to the file
    nmdc_biosample_contexts_frame.to_csv(env_packages_file, sep='\t', index=False)
    print(f"env_package predictions saved to {env_packages_file}")



Loading nmdc_biosample_asserted_normalized_and_inferred_env_package.tsv into nmdc_biosample_contexts_frame...


## env-package prediction complete

To-do: save this and don't recreate it if it's available

Then get it reviewed by other NMDC stakeholders and inject it into MongoDB if approved

## Destructively filter `nmdc_biosample_contexts_frame` by `env_package` 

In [40]:
nmdc_biosample_contexts_frame.shape

(8362, 15)

In [41]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == nmdc_package_selector]

In [42]:
nmdc_biosample_contexts_frame.shape

(192, 15)

# Long process of predicting OBO foundry CURIes from NCBI Biosamples

## Start by getting unique annotations? Pre-counted by Biosamples

Current task is to provide counts by "study" aka Bioproject in addition to Biosample counts or instead of Biosamples counts if necessary

In [43]:
ncbi_frame = ncbi_conn.execute(ncbi_X_per_annotation_query).fetchdf()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [44]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [45]:
# includes values with counts of one... useful for discovering drag-down submissions?

## MIxS and NCBI guidelines imply environmental context slots are multivalued
and that the pipe `|` should be used as a delimiter

there's an envo_count value below that indicates how ofter other delimiteres might be used

In [46]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [47]:
# todo is there any reason to not do this ?
ncbi_frame = ncbi_frame[ncbi_frame['content'].notna() & (ncbi_frame['content'] != '')]

In [48]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [49]:
ncbi_frame.shape

(5189, 5)

In [50]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [51]:
ncbi_frame.shape

(5307, 5)

## splitting adds ~ 5% more rows
which might be important since were currently using a longest annotation strategy here

In [52]:
# how many content_list strings contain envo multiple times now?

In [53]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [54]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    4759
1     531
3      15
2       1
7       1
Name: count, dtype: int64

## If my math is correct, about 0.1% of the annotations still contain multiple CURIes 
after splitting on pipes

There will also be annotations with multiple label-like strings that weren't split because they weren't delimited on pipes
That might be a source of lost information since we are using a longest-match annotator here
I.e. there could be annotations with multiple hits worth keeping

## Parsing out CURIEs

this has a few limitations. The function only tries pre-specified prefixes (['ENOV'] by default) and only considers colons and underscores valid delimiters.

In [55]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [56]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


## In what kinds of cases could no CURIe be parsed
despite the presence of "ENVO" in the content string?

In [57]:
parse_failures

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie
455,449,ENVO：00000316,2,ENVO：00000316,1,1,ENVO：00000316,
2397,2339,ENVO,1,ENVO,1,1,ENVO,
4499,4397,Tropical shrubland biome [ENVO01000214],1,Tropical shrubland biome [ENVO01000214],1,1,Tropical shrubland biome [ENVO01000214],
5126,5012,Tropical woodland biome [ENVO01000220],1,Tropical woodland biome [ENVO01000220],1,1,Tropical woodland biome [ENVO01000220],


## Retrieve the labels for the parsed CURIes

In [58]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

## Apply oaklib annotation to the strings after CURIe removal
Actually the annotator can (sometimes?) detect colon-delimited CURIEs with lower case prefixes

This returns CURIes with evidence but not necessarily the label corresponding to the CURIe

In [142]:
# Specify the lexical index file name
envo_lexical_index_file = "envo_lexical_index.yaml"

# Check if the lexical index file exists
if os.path.exists(envo_lexical_index_file):
    print(f"Loading lexical index from {envo_lexical_index_file}...")
    ix = load_lexical_index(envo_lexical_index_file)
else:
    print(f"{envo_lexical_index_file} not found. Creating lexical index from envo_adapter...")
    # Create the lexical index from envo_adapter
    ix = create_lexical_index(envo_adapter)
    # Save the lexical index to a file
    save_lexical_index(ix, envo_lexical_index_file)
    print(f"Lexical index saved to {envo_lexical_index_file}")

# Initialize the TextAnnotatorInterface
envo_text_annotator_interface = TextAnnotatorInterface()
envo_text_annotator_interface.lexical_index = ix

# this cell only takes ~ 1 minute, but generates a lot of "ERRORS" and WARNINGS in a red font
#   while lexically indexing the ontology

Loading lexical index from envo_lexical_index.yaml...


In [143]:
# Apply the annotation function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_text_annotator_interface, MIN_ANNOTATION_LEN))

# ~ 1 minute


## Add the labels for the CURIes identified though oaklib annotation of strings

In [60]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [61]:
ncbi_frame

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,not applicable,262,not applicable,1,0,not applicable,,,,
1,2,missing,199,missing,1,0,missing,,,,
2,3,not collected,127,not collected,1,0,not collected,,,,
3,4,Orchard,87,Orchard,1,0,Orchard,,,ENVO:00000115,orchard
4,5,,79,,1,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
5302,5186,29,1,29,1,0,29,,,,
5303,5187,101,1,101,1,0,101,,,,
5304,5188,bulk field - melilot predomination,1,bulk field - melilot predomination,1,0,bulk field - melilot predomination,,,ENVO:01000352,field
5305,5189,plant endosphere,1,plant endosphere,1,0,plant endosphere,,,,


## we now have a list of CURIes for each normalized annotation

This could be because the submitter provided a CURIe and a label that don't match
*One* case of this is dragging a CURIe down a column in a spreadsheet, expecting it to be copied,
but actually auto-incrementing it

Now attempt to find one best CURIe for each annotation... by now we have lost the ability to retain multiple legitimate
but improperly separated CURIes

In [62]:
# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long stretches of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [63]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
0    2629
1    2593
2      85
Name: count, dtype: int64

In [64]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [65]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [66]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [67]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [68]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [69]:
double_curie_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,extracted_prefix,extracted_local_id,extracted_local_id_int
149,ENVO:01000245,ENVO:01000635,ENVO,01000245,1000245
216,ENVO:01001430,ENVO:00000077,ENVO,01001430,1001430
229,ENVO:00000136,ENVO:00000316,ENVO,00000136,136
240,ENVO:0003101,ENVO:00003081,ENVO,0003101,3101
248,ENVO:01001431,ENVO:01001430,ENVO,01001431,1001431
...,...,...,...,...,...
4411,ENVO:00000113,ENVO:01000739,ENVO,00000113,113
4560,ENVO:01001437,ENVO:01001430,ENVO,01001437,1001437
4976,ENVO:0000316,ENVO:00000316,ENVO,0000316,316
5106,ENVO:00002040,ENVO:00000109,ENVO,00002040,2040


In [70]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [71]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)


In [72]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

`stretches_df` shows groups of extracted EnvoO ids (CURIes without prefix or padding zeros) that share a common CURIe by oaklib annotation of the textual part. This may not be the best or only way to address these spurious drag-stretch, auto-incremented CURIes

Ie 1001458 corresponds to ENVO:01001458, 'mist'

_although it theoretically could have been ENVO:1001458 since EnvO CURIes can have 7 or 8 digits_

In group 9, there are another ~ 50 sequential id values, all corresponding to environmental context annotations whose best oak-annotated class is ENVO:01001803, 'tropical forest'!

How much of an impact does this have? 

In [73]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,3081
1,1,3082
2,1,3083
3,1,3084
4,1,3085
5,1,3086
6,1,3087
7,1,3088
8,1,3089
9,1,3090


In [74]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [75]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


For stretch 9, which included extracted CURIes from ENVO:01001458 to ENVO:01001511, the oaklib test annotation of 100% of the submitted environmental context annotations was ENVO:01001803, so we will keep that and disregard all of the CURIes from the stretch


In [76]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:00003081,1.0
1,2.0,ENVO:01001430,0.942857


In [77]:
decisive_fraction_threshold = 0.9

In [78]:
decisive_stretch_summary_df = stretch_summary_df[stretch_summary_df['fraction'] >= decisive_fraction_threshold]

In [79]:
decisive_stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:00003081,1.0
1,2.0,ENVO:01001430,0.942857


In [80]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    decisive_stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [81]:
double_curie_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,extracted_prefix,extracted_local_id,extracted_local_id_int,stretch_id,value,most_common_longest_annotation_curie,fraction
0,ENVO:01000245,ENVO:01000635,ENVO,01000245,1000245,,,,
1,ENVO:01001430,ENVO:00000077,ENVO,01001430,1001430,2.0,1001430.0,ENVO:01001430,0.942857
2,ENVO:00000136,ENVO:00000316,ENVO,00000136,136,,,,
3,ENVO:0003101,ENVO:00003081,ENVO,0003101,3101,1.0,3101.0,ENVO:00003081,1.000000
4,ENVO:01001431,ENVO:01001430,ENVO,01001431,1001431,2.0,1001431.0,ENVO:01001430,0.942857
...,...,...,...,...,...,...,...,...,...
77,ENVO:00000113,ENVO:01000739,ENVO,00000113,113,,,,
78,ENVO:01001437,ENVO:01001430,ENVO,01001437,1001437,2.0,1001437.0,ENVO:01001430,0.942857
79,ENVO:0000316,ENVO:00000316,ENVO,0000316,316,,,,
80,ENVO:00002040,ENVO:00000109,ENVO,00002040,2040,,,,


In [82]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [83]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
1,ENVO:01001430,ENVO:00000077,True
3,ENVO:0003101,ENVO:00003081,True
4,ENVO:01001431,ENVO:01001430,True
5,ENVO:0003081,ENVO:00003081,True
6,ENVO:0003082,ENVO:00003081,True
7,ENVO:0003087,ENVO:00003081,True
8,ENVO:01001433,ENVO:01001430,True
9,ENVO:0003095,ENVO:00003081,True
10,ENVO:0003100,ENVO:00003081,True
11,ENVO:0003084,ENVO:00003081,True


In [84]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [85]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [86]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
0    2629
1    2593
2      85
Name: count, dtype: int64

In [87]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    2649
0    2629
2      29
Name: count, dtype: int64

## The extent of multiple detected CURIes has been reduced ~ 4.5 fold 
(for soil env_local_scale)

Isolate the submitter annotations for which there's clearly one best CURIe after removing the drag-stretches

In [88]:
ncbi_frame.shape

(5307, 16)

In [89]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [90]:
ncbi_frame_undisputed.shape

(5278, 16)

In [91]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [92]:
ncbi_frame_disputed.shape

(29, 16)

In [93]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [94]:
ncbi_frame_disputed.shape

(58, 16)

In [95]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

## Just include all of the remaining disputed CURIe assingments

In [96]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [97]:
ncbi_disputes_exploded_frame.shape

(5336, 16)

In [98]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,not applicable,262,not applicable,1,0,not applicable,,,,,[],0,,[],0
1,2,missing,199,missing,1,0,missing,,,,,[],0,,[],0
2,3,not collected,127,not collected,1,0,not collected,,,,,[],0,,[],0
3,4,Orchard,87,Orchard,1,0,Orchard,,,ENVO:00000115,orchard,[ENVO:00000115],1,,[ENVO:00000115],1
4,5,,79,,1,0,,,,,,[],0,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5331,4863,intertidal zone [ENVO:0000316],1,intertidal zone [ENVO:0000316],1,1,intertidal zone,ENVO:0000316,,ENVO:00000316,intertidal zone,"[ENVO:0000316, ENVO:00000316]",2,,[ENVO:00000316],2
5332,4932,field(ENVO:00000114),1,field(ENVO:00000114),1,1,field,ENVO:00000114,agricultural field,ENVO:01000352,field,"[ENVO:00000114, ENVO:01000352]",2,,[ENVO:00000114],2
5333,4932,field(ENVO:00000114),1,field(ENVO:00000114),1,1,field,ENVO:00000114,agricultural field,ENVO:01000352,field,"[ENVO:00000114, ENVO:01000352]",2,,[ENVO:01000352],2
5334,4992,wood [ENVO_00002040],1,wood [ENVO_00002040],1,1,wood,ENVO:00002040,wood,ENVO:00000109,woodland area,"[ENVO:00002040, ENVO:00000109]",2,,[ENVO:00002040],2


In [99]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [100]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    2707
0    2629
Name: count, dtype: int64

In [101]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [102]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["count"].sum().reset_index()
)

ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_scoped_count']

This is currently a count of Biosamples for which the indicated CURIes can be extracted or inferred by oaklib annotation, after removal of drag-stretch, auto-incremented CURIes

In [103]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_scoped_count
0,BFO:0000015,4
1,BFO:0000029,6
2,CHEBI:15377,32
3,CHEBI:22695,15
4,CHEBI:24632,1
...,...,...
537,UBERON:0001913,1
538,UBERON:0001988,1
539,UBERON:0002100,5
540,UBERON:0002416,12


## GOLD mappings/Biosample counts hybrid

we're currently including
- mappings only

and have retired the previous use of
- mappings in hybrid with biosample counts

And we're casting a wide net, especially for the hybrid approach
- searching for 'soil', 'sediment' etc. in GOLDTERMS labels without anchoring them like 'Environmental > Aquatic > Sediment'
- retrieving the CURIes for env_broad_scale, env_local_scale and env_medium for all voting sheets, and trusting orthogonal filtering to remove the inappropriate CURIes

Should we now add (or switch to) direct biosample counts of GOLD "envo" annotations?

Efficient retrieval of  all GOLD data in a given scope isn't easy

In [104]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [105]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)

# ~ 1 second

In [106]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 second

In [107]:
## that's all fast. don't bother caching

In [108]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

# GOLDTERMS only approach

In [109]:
goldterms_result = pd.read_sql_query(extension_query, goldterms_conn)

In [110]:
goldterms_result

Unnamed: 0,subject,predicate,content
0,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313
1,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:mixs_extension,mixs:MiscellaneousNaturalOrArtificialEnvironment
2,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:other,ENVO:00010622
3,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,rdfs:label,Engineered > Artificial ecosystem > Plant grow...
4,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313
...,...,...,...
286,GOLDVOCAB:Plant-pot,rdfs:label,Plant pot
287,GOLDVOCAB:Plants,rdfs:label,Plants
288,GOLDVOCAB:Sewage-treatment-plant,rdfs:label,Sewage treatment plant
289,GOLDVOCAB:Soil-_28non-planted_29,rdfs:label,Soil (non-planted)


In [111]:
# # todo: save this kind of content before subsetting on an environment
# #   the subsetting is currently baked into the query
# 
# # see also goldterms_queries.ipynb in MAM's Collab
# goldterms_result.to_csv("goldterms_single_environment_mappings_long.tsv", sep="\t", index=False)

In [112]:
goldterms_only_curies = goldterms_result.loc[goldterms_result['predicate'].isin(gold_context_selectors), 'content']


In [113]:
goldterms_only_curies = goldterms_only_curies.unique().tolist()

In [114]:
# goldterms_only_curies

# Make lists of CURIEs
which will determine
- the rows in the table
- the boolean filter columns

In [115]:
anchor_curies = list(anchor_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]

biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A])) # 
terrestrial_biome_curies = list(envo_adapter.descendants(TERRESTRIAL_BIOME, predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants(AQUATIC_BIOME, predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants(ENVIRONMENTAL_SYSTEM, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENVIRONMENTAL_MATERIAL, predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

soil_curies = list(envo_adapter.descendants(SOIL, predicates=[IS_A])) # 
liquid_water_curies = list(envo_adapter.descendants(LIQUID_WATER, predicates=[IS_A])) # 
water_ice_curies = list(envo_adapter.descendants(WATER_ICE, predicates=[IS_A])) # 

human_construction_curies = list(envo_adapter.descendants(HUMAN_CONSTRUCTION, predicates=[IS_A])) #
building_curies = list(envo_adapter.descendants(BUILDING, predicates=[IS_A])) #
building_part_curies = list(envo_adapter.descendants(BUILDING_PART, predicates=[IS_A])) #


In [116]:
# Specify an output file name
nlcd2011_class_iris_file = "nlcd2011_class_iris.txt"

nlcd_subset_textual_representation = "nlcd2011"

in_subset_curie = "oio:inSubset"


# Initialize nlcd_classes
nlcd_classes = []

if os.path.exists(nlcd2011_class_iris_file):
    # Load the list from the file if it exists
    print(f"Loading {nlcd_subset_textual_representation} classes from {nlcd2011_class_iris_file}...")
    with open(nlcd2011_class_iris_file, "r") as file:
        nlcd_classes = [line.strip() for line in file.readlines()]
else:
    # File doesn't exist; generate the list using the loop
    print(f"{nlcd2011_class_iris_file} not found. Identifying classes in {nlcd_subset_textual_representation} subset (~2 minutes)...")

    # Retrieve all classes
    entities = envo_adapter.entities()

    # super slow 2 minutes
    # but retrieving classes by named subset seems to crash on EnvO with its textual subsets?
    for entity in entities:
        term_metadata = envo_adapter.entity_metadata_map(entity)
        if in_subset_curie in term_metadata:
            subsets = term_metadata[in_subset_curie]
            if nlcd_subset_textual_representation in subsets:
                nlcd_classes.append(entity)

    # Save the generated list to the file
    with open(nlcd2011_class_iris_file, "w") as file:
        for string in nlcd_classes:
            file.write(string + "\n")

    print(f"List saved to {nlcd2011_class_iris_file}")

# At this point, nlcd_classes contains the desired list
print(f"Total {nlcd_subset_textual_representation} classes loaded: {len(nlcd_classes)}")


Loading nlcd2011 classes from nlcd2011_class_iris.txt...
Total nlcd2011 classes loaded: 22


## Bootstrap the rows

In [117]:
include_in_rows = set()

In [118]:
include_in_rows.update(anchor_curies)

In [119]:
include_in_rows.update(legacy_pv_curies)

In [120]:
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_context_selector])

In [121]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [122]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [123]:
include_in_rows.update(goldterms_only_curies)

In [124]:
include_in_rows.update(nlcd_classes)

In [125]:
rows_lod = []

# Voting sheet rows and boolean columns

In [126]:
for curie in include_in_rows:
    if curie is None:
        continue
        
    # ONCE AGAIN, assuming that EnvO is the only ontology we'll check against
    current_ancestors = list(envo_adapter.ancestors(curie, predicates=[IS_A])) # vs legacy_pv_curies
    ancestors_in_enum_count = len(set(current_ancestors) & set(legacy_pv_curies))
    
    current_descendants  = list(envo_adapter.descendants(curie, predicates=[IS_A])) # vs legacy_pv_curies
    descendants_in_enum_count  = len(set(current_descendants) & set(legacy_pv_curies))
    
    
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        comparison_enum_column_name: False,
        'ancestors_in_enum_count': ancestors_in_enum_count,
        'descendants_in_enum_count': descendants_in_enum_count,
        'nlcd_class': False,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
        'soil': False,
        'liquid water': False,
        'water ice': False,
        'human_construction': False,
        'building': False,
        'building_part': False,
        'goldterms_mappings': False,
    }
        
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in soil_curies:
        row['soil'] = True
    if curie in liquid_water_curies:
        row['liquid water'] = True
    if curie in water_ice_curies:
        row['water ice'] = True
    if curie in human_construction_curies:
        row['human_construction'] = True
    if curie in building_curies:
        row['building'] = True
    if curie in building_part_curies:
        row['building_part'] = True
    if curie in legacy_pv_curies:
        row[comparison_enum_column_name] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    if curie in goldterms_only_curies:
        row['goldterms_mappings'] = True
    if curie in nlcd_classes:
        row['nlcd_class'] = True
        
    try:
        prefix, local_id = curie.split(':')
        if prefix and prefix == 'ENVO' and row['label'] is not None:
            row['envo_native'] = True
    except Exception as e:
        # Print the exception message
        print(f"An error occurred: {e} trying to split {curie}")

    rows_lod.append(row)

# 2 minutes


# ^ Voting sheet rows and boolean columns

In [127]:
rows_frame = pd.DataFrame(rows_lod)

In [128]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvBroadScaleSoilEnum_11_1,ancestors_in_enum_count,descendants_in_enum_count,nlcd_class,abp,env_sys,...,terrestrial_biome,aquatic_biome,env_mat,soil,liquid water,water ice,human_construction,building,building_part,goldterms_mappings
0,ENVO:00005769,mountain forest soil,True,False,False,0,0,False,True,False,...,False,False,True,True,False,False,False,False,False,False
1,ENVO:00005772,orchard soil,True,False,False,0,0,False,True,False,...,False,False,True,True,False,False,False,False,False,False
2,ENVO:01000212,temperate mixed forest biome,True,False,True,4,1,False,True,True,...,True,False,False,False,False,False,False,False,False,False
3,FOODON:03315552,juice beverage,False,False,False,0,0,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,ENVO:01000426,room,True,False,False,0,0,False,True,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,ENVO:01000252,freshwater lake biome,True,False,False,0,0,False,True,True,...,False,True,False,False,False,False,False,False,False,False
703,ENVO:12,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
704,ENVO:01000219,anthropogenic terrestrial biome,True,False,True,2,3,False,True,True,...,True,False,False,False,False,False,False,False,False,False
705,RO:0002577,system,False,False,False,0,52,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Merge in NMDC counts

In [129]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_context_selector].value_counts().reset_index()
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_scoped_count']


In [130]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_scoped_count
0,ENVO:01001442,192


In [131]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [132]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvBroadScaleSoilEnum_11_1,ancestors_in_enum_count,descendants_in_enum_count,nlcd_class,abp,env_sys,...,aquatic_biome,env_mat,soil,liquid water,water ice,human_construction,building,building_part,goldterms_mappings,nmdc_scoped_count
0,ENVO:00005769,mountain forest soil,True,False,False,0,0,False,True,False,...,False,True,True,False,False,False,False,False,False,
1,ENVO:00005772,orchard soil,True,False,False,0,0,False,True,False,...,False,True,True,False,False,False,False,False,False,
2,ENVO:01000212,temperate mixed forest biome,True,False,True,4,1,False,True,True,...,False,False,False,False,False,False,False,False,False,
3,FOODON:03315552,juice beverage,False,False,False,0,0,False,False,False,...,False,True,False,False,False,False,False,False,False,
4,ENVO:01000426,room,True,False,False,0,0,False,True,False,...,False,False,False,False,False,True,False,True,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,ENVO:01000252,freshwater lake biome,True,False,False,0,0,False,True,True,...,True,False,False,False,False,False,False,False,False,
703,ENVO:12,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,False,
704,ENVO:01000219,anthropogenic terrestrial biome,True,False,True,2,3,False,True,True,...,False,False,False,False,False,False,False,False,False,
705,RO:0002577,system,False,False,False,0,52,False,False,False,...,False,False,False,False,False,False,False,False,False,


In [133]:
# gold and ncbi counts are slightly trickier
# for gold: including mappings only, mappings in hybrid with biosample counts. 
#    Switch to direct biosample counts of GOLD "envo" annotations?
# ncbi: we have extracted curies and annotated curies

## Merge in NCBI counts

In [134]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [135]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [136]:
rows_frame.to_csv(output_file_name, sep="\t", index=False)

In [137]:
ncbi_conn.close()