In [1]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import yaml
import json


In [2]:
print("verify output is being rendered")

verify output is being rendered


In [3]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals? Or just use caching around the function?
ancestor_cache = {}
descendant_cache = {}

In [4]:
# todo deal with circularity in env package prediction -> env triad reporting

# todo this on-demand NCBI curie extraction and annotation recapitulates work that is being added to
# https://portal.nersc.gov/project/m3408/biosamples_duckdb/
# via 
#   although that doesn't detect auto-incremented curies from  spreadsheet dragging

# todo if more caching is desired, it should probably take the form of saving dataframes for TSV

# eventually, dig up a complete JSON gold biosample dump for non-hybrid biosample counts

# overall run time (if NCBI biosamples and goldData are cached): ~ 10 minutes

# count studies not biosamples ?
# how? for gold, ncbi or nmdc?

# Task Settings
_For making a Soil env_broad_scale voting sheet vs a Sediment env_local_scale sheet, etc._

todo: bundle these into dicts so they don't have to be modified independently and kept in sync with one another.

In [5]:
output_file_name = "voting_sheets_output/soil_env_local_scale_voting_sheet.tsv"

In [6]:
# semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale
semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale
# semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

## context selectors

In [7]:
# gold_context_selector = 'mixs:env_broad'
# gold_context_selector = 'mixs:env_local'
# gold_context_selector = 'mixs:env_medium'

gold_context_selectors =  [
    'mixs:env_broad',
    'mixs:env_local',
    'mixs:env_medium'
]


In [8]:
# ncbi_context_selector = 'env_broad_scale'
ncbi_context_selector = 'env_local_scale'
# ncbi_context_selector = 'env_medium'

In [9]:
# nmdc_context_selector= 'env_broad_scale_id'
nmdc_context_selector= 'env_local_scale_id'
# nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors

In [10]:
# plant_first_where = "s1.value like 'host-associated > plants%'"
# sediment_first_where = "lower(s1.value) like 'environmental > aquatic%sediment%'"
# soil_first_where = "s1.value like 'environmental > terrestrial > soil%'"
# water_first_where = "s1.value like 'environmental > aquatic%' and lower(s1.value) not like '%sediment%'"

plant_first_where = "lower(s1.value) like '%plant%'" # picks up waste water treatment plant
sediment_first_where = "lower(s1.value) like '%sediment%'"
soil_first_where = "lower(s1.value) like '%soil%'"
water_first_where = "lower(s1.value) like '%aquatic%' and lower(s1.value) not like '%sediment%'"

In [11]:
gold_first_where = soil_first_where

In [12]:
# todo new since soil: why are we only considering MIMS.me for discovering appropriate env triad values?
#   there's usually a roughly equal number of biosamples from in each extension for MIMS.me and 

# ncbi_package_selector = 'plant-associated.6.0'
# ncbi_package_selector = 'sediment.6.0'
ncbi_package_selector = 'soil.6.0'
# ncbi_package_selector = 'water.6.0'

In [13]:
# nmdc_package_selector = 'plant-associated'
# nmdc_package_selector = 'sediment'
nmdc_package_selector = 'soil'
# nmdc_package_selector = 'water'


In [14]:
GOLDTERMS_NA = '' # ???

GOLDTERMS_PLANT_ASSOCIATED = GOLDTERMS_NA # host associated -> viridiplantae? take a string approach!
GOLDTERMS_SEDIMENT = 'GOLDTERMS:3985' #  doesn't have any subclasses
GOLDTERMS_SOIL = 'GOLDTERMS:4212'
GOLDTERMS_WATER = 'GOLDTERMS:3984'

# GOLDTERMS:4180, 'Environmental > Aquatic > Freshwater > Pond > Sediment' and ~64 more don't share a common root
# poetry run runoak -i sqlite:obo:goldterms info 't~sediment'


In [15]:
goldterms_root = GOLDTERMS_SOIL

## selecting name and version of one enum for comparison


In [16]:
# only the Soil enums have legacy definitions (v10.7 and earlier?)

# CONTEXT_ENUM = "EnvBroadScaleSoilEnum"
CONTEXT_ENUM = "EnvLocalScaleSoilEnum"
# CONTEXT_ENUM = "EnvMediumSoilEnum"

# CONTEXT_ENUM = ""

In [17]:
# todo: add columns for membership in multiple enums from multiple version of the schema?
#  like sediment local vs soil local and water local (once that's completed)
#  get them from schema files or something prior to that? sems like the voting sheets are too raw/preliminary for that
#   can use a more recent schema url for more recent enums!

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

# previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [18]:
# todo: don't call the column "legacy_pv". use the name of the enum and the version of the schema?

comparison_enum_column_name = 'EnvLocalSoilEnum_10_7'
# comparison_enum_column_name = 'EnvLocalScaleSoilEnum_11_1'
# comparison_enum_column_name = 'no_comparison_enum'

# Additional Settings

In [19]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [20]:
MIN_ANNOTATION_LEN = 3

In [21]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [22]:
envo_adapter_string = "sqlite:obo:envo"

In [23]:
# goldterms_adapter_string = "sqlite:obo:envo"

In [24]:
env_package_override_file = 'mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [25]:
# ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples.duckdb.gz'

In [26]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
gold_data_file_name = "goldData.xlsx" # goldData.xlsx: Microsoft Excel 2007+
gold_csv_file_name = "gold_biosamples.csv"
BIOSAMPLES_SHEET = "Biosample"

In [27]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"


In [28]:
all_nmdc_biosamples_file = 'all_nmdc_biosamples.json'

# CURIe Constants

In [29]:
BIOME = 'ENVO:00000428'
TERRESTRIAL_BIOME = 'ENVO:00000446'
AQUATIC_BIOME = 'ENVO:00002030'
ABP = 'ENVO:01000813'
ENVIRONMENTAL_SYSTEM = 'ENVO:01000254'
ENVIRONMENTAL_MATERIAL = 'ENVO:00010483'

SOIL = 'ENVO:00001998'
LIQUID_WATER = 'ENVO:00002006'
WATER_ICE = 'ENVO:01000277'

HUMAN_CONSTRUCTION = 'ENVO:00000070'
BUILDING = 'ENVO:00000073'
BUILDING_PART = 'ENVO:01000420'

# Settings-based Queries

In [46]:
goldterms_subclass_query = f"""
select
	subject
from
	entailed_edge ee
where
	predicate = 'rdfs:subClassOf'
	and object = '{goldterms_root}'
"""

In [47]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate in ('{"', '".join(gold_context_selectors)}')"""

In [48]:
ncbi_query = f"""
SELECT content, COUNT(1) AS sample_count 
FROM attributes 
WHERE harmonized_name = '{ncbi_context_selector}' AND package_content like '%{ncbi_package_selector}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [49]:
# and s1.subject = s1.stanza eliminates matches on blank node anntoation rows (probably woudn't change results but adds a little overhead)

extension_query = f"""
select
		s1.subject ,
		s2.predicate,
		COALESCE (s2."object",
	s2."value") as content
from
	statements s1
join statements s2 on 
	s1.subject = s2.subject
where
	{gold_first_where}
	and s1.predicate = 'rdfs:label'
	and s1.subject = s1.stanza
	and s2.predicate in ('mixs:env_broad', 'mixs:env_local', 'mixs:env_medium', 'mixs:mixs_extension', 'rdfs:label', 'mixs:other', 'mixs:anatomical_site', 'mixs:host_taxon') ;
"""

# todo provide examples of sediment samples that are excluded by the requirement for aquatic

# Locally Defined Functions
_Currently using locally-defined cache dictionaries_

In [50]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [51]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

# Procedural Code Starts Here

In [52]:
# Determine the filenames and target directory for the NCBI DuckDB
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
ncbi_compressed_file_path = os.path.join(ncbi_compressed_filename)
ncbi_uncompressed_file_path = os.path.join(ncbi_filename)

# target_dir = os.path.join('.') # just assume the files are downloaded into the same directory as the notebook

In [53]:
if os.path.isfile(ncbi_uncompressed_file_path):
    print(f"{ncbi_uncompressed_file_path} is already present in the current working directory.")
else:
    if os.path.isfile(ncbi_compressed_file_path):
        print(f"{ncbi_compressed_file_path} is already present in the current working directory.")
    else:
        print(f"{ncbi_compressed_file_path} needs to be downloaded")
        ncbi_response = requests.get(ncbi_duckdb_url)
        with open(ncbi_compressed_file_path, "wb") as f:
            f.write(ncbi_response.content)
        # ~ 2 minutes @ 250 Mbps
    
    # Unzip the compressed file and save the extracted file in target directory
    print(f"{ncbi_compressed_file_path} needs to be unpacked")
    with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
        with open(ncbi_uncompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # ~ 2 minutes

ncbi_biosamples.duckdb is already present in the current working directory.


In [54]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [55]:
envo_adapter = get_adapter(envo_adapter_string)

In [56]:
anchor_descendants = get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter)

In [57]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [58]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [59]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,ENVO:00001999,marine water body
1,ENVO:01000188,tropical savanna biome
2,ENVO:00000487,paternoster lake
3,ENVO:01000860,temperate marine upwelling biome
4,ENVO:01000199,mediterranean forest biome
...,...,...
1731,ENVO:01000429,burrow
1732,ENVO:01000431,mixed forest
1733,ENVO:01000536,factory
1734,ENVO:00000873,freshwater biome


----

In [60]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [61]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys =[]
    

In [62]:
print(context_pvs_keys)

['astronomical body part [ENVO:01000813]', '__coast [ENVO:01000687]', '__solid astronomical body part [ENVO:00000191]', '______landform [ENVO:01001886]', '______channel [ENVO:03000117]', '________tunnel [ENVO:00000068]', '______surface landform [ENVO:01001884]', '________desert [ENVO:01001357]', '________outcrop [ENVO:01000302]', '________boulder field [ENVO:00000537]', '________landfill [ENVO:00000533]', '________hummock [ENVO:00000516]', '________terrace [ENVO:00000508]', '________peninsula [ENVO:00000305]', '________shore [ENVO:00000304]', '__________lake shore [ENVO:00000382]', '________dry lake [ENVO:00000277]', '________karst [ENVO:00000175]', '________isthmus [ENVO:00000174]', '________badland [ENVO:00000127]', '________volcanic feature [ENVO:00000094]', '__________volcanic cone [ENVO:00000398]', '____________tuff cone [ENVO:01000664]', '________beach [ENVO:00000091]', '________plain [ENVO:00000086]', '________cave [ENVO:00000067]', '________spring [ENVO:00000027]', '______slope

In [63]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [64]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [65]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [66]:
pv_validation_results

{'problems': [],
 'valids': [{'curie': 'ENVO:01000813', 'label': 'astronomical body part'},
  {'curie': 'ENVO:01000687', 'label': 'coast'},
  {'curie': 'ENVO:00000191', 'label': 'solid astronomical body part'},
  {'curie': 'ENVO:01001886', 'label': 'landform'},
  {'curie': 'ENVO:03000117', 'label': 'channel'},
  {'curie': 'ENVO:00000068', 'label': 'tunnel'},
  {'curie': 'ENVO:01001884', 'label': 'surface landform'},
  {'curie': 'ENVO:01001357', 'label': 'desert'},
  {'curie': 'ENVO:01000302', 'label': 'outcrop'},
  {'curie': 'ENVO:00000537', 'label': 'boulder field'},
  {'curie': 'ENVO:00000533', 'label': 'landfill'},
  {'curie': 'ENVO:00000516', 'label': 'hummock'},
  {'curie': 'ENVO:00000508', 'label': 'terrace'},
  {'curie': 'ENVO:00000305', 'label': 'peninsula'},
  {'curie': 'ENVO:00000304', 'label': 'shore'},
  {'curie': 'ENVO:00000382', 'label': 'lake shore'},
  {'curie': 'ENVO:00000277', 'label': 'dry lake'},
  {'curie': 'ENVO:00000175', 'label': 'karst'},
  {'curie': 'ENVO:0000

----

In [67]:
if os.path.isfile(all_nmdc_biosamples_file):
    print(f"{all_nmdc_biosamples_file} is present in the current working directory and will be read into all_nmdc_biosamples.")
    # with open(all_nmdc_biosamples_file, 'r') as file:
    #     all_nmdc_biosamples = yaml.full_load(file)
    # read as json
    with open(all_nmdc_biosamples_file, 'r') as f:
        all_nmdc_biosamples = json.load(f)

else:
    print(f"All NMDC Biosamples need to be fetched and saved to {all_nmdc_biosamples_file}")
    all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)
    # with open(all_nmdc_biosamples_file, 'w') as file:
    #     documents = yaml.dump(all_nmdc_biosamples, file)
    # save as json
    with open(all_nmdc_biosamples_file, 'w') as f:
        json.dump(all_nmdc_biosamples, f)

# this saves network traffic. could use JSON for faster performance. 
# 1 minute for network fetch and JSON write?!
# 1 minute for yaml read
# instantaneous for JSON read?

all_nmdc_biosamples.json is present in the current working directory and will be read into all_nmdc_biosamples.


In [68]:
env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

In [69]:
# todo show env_pacakge_overrides as a data frame
#   with some other columns for context?

In [70]:
biosample_contexts_lod = biosamples_lod_context_extractor(all_nmdc_biosamples, envo_adapter,
                                                          my_env_pacakge_overrides=env_pacakge_overrides)

# ~ 10 seconds, lots of logging

Overriding env_package for biosample nmdc:bsm-11-0k8nkx16 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-19v98823 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-1yvac190 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-28kgw077 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-2hswww54 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-34przm31 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-35m0rm03 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-3636w778 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nffqc45 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nhng665 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3r4g4610 from  to hydrocarbon resources-fluids_swabs
Overriding env_package

In [71]:
nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

In [72]:
# print a value count for the normalized_env_package column
print("Value counts for normalized_env_package column:")
print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))

Value counts for normalized_env_package column:
normalized_env_package
                                                   5838
soil                                               1707
plant-associated                                    401
water                                               192
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


In [73]:
package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

# these predictions often have a f1 of 1.00
# many people might find that hard to believe

(8362, 14)
                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        16
             hydrocarbon resources-fluids_swabs       1.00      1.00      1.00         7
miscellaneous natural or artificial environment       1.00      1.00      1.00        50
                               plant-associated       1.00      1.00      1.00       122
                                           soil       1.00      1.00      1.00       506
                                          water       1.00      1.00      1.00        57

                                       accuracy                           1.00       758
                                      macro avg       1.00      1.00      1.00       758
                                   weighted avg       1.00      1.00      1.00       758



In [74]:
nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

In [75]:
nmdc_biosample_contexts_frame.shape

(8362, 15)

In [76]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == nmdc_package_selector]

In [77]:
nmdc_biosample_contexts_frame.shape

(6162, 15)

----

In [78]:
ncbi_frame = ncbi_conn.execute(ncbi_query).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [79]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [80]:
# includes values with counts of one... useful for discovering drag-down submissions?

In [81]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [82]:
## diagnostically useful, but why are we saving this and not all of the intermediate dataframes?
# ncbi_frame.to_csv("ncbi_frame.tsv", sep="\t", index=False)

In [83]:
# todo is there any reason to not do this ?
ncbi_frame = ncbi_frame[ncbi_frame['content'].notna() & (ncbi_frame['content'] != '')]

In [84]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [85]:
ncbi_frame.shape

(11507, 5)

In [86]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [87]:
ncbi_frame.shape

(11989, 5)

In [88]:
# how many content_list strings contain envo multiple times now?

In [89]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [90]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    9450
1    2528
2       9
4       1
3       1
Name: count, dtype: int64

doesn't split multiple annotation strings delimited with something other than '|'
annotations with no curies but multiple strings will be "annotated" with OAK, but currently only the one best OAK annotation is kept

In [91]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [92]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


In [93]:
parse_failures

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie
138,138,ENVO:soil,356,ENVO:soil,1,1,ENVO soil,
151,148,ENVO:microbial feature,343,ENVO:microbial feature,1,1,ENVO microbial feature,
306,300,ENVO:biotic mesoscopic physical object,181,ENVO:biotic mesoscopic physical object,1,1,ENVO biotic mesoscopic physical object,
1201,1135,[ENVO00000486],30,[ENVO00000486],1,1,[ENVO00000486],
1394,1311,ENVO:mixed forest,20,ENVO:mixed forest,1,1,ENVO mixed forest,
...,...,...,...,...,...,...,...,...
11983,11502,ENVO0000328,1,ENVO0000328,1,1,ENVO0000328,
11984,11503,ENVO0000334,1,ENVO0000334,1,1,ENVO0000334,
11985,11504,ENVO0000352,1,ENVO0000352,1,1,ENVO0000352,
11987,11506,ENVO0000027,1,ENVO0000027,1,1,ENVO0000027,


Should we try parsing on additional CURIE delimiters? Or no delimiter?


In [94]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

In [95]:
# Apply the annotation function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_adapter, MIN_ANNOTATION_LEN))

# this cell only takes ~ 1 minute, but generates a lot of "ERRORS" and WARNINGS in a red fornt
#   while loading the ontologies that are used for annotating


ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [96]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [97]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,not applicable,24038,not applicable,1,0,not applicable,,,,
1,2,missing,14739,missing,1,0,missing,,,,
2,3,soil,8593,soil,1,0,soil,,,ENVO:00001998,soil
3,4,not collected,7666,not collected,1,0,not collected,,,,
4,5,Agricultural Land,5994,Agricultural Land,1,0,Agricultural Land,,,ENVO:00000077,agricultural ecosystem
...,...,...,...,...,...,...,...,...,...,...,...
11984,11503,ENVO0000334,1,ENVO0000334,1,1,ENVO0000334,,,,
11985,11504,ENVO0000352,1,ENVO0000352,1,1,ENVO0000352,,,,
11986,11505,soil biomass magnetite + surfactant,1,soil biomass magnetite + surfactant,1,0,soil biomass magnetite + surfactant,,,CHEBI:35195,surfactant
11987,11506,ENVO0000027,1,ENVO0000027,1,1,ENVO0000027,,,,


## GOLD mappings/Biosample counts hybrid

we're currently including
- mappings in hybrid with biosample counts
- mappings only

And we're casting a wide net, especially for the hybrid approach
- searching for 'soil', 'sediment' etc. in GOLDTERMS labels without anchoring them like 'Environmental > Aquatic > Sediment'
- retrieving the CURIes for env_broad_scale, env_local_scale and env_medium for all voting sheets, and trusting orthogonal filtering to remove the inappropriate CURIes

Should we now add (or switch to) direct biosample counts of GOLD "envo" annotations?

Efficient retrieval of  all GOLD data in a given scope isn't easy

In [98]:
if os.path.isfile(gold_data_file_name):
    print(f"{gold_data_file_name} is already present in the current working directory.")
else:
    print(f"{gold_data_file_name} needs to be downloaded")
    gold_response = requests.get(gold_data_url)
    with open(gold_data_file_name, "wb") as f:
        f.write(gold_response.content)
        # ~ 10 seconds  @ 250 Mbps

goldData.xlsx is already present in the current working directory.


Expect to see

> /home/mark/.cache/pypoetry/virtualenvs/nmdc-submission-schema-DC6HKp4p-py3.10/lib/python3.10/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

In [99]:
if os.path.isfile(gold_csv_file_name):
    print(f"{gold_csv_file_name} is present in the current working directory and will be parsed into gold_biosamples_frame.")
    gold_biosamples_frame = pd.read_csv(gold_csv_file_name, sep="\t")
else:
    print(f"gold_biosamples_frame and {gold_csv_file_name} need to be created")
    gold_biosamples_frame = pd.read_excel(gold_data_file_name, sheet_name=BIOSAMPLES_SHEET)
    gold_biosamples_frame.to_csv("gold_biosamples.csv", index=False, sep="\t")
    # 2 minutes

gold_biosamples.csv is present in the current working directory and will be parsed into gold_biosamples_frame.


In [100]:
gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(
    0).astype(int)


In [101]:
gold_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
1,Gb0035601,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
2,Gb0035602,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
3,Gb0035635,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
4,Gb0035638,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211608,Gb0405291,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
211609,Gb0405292,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
211610,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified
211611,Gb0405294,Freshwater microbial communities from Prairie ...,449393.0,freshwater metagenome,lake water,2023-08-08,"USA: Prairie Lake NEON Field Site, Vashti, Nor...",47.159710,-99.118723,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [102]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [103]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)
    
# ~ 1 second

In [104]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 second

In [105]:
# that's all fast. don't bother caching

In [106]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [107]:
goldterms_subjects = pd.read_sql_query(goldterms_subclass_query, goldterms_conn)

In [108]:
goldterms_subjects['path_id'] = goldterms_subjects['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [109]:
goldterms_subjects

Unnamed: 0,subject,path_id
0,GOLDTERMS:5820,5820
1,GOLDTERMS:5421,5421
2,GOLDTERMS:5617,5617
3,GOLDTERMS:Environmental-Terrestrial-Soil-Natur...,
4,GOLDTERMS:Environmental-Terrestrial-Soil-Pasture,
...,...,...
81,GOLDTERMS:4203,4203
82,GOLDTERMS:Environmental-Terrestrial-Soil-Uncla...,
83,GOLDTERMS:5804,5804
84,GOLDTERMS:4241,4241


In [110]:
gold_path_ids = goldterms_subjects['path_id'].dropna().unique().tolist()
gold_path_ids = [int(my_id) for my_id in gold_path_ids]


In [111]:
gold_env_filtered_biosamples_frame = gold_biosamples_frame[
    gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].isin(gold_path_ids)]


In [112]:
gold_env_filtered_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
11,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
12,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
13,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
14,Gb0050974,Soil ecosystem from different sites within th...,410658.0,soil metagenome,"soil classification euic, hyperthermic lithic ...",,Soils collected from different sites within th...,26.663199,-80.628500,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
56,Gb0051017,Mammuthus primigenius fossil ecosystem from Bo...,444079.0,fossil metagenome,"Bolshaya Kolopatkaya river, Russia",,Russia: Sakha Republic,70.000000,151.000000,4418,Environmental,Terrestrial,Soil,Fossil,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211578,Gb0405261,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
211579,Gb0405262,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.112820,-119.731615,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
211580,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified
211581,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,Environmental,Terrestrial,Soil,Unclassified,Unclassified


In [113]:
goldterms_context_frame = pd.read_sql_query(goldterms_envo_query, goldterms_conn)

In [114]:
goldterms_context_frame['object_label'] = goldterms_context_frame['object'].apply(envo_adapter.label)

In [115]:
goldterms_context_frame['path_id'] = goldterms_context_frame['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [116]:
goldterms_context_frame

Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,GOLDTERMS:Engineered-Artificial-ecosystem,GOLDTERMS:Engineered-Artificial-ecosystem,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
1,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
2,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,GOLDTERMS:Engineered-Artificial-ecosystem-Plan...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
3,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
4,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,mixs:env_broad,ENVO:01000313,,,,,anthropogenic environment,
...,...,...,...,...,...,...,...,...,...,...
1342,GOLDTERMS:5833,GOLDTERMS:5833,mixs:env_medium,ENVO:00002001,,,,,waste water,5833
1343,GOLDTERMS:5836,GOLDTERMS:5836,mixs:env_medium,ENVO:00002044,,,,,sludge,5836
1344,GOLDTERMS:5838,GOLDTERMS:5838,mixs:env_medium,ENVO:00002001,,,,,waste water,5838
1345,GOLDTERMS:5841,GOLDTERMS:5841,mixs:env_medium,ENVO:03600013,,,,,bituminous sand,5841


In [117]:
# Fill NaN values in 'BIOSAMPLE ECOSYSTEM PATH ID' with 0 and convert to int
gold_env_filtered_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_env_filtered_biosamples_frame[
    'BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int)

# Drop rows with NaN in 'path_id' in goldterms_context_frame
goldterms_context_frame = goldterms_context_frame.dropna(subset=['path_id'])

# Convert 'path_id' to int
goldterms_context_frame['path_id'] = goldterms_context_frame['path_id'].astype(int)

# Perform the left merge
gold_env_filtered_biosamples_with_inferred = gold_env_filtered_biosamples_frame.merge(
    goldterms_context_frame,
    left_on='BIOSAMPLE ECOSYSTEM PATH ID',
    right_on='path_id',
    how='left'
)


In [118]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
1,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
2,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
3,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
4,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32604,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
32605,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
32606,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
32607,Gb0405265,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212


# GOLDTERMS only approach

In [119]:
goldterms_result = pd.read_sql_query(extension_query, goldterms_conn)

In [120]:
goldterms_result

Unnamed: 0,subject,predicate,content
0,<GOLDVOCAB:Paddy-field/soil>,rdfs:label,Paddy field/soil
1,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:env_broad,ENVO:00000446
2,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:env_local,ENVO:01001177
3,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:env_medium,ENVO:00005742
4,GOLDTERMS:Environmental-Terrestrial-Soil-Arable,mixs:mixs_extension,mixs:Soil
...,...,...,...
458,GOLDVOCAB:Riparian-soil,rdfs:label,Riparian soil
459,GOLDVOCAB:Soil,rdfs:label,Soil
460,GOLDVOCAB:Soil-_28non-planted_29,rdfs:label,Soil (non-planted)
461,GOLDVOCAB:Soil-crust,rdfs:label,Soil crust


In [121]:
# # todo: save this kind of content before subsetting on an environment
# #   the subsetting is currently baked into the query
# 
# # see also goldterms_queries.ipynb in MAM's Collab
# goldterms_result.to_csv("goldterms_single_environment_mappings_long.tsv", sep="\t", index=False)

In [123]:
goldterms_only_curies = goldterms_result.loc[goldterms_result['predicate'].isin(gold_context_selectors), 'content']


In [124]:
goldterms_only_curies = goldterms_only_curies.unique().tolist()

In [126]:
# goldterms_only_curies

----

In [127]:
anchor_curies = list(anchor_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]

biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A])) # 
terrestrial_biome_curies = list(envo_adapter.descendants(TERRESTRIAL_BIOME, predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants(AQUATIC_BIOME, predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants(ENVIRONMENTAL_SYSTEM, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENVIRONMENTAL_MATERIAL, predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

soil_curies = list(envo_adapter.descendants(SOIL, predicates=[IS_A])) # 
liquid_water_curies = list(envo_adapter.descendants(LIQUID_WATER, predicates=[IS_A])) # 
water_ice_curies = list(envo_adapter.descendants(WATER_ICE, predicates=[IS_A])) # 

human_construction_curies = list(envo_adapter.descendants(HUMAN_CONSTRUCTION, predicates=[IS_A])) #
building_curies = list(envo_adapter.descendants(BUILDING, predicates=[IS_A])) #
building_part_curies = list(envo_adapter.descendants(BUILDING_PART, predicates=[IS_A])) #


In [128]:
include_in_rows = set()

In [129]:
include_in_rows.update(anchor_curies)

In [130]:
include_in_rows.update(legacy_pv_curies)

In [131]:
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_context_selector])

In [132]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [133]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [134]:
include_in_rows.update(gold_env_filtered_biosamples_with_inferred['object'])

In [135]:
include_in_rows.update(goldterms_only_curies)

In [136]:
rows_lod = []

# Begin constructing the voting sheet

In [137]:
for curie in include_in_rows:
    if curie is None:
        continue
        
    # ONCE AGAIN, assuming that EnvO is the only ontology we'll check against
    current_ancestors = list(envo_adapter.ancestors(curie, predicates=[IS_A])) # vs legacy_pv_curies
    ancestors_in_enum_count = len(set(current_ancestors) & set(legacy_pv_curies))
    
    current_descendants  = list(envo_adapter.descendants(curie, predicates=[IS_A])) # vs legacy_pv_curies
    descendants_in_enum_count  = len(set(current_descendants) & set(legacy_pv_curies))
    
    
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        comparison_enum_column_name: False,
        'ancestors_in_enum_count': ancestors_in_enum_count,
        'descendants_in_enum_count': descendants_in_enum_count,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
        'soil': False,
        'liquid water': False,
        'water ice': False,
        'human_construction': False,
        'building': False,
        'building_part': False,
        'goldterms_mappings': False,
    }
        
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in soil_curies:
        row['soil'] = True
    if curie in liquid_water_curies:
        row['liquid water'] = True
    if curie in water_ice_curies:
        row['water ice'] = True
    if curie in human_construction_curies:
        row['human_construction'] = True
    if curie in building_curies:
        row['building'] = True
    if curie in building_part_curies:
        row['building_part'] = True
    if curie in legacy_pv_curies:
        row[comparison_enum_column_name] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    if curie in goldterms_only_curies:
        row['goldterms_mappings'] = True
        
    try:
        prefix, local_id = curie.split(':')
        if prefix and prefix == 'ENVO' and row['label'] is not None:
            row['envo_native'] = True
    except Exception as e:
        # Print the exception message
        print(f"An error occurred: {e} trying to split {curie}")

    rows_lod.append(row)

# 2 minutes


# ^ Construction of the voting sheet

In [138]:
rows_frame = pd.DataFrame(rows_lod)

In [139]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvLocalSoilEnum_10_7,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,soil,liquid water,water ice,human_construction,building,building_part,goldterms_mappings
0,ENVO:01000869,area of scrub,True,False,False,1,0,True,False,False,False,False,False,False,False,False,False,False,False,False
1,ENVO:00000502,container of an intermittent saline lake,True,False,False,5,0,True,False,False,False,False,False,False,False,False,False,False,False,False
2,ENVO:00003088,pantothenate enriched soil,True,False,False,1,0,True,False,False,False,False,True,True,False,False,False,False,False,False
3,ENVO:00006073,,False,False,False,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False
4,ENVO:01001459,high-elevation mountain,True,False,False,6,0,True,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2872,ENVO:01000500,brick exterior wall,True,False,False,2,0,True,False,False,False,False,False,False,False,False,True,False,True,False
2873,ENVO:03000051,ice-encased melt pond,True,False,False,1,0,True,False,False,False,False,False,False,False,False,False,False,False,False
2874,ENVO:00005781,heat stressed soil,True,False,False,1,0,True,False,False,False,False,True,True,False,False,False,False,False,False
2875,ENVO:01001505,alpine tundra biome,True,False,False,1,0,True,True,True,True,False,False,False,False,False,False,False,False,False


# Merge in NMDC counts

In [140]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_context_selector].value_counts().reset_index()
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_scoped_count']


In [141]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_scoped_count
0,ENVO:01000843,1268
1,ENVO:01000816,1067
2,ENVO:01000888,777
3,ENVO:01000869,614
4,ENVO:01000861,295
5,ENVO:01000893,245
6,ENVO:00000148,243
7,ENVO:01000892,207
8,ENVO:01000855,199
9,ENVO:01000891,169


In [142]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

# Merge in GOLD hybrid counts

In [143]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
1,Gb0050971,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Mesic aquic argiudoll,,Soils collected from different sites within th...,40.104616,-88.226517,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
2,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
3,Gb0050972,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classification Dystric brunisol,,Soils collected from different sites within th...,52.743203,-91.718433,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
4,Gb0050973,Soil ecosystem from different sites within th...,410658.0,soil metagenome,soil classificaiton Distrophic oxisol,,Soils collected from different sites within th...,-29.539671,-55.107556,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32604,Gb0405263,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
32605,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212
32606,Gb0405264,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_medium,ENVO:00001998,,,,,soil,4212
32607,Gb0405265,Soil microbial communities from San Joaquin Ex...,410658.0,soil metagenome,soil,2023-02-27,"USA: Yosemite Lakes, California",37.111270,-119.728168,4212,...,GOLDTERMS:4212,GOLDTERMS:4212,mixs:env_broad,ENVO:00000446,,,,,terrestrial biome,4212


In [144]:
gold_biosample_scoped_counts = gold_env_filtered_biosamples_with_inferred['object'].value_counts().reset_index()
gold_biosample_scoped_counts.columns = ['curie', 'gold_hybrid_count']

In [145]:
gold_biosample_scoped_counts

Unnamed: 0,curie,gold_hybrid_count
0,ENVO:00001998,12218
1,ENVO:00000446,11079
2,ENVO:00000077,1609
3,ENVO:01001209,1527
4,ENVO:01000177,947
5,ENVO:00005774,874
6,ENVO:00005740,787
7,ENVO:00000134,629
8,ENVO:01001357,388
9,ENVO:00000292,250


In [146]:
# Perform the left merge
rows_frame = rows_frame.merge(
    gold_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [147]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvLocalSoilEnum_10_7,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,...,env_mat,soil,liquid water,water ice,human_construction,building,building_part,goldterms_mappings,nmdc_scoped_count,gold_hybrid_count
0,ENVO:01000869,area of scrub,True,False,False,1,0,True,False,False,...,False,False,False,False,False,False,False,False,614.0,
1,ENVO:00000502,container of an intermittent saline lake,True,False,False,5,0,True,False,False,...,False,False,False,False,False,False,False,False,,
2,ENVO:00003088,pantothenate enriched soil,True,False,False,1,0,True,False,False,...,True,True,False,False,False,False,False,False,,
3,ENVO:00006073,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,False,,
4,ENVO:01001459,high-elevation mountain,True,False,False,6,0,True,False,False,...,False,False,False,False,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2872,ENVO:01000500,brick exterior wall,True,False,False,2,0,True,False,False,...,False,False,False,False,True,False,True,False,,
2873,ENVO:03000051,ice-encased melt pond,True,False,False,1,0,True,False,False,...,False,False,False,False,False,False,False,False,,
2874,ENVO:00005781,heat stressed soil,True,False,False,1,0,True,False,False,...,True,True,False,False,False,False,False,False,,
2875,ENVO:01001505,alpine tundra biome,True,False,False,1,0,True,True,True,...,False,False,False,False,False,False,False,False,,


In [148]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [149]:
# gold and ncbi counts are slightly trickier
# for gold: including mappings only, mappings in hybrid with biosample counts. 
#    Switch to direct biosample counts of GOLD "envo" annotations?
# ncbi: we have extracted curies and annotated curies

In [150]:
# todo move this stuff up to immediately after the creation of ncbi_frame ?

# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long runs of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [151]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count
0,1,not applicable,24038,not applicable,1,0,not applicable,,,,,[],0
1,2,missing,14739,missing,1,0,missing,,,,,[],0
2,3,soil,8593,soil,1,0,soil,,,ENVO:00001998,soil,[ENVO:00001998],1
3,4,not collected,7666,not collected,1,0,not collected,,,,,[],0
4,5,Agricultural Land,5994,Agricultural Land,1,0,Agricultural Land,,,ENVO:00000077,agricultural ecosystem,[ENVO:00000077],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11984,11503,ENVO0000334,1,ENVO0000334,1,1,ENVO0000334,,,,,[],0
11985,11504,ENVO0000352,1,ENVO0000352,1,1,ENVO0000352,,,,,[],0
11986,11505,soil biomass magnetite + surfactant,1,soil biomass magnetite + surfactant,1,0,soil biomass magnetite + surfactant,,,CHEBI:35195,surfactant,[CHEBI:35195],1
11987,11506,ENVO0000027,1,ENVO0000027,1,1,ENVO0000027,,,,,[],0


In [152]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    6978
0    4442
2     569
Name: count, dtype: int64

In [153]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [154]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [155]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [156]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [157]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [158]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [159]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)

# pprint.pprint(stretches_dict)

In [160]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

In [161]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
354,9,1001507
355,9,1001508
356,9,1001509
357,9,1001510


In [162]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [163]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


In [164]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:01000703,0.683544
1,2.0,ENVO:00000230,0.952381
2,3.0,ENVO:00000428,0.993056
3,4.0,ENVO:00000873,1.0
4,5.0,ENVO:00000078,1.0
5,6.0,ENVO:00002001,0.95
6,7.0,ENVO:01000431,0.333333
7,8.0,ENVO:01000628,0.483871
8,9.0,ENVO:01001803,1.0


In [165]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [166]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [167]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
4,ENVO:01000198,ENVO:01000431,True
21,ENVO:00000230,ENVO:00000035,True
23,ENVO:01000199,ENVO:01001791,True
24,ENVO:01000197,ENVO:01000430,True
37,ENVO:00000231,ENVO:00000230,True
...,...,...,...
437,ENVO:01028,ENVO:00000078,True
438,ENVO:01040,ENVO:00000078,True
439,ENVO:01044,ENVO:00000078,True
440,ENVO:00002012,ENVO:00002001,True


In [168]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [169]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [170]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    6978
0    4442
2     569
Name: count, dtype: int64

In [171]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    7429
0    4442
2     118
Name: count, dtype: int64

In [172]:
ncbi_frame.shape

(11989, 16)

In [173]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [174]:
ncbi_frame_undisputed.shape

(11871, 16)

In [175]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [176]:
ncbi_frame_disputed.shape

(118, 16)

In [177]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [178]:
ncbi_frame_disputed.shape

(236, 16)

In [179]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

In [180]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [181]:
ncbi_disputes_exploded_frame.shape

(12107, 16)

In [182]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,not applicable,24038,not applicable,1,0,not applicable,,,,,[],0,,[],0
1,2,missing,14739,missing,1,0,missing,,,,,[],0,,[],0
2,3,soil,8593,soil,1,0,soil,,,ENVO:00001998,soil,[ENVO:00001998],1,,[ENVO:00001998],1
3,4,not collected,7666,not collected,1,0,not collected,,,,,[],0,,[],0
4,5,Agricultural Land,5994,Agricultural Land,1,0,Agricultural Land,,,ENVO:00000077,agricultural ecosystem,[ENVO:00000077],1,,[ENVO:00000077],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12102,11439,irrigation system [ENVO:00000029]|fertilizatio...,1,fertilization process [ENVO:02500001],3,1,fertilization process,ENVO:02500001,desertification,BFO:0000015,process,"[BFO:0000015, ENVO:02500001]",2,,[ENVO:02500001],2
12103,11440,irrigation system [ENVO:00000029]|fertilizatio...,1,fertilization process [ENVO:02500001],3,1,fertilization process,ENVO:02500001,desertification,BFO:0000015,process,"[BFO:0000015, ENVO:02500001]",2,,[BFO:0000015],2
12104,11440,irrigation system [ENVO:00000029]|fertilizatio...,1,fertilization process [ENVO:02500001],3,1,fertilization process,ENVO:02500001,desertification,BFO:0000015,process,"[BFO:0000015, ENVO:02500001]",2,,[ENVO:02500001],2
12105,11441,irrigation system [ENVO:00000029]|fertilizatio...,1,fertilization process [ENVO:02500001],3,1,fertilization process,ENVO:02500001,desertification,BFO:0000015,process,"[BFO:0000015, ENVO:02500001]",2,,[BFO:0000015],2


In [183]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [184]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    7665
0    4442
Name: count, dtype: int64

In [185]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [186]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["sample_count"].sum().reset_index()
)

ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_scoped_count']

In [187]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_scoped_count
0,BFO:0000015,73
1,BFO:0000029,1883
2,BFO:0000030,181
3,CHEBI:15377,377
4,CHEBI:15379,1
...,...,...
1494,UBERON:0001456,3
1495,UBERON:0001474,2
1496,UBERON:0001913,3
1497,UBERON:0001988,134


In [188]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [189]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,EnvLocalSoilEnum_10_7,ancestors_in_enum_count,descendants_in_enum_count,abp,env_sys,biome,...,soil,liquid water,water ice,human_construction,building,building_part,goldterms_mappings,nmdc_scoped_count,gold_hybrid_count,ncbi_scoped_count
0,ENVO:01000869,area of scrub,True,False,False,1,0,True,False,False,...,False,False,False,False,False,False,False,614.0,,
1,ENVO:00000502,container of an intermittent saline lake,True,False,False,5,0,True,False,False,...,False,False,False,False,False,False,False,,,1.0
2,ENVO:00003088,pantothenate enriched soil,True,False,False,1,0,True,False,False,...,True,False,False,False,False,False,False,,,
3,ENVO:00006073,,False,False,False,0,0,False,False,False,...,False,False,False,False,False,False,False,,,1.0
4,ENVO:01001459,high-elevation mountain,True,False,False,6,0,True,False,False,...,False,False,False,False,False,False,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2872,ENVO:01000500,brick exterior wall,True,False,False,2,0,True,False,False,...,False,False,False,True,False,True,False,,,
2873,ENVO:03000051,ice-encased melt pond,True,False,False,1,0,True,False,False,...,False,False,False,False,False,False,False,,,
2874,ENVO:00005781,heat stressed soil,True,False,False,1,0,True,False,False,...,True,False,False,False,False,False,False,,,1.0
2875,ENVO:01001505,alpine tundra biome,True,False,False,1,0,True,True,True,...,False,False,False,False,False,False,False,,,


In [190]:
rows_frame.to_csv(output_file_name, sep="\t", index=False)