In [1]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import yaml


In [2]:
print("verify output is being rendered")

verify output is being rendered


In [3]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals? Or just use caching arrond the function?
ancestor_cache = {}
descendant_cache = {}

In [4]:
# todo deal with circularity in env package prediction -> env triad reporting

# todo the on-demand NCBI curie extraction and annotation recapitulates this shared work
#   although that doesn't detect auto-incremented curies from  spreadsheet dragging

# todo if more caching is desired, it should probably take the form of saving dataframes for TSV

# todo include gold-envo mappings whether they are used by any gold samples or not

# eventually, dig up a complete JSON gold biosample dump

# overall run time (if NCBI biosamples and goldData are cached)

# Task Settings
_For making a Soil env_broad_scale voting sheet vs a Sediment env_local_scale sheet, etc._

todo: bundle these into dicts so they don't have to be modified independently and kept in sync with one another.

In [5]:
output_file_name = "sediment_env_local_scale_voting_sheet_2.tsv"

In [6]:
# semantic_anchor = 'ENVO:00000428' # biome for env_broad_scale
semantic_anchor = 'ENVO:01000813' # astronomical body part "abp" for env_local_scale
# semantic_anchor = 'ENVO:00010483' # environmental material for env_medium

## context selectors

In [7]:
# gold_context_selector = 'mixs:env_broad'
gold_context_selector = 'mixs:env_local'
# gold_context_selector = 'mixs:env_medium'

In [8]:
# ncbi_context_selector = 'env_broad_scale'
ncbi_context_selector = 'env_local_scale'
# ncbi_context_selector = 'env_medium'

In [9]:
# nmdc_context_selector= 'env_broad_scale_id'
nmdc_context_selector= 'env_local_scale_id'
# nmdc_context_selector= 'env_medium_id'

## package aka environment aka extension selectors

In [10]:
# todo new since soil: why are we only considering MIMS.me for discovering appropriate env triad values?
#   there's usually a roughly equal number of biosamples from in each extension for MIMS.me and 

# ncbi_package_selector = 'MIMS.me.plant-associated.6.0'
ncbi_package_selector = 'MIMS.me.sediment.6.0'
# ncbi_package_selector = 'MIMS.me.soil.6.0'
# ncbi_package_selector = 'MIMS.me.water.6.0'

In [11]:
# nmdc_package_selector = 'plant-associated'
nmdc_package_selector = 'sediment'
# nmdc_package_selector = 'soil'
# nmdc_package_selector = 'water'


In [12]:
GOLDTERMS_NA = '' # ???

GOLDTERMS_PLANT_ASSOCIATED = GOLDTERMS_NA
GOLDTERMS_SEDIMENT = 'GOLDTERMS:3985' #  doesn't have any subclasses
GOLDTERMS_SOIL = 'GOLDTERMS:4212'
GOLDTERMS_WATER = 'GOLDTERMS:3984'

# GOLDTERMS:4180, 'Environmental > Aquatic > Freshwater > Pond > Sediment' and ~64 more don't share a common root
# poetry run runoak -i sqlite:obo:goldterms info 't~sediment'


In [13]:
goldterms_root = GOLDTERMS_NA

## selecting name and version of one enum for comparison


In [14]:
# only the Soil enums have legacy definitions (v10.7 and earlier?)

# CONTEXT_ENUM = "EnvBroadScaleSoilEnum"
CONTEXT_ENUM = "EnvLocalScaleSoilEnum"
# CONTEXT_ENUM = "EnvMediumSoilEnum"

# CONTEXT_ENUM = ""

In [15]:
# todo: add columns for membership in multiple enums from multiple version of the schema?
#  like sediment local vs soil local and water local (once that's completed)
#  get them from schema files or something prior to that? sems like the voting sheets are too raw/preliminary for that
#   can use a more recent schema url for more recent enums!

# previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/refs/tags/v11.1.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [None]:
# todo: don't call the column "legacy_pv". use the name of the enum and the version of the schema?

comparison_enum_column_name = 'EnvLocalScaleSoilEnum_11_1'

# Additional Settings

In [16]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [17]:
MIN_ANNOTATION_LEN = 3

In [18]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [19]:
envo_adapter_string = "sqlite:obo:envo"

In [20]:
# goldterms_adapter_string = "sqlite:obo:envo"

In [21]:
env_package_override_file = 'mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [22]:
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'

In [23]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
gold_data_file_name = "goldData.xlsx" # goldData.xlsx: Microsoft Excel 2007+
BIOSAMPLES_SHEET = "Biosample"

In [24]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"


# Settings-based Queries

In [25]:
goldterms_subclass_query = f"""
select
	subject
from
	entailed_edge ee
where
	predicate = 'rdfs:subClassOf'
	and object = '{goldterms_root}'
"""

In [26]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate = '{gold_context_selector}'"""

In [27]:
ncbi_query = f"""
SELECT content, COUNT(1) AS sample_count 
FROM attributes 
WHERE harmonized_name = '{ncbi_context_selector}' AND package_content = '{ncbi_package_selector}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

# Locally Defined Functions
_Currently using locally-defined cache dictionaries_

In [28]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [29]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

# Procedural Code Starts Here

In [30]:
# Determine the filenames and target directory for the NCBI DuckDB
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
ncbi_compressed_file_path = os.path.join(ncbi_compressed_filename)
ncbi_uncompressed_file_path = os.path.join(ncbi_filename)

# target_dir = os.path.join('.') # just assume the files are downloaded into the same directory as the notebook

In [31]:
if os.path.isfile(ncbi_uncompressed_file_path):
    print(f"{ncbi_uncompressed_file_path} is already present in the current working directory.")
else:
    if os.path.isfile(ncbi_compressed_file_path):
        print(f"{ncbi_compressed_file_path} is already present in the current working directory.")
    else:
        print(f"{ncbi_compressed_file_path} needs to be downloaded")
        ncbi_response = requests.get(ncbi_duckdb_url)
        with open(ncbi_compressed_file_path, "wb") as f:
            f.write(ncbi_response.content)
        # ~ 2 minutes @ 250 Mbps
    
    # Unzip the compressed file and save the extracted file in target directory
    print(f"{ncbi_compressed_file_path} needs to be unpacked")
    with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
        with open(ncbi_uncompressed_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # ~ 2 minutes

ncbi_biosamples_2024-09-23.duckdb is already present in the current working directory.


In [32]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [33]:
envo_adapter = get_adapter(envo_adapter_string)

In [34]:
anchor_descendants = get_curie_descendants_label_dict(semantic_anchor, [IS_A], envo_adapter)

In [35]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [36]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [37]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,ENVO:00001999,marine water body
1,ENVO:01000188,tropical savanna biome
2,ENVO:00000487,paternoster lake
3,ENVO:01000860,temperate marine upwelling biome
4,ENVO:01000199,mediterranean forest biome
...,...,...
1731,ENVO:01000429,burrow
1732,ENVO:01000431,mixed forest
1733,ENVO:01000536,factory
1734,ENVO:00000873,freshwater biome


----

In [38]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [39]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys =[]
    

In [40]:
print(context_pvs_keys)

['active permafrost layer [ENVO:04000009]', 'alas [ENVO:00000438]', 'badland [ENVO:00000127]', 'beach [ENVO:00000091]', 'butte [ENVO:00000287]', 'caldera [ENVO:00000096]', 'campground [ENVO:01000935]', 'canyon [ENVO:00000169]', 'cave [ENVO:00000067]', 'channel [ENVO:03000117]', 'cirque [ENVO:00000155]', 'cliff [ENVO:00000087]', 'crater [ENVO:00000514]', 'crevasse [ENVO:00000320]', 'cryosphere [ENVO:03000143]', 'dam [ENVO:00000074]', 'desert [ENVO:01001357]', 'drainage basin [ENVO:00000291]', 'drumlin [ENVO:00000276]', 'dry lake [ENVO:00000277]', 'dune [ENVO:00000170]', 'endorheic basin [ENVO:00000551]', 'escarpment [ENVO:00000280]', 'esker [ENVO:00000282]', 'farm [ENVO:00000078]', 'fen [ENVO:00000232]', 'fjord [ENVO:00000039]', 'flood plain [ENVO:00000255]', 'frost heave [ENVO:01001568]', 'fumarole [ENVO:00000216]', 'garden [ENVO:00000011]', 'glacier [ENVO:00000133]', 'greenhouse [ENVO:03600087]', 'harbour [ENVO:00000463]', 'hill [ENVO:00000083]', 'hummock [ENVO:00000516]', 'isthmus [E

In [41]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [42]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [43]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [44]:
pv_validation_results

{'problems': [],
 'valids': [{'curie': 'ENVO:04000009', 'label': 'active permafrost layer'},
  {'curie': 'ENVO:00000438', 'label': 'alas'},
  {'curie': 'ENVO:00000127', 'label': 'badland'},
  {'curie': 'ENVO:00000091', 'label': 'beach'},
  {'curie': 'ENVO:00000287', 'label': 'butte'},
  {'curie': 'ENVO:00000096', 'label': 'caldera'},
  {'curie': 'ENVO:01000935', 'label': 'campground'},
  {'curie': 'ENVO:00000169', 'label': 'canyon'},
  {'curie': 'ENVO:00000067', 'label': 'cave'},
  {'curie': 'ENVO:03000117', 'label': 'channel'},
  {'curie': 'ENVO:00000155', 'label': 'cirque'},
  {'curie': 'ENVO:00000087', 'label': 'cliff'},
  {'curie': 'ENVO:00000514', 'label': 'crater'},
  {'curie': 'ENVO:00000320', 'label': 'crevasse'},
  {'curie': 'ENVO:03000143', 'label': 'cryosphere'},
  {'curie': 'ENVO:00000074', 'label': 'dam'},
  {'curie': 'ENVO:01001357', 'label': 'desert'},
  {'curie': 'ENVO:00000291', 'label': 'drainage basin'},
  {'curie': 'ENVO:00000276', 'label': 'drumlin'},
  {'curie': '

----

In [45]:
# todo rename to all_nmdc_samples etc
all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)  # Example with stop_after

# ~ 1 minute

# todo cache this as a file

Fetched page 1 with 1000 documents. Total fetched: 1000
Fetched page 2 with 1000 documents. Total fetched: 2000
Fetched page 3 with 1000 documents. Total fetched: 3000
Fetched page 4 with 1000 documents. Total fetched: 4000
Fetched page 5 with 1000 documents. Total fetched: 5000
Fetched page 6 with 1000 documents. Total fetched: 6000
Fetched page 7 with 1000 documents. Total fetched: 7000
Fetched page 8 with 1000 documents. Total fetched: 8000
Fetched page 9 with 362 documents. Total fetched: 8362
All documents fetched.


In [46]:
# # this is slow because it saves slots that we don't need
# 
# with open("nmdc_biosamples_from_api.yaml", "w") as file:
#     yaml.dump(all_nmdc_biosamples, file, default_flow_style=False)

In [47]:
# # todo I don't think we're actually using this
# all_studies = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL, STUDY_SET_COLLECTION)  # Example with stop_after

In [48]:
env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

In [49]:
# env_pacakge_overrides
# todo or show as frame
# todo include some other columns for context?

In [50]:
biosample_contexts_lod = biosamples_lod_context_extractor(all_nmdc_biosamples, envo_adapter,
                                                          my_env_pacakge_overrides=env_pacakge_overrides)

# ~ 10 seconds, lots of logging

Overriding env_package for biosample nmdc:bsm-11-0k8nkx16 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-19v98823 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-1yvac190 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-28kgw077 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-2hswww54 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-34przm31 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-35m0rm03 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-3636w778 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nffqc45 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nhng665 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3r4g4610 from  to hydrocarbon resources-fluids_swabs
Overriding env_package

In [51]:
nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

In [52]:
# print a value count for the normalized_env_package column
print("Value counts for normalized_env_package column:")
print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))

Value counts for normalized_env_package column:
normalized_env_package
                                                   5838
soil                                               1707
plant-associated                                    401
water                                               192
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


In [53]:
package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

(8362, 14)
                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        16
             hydrocarbon resources-fluids_swabs       1.00      1.00      1.00         7
miscellaneous natural or artificial environment       1.00      1.00      1.00        50
                               plant-associated       1.00      1.00      1.00       122
                                           soil       1.00      1.00      1.00       506
                                          water       1.00      1.00      1.00        57

                                       accuracy                           1.00       758
                                      macro avg       1.00      1.00      1.00       758
                                   weighted avg       1.00      1.00      1.00       758



In [54]:
nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

In [55]:
nmdc_biosample_contexts_frame.shape

(8362, 15)

In [56]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == nmdc_package_selector]

In [57]:
nmdc_biosample_contexts_frame.shape

(0, 15)

----

In [58]:
ncbi_frame = ncbi_conn.execute(ncbi_query).fetchdf()

In [59]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [60]:
# includes values with counts of one... useful for discovering drag-down submissions?

In [61]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [62]:
ncbi_frame.to_csv("ncbi_frame.tsv", sep="\t", index=False)

In [63]:
# todo is there any reason to not do this ?
ncbi_frame = ncbi_frame[ncbi_frame['content'].notna() & (ncbi_frame['content'] != '')]

In [64]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [65]:
ncbi_frame.shape

(1128, 5)

In [66]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [67]:
ncbi_frame.shape

(1170, 5)

In [68]:
# how many content_list strings contain envo multiple times now?

In [69]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [70]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    960
1    208
2      1
3      1
Name: count, dtype: int64

doesn't account for multiple label strings delimited with something other than '|'

In [71]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [72]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


In [73]:
parse_failures

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie


In [74]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

In [75]:
# Apply the annotation function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_adapter, MIN_ANNOTATION_LEN))

# only runs for ~ 1 minute, but generates a lot of "ERRORS" while loading the ontologies that are used for annotating


ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [76]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [77]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,stream,2140,stream,1,0,stream,,,ENVO:00000023,stream
1,2,intertidal zone,1054,intertidal zone,1,0,intertidal zone,,,ENVO:00000316,intertidal zone
2,3,coast,891,coast,1,0,coast,,,ENVO:01000687,coast
3,4,not applicable,875,not applicable,1,0,not applicable,,,,
4,5,not collected,830,not collected,1,0,not collected,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1165,1127,"Sublittoral_mangrove_tidal_zone, replicate = 2",1,"Sublittoral_mangrove_tidal_zone, replicate = 2",1,0,"Sublittoral_mangrove_tidal_zone, replicate = 2",,,,
1166,1128,acidic hot spring [ENVO:00002120]| spring wate...,1,acidic hot spring [ENVO:00002120],3,1,acidic hot spring,ENVO:00002120,acidic hot spring,ENVO:00002120,acidic hot spring
1167,1128,acidic hot spring [ENVO:00002120]| spring wate...,1,spring water [ENVO:03600065],3,1,spring water,ENVO:03600065,spring water,ENVO:03600065,spring water
1168,1128,acidic hot spring [ENVO:00002120]| spring wate...,1,sediment [ENVO:00002007],3,1,sediment,ENVO:00002007,sediment,ENVO:00002007,sediment


----

In [78]:
if os.path.isfile(gold_data_file_name):
    print(f"{gold_data_file_name} is already present in the current working directory.")
else:
    print(f"{gold_data_file_name} needs to be downloaded")
    gold_response = requests.get(gold_data_url)
    with open(gold_data_file_name, "wb") as f:
        f.write(gold_response.content)
        # ~ 10 seconds  @ 250 Mbps

goldData.xlsx is already present in the current working directory.


Expect to see

> /home/mark/.cache/pypoetry/virtualenvs/nmdc-submission-schema-DC6HKp4p-py3.10/lib/python3.10/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

In [79]:
gold_biosamples_frame = pd.read_excel(gold_data_file_name, sheet_name=BIOSAMPLES_SHEET)
# 1.5 minutes
# todo cache the output of this, not the input!


  warn("Workbook contains no default style, apply openpyxl's default")


In [80]:
gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(
    0).astype(int)


In [81]:
gold_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
1,Gb0035601,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
2,Gb0035602,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
3,Gb0035635,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
4,Gb0035638,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210932,Gb0405291,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210933,Gb0405292,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210934,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified
210935,Gb0405294,Freshwater microbial communities from Prairie ...,449393.0,freshwater metagenome,lake water,2023-08-08,"USA: Prairie Lake NEON Field Site, Vashti, Nor...",47.159710,-99.118723,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [82]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [83]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)
    
# ~ 1 second

In [84]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# ~ 1 second

In [85]:
# that's all fast. don't bother caching

In [86]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [87]:
goldterms_subjects = pd.read_sql_query(goldterms_subclass_query, goldterms_conn)

In [88]:
goldterms_subjects['path_id'] = goldterms_subjects['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [89]:
goldterms_subjects

Unnamed: 0,subject,path_id


In [90]:
gold_path_ids = goldterms_subjects['path_id'].dropna().unique().tolist()
gold_path_ids = [int(my_id) for my_id in gold_path_ids]


In [91]:
gold_env_filtered_biosamples_frame = gold_biosamples_frame[
    gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].isin(gold_path_ids)]


In [92]:
gold_env_filtered_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM


In [93]:
goldterms_context_frame = pd.read_sql_query(goldterms_envo_query, goldterms_conn)

In [94]:
goldterms_context_frame['object_label'] = goldterms_context_frame['object'].apply(envo_adapter.label)

In [95]:
goldterms_context_frame['path_id'] = goldterms_context_frame['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [96]:
goldterms_context_frame

Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,mixs:env_local,OBI:0001046,,,,,,
1,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,mixs:env_local,OBI:0001046,,,,,,
2,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,mixs:env_local,OBI:0001046,,,,,,
3,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,mixs:env_local,OBI:0001046,,,,,,
4,GOLDTERMS:Engineered-Bioreactor-Passive-biorea...,GOLDTERMS:Engineered-Bioreactor-Passive-biorea...,mixs:env_local,OBI:0001046,,,,,,
...,...,...,...,...,...,...,...,...,...,...
296,GOLDTERMS:5838,GOLDTERMS:5838,mixs:env_local,OBI:0001046,,,,,,5838
297,GOLDTERMS:5841,GOLDTERMS:5841,mixs:env_local,ENVO:00000076,,,,,mine,5841
298,GOLDTERMS:5843,GOLDTERMS:5843,mixs:env_local,OBI:0001046,,,,,,5843
299,GOLDTERMS:5846,GOLDTERMS:5846,mixs:env_local,OBI:0001046,,,,,,5846


In [97]:
# Fill NaN values in 'BIOSAMPLE ECOSYSTEM PATH ID' with 0 and convert to int
gold_env_filtered_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_env_filtered_biosamples_frame[
    'BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int)

# Drop rows with NaN in 'path_id' in goldterms_context_frame
goldterms_context_frame = goldterms_context_frame.dropna(subset=['path_id'])

# Convert 'path_id' to int
goldterms_context_frame['path_id'] = goldterms_context_frame['path_id'].astype(int)

# Perform the left merge
gold_env_filtered_biosamples_with_inferred = gold_env_filtered_biosamples_frame.merge(
    goldterms_context_frame,
    left_on='BIOSAMPLE ECOSYSTEM PATH ID',
    right_on='path_id',
    how='left'
)


In [98]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id


----

In [99]:
include_in_rows = set()

In [100]:
include_in_rows.update(anchor_descendants_frame['curie'])

In [101]:
include_in_rows.update([i['curie'] for i in pv_validation_results['valids']])

In [102]:
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_context_selector])

In [103]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [104]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [105]:
include_in_rows.update(gold_env_filtered_biosamples_with_inferred['object'])

In [106]:
rows_lod = []

In [107]:
# TODO MOVE THESE UP, because the expressions are already being used above

anchor_curies = list(anchor_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]
biome_curies = list(envo_adapter.descendants('ENVO:00000428', predicates=[IS_A])) # 
terrestrial_biome_curies = list(envo_adapter.descendants('ENVO:00000446', predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants('ENVO:00002030', predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants('ENVO:01000813', predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants('ENVO:01000254', predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants('ENVO:00010483', predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

In [108]:
for curie in include_in_rows:
    if curie is None:
        continue
        
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        comparison_enum_column_name: False,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
    }
        
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in legacy_pv_curies:
        row[comparison_enum_column_name] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
        
    try:
        prefix, local_id = curie.split(':')
        if prefix and prefix == 'ENVO' and row['label'] is not None:
            row['envo_native'] = True
    except Exception as e:
        # Print the exception message
        print(f"An error occurred: {e} trying to split {curie}")

    rows_lod.append(row)


In [109]:
rows_frame = pd.DataFrame(rows_lod)

In [110]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat
0,ENVO:01000492,dung building floor,True,False,False,True,False,False,False,False,False
1,ENVO:00001997,acid mine drainage,True,False,False,True,False,False,False,False,False
2,ENVO:01000348,maize field,True,False,False,True,False,False,False,False,False
3,ENVO:01000201,subtropical broadleaf forest biome,True,False,False,True,True,True,True,False,False
4,ENVO:01001021,humic lake,True,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1894,ENVO:00002043,wastewater treatment plant,True,False,False,True,False,False,False,False,False
1895,ENVO:00003964,tobacco warehouse,True,False,False,True,False,False,False,False,False
1896,ENVO:01001518,drifting sea ice mass,True,False,False,True,False,False,False,False,False
1897,ENVO:03501188,ophthalmology clinic,True,False,False,True,False,False,False,False,False


In [111]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_context_selector].value_counts().reset_index()
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_scoped_count']


In [112]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_scoped_count


In [113]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [114]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id


In [115]:
gold_biosample_scoped_counts = gold_env_filtered_biosamples_with_inferred['object'].value_counts().reset_index()
gold_biosample_scoped_counts.columns = ['curie', 'gold_scoped_count']

In [116]:
gold_biosample_scoped_counts

Unnamed: 0,curie,gold_scoped_count


In [117]:
# Perform the left merge
rows_frame = rows_frame.merge(
    gold_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [118]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,nmdc_scoped_count,gold_scoped_count
0,ENVO:01000492,dung building floor,True,False,False,True,False,False,False,False,False,,
1,ENVO:00001997,acid mine drainage,True,False,False,True,False,False,False,False,False,,
2,ENVO:01000348,maize field,True,False,False,True,False,False,False,False,False,,
3,ENVO:01000201,subtropical broadleaf forest biome,True,False,False,True,True,True,True,False,False,,
4,ENVO:01001021,humic lake,True,False,False,True,False,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1894,ENVO:00002043,wastewater treatment plant,True,False,False,True,False,False,False,False,False,,
1895,ENVO:00003964,tobacco warehouse,True,False,False,True,False,False,False,False,False,,
1896,ENVO:01001518,drifting sea ice mass,True,False,False,True,False,False,False,False,False,,
1897,ENVO:03501188,ophthalmology clinic,True,False,False,True,False,False,False,False,False,,


In [119]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [120]:
# gold and ncbi counts are slightly trickier
# for gold may want to include presence or mapping in goldterms in addition to biosamples counts
# ncbi: we have extracted curies and annotated curies

In [121]:
# todo move this stuff up to immediately after the creation of ncbi_frame ?

# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long runs of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [122]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count
0,1,stream,2140,stream,1,0,stream,,,ENVO:00000023,stream,[ENVO:00000023],1
1,2,intertidal zone,1054,intertidal zone,1,0,intertidal zone,,,ENVO:00000316,intertidal zone,[ENVO:00000316],1
2,3,coast,891,coast,1,0,coast,,,ENVO:01000687,coast,[ENVO:01000687],1
3,4,not applicable,875,not applicable,1,0,not applicable,,,,,[],0
4,5,not collected,830,not collected,1,0,not collected,,,,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,1127,"Sublittoral_mangrove_tidal_zone, replicate = 2",1,"Sublittoral_mangrove_tidal_zone, replicate = 2",1,0,"Sublittoral_mangrove_tidal_zone, replicate = 2",,,,,[],0
1166,1128,acidic hot spring [ENVO:00002120]| spring wate...,1,acidic hot spring [ENVO:00002120],3,1,acidic hot spring,ENVO:00002120,acidic hot spring,ENVO:00002120,acidic hot spring,[ENVO:00002120],1
1167,1128,acidic hot spring [ENVO:00002120]| spring wate...,1,spring water [ENVO:03600065],3,1,spring water,ENVO:03600065,spring water,ENVO:03600065,spring water,[ENVO:03600065],1
1168,1128,acidic hot spring [ENVO:00002120]| spring wate...,1,sediment [ENVO:00002007],3,1,sediment,ENVO:00002007,sediment,ENVO:00002007,sediment,[ENVO:00002007],1


In [123]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    937
0    212
2     21
Name: count, dtype: int64

In [124]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [125]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [126]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [127]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [128]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [129]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [130]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)

# pprint.pprint(stretches_dict)

In [131]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

In [132]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,1001208
1,1,1001209
2,1,1001210
3,1,1001211
4,2,1001225
5,2,1001226
6,2,1001227
7,3,1001229
8,3,1001230
9,3,1001231


In [133]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [134]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


In [135]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:01001201,1.0
1,2.0,ENVO:01001201,1.0
2,3.0,ENVO:01001201,1.0


In [136]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [137]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [138]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
3,ENVO:01001227,ENVO:01001201,True
4,ENVO:01001230,ENVO:01001201,True
6,ENVO:01001225,ENVO:01001201,True
7,ENVO:01001208,ENVO:01001201,True
9,ENVO:01001209,ENVO:01001201,True
10,ENVO:01001210,ENVO:01001201,True
12,ENVO:01001211,ENVO:01001201,True
14,ENVO:01001231,ENVO:01001201,True
16,ENVO:01001226,ENVO:01001201,True
19,ENVO:01001229,ENVO:01001201,True


In [139]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [140]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [141]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    937
0    212
2     21
Name: count, dtype: int64

In [142]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    947
0    212
2     11
Name: count, dtype: int64

In [143]:
ncbi_frame.shape

(1170, 16)

In [144]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [145]:
ncbi_frame_undisputed.shape

(1159, 16)

In [146]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [147]:
ncbi_frame_disputed.shape

(11, 16)

In [148]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [149]:
ncbi_frame_disputed.shape

(22, 16)

In [150]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

In [151]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [152]:
ncbi_disputes_exploded_frame.shape

(1181, 16)

In [153]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,stream,2140,stream,1,0,stream,,,ENVO:00000023,stream,[ENVO:00000023],1,,[ENVO:00000023],1
1,2,intertidal zone,1054,intertidal zone,1,0,intertidal zone,,,ENVO:00000316,intertidal zone,[ENVO:00000316],1,,[ENVO:00000316],1
2,3,coast,891,coast,1,0,coast,,,ENVO:01000687,coast,[ENVO:01000687],1,,[ENVO:01000687],1
3,4,not applicable,875,not applicable,1,0,not applicable,,,,,[],0,,[],0
4,5,not collected,830,not collected,1,0,not collected,,,,,[],0,,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1176,1091,marine environmental zone [ENVO:01001213],1,marine environmental zone [ENVO:01001213],1,1,marine environmental zone,ENVO:01001213,radio wave stellar radiation,ENVO:01001201,marine environmental zone,"[ENVO:01001213, ENVO:01001201]",2,,[ENVO:01001201],2
1177,1123,marine environmental zone [ENVO:01001217],1,marine environmental zone [ENVO:01001217],1,1,marine environmental zone,ENVO:01001217,X-ray stellar radiation,ENVO:01001201,marine environmental zone,"[ENVO:01001217, ENVO:01001201]",2,,[ENVO:01001217],2
1178,1123,marine environmental zone [ENVO:01001217],1,marine environmental zone [ENVO:01001217],1,1,marine environmental zone,ENVO:01001217,X-ray stellar radiation,ENVO:01001201,marine environmental zone,"[ENVO:01001217, ENVO:01001201]",2,,[ENVO:01001201],2
1179,1124,marine environmental zone [ENVO:01001223],1,marine environmental zone [ENVO:01001223],1,1,marine environmental zone,ENVO:01001223,lumber production process,ENVO:01001201,marine environmental zone,"[ENVO:01001223, ENVO:01001201]",2,,[ENVO:01001223],2


In [154]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [155]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    969
0    212
Name: count, dtype: int64

In [156]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [157]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["sample_count"].sum().reset_index()
)

ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_scoped_count']

In [158]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_scoped_count
0,BFO:0000029,40
1,CHEBI:15377,29
2,CHEBI:15379,1
3,CHEBI:16183,1
4,CHEBI:24866,30
...,...,...
315,RO:0001025,2
316,RO:0002577,6
317,UBERON:0000060,1
318,UBERON:0001988,3


In [159]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [160]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,nmdc_scoped_count,gold_scoped_count,ncbi_scoped_count
0,ENVO:01000492,dung building floor,True,False,False,True,False,False,False,False,False,,,
1,ENVO:00001997,acid mine drainage,True,False,False,True,False,False,False,False,False,,,
2,ENVO:01000348,maize field,True,False,False,True,False,False,False,False,False,,,
3,ENVO:01000201,subtropical broadleaf forest biome,True,False,False,True,True,True,True,False,False,,,
4,ENVO:01001021,humic lake,True,False,False,True,False,False,False,False,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1894,ENVO:00002043,wastewater treatment plant,True,False,False,True,False,False,False,False,False,,,
1895,ENVO:00003964,tobacco warehouse,True,False,False,True,False,False,False,False,False,,,
1896,ENVO:01001518,drifting sea ice mass,True,False,False,True,False,False,False,False,False,,,
1897,ENVO:03501188,ophthalmology clinic,True,False,False,True,False,False,False,False,False,,,


In [161]:
rows_frame.to_csv(output_file_name, sep="\t", index=False)