In [1]:
from common import *

import gzip
import os
import shutil
import sqlite3
from urllib.parse import urlparse

import duckdb
import pandas as pd
import requests
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
print("hello")

hello


In [3]:
# Initialize cache dictionaries for predict_from_normalized_env_packages
# todo how to move the definitions for function that use these globals?
ancestor_cache = {}
descendant_cache = {}

In [4]:
# todo deal with circularity in env package prediction -> env triad reporting

----

In [5]:
# ANCHOR_CURIE = 'ENVO:00000428' # biome
ANCHOR_CURIE = 'ENVO:01000813' # astronomical body part "abp"

In [6]:
gold_scope = 'mixs:env_local'

In [7]:
GOLDTERMS_SOIL = 'GOLDTERMS:4212'
GOLDTERMS_WATER = 'GOLDTERMS:3984'
GOLDTERMS_SEDIMENT = 'GOLDTERMS:3985' #  doesn't have any subclasses
GOLDTERMS_PLANT_ASSOCIATED = '' #  doesn't have any subclasses
# 
# GOLDTERMS:4180, 'Environmental > Aquatic > Freshwater > Pond > Sediment' and ~64 more don't share a common root
# poetry run runoak -i sqlite:obo:goldterms info 't~sediment'

# GOLDTERMS plant associated?


In [8]:
# GOLDTERMS_ROOT = GOLDTERMS_SOIL
GOLDTERMS_ROOT = GOLDTERMS_PLANT_ASSOCIATED

In [9]:
ncbi_scope = 'env_local_scale'

In [10]:
# todo new since soil: why are we only considering MIMS.me for discovering appropriate env triad values?

# NCBI_PACKAGE = 'MIMS.me.soil.6.0'
# NCBI_PACKAGE = 'MIMS.me.water.6.0'
# NCBI_PACKAGE = 'MIMS.me.sediment.6.0'
NCBI_PACKAGE = 'MIMS.me.plant-associated.6.0'

In [11]:
nmdc_scope= 'env_local_scale_id'

In [12]:
# TARGET_ENV_PACKAGE = 'soil'
# TARGET_ENV_PACKAGE = 'water'
# TARGET_ENV_PACKAGE = 'sediment'
TARGET_ENV_PACKAGE = 'plant-associated'


In [13]:
# CONTEXT_ENUM = "EnvBroadScaleSoilEnum"
# CONTEXT_ENUM = "EnvBroadScaleWaterEnum"
# CONTEXT_ENUM = "EnvBroadScaleSedimentEnum"
# CONTEXT_ENUM = "EnvBroadScalePlantAssociatedEnum"
CONTEXT_ENUM = "EnvLocalScalePlantAssociatedEnum"

In [14]:
output_file_name = "plant_associated_env_local_scale_rows_frame.tsv"

----

In [15]:
# Approved prefixes (case-insensitive)
approved_prefixes = ['ENVO']

In [16]:
MIN_ANNOTATION_LEN = 3

In [17]:
previous_submission_schema_url = "https://raw.githubusercontent.com/microbiomedata/submission-schema/v10.7.0/src/nmdc_submission_schema/schema/nmdc_submission_schema.yaml"

In [18]:
NMDC_RUNTIME_BASE_URL = 'https://api.microbiomedata.org/nmdcschema/'
STUDY_SET_COLLECTION = 'study_set'
BIOSAMPLE_SET_COLLECTION = 'biosample_set'

In [19]:
envo_adapter_string = "sqlite:obo:envo"

In [20]:
# goldterms_adapter_string = "sqlite:obo:envo"

In [21]:
env_package_override_file = '../../mam-env-package-overrides.tsv'
override_column = 'mam_inferred_env_package'

In [22]:
ncbi_duckdb_url = 'https://portal.nersc.gov/project/m3408/biosamples_duckdb/ncbi_biosamples_2024-09-23.duckdb.gz'

In [23]:
gold_data_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
BIOSAMPLES_SHEET = "Biosample"

In [24]:
goldterms_semsql_url = "https://s3.amazonaws.com/bbop-sqlite/goldterms.db.gz"


In [25]:
goldterms_subclass_query = f"""
select
	subject
from
	entailed_edge ee
where
	predicate = 'rdfs:subClassOf'
	and object = '{GOLDTERMS_ROOT}'
"""

In [26]:
# todo could this have been done with a OAK query, eliminating the need to explicitly download the file?

goldterms_envo_query = f"""
SELECT
	*
FROM
	statements s
WHERE
	predicate = '{gold_scope}'"""

In [27]:
ncbi_query = f"""
SELECT content, COUNT(1) AS sample_count 
FROM attributes 
WHERE harmonized_name = '{ncbi_scope}' AND package_content = '{NCBI_PACKAGE}'
GROUP BY content
ORDER BY COUNT(1) DESC
"""

In [28]:
def predict_from_normalized_env_packages(df_raw, adapter):
    # Apply the function to the relevant columns

    df = df_raw.copy()

    print(df.shape)
    for column in ['env_broad_scale_id', 'env_local_scale_id', 'env_medium_id']:
        df[f'{column}_ancestors'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['ancestors'])
        df[f'{column}_descendants'] = df[column].apply(lambda x: get_hierarchy_terms(x, adapter)['descendants'])

    # Vectorize each set of terms separately
    broad_scale_ancestors = vectorize_terms(df, 'env_broad_scale_id_ancestors')
    broad_scale_descendants = vectorize_terms(df, 'env_broad_scale_id_descendants')

    local_scale_ancestors = vectorize_terms(df, 'env_local_scale_id_ancestors')
    local_scale_descendants = vectorize_terms(df, 'env_local_scale_id_descendants')

    medium_ancestors = vectorize_terms(df, 'env_medium_id_ancestors')
    medium_descendants = vectorize_terms(df, 'env_medium_id_descendants')

    # Combine all feature matrices
    X = hstack([
        broad_scale_ancestors,
        broad_scale_descendants,
        local_scale_ancestors,
        local_scale_descendants,
        medium_ancestors,
        medium_descendants
    ])

    # Filter the DataFrame to only include non-null rows for the target column
    df_filtered = df[df['normalized_env_package'].notnull() & (df['normalized_env_package'] != "")]

    # Extract the target variable
    y = df_filtered['normalized_env_package']

    # Ensure X corresponds to the filtered rows
    X_filtered = X[df_filtered.index]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.3, random_state=42)

    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # not determining confidence for each class nor saving any diagnostics any more

    return clf.predict(X)

In [29]:
def get_hierarchy_terms(my_curie: str, adapter) -> dict:
    """
    Extract ancestor and descendant terms from the ontology for a given CURIE,
    using caching to improve performance and filtering by 'is_a' relationships.

    Args:
        my_curie (str): CURIE identifier for the ontology term.
        adapter: Ontology adapter.

    Returns:
        dict: Dictionary containing lists of ancestor and descendant terms.
    """
    if my_curie not in ancestor_cache:
        try:
            ancestors = list(adapter.ancestors(my_curie, predicates=[IS_A]))
            ancestor_cache[my_curie] = [adapter.label(ancestor) for ancestor in ancestors if ancestor]
        except Exception as my_e:
            print(f"Error retrieving ancestors for {my_curie}: {my_e}")
            ancestor_cache[my_curie] = []

    if my_curie not in descendant_cache:
        try:
            descendants = list(adapter.descendants(my_curie, predicates=[IS_A]))
            descendant_cache[my_curie] = [adapter.label(descendant) for descendant in descendants if descendant]
        except Exception as my_e:
            print(f"Error retrieving descendants for {my_curie}: {my_e}")
            descendant_cache[my_curie] = []

    return {
        'ancestors': ancestor_cache[my_curie],
        'descendants': descendant_cache[my_curie],
    }

In [30]:
# Determine the filenames and target directory
ncbi_compressed_filename = urlparse(ncbi_duckdb_url).path.split('/')[-1]
ncbi_filename = os.path.splitext(ncbi_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

In [31]:
# Fetch the contents from the URL and save compressed file in target directory
ncbi_compressed_file_path = os.path.join(target_dir, ncbi_compressed_filename)

# ncbi_response = requests.get(ncbi_duckdb_url)
# with open(ncbi_compressed_file_path, "wb") as f:
#     f.write(ncbi_response.content)

# ~ 2 minutes @ 250 Mbps

In [32]:
# Unzip the compressed file and save the extracted file in target directory
ncbi_uncompressed_file_path = os.path.join(target_dir, ncbi_filename)
# with gzip.open(ncbi_compressed_file_path, "rb") as f_in:
#     with open(ncbi_uncompressed_file_path, "wb") as f_out:
#         shutil.copyfileobj(f_in, f_out)
#
# # ~ 2 minutes

In [33]:
ncbi_conn = duckdb.connect(database=ncbi_uncompressed_file_path, read_only=True)

In [34]:
envo_adapter = get_adapter(envo_adapter_string)

In [35]:
anchor_descendants = get_curie_descendants_label_dict(ANCHOR_CURIE, [IS_A], envo_adapter)

In [36]:
anchor_descendants_lod = curie_descendants_label_dict_to_lod(anchor_descendants)

In [37]:
anchor_descendants_frame = curie_descendants_label_lod_to_df(anchor_descendants_lod)

In [38]:
anchor_descendants_frame

Unnamed: 0,curie,label
0,ENVO:00001999,marine water body
1,ENVO:01000188,tropical savanna biome
2,ENVO:00000487,paternoster lake
3,ENVO:01000860,temperate marine upwelling biome
4,ENVO:01000199,mediterranean forest biome
...,...,...
1731,ENVO:01000429,burrow
1732,ENVO:01000431,mixed forest
1733,ENVO:01000536,factory
1734,ENVO:00000873,freshwater biome


----

In [39]:
sv = get_schemaview_from_source(previous_submission_schema_url)

In [40]:
# todo break out slow steps into its own cell

try:
    CONTEXT_ENUM_def = sv.get_enum(CONTEXT_ENUM)
    context_pvs_keys = list(CONTEXT_ENUM_def.permissible_values.keys())
except AttributeError as e:
    # Handle the AttributeError
    print(f"An AttributeError occurred: {e}")
    context_pvs_keys =[]
    

An AttributeError occurred: 'NoneType' object has no attribute 'permissible_values'


In [41]:
initially_parsed_context_pvs = parse_hierarchically_underscored_strings(context_pvs_keys)

In [42]:
deduped_context_pvs = dedupe_underscoreless_pvs(initially_parsed_context_pvs)

In [43]:
pv_validation_results = validate_curie_label_list_dict(deduped_context_pvs, envo_adapter, print_flag=True)

In [44]:
pv_validation_results

{'problems': [], 'valids': []}

----

In [45]:
# todo rename to all_nmdc_samples etc
all_nmdc_biosamples = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL,
                                               BIOSAMPLE_SET_COLLECTION)  # Example with stop_after

Fetched page 1 with 1000 documents. Total fetched: 1000
Fetched page 2 with 1000 documents. Total fetched: 2000
Fetched page 3 with 1000 documents. Total fetched: 3000
Fetched page 4 with 1000 documents. Total fetched: 4000
Fetched page 5 with 1000 documents. Total fetched: 5000
Fetched page 6 with 1000 documents. Total fetched: 6000
Fetched page 7 with 1000 documents. Total fetched: 7000
Fetched page 8 with 1000 documents. Total fetched: 8000
Fetched page 9 with 320 documents. Total fetched: 8320
All documents fetched.


In [46]:
# # todo I don't think we're actually using this
# all_studies = get_docs_from_nmdc_collection(NMDC_RUNTIME_BASE_URL, STUDY_SET_COLLECTION)  # Example with stop_after

In [47]:
env_pacakge_overrides = tsv_to_dict_of_dicts(env_package_override_file, 'id')

In [48]:
# env_pacakge_overrides
# todo or show as frame
# todo include some other columns for context?

In [49]:
biosample_contexts_lod = biosamples_lod_context_extractor(all_nmdc_biosamples, envo_adapter,
                                                          my_env_pacakge_overrides=env_pacakge_overrides)

Overriding env_package for biosample nmdc:bsm-11-0k8nkx16 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-19v98823 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-1yvac190 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-28kgw077 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-2hswww54 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-34przm31 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-35m0rm03 from  to hydrocarbon resources-fluids_swabs
Overriding env_package for biosample nmdc:bsm-11-3636w778 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nffqc45 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3nhng665 from  to plant-associated
Overriding env_package for biosample nmdc:bsm-11-3r4g4610 from  to hydrocarbon resources-fluids_swabs
Overriding env_package

In [50]:
nmdc_biosample_contexts_frame = pd.DataFrame(biosample_contexts_lod)

In [51]:
# print a value count for the normalized_env_package column
print("Value counts for normalized_env_package column:")
print(nmdc_biosample_contexts_frame['normalized_env_package'].value_counts(dropna=False))

Value counts for normalized_env_package column:
normalized_env_package
                                                   5838
soil                                               1665
plant-associated                                    401
water                                               192
miscellaneous natural or artificial environment     140
host-associated                                      61
hydrocarbon resources-fluids_swabs                   23
Name: count, dtype: int64


In [52]:
package_predictions = predict_from_normalized_env_packages(nmdc_biosample_contexts_frame, envo_adapter)

(8320, 14)
                                                 precision    recall  f1-score   support

                                host-associated       1.00      1.00      1.00        21
             hydrocarbon resources-fluids_swabs       1.00      0.83      0.91         6
miscellaneous natural or artificial environment       1.00      1.00      1.00        44
                               plant-associated       1.00      1.00      1.00       132
                                           soil       1.00      1.00      1.00       489
                                          water       0.98      1.00      0.99        53

                                       accuracy                           1.00       745
                                      macro avg       1.00      0.97      0.98       745
                                   weighted avg       1.00      1.00      1.00       745



In [53]:
nmdc_biosample_contexts_frame['predicted_env_package'] = package_predictions

In [54]:
nmdc_biosample_contexts_frame.shape

(8320, 15)

In [55]:
nmdc_biosample_contexts_frame = nmdc_biosample_contexts_frame[
    nmdc_biosample_contexts_frame['predicted_env_package'] == TARGET_ENV_PACKAGE]

In [56]:
nmdc_biosample_contexts_frame.shape

(401, 15)

----

In [57]:
ncbi_frame = ncbi_conn.execute(ncbi_query).fetchdf()

In [58]:
ncbi_frame.insert(0, 'serial_number', range(1, len(ncbi_frame) + 1))

In [59]:
# includes values with counts of one... useful for discovering drag-down submissions?

In [60]:
ncbi_frame['content_list'] = ncbi_frame['content'].str.split('|')

In [61]:
ncbi_frame['content_count'] = ncbi_frame['content_list'].apply(len)

In [62]:
ncbi_frame.shape

(1955, 5)

In [63]:
ncbi_frame = ncbi_frame.explode('content_list').reset_index(drop=True)

In [64]:
ncbi_frame.shape

(1984, 5)

In [65]:
# how many content_list strings contain envo multiple times now?

In [66]:
ncbi_frame['envo_count'] = ncbi_frame['content_list'].str.lower().str.count("envo")

In [67]:
ncbi_frame['envo_count'].value_counts()

envo_count
0    1742
1     227
3      14
2       1
Name: count, dtype: int64

doesn't account for multiple label strings delimited with something other than '|'

In [68]:
ncbi_frame[['extracted_label', 'extracted_curie']] = ncbi_frame['content_list'].apply(parse_curie_label)

In [69]:
parse_failures = ncbi_frame[
    (ncbi_frame['envo_count'] > 0) & (ncbi_frame['extracted_curie'].isna() | (ncbi_frame['extracted_curie'] == ''))]


In [70]:
parse_failures

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie
63,63,Tropical woodland biome [ENVO01000220],223,Tropical woodland biome [ENVO01000220],1,1,Tropical woodland biome [ENVO01000220],
157,153,Tropical shrubland biome [ENVO01000214],77,Tropical shrubland biome [ENVO01000214],1,1,Tropical shrubland biome [ENVO01000214],


In [71]:
ncbi_frame['real_label'] = ncbi_frame['extracted_curie'].apply(envo_adapter.label)

In [72]:
# Apply the function to each row in the 'label' column
ncbi_frame['longest_annotation_curie'] = ncbi_frame['extracted_label'].apply(
    lambda x: get_longest_annotation_curie(x, envo_adapter, MIN_ANNOTATION_LEN))


ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [73]:
ncbi_frame['longest_annotation_label'] = ncbi_frame['longest_annotation_curie'].apply(envo_adapter.label)

In [74]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label
0,1,woodland,6385,woodland,1,0,woodland,,,ENVO:00000057,mangrove swamp
1,2,rhizosphere environment [ENVO_01000999],5325,rhizosphere environment [ENVO_01000999],1,1,rhizosphere environment,ENVO:01000999,rhizosphere environment,ENVO:01000999,rhizosphere environment
2,3,not applicable,2924,not applicable,1,0,not applicable,,,,
3,4,laboratory environment [ENVO:01001405],2683,laboratory environment [ENVO:01001405],1,1,laboratory environment,ENVO:01001405,laboratory environment,ENVO:01001405,laboratory environment
4,5,forest,1768,forest,1,0,forest,,,ENVO:00000111,forested area
...,...,...,...,...,...,...,...,...,...,...,...
1979,1951,Vineyard-SMDRTd1,1,Vineyard-SMDRTd1,1,0,Vineyard-SMDRTd1,,,ENVO:00000116,vineyard
1980,1952,Vineyard-VARTc2,1,Vineyard-VARTc2,1,0,Vineyard-VARTc2,,,ENVO:00000116,vineyard
1981,1953,Vineyard-VVRTc3,1,Vineyard-VVRTc3,1,0,Vineyard-VVRTc3,,,ENVO:00000116,vineyard
1982,1954,Vineyard-VVRTe1,1,Vineyard-VVRTe1,1,0,Vineyard-VVRTe1,,,ENVO:00000116,vineyard


----

In [75]:
gold_biosamples_frame = pd.read_excel(gold_data_url, sheet_name=BIOSAMPLES_SHEET)
# 2 minutes

  warn("Workbook contains no default style, apply openpyxl's default")


In [76]:
gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].fillna(
    0).astype(int)


In [77]:
gold_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM
0,Gb0011929,"GEBA_MDM Biosample from Great Boiling Spring, ...",749907.0,sediment metagenome,Sediment,,"Great Boiling Spring (GBS), Nevada",40.661433,-119.366250,3992,Environmental,Aquatic,Thermal springs,Hot (42-90C),Unclassified
1,Gb0035601,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
2,Gb0035602,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
3,Gb0035635,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
4,Gb0035638,Compost enrichment cellulose adapted microbial...,702656.0,compost metagenome,single cell,2012-10-31,USA: California: Emeryville: Joint Bio-Energy ...,37.840800,-122.289600,6039,Engineered,Solid waste,Green waste,Composting,Unclassified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210932,Gb0405291,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210933,Gb0405292,Freshwater biofilm microbial communities from ...,718308.0,biofilm metagenome,creek biofilm,2023-07-26,"USA: Lewis Run NEON Field Site, Briggs, Virginia",39.095630,-77.983216,8389,Environmental,Aquatic,Freshwater,Creek,Biofilm
210934,Gb0405293,Freshwater microbial communities from Lake Sug...,449393.0,freshwater metagenome,lake water,2023-08-01,"USA: Lake Suggs NEON Field Site, Melrose, Florida",29.688140,-82.017318,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified
210935,Gb0405294,Freshwater microbial communities from Prairie ...,449393.0,freshwater metagenome,lake water,2023-08-08,"USA: Prairie Lake NEON Field Site, Vashti, Nor...",47.159710,-99.118723,4179,Environmental,Aquatic,Freshwater,Lake,Unclassified


In [78]:
# Determine the filenames and target directory
goldterms_compressed_filename = urlparse(goldterms_semsql_url).path.split('/')[-1]
goldterms_filename = os.path.splitext(goldterms_compressed_filename)[0]
target_dir = os.path.join("..", "..")  # Two levels up

# Print to confirm the filenames
print(goldterms_filename)

goldterms.db


In [79]:
# Fetch the contents from the URL and save compressed file in target directory
goldterms_response = requests.get(goldterms_semsql_url)
goldterms_compressed_file_path = os.path.join(target_dir, goldterms_compressed_filename)
with open(goldterms_compressed_file_path, "wb") as f:
    f.write(goldterms_response.content)

In [80]:
# Unzip the compressed file and save the extracted file in target directory
goldterms_uncompressed_file_path = os.path.join(target_dir, goldterms_filename)
with gzip.open(goldterms_compressed_file_path, "rb") as f_in:
    with open(goldterms_uncompressed_file_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

In [81]:
goldterms_conn = sqlite3.connect(goldterms_uncompressed_file_path)

In [82]:
goldterms_subjects = pd.read_sql_query(goldterms_subclass_query, goldterms_conn)

In [83]:
goldterms_subjects['path_id'] = goldterms_subjects['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [84]:
goldterms_subjects

Unnamed: 0,subject,path_id


In [85]:
gold_path_ids = goldterms_subjects['path_id'].dropna().unique().tolist()
gold_path_ids = [int(my_id) for my_id in gold_path_ids]


In [86]:
gold_env_filtered_biosamples_frame = gold_biosamples_frame[
    gold_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'].isin(gold_path_ids)]


In [87]:
gold_env_filtered_biosamples_frame

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,BIOSAMPLE ECOSYSTEM,BIOSAMPLE ECOSYSTEM CATEGORY,BIOSAMPLE ECOSYSTEM TYPE,BIOSAMPLE ECOSYSTEM SUBTYPE,BIOSAMPLE SPECIFIC ECOSYSTEM


In [88]:
goldterms_context_frame = pd.read_sql_query(goldterms_envo_query, goldterms_conn)

In [89]:
goldterms_context_frame['object_label'] = goldterms_context_frame['object'].apply(envo_adapter.label)

In [90]:
goldterms_context_frame['path_id'] = goldterms_context_frame['subject'].str.extract(r'GOLDTERMS:(\d+)')

In [91]:
goldterms_context_frame

Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id
0,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,GOLDTERMS:Engineered-Bioreactor-Anaerobic-Soft...,mixs:env_local,OBI:0001046,,,,,,
1,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,GOLDTERMS:Engineered-Bioreactor-DHS-reactor,mixs:env_local,OBI:0001046,,,,,,
2,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,mixs:env_local,OBI:0001046,,,,,,
3,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,GOLDTERMS:Engineered-Bioreactor-Membrane-biore...,mixs:env_local,OBI:0001046,,,,,,
4,GOLDTERMS:Engineered-Bioreactor-Passive-biorea...,GOLDTERMS:Engineered-Bioreactor-Passive-biorea...,mixs:env_local,OBI:0001046,,,,,,
...,...,...,...,...,...,...,...,...,...,...
453,GOLDTERMS:5838,GOLDTERMS:5838,mixs:env_local,OBI:0001046,,,,,,5838
454,GOLDTERMS:5841,GOLDTERMS:5841,mixs:env_local,ENVO:00000076,,,,,mine,5841
455,GOLDTERMS:5843,GOLDTERMS:5843,mixs:env_local,OBI:0001046,,,,,,5843
456,GOLDTERMS:5846,GOLDTERMS:5846,mixs:env_local,OBI:0001046,,,,,,5846


In [92]:
# Fill NaN values in 'BIOSAMPLE ECOSYSTEM PATH ID' with 0 and convert to int
gold_env_filtered_biosamples_frame['BIOSAMPLE ECOSYSTEM PATH ID'] = gold_env_filtered_biosamples_frame[
    'BIOSAMPLE ECOSYSTEM PATH ID'].fillna(0).astype(int)

# Drop rows with NaN in 'path_id' in goldterms_context_frame
goldterms_context_frame = goldterms_context_frame.dropna(subset=['path_id'])

# Convert 'path_id' to int
goldterms_context_frame['path_id'] = goldterms_context_frame['path_id'].astype(int)

# Perform the left merge
gold_env_filtered_biosamples_with_inferred = gold_env_filtered_biosamples_frame.merge(
    goldterms_context_frame,
    left_on='BIOSAMPLE ECOSYSTEM PATH ID',
    right_on='path_id',
    how='left'
)


In [93]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id


----

In [94]:
include_in_rows = set()

In [95]:
include_in_rows.update(anchor_descendants_frame['curie'])

In [96]:
include_in_rows.update([i['curie'] for i in pv_validation_results['valids']])

In [97]:
include_in_rows.update(nmdc_biosample_contexts_frame[nmdc_scope])

In [98]:
include_in_rows.update(ncbi_frame['extracted_curie'])

In [99]:
include_in_rows.update(ncbi_frame['longest_annotation_curie'])

In [100]:
include_in_rows.update(gold_env_filtered_biosamples_with_inferred['object'])

In [101]:
rows_lod = []

In [102]:
# TODO MOVE THESE UP, because the expressions are already being used above

anchor_curies = list(anchor_descendants_frame['curie'])
legacy_pv_curies = [i['curie'] for i in pv_validation_results['valids']]
biome_curies = list(envo_adapter.descendants('ENVO:00000428', predicates=[IS_A])) # 
terrestrial_biome_curies = list(envo_adapter.descendants('ENVO:00000446', predicates=[IS_A]))
aquatic_biome_curies = list(envo_adapter.descendants('ENVO:00002030', predicates=[IS_A]))
abp_curies = list(envo_adapter.descendants('ENVO:01000813', predicates=[IS_A]))
env_sys_curies = list(envo_adapter.descendants('ENVO:01000254', predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants('ENVO:00010483', predicates=[IS_A]))
obsoletes_curies = list(envo_adapter.obsoletes())

for curie in include_in_rows:
    if curie is None:
        continue
    row = {
        'curie': curie,
        'label': envo_adapter.label(curie),
        'envo_native': False,
        'obsolete': False,
        'legacy_pv': False,
        'abp': False,
        'env_sys': False,
        'biome': False,
        'terrestrial_biome': False,
        'aquatic_biome': False,
        'env_mat': False,
    }
    prefix, local_id = curie.split(':')
    if prefix and prefix == 'ENVO' and row['label'] is not None:
        row['envo_native'] = True
    if curie in biome_curies:
        row['biome'] = True
    if curie in terrestrial_biome_curies:
        row['terrestrial_biome'] = True
    if curie in aquatic_biome_curies:
        row['aquatic_biome'] = True
    if curie in abp_curies:
        row['abp'] = True
    if curie in env_sys_curies:
        row['env_sys'] = True
    if curie in env_mat_curies:
        row['env_mat'] = True
    if curie in legacy_pv_curies:
        row['legacy_pv'] = True
    if curie in obsoletes_curies:
        row['obsolete'] = True
    rows_lod.append(row)


In [103]:
rows_frame = pd.DataFrame(rows_lod)

In [104]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat
0,ENVO:00000111,forested area,True,False,False,True,False,False,False,False,False
1,ENVO:03501282,hangar,True,False,False,True,False,False,False,False,False
2,ENVO:00000146,snow field,True,False,False,True,False,False,False,False,False
3,ENVO:01000672,geological joint,True,False,False,True,False,False,False,False,False
4,ENVO:01000710,flood,True,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1910,ENVO:00000165,asphalt lake,True,False,False,True,False,False,False,False,False
1911,ENVO:03501273,listed building,True,False,False,True,False,False,False,False,False
1912,ENVO:01000362,inflationary cave,True,False,False,True,False,False,False,False,False
1913,ENVO:01000943,vegetated hill,True,False,False,True,False,False,False,False,False


In [105]:
nmdc_biosample_scoped_counts = nmdc_biosample_contexts_frame[nmdc_scope].value_counts().reset_index()
nmdc_biosample_scoped_counts.columns = ['curie', 'nmdc_scoped_count']


In [106]:
nmdc_biosample_scoped_counts

Unnamed: 0,curie,nmdc_scoped_count
0,ENVO:01001057,199
1,ENVO:01001442,192
2,PO:0025025,10


In [107]:
# Perform the left merge
rows_frame = rows_frame.merge(
    nmdc_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [108]:
gold_env_filtered_biosamples_with_inferred

Unnamed: 0,BIOSAMPLE GOLD ID,BIOSAMPLE NAME,BIOSAMPLE NCBI TAX ID,BIOSAMPLE NCBI TAX NAME,BIOSAMPLE SAMPLE COLLECTION SITE,BIOSAMPLE SAMPLE COLLECTION DATE,BIOSAMPLE GEOGRAPHIC LOCATION,BIOSAMPLE LATITUDE,BIOSAMPLE LONGITUDE,BIOSAMPLE ECOSYSTEM PATH ID,...,stanza,subject,predicate,object,value,datatype,language,graph,object_label,path_id


In [109]:
gold_biosample_scoped_counts = gold_env_filtered_biosamples_with_inferred['object'].value_counts().reset_index()
gold_biosample_scoped_counts.columns = ['curie', 'gold_scoped_count']

In [110]:
gold_biosample_scoped_counts

Unnamed: 0,curie,gold_scoped_count


In [111]:
# Perform the left merge
rows_frame = rows_frame.merge(
    gold_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [112]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,nmdc_scoped_count,gold_scoped_count
0,ENVO:00000111,forested area,True,False,False,True,False,False,False,False,False,,
1,ENVO:03501282,hangar,True,False,False,True,False,False,False,False,False,,
2,ENVO:00000146,snow field,True,False,False,True,False,False,False,False,False,,
3,ENVO:01000672,geological joint,True,False,False,True,False,False,False,False,False,,
4,ENVO:01000710,flood,True,False,False,True,False,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910,ENVO:00000165,asphalt lake,True,False,False,True,False,False,False,False,False,,
1911,ENVO:03501273,listed building,True,False,False,True,False,False,False,False,False,,
1912,ENVO:01000362,inflationary cave,True,False,False,True,False,False,False,False,False,,
1913,ENVO:01000943,vegetated hill,True,False,False,True,False,False,False,False,False,,


In [113]:
# 990 rows in https://docs.google.com/spreadsheets/d/12WH3eduBq2qSTy9zVF3n7fyajn6ssLZL/edit?gid=546570706#gid=546570706

In [114]:
# gold and ncbi counts are slightly trickier
# for gold may want to include presence or mapping in goldterms in addition to biosamples counts
# ncbi: we have extracted curies and annotated curies

In [115]:
# todo move this stuff up to immediately after the creation of ncbi_frame ?

# todo don't accept extracted curie if no real label?
# any kind of string similarity checking for label of annotated curie vs extracted label ?
# look for long runs of curies?
# can we measure the beneficial impact of any of this? current crux: how to distribute counts

ncbi_frame['curie_list'] = ncbi_frame.apply(
    lambda my_row: list({my_row['extracted_curie'], my_row['longest_annotation_curie']} - {None}),
    axis=1
)

ncbi_frame['unique_curie_count'] = ncbi_frame['curie_list'].apply(len)

In [116]:
ncbi_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count
0,1,woodland,6385,woodland,1,0,woodland,,,ENVO:00000057,mangrove swamp,[ENVO:00000057],1
1,2,rhizosphere environment [ENVO_01000999],5325,rhizosphere environment [ENVO_01000999],1,1,rhizosphere environment,ENVO:01000999,rhizosphere environment,ENVO:01000999,rhizosphere environment,[ENVO:01000999],1
2,3,not applicable,2924,not applicable,1,0,not applicable,,,,,[],0
3,4,laboratory environment [ENVO:01001405],2683,laboratory environment [ENVO:01001405],1,1,laboratory environment,ENVO:01001405,laboratory environment,ENVO:01001405,laboratory environment,[ENVO:01001405],1
4,5,forest,1768,forest,1,0,forest,,,ENVO:00000111,forested area,[ENVO:00000111],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1979,1951,Vineyard-SMDRTd1,1,Vineyard-SMDRTd1,1,0,Vineyard-SMDRTd1,,,ENVO:00000116,vineyard,[ENVO:00000116],1
1980,1952,Vineyard-VARTc2,1,Vineyard-VARTc2,1,0,Vineyard-VARTc2,,,ENVO:00000116,vineyard,[ENVO:00000116],1
1981,1953,Vineyard-VVRTc3,1,Vineyard-VVRTc3,1,0,Vineyard-VVRTc3,,,ENVO:00000116,vineyard,[ENVO:00000116],1
1982,1954,Vineyard-VVRTe1,1,Vineyard-VVRTe1,1,0,Vineyard-VVRTe1,,,ENVO:00000116,vineyard,[ENVO:00000116],1


In [117]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    1299
0     645
2      40
Name: count, dtype: int64

In [132]:
double_curie_frame = ncbi_frame[ncbi_frame['unique_curie_count'] > 1]

In [133]:
double_curie_frame = double_curie_frame[['extracted_curie', 'longest_annotation_curie']]

In [134]:
double_curie_frame = double_curie_frame.drop_duplicates()

In [135]:
double_curie_frame[['extracted_prefix', 'extracted_local_id']] = double_curie_frame['extracted_curie'].str.split(':', expand=True)

In [136]:
double_curie_frame['extracted_local_id_int'] = pd.to_numeric(double_curie_frame['extracted_local_id'], errors='coerce').astype('Int64')

In [137]:
# Ensure extracted_local_id_int is unique and sorted
unique_sorted_series = double_curie_frame['extracted_local_id_int'].dropna().drop_duplicates().sort_values()


In [138]:
# Find stretches
stretches_dict = find_consecutive_stretches_dict(unique_sorted_series)

# pprint.pprint(stretches_dict)

In [139]:
# Convert the stretches dictionary into a DataFrame
stretches_df = stretches_dict_to_long_dataframe(stretches_dict)

In [140]:
stretches_df

Unnamed: 0,stretch_id,value
0,1,1001431
1,1,1001432
2,1,1001433
3,1,1001434
4,1,1001435
5,1,1001436
6,1,1001437
7,1,1001438
8,1,1001439
9,1,1001440


In [141]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretches_df,
    left_on='extracted_local_id_int',
    right_on='value',
    how='left'
)

In [142]:
stretch_summary_df = summarize_stretch_groups(double_curie_frame)


In [143]:
stretch_summary_df

Unnamed: 0,stretch_id,most_common_longest_annotation_curie,fraction
0,1.0,ENVO:01001430,1.0


In [144]:
# Perform the left merge
double_curie_frame = double_curie_frame.merge(
    stretch_summary_df,
    left_on='stretch_id',
    right_on='stretch_id',
    how='left'
)

In [145]:
drag_evidence_frame = double_curie_frame[double_curie_frame['stretch_id'] >= 1]
drag_evidence_frame = drag_evidence_frame[['extracted_curie', 'longest_annotation_curie']]
drag_evidence_frame['drag_evidence'] = True

In [146]:
drag_evidence_frame

Unnamed: 0,extracted_curie,longest_annotation_curie,drag_evidence
6,ENVO:01001432,ENVO:01001430,True
7,ENVO:01001435,ENVO:01001430,True
8,ENVO:01001436,ENVO:01001430,True
9,ENVO:01001433,ENVO:01001430,True
10,ENVO:01001434,ENVO:01001430,True
11,ENVO:01001431,ENVO:01001430,True
12,ENVO:01001442,ENVO:01001430,True
13,ENVO:01001462,ENVO:01001430,True
14,ENVO:01001463,ENVO:01001430,True
15,ENVO:01001443,ENVO:01001430,True


In [147]:
ncbi_frame = ncbi_frame.merge(
    drag_evidence_frame,
    left_on=['extracted_curie', 'longest_annotation_curie'],
    right_on=['extracted_curie', 'longest_annotation_curie'],
    how='left'
)

In [148]:
# Initialize dragless_curie_list with curie_list values
ncbi_frame["dragless_curie_list"] = ncbi_frame["curie_list"]

# Update dragless_curie_list based on the condition
for index, row in ncbi_frame.iterrows():
    if row["drag_evidence"] is True:
        if row["longest_annotation_curie"] is not None:
            ncbi_frame.at[index, "dragless_curie_list"] = [row["longest_annotation_curie"]]
        else:
            ncbi_frame.at[index, "dragless_curie_list"] = []

ncbi_frame['dragless_curie_count'] = ncbi_frame['dragless_curie_list'].apply(len)

In [149]:
ncbi_frame['unique_curie_count'].value_counts()

unique_curie_count
1    1299
0     645
2      40
Name: count, dtype: int64

In [150]:
ncbi_frame['dragless_curie_count'].value_counts()

dragless_curie_count
1    1332
0     645
2       7
Name: count, dtype: int64

In [151]:
ncbi_frame.shape

(1984, 16)

In [152]:
ncbi_frame_undisputed = ncbi_frame[ncbi_frame['dragless_curie_count'] <= 1]

In [153]:
ncbi_frame_undisputed.shape

(1977, 16)

In [154]:
ncbi_frame_disputed = ncbi_frame[ncbi_frame['dragless_curie_count'] > 1]

In [155]:
ncbi_frame_disputed.shape

(7, 16)

In [156]:
ncbi_frame_disputed = ncbi_frame_disputed.explode("dragless_curie_list", ignore_index=True)


In [157]:
ncbi_frame_disputed.shape

(14, 16)

In [158]:
ncbi_frame_disputed["dragless_curie_list"] = ncbi_frame_disputed["dragless_curie_list"].apply(lambda x: [x])

In [159]:
# Combine the rows of ncbi_frame_undisputed and ncbi_frame_disputed into a new DataFrame
ncbi_disputes_exploded_frame = pd.concat([ncbi_frame_undisputed, ncbi_frame_disputed], ignore_index=True)


In [160]:
ncbi_disputes_exploded_frame.shape

(1991, 16)

In [161]:
ncbi_disputes_exploded_frame

Unnamed: 0,serial_number,content,sample_count,content_list,content_count,envo_count,extracted_label,extracted_curie,real_label,longest_annotation_curie,longest_annotation_label,curie_list,unique_curie_count,drag_evidence,dragless_curie_list,dragless_curie_count
0,1,woodland,6385,woodland,1,0,woodland,,,ENVO:00000057,mangrove swamp,[ENVO:00000057],1,,[ENVO:00000057],1
1,2,rhizosphere environment [ENVO_01000999],5325,rhizosphere environment [ENVO_01000999],1,1,rhizosphere environment,ENVO:01000999,rhizosphere environment,ENVO:01000999,rhizosphere environment,[ENVO:01000999],1,,[ENVO:01000999],1
2,3,not applicable,2924,not applicable,1,0,not applicable,,,,,[],0,,[],0
3,4,laboratory environment [ENVO:01001405],2683,laboratory environment [ENVO:01001405],1,1,laboratory environment,ENVO:01001405,laboratory environment,ENVO:01001405,laboratory environment,[ENVO:01001405],1,,[ENVO:01001405],1
4,5,forest,1768,forest,1,0,forest,,,ENVO:00000111,forested area,[ENVO:00000111],1,,[ENVO:00000111],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1986,231,NY_fields[ENVO:00000040],36,NY_fields[ENVO:00000040],1,1,NY_fields,ENVO:00000040,waterfall,ENVO:01000352,field,"[ENVO:00000040, ENVO:01000352]",2,,[ENVO:01000352],2
1987,425,intertidal zone [ENVO:0000316],8,intertidal zone [ENVO:0000316],1,1,intertidal zone,ENVO:0000316,,ENVO:00000316,intertidal zone,"[ENVO:00000316, ENVO:0000316]",2,,[ENVO:00000316],2
1988,425,intertidal zone [ENVO:0000316],8,intertidal zone [ENVO:0000316],1,1,intertidal zone,ENVO:0000316,,ENVO:00000316,intertidal zone,"[ENVO:00000316, ENVO:0000316]",2,,[ENVO:0000316],2
1989,1018,green_house[ENVO:00000040],1,green_house[ENVO:00000040],1,1,green_house,ENVO:00000040,waterfall,ENVO:01000417,house,"[ENVO:00000040, ENVO:01000417]",2,,[ENVO:00000040],2


In [162]:
ncbi_disputes_exploded_frame['post_explode_curie_count'] = ncbi_disputes_exploded_frame['dragless_curie_list'].apply(len)

In [163]:
ncbi_disputes_exploded_frame['post_explode_curie_count'].value_counts()

post_explode_curie_count
1    1346
0     645
Name: count, dtype: int64

In [164]:
# Set 'post_explode_curie' to the 0th item in 'dragless_curie_list'
ncbi_disputes_exploded_frame["post_explode_curie"] = ncbi_disputes_exploded_frame["dragless_curie_list"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

In [165]:

ncbi_biosample_scoped_counts = (
    ncbi_disputes_exploded_frame.groupby("post_explode_curie")["sample_count"].sum().reset_index()
)

ncbi_biosample_scoped_counts.columns = ['curie', 'ncbi_scoped_count']

In [166]:
ncbi_biosample_scoped_counts

Unnamed: 0,curie,ncbi_scoped_count
0,BFO:0000015,18
1,BFO:0000029,5
2,CHEBI:22695,11
3,CHEBI:24632,1
4,CHEBI:33290,209
...,...,...
303,RO:0002577,148
304,UBERON:0000178,48
305,UBERON:0001988,3
306,UBERON:0002100,6


In [167]:
# Perform the left merge
rows_frame = rows_frame.merge(
    ncbi_biosample_scoped_counts,
    left_on='curie',
    right_on='curie',
    how='left'
)

In [168]:
rows_frame

Unnamed: 0,curie,label,envo_native,obsolete,legacy_pv,abp,env_sys,biome,terrestrial_biome,aquatic_biome,env_mat,nmdc_scoped_count,gold_scoped_count,ncbi_scoped_count
0,ENVO:00000111,forested area,True,False,False,True,False,False,False,False,False,,,4927.0
1,ENVO:03501282,hangar,True,False,False,True,False,False,False,False,False,,,
2,ENVO:00000146,snow field,True,False,False,True,False,False,False,False,False,,,
3,ENVO:01000672,geological joint,True,False,False,True,False,False,False,False,False,,,
4,ENVO:01000710,flood,True,False,False,True,False,False,False,False,False,,,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1910,ENVO:00000165,asphalt lake,True,False,False,True,False,False,False,False,False,,,
1911,ENVO:03501273,listed building,True,False,False,True,False,False,False,False,False,,,
1912,ENVO:01000362,inflationary cave,True,False,False,True,False,False,False,False,False,,,
1913,ENVO:01000943,vegetated hill,True,False,False,True,False,False,False,False,False,,,21.0


In [169]:
rows_frame.to_csv(output_file_name, sep="\t", index=false)