In [1]:
from linkml_runtime.utils.schemaview import SchemaView
import pandas as pd
import sqlite3
import os
import sys
import shutil
import re

import pickle
import os.path
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build

import numpy

# what to import for prsing Montana's google sheet?

## What slots are *required* for biosamples?! How would I know?

https://microbiomedata.github.io/nmdc-schema/Biosample/

These are marked required

- biosample➞env_broad_scale 1..1
- biosample➞env_local_scale 1..1
- biosample➞env_medium 1..1

`id` has `identifier=True`

This script already for required and recommended MIxS like `elev` and `depth` in the soil package


In [2]:
# globals
headers_list = []
expectations_list = []

In [3]:
# template columns processed by DataHarmonizzer
blank_row = {
    "Ontology ID": "",
    "parent class": "",
    "label": "",
    "datatype": "",
    "source": "",
    "data status": "",
    "requirement": "",
    "min value": "",
    "max value": "",
    "capitalize": "",
    "description": "",
    "guidance": "",
    "examples": "",
}

### DataHarmonizer data types

| datatype              | description                                                                                                                                        |
|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|
| xs:token              | An XML string                                                                                                                                      |
| xs:unique             | A xs:token which should be unique in a   dataset. Good for validating sample identifiers for example                                               |
| xs:date               | An XML date                                                                                                                                        |
| select                | A field with a pulldown menu of selection   options. Can be indented as a hierarchy.                                                               |
| multiple              | A field with a popup menu allowing   multiple selection/deselection of items in a given hierarchy of terms.                                        |
| xs:nonNegativeInteger | An integer >= 0                                                                                                                                    |
| xs:decimal            | A decimal number                                                                                                                                   |
| provenance            | Marks a field that when Validated,   automatically receives a prefix of "DataHarmonizer provenance:   vX.Y.Z" in addition to its existing content. |

In [4]:
# required terms, according to Example-Soil_NMDC_SampleMetadata
#   https://docs.google.com/spreadsheets/d/1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0/edit#gid=860381937

# these are generally not specified as required in nmdc-schema or mixs-source

# see also montana_nmdc_mixs.tsv below

# some of these don't appear in mongodb yet

# group the GOLD *ecosystem* slots seperately?

# many of those are underannotated in the MIxS or NMDC schemas

still_expected = [
    "id",
    "identifier",
    "name",
    "ncbi_taxonomy_name",
    "env_package",
    "samp_mat_process",
    "ecosystem",
    "ecosystem_category",
    "ecosystem_type",
    "ecosystem_subtype",
    "specific_ecosystem",
]

required_if_included = [
    "samp_store_temp",
    "store_cond",
    "collection_date",
    "env_broad_scale",
    "env_local_scale",
    "env_medium",
    "geo_loc_name",
    "lat_lon",
]


# ---

# for making a template based on MIxS slots
#   from https://github.com/cmungall/mixs-source
#   (GenomicsStandardsConsortium/mixs-source might be slightly out of sync)
mixs_yaml_file = "../../mixs-source/model/schema/mixs.yaml"

# ---

# also contains some MIxS knowledge, possibly with NMDC elaboration on terms
#   also required for NMDC-native terms
#   from https://github.com/microbiomedata/nmdc-schema

nmdc_yaml_file = "../../nmdc-schema/src/schema/nmdc.yaml"

# ---

# observed, normalized use of ontology terms (classes), per env package,
#   in the INSDC Biosample database
#   from https://github.com/turbomam/scoped-mapping/blob/main/notebooks/onto_slots_by_env_pack.ipynb
envpack_slot_fractions_file = (
    "../../scoped-mapping/notebooks/envpack_slot_fractions.tsv"
)

# ---

# this is a convenient, high-performance way to access an ontology (like EnvO)
# requires a little semantic sql setup
# there are probably additional ontologies that need to be consulted
#   for picking terms to go in MIxS triad slots in non-soil templates
#   from https://github.com/cmungall/semantic-sql
envo_rdftab_file = "../../semantic-sql/db/envo.db"


# ---

# can also be helpful to look at Example-Soil_NMDC_SampleMetadata
#   https://docs.google.com/spreadsheets/d/1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0/edit#gid=0

montana_nmdc_mixs_file = "montana_nmdc_mixs.tsv" # was derrived from
Example_Soil_NMDC_SampleMetadata = "1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0"


# ---

mongodb_biosample_file = "biosample_set.csv"

# ---

# output path
#   this should be paramterized for looping over the packages
data_tsv_prefix = "../../DataHarmonizer/template/"
data_tsv_suffix = "/data.tsv"

# this script creates template files, but they need to go in template directories
# if the per-package template directories need to be created, tehy will copied from data_tsv_prototype
data_tsv_prototype = "../../DataHarmonizer/template/soil/"

In [5]:
GOOGLE_API_SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
SPREADSHEET_ID = Example_Soil_NMDC_SampleMetadata
SHEET_TAB_ID = "EcosystemTerms"

In [6]:
# https://towardsdatascience.com/how-to-import-google-sheets-data-into-a-pandas-dataframe-using-googles-api-v4-2020-f50e84ea4530


def gsheet_api_check(GOOGLE_API_SCOPES):
    creds = None
    if os.path.exists("token.pickle"):
        with open("token.pickle", "rb") as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                "credentials.json", GOOGLE_API_SCOPES
            )
            creds = flow.run_local_server(port=0)
        with open("token.pickle", "wb") as token:
            pickle.dump(creds, token)
    return creds


def pull_sheet_data(GOOGLE_API_SCOPES, SPREADSHEET_ID, SHEET_TAB_ID):
    creds = gsheet_api_check(GOOGLE_API_SCOPES)
    service = build("sheets", "v4", credentials=creds)
    sheet = service.spreadsheets()
    result = (
        sheet.values().get(spreadsheetId=SPREADSHEET_ID, range=SHEET_TAB_ID).execute()
    )
    values = result.get("values", [])

    if not values:
        print("No data found.")
    else:
        rows = (
            sheet.values()
            .get(spreadsheetId=SPREADSHEET_ID, range=SHEET_TAB_ID)
            .execute()
        )
        data = rows.get("values")
        print("COMPLETE: Data copied")
        return data

In [7]:
pulled_data = pull_sheet_data(GOOGLE_API_SCOPES, SPREADSHEET_ID, SHEET_TAB_ID)

COMPLETE: Data copied


In [8]:
triad_slots = ["env_broad_scale", "env_local_scale", "env_medium"]

et_triad_frame = pd.DataFrame(pulled_data[2:], columns=pulled_data[1])
# there are two sets of package	env_broad_scale	env_local_scale	env_medium columns
# proablay shouldn't assume these locations
et_triad_frame = et_triad_frame.iloc[:, 8:12]
et_triad_frame = et_triad_frame.loc[~et_triad_frame["package"].isnull()]
et_triad_frame = et_triad_frame.loc[
    ~et_triad_frame["env_broad_scale"].eq("select broad scale")
]
et_triad_frame = et_triad_frame.fillna("")
et_triad_frame

Unnamed: 0,package,env_broad_scale,env_local_scale,env_medium
9,air,neritic sea surface microlayer biome,,
10,built-environment,,plumbing drain,
11,HCR-cores,,oil reservoir,
12,HCR-cores,,oil seep,
13,HCR-cores,,oil well,
...,...,...,...,...
205,water,tropical marine upwelling biome,lake shore,
206,water,tropical marine upwelling biome,,
207,water,,underground water body,
208,water,,waterfall,


In [9]:
et_tf_package_counts = (
    et_triad_frame["package"]
    .value_counts()
    .rename_axis("package")
    .reset_index(name="enum_rows")
)

et_tf_package_counts

Unnamed: 0,package,enum_rows
0,soil,85
1,water,81
2,sediment,20
3,plant_associated,4
4,HCR-cores,3
5,HCR-fluids-swabs,3
6,host-associated,3
7,air,1
8,built-environment,1


In [10]:
# # melt, then
# et_triad_frame.groupby(["package", "slot"]).size().reset_index(name="XXX")

In [11]:
mixs_yaml_view = SchemaView(mixs_yaml_file)
nmdc_yaml_view = SchemaView(nmdc_yaml_file)

## `live_frame` is informative, but not used anywhere else right now

In [12]:
mongodb_biosample_frame = pd.read_csv(mongodb_biosample_file)
mongodb_biosample_cols_raw = mongodb_biosample_frame.columns

mongodb_biosample_cols = [
    re.sub(r"\..*$", "", raw) for raw in mongodb_biosample_cols_raw
]

mongodb_biosample_cols = list(set(mongodb_biosample_cols))

In [13]:
biosample_slots = nmdc_yaml_view.class_induced_slots("biosample")

Loading schema external_identifiers from ../../nmdc-schema/src/schema/nmdc.yaml
Loading schema core from ../../nmdc-schema/src/schema/nmdc.yaml
Loading schema prov from ../../nmdc-schema/src/schema/nmdc.yaml
Loading schema basic_slots from ../../nmdc-schema/src/schema/nmdc.yaml
Loading schema https://w3id.org/linkml/types from ../../nmdc-schema/src/schema/nmdc.yaml
Loading schema annotation from ../../nmdc-schema/src/schema/nmdc.yaml
Loading schema workflow_execution_activity from ../../nmdc-schema/src/schema/nmdc.yaml
Loading schema mixs from ../../nmdc-schema/src/schema/nmdc.yaml


In [14]:
montana_nmdc_mixs = pd.read_csv(montana_nmdc_mixs_file, sep="\t")
# the sample size column is hidden in the template

In [15]:
live_list = []
for i in montana_nmdc_mixs["slot"].values:
    i_mixs = mixs_yaml_view.get_slot(i)
    i_mixs = not i_mixs is None
    i_nmdc = nmdc_yaml_view.get_slot(i)
    i_nmdc = not i_nmdc is None
    i_biosample = i in biosample_slots
    i_mongodb = i in mongodb_biosample_cols
    live_list.append(
        {"mixs": i_mixs, "nmdc": i_nmdc, "biosample": i_biosample, "mongodb": i_mongodb}
    )

live_frame = pd.DataFrame(live_list)

live_frame = pd.concat([montana_nmdc_mixs, live_frame], axis=1)
live_frame

Loading schema water from ../../mixs-source/model/schema/mixs.yaml
Loading schema terms from ../../mixs-source/model/schema/mixs.yaml
Loading schema ranges from ../../mixs-source/model/schema/mixs.yaml
Loading schema https://w3id.org/linkml/types from ../../mixs-source/model/schema/mixs.yaml
Loading schema wastewater_sludge from ../../mixs-source/model/schema/mixs.yaml
Loading schema soil from ../../mixs-source/model/schema/mixs.yaml
Loading schema sediment from ../../mixs-source/model/schema/mixs.yaml
Loading schema plant_associated from ../../mixs-source/model/schema/mixs.yaml
Loading schema miscellaneous_natural_or_artificial_environment from ../../mixs-source/model/schema/mixs.yaml
Loading schema microbial_mat_biofilm from ../../mixs-source/model/schema/mixs.yaml
Loading schema hydrocarbon_resources_fluids_swabs from ../../mixs-source/model/schema/mixs.yaml
Loading schema hydrocarbon_resources_cores from ../../mixs-source/model/schema/mixs.yaml
Loading schema human_vaginal from ../

Unnamed: 0,raw,template_block,slot,notes,mixs,nmdc,biosample,mongodb
0,ID,green,id,curie,False,True,False,True
1,investigation_type,green,investigation_type,,True,True,False,False
2,package,green,env_package,,True,True,False,False
3,sample_name,green,name,long textual,False,True,False,True
4,source_mat_ID,green,source_mat_id,,True,True,False,False
5,collection_date,red,collection_date,has raw value,True,True,False,True
6,env_broad_scale,red,env_broad_scale,has raw value,True,True,False,True
7,env_local_scale,red,env_local_scale,has raw value,True,True,False,True
8,env_medium,red,env_medium,has raw value,True,True,False,True
9,geo_loc_name,red,geo_loc_name,has raw value,True,True,False,True


## Back to core functionality

In [16]:
for i in biosample_slots:
    if i.required or i.recommended or i.identifier:
        print(i.name)

env_broad_scale
env_local_scale
env_medium
id


In [17]:
enums_obj = mixs_yaml_view.all_enum()
enum_list = list(enums_obj.keys())

In [18]:
# how to get a list of all environmental packages according to  MIxS ?
epes = mixs_yaml_view.get_enum("env_package_enum")
epe_pvs = list(epes.permissible_values.keys())
unique_observed_packages = epe_pvs
unique_observed_packages.sort()

# maybe some of these aren't classes?

In [19]:
mixs_classes = list(mixs_yaml_view.all_class().keys())
# enum_not_class = set(unique_observed_packages) - set(mixs_classes)
# print(enum_not_class)

# # {'wastewater/sludge', 'hydrocarbon resources-fluids/swabs', 'microbial mat/biofilm', 'misc environment'}
# # 'wastewater_sludge'
# # 'hydrocarbon resources-fluids_swabs'
# # 'microbial mat_biofilm'
# # 'miscellaneous natural or artificial environment'

In [20]:
n2d = []
for i in mixs_classes:
    j = mixs_yaml_view.get_class(i).description
    n2d.append({"name": i, "desc": j})

n2d = pd.DataFrame(n2d)
n2d = n2d.loc[n2d["desc"].isin(unique_observed_packages)]
n2d["for_path"] = n2d["name"].str.replace("[^a-zA-Z]+", "_", regex=True)

n2d

Unnamed: 0,name,desc,for_path
0,water,water,water
2,wastewater_sludge,wastewater/sludge,wastewater_sludge
3,soil,soil,soil
4,sediment,sediment,sediment
5,plant-associated,plant-associated,plant_associated
7,microbial mat_biofilm,microbial mat/biofilm,microbial_mat_biofilm
8,hydrocarbon resources-fluids_swabs,hydrocarbon resources-fluids/swabs,hydrocarbon_resources_fluids_swabs
9,hydrocarbon resources-cores,hydrocarbon resources-cores,hydrocarbon_resources_cores
10,human-vaginal,human-vaginal,human_vaginal
11,human-skin,human-skin,human_skin


In [21]:
def append_headers_list(headers_dict):
    headers_list.append(headers_dict)

In [22]:
def make_ascii(raw):
    if raw is None:
        return " "
    else:
        encoded_string = raw.encode("ascii", "ignore")
        decoded_string = encoded_string.decode()
        return decoded_string

In [23]:
min_enum_count = 5

In [24]:
def template_package(package):
    package_slots = list(mixs_yaml_view.class_slots(package))
    package_slots.sort()

    rows_list = []
    global headers_list
    headers_list = []

    section_values_set = set()

    ultilized_enums = {}

    append_headers_list(
        {
            "label": "NMDC slots",
        }
    )

    # refactor?
    for current_term in still_expected:
        if current_term in mixs_yaml_view.all_slot():
            current_term_prioritized = mixs_yaml_view.get_slot(current_term)
        else:
            current_term_prioritized = nmdc_yaml_view.get_slot(current_term)

        # special handling for examples
        current_example = ""
        ce_len = len(current_term_prioritized.examples)
        if ce_len > 0:
            current_example = make_ascii(current_term_prioritized.examples[0].value)

        append_headers_list(
            {
                "parent class": "NMDC slots",
                "label": current_term,
                "datatype": "xs:token",
                "requirement": "required",
                "description": make_ascii(current_term_prioritized.description),
                "examples": current_example,
                "guidance": make_ascii(". ".join(current_term_prioritized.comments)),
            }
        )

    # add any of these?
    #     blank_row = {
    #     "Ontology ID": "",
    #     "source": "",
    #     "data status": "",
    #     "min value": "",
    #     "max value": "",
    #     "capitalize": "",
    #     "guidance": "",
    # }

    append_headers_list(
        {
            "parent class": "NMDC slots",
            "label": "Study IDs",
            "requirement": "recommended",
            "datatype": "xs:token",
            "description": "|-seperated list of NMDC study identifiers",
        }
    )

    for ps in package_slots:
        row_dict = blank_row.copy()

        current_term = mixs_yaml_view.induced_slot(ps, "soil")

        row_dict["Ontology ID"] = current_term["slot_uri"]

        # TODO would the is_a parent ever be empty?
        section_values_set.add(current_term["is_a"])
        row_dict["parent class"] = current_term["is_a"]
        row_dict["label"] = current_term["name"]

        # see datatypes above
        ps_range = current_term["range"]
        row_dict["datatype"] = "xs:token"
        # TODO quantity value

        # TODO sort the enums alphabetically
        if ps_range in enum_list:
            # TODO could use meaning slots from enum permissible values as ontology terms
            #   if they were populated
            #   hey, who was supposed to do that?
            enum_permissibles = list(enums_obj[ps_range]["permissible_values"].keys())
            ultilized_enums[ps] = enum_permissibles
            if "multivalued" in row_dict and row_dict["multivalued"]:
                row_dict["datatype"] = "multiple"
            else:
                row_dict["datatype"] = "select"

        # would be better to count number of enums per package/slot
        if ps in triad_slots:
            current_count = (
                et_tf_package_counts["enum_rows"]
                .loc[et_tf_package_counts["package"].eq(package)]
                .squeeze()
            )
            if (
                isinstance(current_count, numpy.int64)
                and current_count >= min_enum_count
            ):
                current_enums = list(
                    et_triad_frame[ps].loc[et_triad_frame["package"].eq(package)]
                )
                current_enums = [i for i in current_enums if i]
                ultilized_enums[ps] = current_enums
                row_dict["datatype"] = "select"

        if ps_range == "date":
            row_dict["datatype"] = "xs:date"
        if ps_range == "double":
            # see also xs:nonNegativeInteger
            row_dict["datatype"] = "xs:decimal"

        # source	Used to name a select list field that row's select list should be replicated from.
        #   For example, a "citizenship" select field sourced from a "country of birth" select list of countries.
        row_dict["source"] = ""

        # data status	A customizable list of additional metadata select options to include with given
        #   (select or numeric or text) input field to indicate if a value was missing, not collected, etc.
        #   Format: semicolon separated list of options. Options are also displayed in column help info.
        row_dict["data status"] = ""

        row_dict["requirement"] = ""
        if current_term["recommended"]:
            row_dict["requirement"] = "recommended"
        if current_term["required"]:
            row_dict["requirement"] = "required"
        if ps in required_if_included:
            row_dict["requirement"] = "required"

        # TODO
        row_dict["min value"] = ""
        row_dict["max value"] = ""

        # On data entry or validation, capitalize field content according to setting.
        #   Leaves text unchanged when no value is provided. Options: lower / UPPER / Title
        row_dict["capitalize"] = ""

        row_dict["description"] = make_ascii(current_term["description"])

        row_dict["guidance"] = make_ascii(". ".join(current_term["comments"]))

        # am I parsing examples correctly?
        row_dict["examples"] = make_ascii(current_term.examples[0].value)

        dictionary_copy = row_dict.copy()
        rows_list.append(dictionary_copy)

    reshape_list = []
    for enum_name, permissibles in ultilized_enums.items():
        for permissible in permissibles:
            reshape_list.append({"parent class": enum_name, "label": permissible})

    reshape_frame = pd.DataFrame(reshape_list)

    for current_sv in section_values_set:
        section_value_dict = blank_row.copy()
        section_value_dict["label"] = current_sv
        section_value_dict = section_value_dict.copy()
        headers_list.append(section_value_dict)

    rows_list = headers_list + rows_list

    package_template = pd.DataFrame(rows_list)

    package_template = pd.concat([package_template, reshape_frame])
    package_template = package_template.fillna("")

    return package_template

In [25]:
if not os.path.exists(data_tsv_prototype):
    print("data_tsv_prototype: " + data_tsv_prototype + " does not exist")
    sys.exit()

if not os.path.exists(data_tsv_prefix):
    print("parent template directory: " + data_tsv_prefix + " does not exist")
    print("creating")
    os.makedirs(data_tsv_prefix)

In [26]:
# oops directory creation messages and TEMPLATES values get interspersed

n2d_names = list(n2d["name"])
n2d_names.sort()
for package in n2d_names:
    #     print(package)
    for_path = n2d["for_path"].loc[n2d["name"] == package].squeeze()
    as_description = n2d["desc"].loc[n2d["name"] == package].squeeze()
    data_tsv_dir = os.path.join(data_tsv_prefix, for_path)
    if not os.path.exists(data_tsv_dir):
        #         print("package template directory: " + data_tsv_dir + " does not exist")
        #         print("copying from " + data_tsv_prototype)
        cpres = shutil.copytree(data_tsv_prototype, data_tsv_dir)
    #         print(cpres + " created")
    package_template = template_package(package)
    final_dest = os.path.join(data_tsv_dir, "data.tsv")
    package_template.to_csv(final_dest, sep="\t", index=False)
    print(
        "'MIxS "
        + as_description
        + "':    {'folder': '"
        + for_path
        + "', 'status': 'published'},"
    )
# the output below can go into TEMPLATES in script/main.js
# don't forget to also edit the default template template_label
# it should be one of the package names, not the fodler name

'MIxS air':    {'folder': 'air', 'status': 'published'},
'MIxS built environment':    {'folder': 'built_environment', 'status': 'published'},
'MIxS host-associated':    {'folder': 'host_associated', 'status': 'published'},
'MIxS human-associated':    {'folder': 'human_associated', 'status': 'published'},
'MIxS human-gut':    {'folder': 'human_gut', 'status': 'published'},
'MIxS human-oral':    {'folder': 'human_oral', 'status': 'published'},
'MIxS human-skin':    {'folder': 'human_skin', 'status': 'published'},
'MIxS human-vaginal':    {'folder': 'human_vaginal', 'status': 'published'},
'MIxS hydrocarbon resources-cores':    {'folder': 'hydrocarbon_resources_cores', 'status': 'published'},
'MIxS hydrocarbon resources-fluids/swabs':    {'folder': 'hydrocarbon_resources_fluids_swabs', 'status': 'published'},
'MIxS microbial mat/biofilm':    {'folder': 'microbial_mat_biofilm', 'status': 'published'},
'MIxS plant-associated':    {'folder': 'plant_associated', 'status': 'published'},
'MIxS 

### Don't forget to run `templates/make_all.sh`

## INSDC observed MIxS triad values
After some normalization


In [27]:
# observed term usage per package after normalizing
envpack_slot_fractions = pd.read_csv(envpack_slot_fractions_file, sep="\t")

soil_fractions = envpack_slot_fractions.loc[
    envpack_slot_fractions["env_package"].eq("soil")
].copy()
soil_ebs_fractions = soil_fractions.loc[
    envpack_slot_fractions["slot"].eq("env_broad_scale")
].copy()

# soil_ebs_fractions
# assume that the normalizastion ahs split multi annotations on |
soil_ebs_fractions[["label", "id"]] = soil_ebs_fractions["value"].str.split(
    " \[", expand=True
)
soil_ebs_fractions["id"] = soil_ebs_fractions["id"].str.replace("\]$", "", regex=True)
soil_ebs_fractions[["prefix", "local"]] = soil_ebs_fractions["id"].str.split(
    ":", expand=True
)

In [28]:
# envo terms are predominant for env broad scale in soil
# but that won't always be the case
# where to draw the line between REST API calls which don't require any setup
# and rdftab databases etc which arent' dependent on remote services

connection = sqlite3.connect(envo_rdftab_file)

biomes_q = """
select s2.subject, s2.value
from entailed_edge s1
join statements s2
on s1.subject = s2.subject
where s1.predicate = 'rdfs:subClassOf' 
and s1.object = 'ENVO:00000428'
and s2.predicate = 'rdfs:label'
---limit 9
"""

biomes_res = pd.read_sql_query(biomes_q, connection)

biomes_res

Unnamed: 0,subject,value
0,ENVO:01000190,flooded savanna biome
1,ENVO:01000189,temperate savanna biome
2,ENVO:01000188,tropical savanna biome
3,ENVO:01000187,subtropical savanna biome
4,ENVO:01000054,marine basaltic hydrothermal vent biome
...,...,...
90,ENVO:01000026,marine bathyal zone biome
91,ENVO:01000025,marine neritic benthic zone biome
92,ENVO:01000024,marine benthic biome
93,ENVO:01000023,marine pelagic biome


In [29]:
soil_ebs_fractions["biome"] = soil_ebs_fractions["id"].isin(list(biomes_res["subject"]))
soil_ebs_fractions

Unnamed: 0,env_package,slot,value,val_count,fraction,label,id,prefix,local,biome
1071,soil,env_broad_scale,Bacteria [NCBITaxon:2],58,0.003593,Bacteria,NCBITaxon:2,NCBITaxon,2,False
1072,soil,env_broad_scale,Fusarium [NCBITaxon:5506],21,0.001301,Fusarium,NCBITaxon:5506,NCBITaxon,5506,False
1073,soil,env_broad_scale,Gossypium hirsutum [NCBITaxon:3635],72,0.004461,Gossypium hirsutum,NCBITaxon:3635,NCBITaxon,3635,False
1074,soil,env_broad_scale,Healthy [HP:0032322],2,0.000124,Healthy,HP:0032322,HP,0032322,False
1075,soil,env_broad_scale,agricultural soil [ENVO:00002259],20,0.001239,agricultural soil,ENVO:00002259,ENVO,00002259,False
...,...,...,...,...,...,...,...,...,...,...
1140,soil,env_broad_scale,tropical shrubland biome [ENVO:01000214],127,0.007868,tropical shrubland biome,ENVO:01000214,ENVO,01000214,True
1141,soil,env_broad_scale,tundra [ENVO:00000112],347,0.021498,tundra,ENVO:00000112,ENVO,00000112,False
1142,soil,env_broad_scale,tundra biome [ENVO:01000180],531,0.032898,tundra biome,ENVO:01000180,ENVO,01000180,True
1143,soil,env_broad_scale,unmapped [unmapped],2136,0.132334,unmapped,unmapped,unmapped,,False
