In [1]:
from linkml_runtime.utils.schemaview import SchemaView
import pandas as pd
import sqlite3
import os
import sys
import shutil
import re

import pickle
import os.path


# what/which to import for prsing Montana's google sheet?
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build

import numpy

# pip install google-api-python-client google-auth-oauthlib python-igraph

In [2]:
# import pycairo
# import igraph as ig
# import matplotlib.pyplot as plt

## What slots are *required* for biosamples?! How would I know?

https://microbiomedata.github.io/nmdc-schema/Biosample/

These are marked required

- biosample➞env_broad_scale 1..1
- biosample➞env_local_scale 1..1
- biosample➞env_medium 1..1

`id` has `identifier=True`

This script already for required and recommended MIxS like `elev` and `depth` in the soil package


In [3]:
# globals
headers_list = []
expectations_list = []
vals = []

In [4]:
# template columns processed by DataHarmonizzer
blank_row = {
    "Ontology ID": "",
    "parent class": "",
    "label": "",
    "datatype": "",
    "source": "",
    "data status": "",
    "requirement": "",
    "min value": "",
    "max value": "",
    "capitalize": "",
    "description": "",
    "guidance": "",
    "examples": "",
}

### DataHarmonizer data types

| datatype              | description                                                                                                                                        |
|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|
| xs:token              | An XML string                                                                                                                                      |
| xs:unique             | A xs:token which should be unique in a   dataset. Good for validating sample identifiers for example                                               |
| xs:date               | An XML date                                                                                                                                        |
| select                | A field with a pulldown menu of selection   options. Can be indented as a hierarchy.                                                               |
| multiple              | A field with a popup menu allowing   multiple selection/deselection of items in a given hierarchy of terms.                                        |
| xs:nonNegativeInteger | An integer >= 0                                                                                                                                    |
| xs:decimal            | A decimal number                                                                                                                                   |
| provenance            | Marks a field that when Validated,   automatically receives a prefix of "DataHarmonizer provenance:   vX.Y.Z" in addition to its existing content. |

In [5]:
# required terms, according to Example-Soil_NMDC_SampleMetadata
#   https://docs.google.com/spreadsheets/d/1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0/edit#gid=860381937

# these are generally not specified as required in nmdc-schema or mixs-source

# see also montana_nmdc_mixs.tsv below

# some of these don't appear in mongodb yet

# group the GOLD *ecosystem* slots seperately?

# many of those are underannotated in the MIxS or NMDC schemas

still_expected = [
    "id",
    "identifier",
    "name",
    "ncbi_taxonomy_name",
    "env_package",
    "samp_mat_process",
    "ecosystem",
    "ecosystem_category",
    "ecosystem_type",
    "ecosystem_subtype",
    "specific_ecosystem",
]

required_if_included = [
    "samp_store_temp",
    "store_cond",
    "collection_date",
    "env_broad_scale",
    "env_local_scale",
    "env_medium",
    "geo_loc_name",
    "lat_lon",
]


# ---

# for making a template based on MIxS slots
#   from https://github.com/cmungall/mixs-source
#   (GenomicsStandardsConsortium/mixs-source might be slightly out of sync)
mixs_yaml_file = "../../mixs-source/model/schema/mixs.yaml"

# ---

# also contains some MIxS knowledge, possibly with NMDC elaboration on terms
#   also required for NMDC-native terms
#   from https://github.com/microbiomedata/nmdc-schema

nmdc_yaml_file = "../../nmdc-schema/src/schema/nmdc.yaml"

# ---

# observed, normalized use of ontology terms (classes), per env package,
#   in the INSDC Biosample database
#   from https://github.com/turbomam/scoped-mapping/blob/main/notebooks/onto_slots_by_env_pack.ipynb
envpack_slot_fractions_file = (
    "../../scoped-mapping/notebooks/envpack_slot_fractions.tsv"
)

# ---

# this is a convenient, high-performance way to access an ontology (like EnvO)
# requires a little semantic sql setup
# there are probably additional ontologies that need to be consulted
#   for picking terms to go in MIxS triad slots in non-soil templates
#   from https://github.com/cmungall/semantic-sql
envo_rdftab_file = "../../semantic-sql/db/envo.db"


# ---

# can also be helpful to look at Example-Soil_NMDC_SampleMetadata
#   https://docs.google.com/spreadsheets/d/1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0/edit#gid=0

montana_nmdc_mixs_file = "montana_nmdc_mixs.tsv"  # was derrived from
Example_Soil_NMDC_SampleMetadata = "1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0"


# ---

mongodb_biosample_file = "biosample_set.csv"

# ---

# output path
#   this should be paramterized for looping over the packages
data_tsv_prefix = "../../DataHarmonizer/template/"
data_tsv_suffix = "/data.tsv"

# this script creates template files, but they need to go in template directories
# if the per-package template directories need to be created, tehy will copied from data_tsv_prototype
data_tsv_prototype = "../../DataHarmonizer/template/soil/"

In [6]:
GOOGLE_API_SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
SPREADSHEET_ID = Example_Soil_NMDC_SampleMetadata
SHEET_TAB_ID = "EcosystemTerms"

In [7]:
# envo terms are predominant for env broad scale in soil
# but that won't always be the case
# where to draw the line between REST API calls which don't require any setup
# and rdftab databases etc which arent' dependent on remote services

connection = sqlite3.connect(envo_rdftab_file)

biomes_q = """
select s2.subject, s2.value
from entailed_edge s1
join statements s2
on s1.subject = s2.subject
where s1.predicate = 'rdfs:subClassOf' 
and s1.object = 'ENVO:00000428'
and s2.predicate = 'rdfs:label'
---limit 9
"""

biomes_res = pd.read_sql_query(biomes_q, connection)

# biomes_res

---

In [8]:
envo_sco_q = """
--- really only want named envo subjects and objects
--- do that outside of sql?
--- trying lexical check on beginning of CURIEs for now

select
	"subject",
	"object"
from
	statements
where
	predicate = "rdfs:subClassOf"
	and SUBSTRING("subject", 1, 5) = 'ENVO:'
	and SUBSTRING("object", 1, 5) = 'ENVO:'
"""

envo_sco_res = pd.read_sql_query(envo_sco_q, connection)

In [9]:
envo_sco_t_q = """
--- see sco notes above
select
	"subject" as child,
    predicate,
	"object" as parent
from
	entailed_edge
where
	predicate = "rdfs:subClassOf"
	and SUBSTRING("subject", 1, 5) = 'ENVO:'
	and SUBSTRING("object", 1, 5) = 'ENVO:'
"""

envo_sco_t_res = pd.read_sql_query(envo_sco_t_q, connection)
# envo_sco_t_res

In [10]:
envo_lab_q = """
--- really only want named envo subjects and objects
--- do that outside of sql?
--- trying lexical check on beginning of CURIEs for now

select
	"subject",
	"value"
from
	statements
where
	predicate = "rdfs:label"
	and SUBSTRING("subject", 1, 5) = 'ENVO:'
"""

envo_lab_res = pd.read_sql_query(envo_lab_q, connection)

envo_lab_res["lclab"] = envo_lab_res["value"].str.lower()

# envo_lab_res

----

In [11]:
# https://towardsdatascience.com/how-to-import-google-sheets-data-into-a-pandas-dataframe-using-googles-api-v4-2020-f50e84ea4530


def gsheet_api_check(GOOGLE_API_SCOPES):
    creds = None
    if os.path.exists("token.pickle"):
        with open("token.pickle", "rb") as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                "credentials.json", GOOGLE_API_SCOPES
            )
            creds = flow.run_local_server(port=0)
        with open("token.pickle", "wb") as token:
            pickle.dump(creds, token)
    return creds


def pull_sheet_data(GOOGLE_API_SCOPES, SPREADSHEET_ID, SHEET_TAB_ID):
    creds = gsheet_api_check(GOOGLE_API_SCOPES)
    service = build("sheets", "v4", credentials=creds)
    sheet = service.spreadsheets()
    result = (
        sheet.values().get(spreadsheetId=SPREADSHEET_ID, range=SHEET_TAB_ID).execute()
    )
    values = result.get("values", [])

    if not values:
        print("No data found.")
    else:
        rows = (
            sheet.values()
            .get(spreadsheetId=SPREADSHEET_ID, range=SHEET_TAB_ID)
            .execute()
        )
        data = rows.get("values")
        print("COMPLETE: Data copied")
        return data

In [12]:
# https://stackoverflow.com/a/69262613/3860847
def nest(n, c=0):
    global vals
    return ((c * "   ") + n) + (
        ""
        if not (k := [nest(a, c + 1) for a, b in vals if b == n])
        else "\n" + ("\n".join(k))
    )

In [13]:
# assuming a lot of globals
def list_to_text_tree(current_label_list):
    envo_lab_current_curated = envo_lab_res.loc[envo_lab_res["lclab"].isin(current_label_list)]

    children_to_display = envo_sco_t_res["child"].isin(
        list(envo_lab_current_curated["subject"])
    )
    children_to_display = envo_sco_t_res.loc[children_to_display]

    child_count = children_to_display["parent"].value_counts()
    child_count = child_count.rename_axis("parent").reset_index(name="child_count")

    envo_lab_current_curated = envo_lab_current_curated.merge(
        child_count,
        how="left",
        left_on="subject",
        right_on="parent",
        sort=False,
        suffixes=("_x", "_y"),
        copy=True,
        indicator=False,
        validate=None,
    )

    envo_lab_current_curated.sort_values(
        by=["child_count", "lclab"], inplace=True, ascending=[False, True]
    )

    envo_lab_current_curated = envo_lab_current_curated[["subject", "lclab", "child_count"]]
    envo_lab_current_curated.reset_index(inplace=True, drop=True)

    current_direct_sco = envo_sco_res.loc[
        envo_sco_res["subject"].isin(envo_lab_current_curated["subject"])
        & envo_sco_res["object"].isin(envo_lab_current_curated["subject"])
    ]

    workarround = current_direct_sco.copy()
    workarround = workarround.merge(
        envo_lab_res,
        how="left",
        left_on="subject",
        right_on="subject",
        sort=False,
        suffixes=("_x", "_y"),
        copy=True,
        indicator=False,
        validate=None,
    )
    workarround = workarround.merge(
        envo_lab_res,
        how="left",
        left_on="object",
        right_on="subject",
        sort=False,
        suffixes=("_x", "_y"),
        copy=True,
        indicator=False,
        validate=None,
    )
    workarround = workarround[["lclab_x", "lclab_y"]]
    workarround.columns = ["subject", "object"]

    data = workarround.to_dict()
    
    global vals
    
    vals = [[data["subject"][i], data["object"][i]] for i in data["subject"]]
    
#     print(vals)
    
    roots = {b for _, b in vals if all(j != b for j, _ in vals)}
    
    # generate to an array instead
    lab_blob = "\n".join(nest(b) for b in roots)
    lab_list = lab_blob.split("\n")
    return lab_list


In [14]:
# def envoid_to_lab(envo_id):
#     envo_lab = envo_lab_current_curated["lclab"].loc[
#         envo_lab_current_curated["subject"] == envo_id
#     ]
#     envo_lab = envo_lab.squeeze()
#     return envo_lab

In [15]:
pulled_data = pull_sheet_data(GOOGLE_API_SCOPES, SPREADSHEET_ID, SHEET_TAB_ID)

COMPLETE: Data copied


In [16]:
triad_slots = ["env_broad_scale", "env_local_scale", "env_medium"]

et_triad_frame = pd.DataFrame(pulled_data[2:], columns=pulled_data[1])
# there are two sets of package	env_broad_scale	env_local_scale	env_medium columns
# proablay shouldn't assume these locations
et_triad_frame = et_triad_frame.iloc[:, 8:12]
et_triad_frame = et_triad_frame.loc[~et_triad_frame["package"].isnull()]
et_triad_frame = et_triad_frame.loc[
    ~et_triad_frame["env_broad_scale"].eq("select broad scale")
]
et_triad_frame = et_triad_frame.fillna("")
et_triad_frame

Unnamed: 0,package,env_broad_scale,env_local_scale,env_medium
9,air,neritic sea surface microlayer biome,,
10,built-environment,,plumbing drain,
11,HCR-cores,,oil reservoir,
12,HCR-cores,,oil seep,
13,HCR-cores,,oil well,
...,...,...,...,...
205,water,tropical marine upwelling biome,lake shore,
206,water,tropical marine upwelling biome,,
207,water,,underground water body,
208,water,,waterfall,


In [17]:
et_tf_package_counts = (
    et_triad_frame["package"]
    .value_counts()
    .rename_axis("package")
    .reset_index(name="enum_rows")
)

et_tf_package_counts

Unnamed: 0,package,enum_rows
0,soil,85
1,water,81
2,sediment,20
3,plant_associated,4
4,HCR-cores,3
5,HCR-fluids-swabs,3
6,host-associated,3
7,air,1
8,built-environment,1


In [18]:
# # melt, then
# et_triad_frame.groupby(["package", "slot"]).size().reset_index(name="XXX")

In [19]:
mixs_yaml_view = SchemaView(mixs_yaml_file)
nmdc_yaml_view = SchemaView(nmdc_yaml_file)

## `live_frame` is informative, but not used anywhere else right now

In [20]:
mongodb_biosample_frame = pd.read_csv(mongodb_biosample_file)
mongodb_biosample_cols_raw = mongodb_biosample_frame.columns

mongodb_biosample_cols = [
    re.sub(r"\..*$", "", raw) for raw in mongodb_biosample_cols_raw
]

mongodb_biosample_cols = list(set(mongodb_biosample_cols))

In [21]:
biosample_slots = nmdc_yaml_view.class_induced_slots("biosample")

In [22]:
montana_nmdc_mixs = pd.read_csv(montana_nmdc_mixs_file, sep="\t")
# the sample size column is hidden in the template

In [23]:
live_list = []
for i in montana_nmdc_mixs["slot"].values:
    i_mixs = mixs_yaml_view.get_slot(i)
    i_mixs = not i_mixs is None
    i_nmdc = nmdc_yaml_view.get_slot(i)
    i_nmdc = not i_nmdc is None
    i_biosample = i in biosample_slots
    i_mongodb = i in mongodb_biosample_cols
    live_list.append(
        {"mixs": i_mixs, "nmdc": i_nmdc, "biosample": i_biosample, "mongodb": i_mongodb}
    )

live_frame = pd.DataFrame(live_list)

live_frame = pd.concat([montana_nmdc_mixs, live_frame], axis=1)
live_frame

Unnamed: 0,raw,template_block,slot,notes,mixs,nmdc,biosample,mongodb
0,ID,green,id,curie,False,True,False,True
1,investigation_type,green,investigation_type,,True,True,False,False
2,package,green,env_package,,True,True,False,False
3,sample_name,green,name,long textual,False,True,False,True
4,source_mat_ID,green,source_mat_id,,True,True,False,False
5,collection_date,red,collection_date,has raw value,True,True,False,True
6,env_broad_scale,red,env_broad_scale,has raw value,True,True,False,True
7,env_local_scale,red,env_local_scale,has raw value,True,True,False,True
8,env_medium,red,env_medium,has raw value,True,True,False,True
9,geo_loc_name,red,geo_loc_name,has raw value,True,True,False,True


## Back to core functionality

In [24]:
for i in biosample_slots:
    if i.required or i.recommended or i.identifier:
        print(i.name)

env_broad_scale
env_local_scale
env_medium
id


In [25]:
enums_obj = mixs_yaml_view.all_enum()
enum_list = list(enums_obj.keys())

In [26]:
# how to get a list of all environmental packages according to  MIxS ?
epes = mixs_yaml_view.get_enum("env_package_enum")
epe_pvs = list(epes.permissible_values.keys())
unique_observed_packages = epe_pvs
unique_observed_packages.sort()

# maybe some of these aren't classes?

In [27]:
mixs_classes = list(mixs_yaml_view.all_class().keys())
# enum_not_class = set(unique_observed_packages) - set(mixs_classes)
# print(enum_not_class)

# # {'wastewater/sludge', 'hydrocarbon resources-fluids/swabs', 'microbial mat/biofilm', 'misc environment'}
# # 'wastewater_sludge'
# # 'hydrocarbon resources-fluids_swabs'
# # 'microbial mat_biofilm'
# # 'miscellaneous natural or artificial environment'

In [28]:
n2d = []
for i in mixs_classes:
    j = mixs_yaml_view.get_class(i).description
    n2d.append({"name": i, "desc": j})

n2d = pd.DataFrame(n2d)
n2d = n2d.loc[n2d["desc"].isin(unique_observed_packages)]
n2d["for_path"] = n2d["name"].str.replace("[^a-zA-Z]+", "_", regex=True)

n2d

Unnamed: 0,name,desc,for_path
0,water,water,water
2,wastewater_sludge,wastewater/sludge,wastewater_sludge
3,soil,soil,soil
4,sediment,sediment,sediment
5,plant-associated,plant-associated,plant_associated
7,microbial mat_biofilm,microbial mat/biofilm,microbial_mat_biofilm
8,hydrocarbon resources-fluids_swabs,hydrocarbon resources-fluids/swabs,hydrocarbon_resources_fluids_swabs
9,hydrocarbon resources-cores,hydrocarbon resources-cores,hydrocarbon_resources_cores
10,human-vaginal,human-vaginal,human_vaginal
11,human-skin,human-skin,human_skin


In [29]:
def append_headers_list(headers_dict):
    headers_list.append(headers_dict)

In [30]:
def make_ascii(raw):
    if raw is None:
        return " "
    else:
        encoded_string = raw.encode("ascii", "ignore")
        decoded_string = encoded_string.decode()
        return decoded_string

In [31]:
min_enum_count = 5

----

In [32]:
# # curated_water_ebs
# current_curated = [
#     "epeiric sea biome",
#     "estuarine biome",
#     "freshwater biome",
#     "freshwater lake biome",
#     "freshwater river biome",
#     "mangrove biome",
#     "marine basaltic hydrothermal vent biome",
#     "marine benthic biome",
#     "marine cold seep biome",
#     "marine hadal zone biome",
#     "marine hydrothermal vent biome",
#     "marine neritic benthic zone biome",
#     "marine reef biome",
#     "marine salt marsh biome",
#     "marine subtidal rocky reef biome",
#     "marine ultramafic hydrothermal vent biome",
#     "tropical marginal sea biome",
# ]


# list_to_text_tree(current_curated)

In [33]:
def template_package(package):
    package_slots = list(mixs_yaml_view.class_slots(package))
    package_slots.sort()

    rows_list = []
    global headers_list
    headers_list = []

    section_values_set = set()

    ultilized_enums = {}

    append_headers_list(
        {
            "label": "NMDC slots",
        }
    )

    # refactor?
    for current_term in still_expected:
        if current_term in mixs_yaml_view.all_slot():
            current_term_prioritized = mixs_yaml_view.get_slot(current_term)
        else:
            current_term_prioritized = nmdc_yaml_view.get_slot(current_term)

        # special handling for examples
        current_example = ""
        ce_len = len(current_term_prioritized.examples)
        if ce_len > 0:
            current_example = make_ascii(current_term_prioritized.examples[0].value)

        append_headers_list(
            {
                "parent class": "NMDC slots",
                "label": current_term,
                "datatype": "xs:token",
                "requirement": "required",
                "description": make_ascii(current_term_prioritized.description),
                "examples": current_example,
                "guidance": make_ascii(". ".join(current_term_prioritized.comments)),
            }
        )

    # add any of these?
    #     blank_row = {
    #     "Ontology ID": "",
    #     "source": "",
    #     "data status": "",
    #     "min value": "",
    #     "max value": "",
    #     "capitalize": "",
    #     "guidance": "",
    # }

    append_headers_list(
        {
            "parent class": "NMDC slots",
            "label": "Study IDs",
            "requirement": "recommended",
            "datatype": "xs:token",
            "description": "|-seperated list of NMDC study identifiers",
        }
    )

    for ps in package_slots:
        row_dict = blank_row.copy()

        current_term = mixs_yaml_view.induced_slot(ps, "soil")

        row_dict["Ontology ID"] = current_term["slot_uri"]

        # TODO would the is_a parent ever be empty?
        section_values_set.add(current_term["is_a"])
        row_dict["parent class"] = current_term["is_a"]
        row_dict["label"] = current_term["name"]

        # see datatypes above
        ps_range = current_term["range"]
        row_dict["datatype"] = "xs:token"
        # TODO quantity value

        # TODO sort the enums alphabetically
        if ps_range in enum_list:
            # TODO could use meaning slots from enum permissible values as ontology terms
            #   if they were populated
            #   hey, who was supposed to do that?
            enum_permissibles = list(enums_obj[ps_range]["permissible_values"].keys())
            ultilized_enums[ps] = enum_permissibles
            if "multivalued" in row_dict and row_dict["multivalued"]:
                row_dict["datatype"] = "multiple"
            else:
                row_dict["datatype"] = "select"

        # would be better to count number of enums per package/slot
        if ps in triad_slots:
            current_count = (
                et_tf_package_counts["enum_rows"]
                .loc[et_tf_package_counts["package"].eq(package)]
                .squeeze()
            )
            if (
                isinstance(current_count, numpy.int64)
                and current_count >= min_enum_count
            ):
                current_enums = list(
                    et_triad_frame[ps].loc[et_triad_frame["package"].eq(package)]
                )
                current_enums = [i for i in current_enums if i]
                #                 print(ps)
                
                current_enums = list_to_text_tree(current_enums)
#                 print(current_enums)
                
                
                ultilized_enums[ps] = current_enums
                row_dict["datatype"] = "select"

        if ps_range == "date":
            row_dict["datatype"] = "xs:date"
        if ps_range == "double":
            # see also xs:nonNegativeInteger
            row_dict["datatype"] = "xs:decimal"

        # source	Used to name a select list field that row's select list should be replicated from.
        #   For example, a "citizenship" select field sourced from a "country of birth" select list of countries.
        row_dict["source"] = ""

        # data status	A customizable list of additional metadata select options to include with given
        #   (select or numeric or text) input field to indicate if a value was missing, not collected, etc.
        #   Format: semicolon separated list of options. Options are also displayed in column help info.
        row_dict["data status"] = ""

        row_dict["requirement"] = ""
        if current_term["recommended"]:
            row_dict["requirement"] = "recommended"
        if current_term["required"]:
            row_dict["requirement"] = "required"
        if ps in required_if_included:
            row_dict["requirement"] = "required"

        # TODO
        row_dict["min value"] = ""
        row_dict["max value"] = ""

        # On data entry or validation, capitalize field content according to setting.
        #   Leaves text unchanged when no value is provided. Options: lower / UPPER / Title
        row_dict["capitalize"] = ""

        row_dict["description"] = make_ascii(current_term["description"])

        row_dict["guidance"] = make_ascii(". ".join(current_term["comments"]))

        # am I parsing examples correctly?
        row_dict["examples"] = make_ascii(current_term.examples[0].value)

        dictionary_copy = row_dict.copy()
        rows_list.append(dictionary_copy)

    reshape_list = []
    for enum_name, permissibles in ultilized_enums.items():
        for permissible in permissibles:
            reshape_list.append({"parent class": enum_name, "label": permissible})

    reshape_frame = pd.DataFrame(reshape_list)

    for current_sv in section_values_set:
        section_value_dict = blank_row.copy()
        section_value_dict["label"] = current_sv
        section_value_dict = section_value_dict.copy()
        headers_list.append(section_value_dict)

    rows_list = headers_list + rows_list

    package_template = pd.DataFrame(rows_list)

    package_template = pd.concat([package_template, reshape_frame])
    package_template = package_template.fillna("")

    return package_template

In [34]:
if not os.path.exists(data_tsv_prototype):
    print("data_tsv_prototype: " + data_tsv_prototype + " does not exist")
    sys.exit()

if not os.path.exists(data_tsv_prefix):
    print("parent template directory: " + data_tsv_prefix + " does not exist")
    print("creating")
    os.makedirs(data_tsv_prefix)

In [35]:
# oops directory creation messages and TEMPLATES values get interspersed

n2d_names = list(n2d["name"])
n2d_names.sort()
for package in n2d_names:
    #     print(package)
    for_path = n2d["for_path"].loc[n2d["name"] == package].squeeze()
    as_description = n2d["desc"].loc[n2d["name"] == package].squeeze()
    data_tsv_dir = os.path.join(data_tsv_prefix, for_path)
    if not os.path.exists(data_tsv_dir):
        #         print("package template directory: " + data_tsv_dir + " does not exist")
        #         print("copying from " + data_tsv_prototype)
        cpres = shutil.copytree(data_tsv_prototype, data_tsv_dir)
    #         print(cpres + " created")
    package_template = template_package(package)
    final_dest = os.path.join(data_tsv_dir, "data.tsv")
    package_template.to_csv(final_dest, sep="\t", index=False)
    print(
        "'MIxS "
        + as_description
        + "':    {'folder': '"
        + for_path
        + "', 'status': 'published'},"
    )
# the output below can go into TEMPLATES in script/main.js
# don't forget to also edit the default template template_label
# it should be one of the package names, not the fodler name

'MIxS air':    {'folder': 'air', 'status': 'published'},
'MIxS built environment':    {'folder': 'built_environment', 'status': 'published'},
'MIxS host-associated':    {'folder': 'host_associated', 'status': 'published'},
'MIxS human-associated':    {'folder': 'human_associated', 'status': 'published'},
'MIxS human-gut':    {'folder': 'human_gut', 'status': 'published'},
'MIxS human-oral':    {'folder': 'human_oral', 'status': 'published'},
'MIxS human-skin':    {'folder': 'human_skin', 'status': 'published'},
'MIxS human-vaginal':    {'folder': 'human_vaginal', 'status': 'published'},
'MIxS hydrocarbon resources-cores':    {'folder': 'hydrocarbon_resources_cores', 'status': 'published'},
'MIxS hydrocarbon resources-fluids/swabs':    {'folder': 'hydrocarbon_resources_fluids_swabs', 'status': 'published'},
'MIxS microbial mat/biofilm':    {'folder': 'microbial_mat_biofilm', 'status': 'published'},
'MIxS plant-associated':    {'folder': 'plant_associated', 'status': 'published'},
'MIxS 

Where did I put the INSDC bottom up term usage? **Would need to filter, probably by count and against ontology rultes below.** Try any repair of illegal terms?

---

https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS

| Field   name    | Comments                                                                                                                    |
|-----------------|-----------------------------------------------------------------------------------------------------------------------------|
| env_broad_scale | We recommend   using subclasses of ENVO’s biome   class: http://purl.obolibrary.org/obo/ENVO_00000428                       |
| env_local_scale | Please use terms that are present in ENVO   and which are of smaller spatial grain than your entry for env_broad_scale      |
| env_medium      | Please use one or   more subclasses of ENVO’s environmental material   class: http://purl.obolibrary.org/obo/ENVO_00010483. |


https://github.com/cidgoh/DataHarmonizer/wiki/DataHarmonizer-Templates#datatype
- Datatype
    - The datatype column mentioned above accepts the following values:
    
| datatype | description                                                                          |
|----------|--------------------------------------------------------------------------------------|
| select   | A field with a pulldown menu of selection options. Can be **indented** as a hierarchy. |

---

https://docs.google.com/spreadsheets/d/1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0/edit#gid=1466894602

Need to convert from label to term ID.
- rdftab
    - /Users/MAM/Documents/gitrepos/semantic-sql/db/envo.db
        - reconstruct with Makefile?
- download ontology (as JSON?)
- web service?

Then construct induced subgraph and use that to guide indentation


'MIxS sediment':    {'folder': 'sediment', 'status': 'published'},
- env_broad_scale
    - ['epeiric sea biome', 'estuarine biome', 'freshwater biome', 'freshwater lake biome', 'freshwater river biome', 'mangrove biome', 'marine basaltic hydrothermal vent biome', 'marine benthic biome', 'marine cold seep biome', 'marine hadal zone biome', 'marine hydrothermal vent biome', 'marine neritic benthic zone biome', 'marine reef biome', 'marine salt marsh biome', 'marine subtidal rocky reef biome', 'marine ultramafic hydrothermal vent biome', 'tropical marginal sea biome']
- env_local_scale
    - ['channel', 'coast', 'delta', 'dry lake', 'dry river', 'dry valley', 'alluvial fan', 'bank', 'estuary', 'lake', 'lake shore', 'marine benthic feature', 'mid-ocean ridge', 'phreatic zone', 'pond', 'bayou', 'river', 'seamount', 'shore', 'tunnel']
- env_medium
    - []


'MIxS soil':    {'folder': 'soil', 'status': 'published'},
- env_broad_scale
    - ['alpine biome', 'alpine tundra biome', 'anthropogenic terrestrial biome', 'arid biome', 'flooded savanna biome', 'mangrove biome', 'mediterranean biome', 'mediterranean savanna biome', 'mediterranean shrubland biome', 'mediterranean woodland biome', 'montane biome', 'montane savanna biome', 'montane shrubland biome', 'polar biome', 'rangeland biome', 'savanna biome', 'shrubland biome', 'subalpine biome', 'subpolar biome', 'subtropical biome', 'subtropical savanna biome', 'subtropical shrubland biome', 'subtropical woodland biome', 'temperate biome', 'temperate savanna biome', 'temperate shrubland biome', 'temperate woodland biome', 'tropical biome', 'tropical savanna biome', 'tropical shrubland biome', 'tropical woodland biome', 'tundra biome', 'urban biome', 'village biome', 'woodland biome']
- env_local_scale
    - ['active geological fault', 'agricultural field', 'beach', 'canyon', 'cave', 'vein', 'channel', 'cliff', 'coast', 'dry lake', 'dry river', 'dry valley', 'dune', 'garden', 'glacial valley', 'hill', 'hillside', 'hummock', 'impact crater', 'isthmus', 'karst', 'lake shore', 'lava field', 'mesa', 'mountain', 'peninsula', 'plain', 'plateau', 'ridge', 'slope', 'snow field', 'tombolo', 'tuff cone', 'tunnel', 'valley', 'volcano', 'woodland clearing']
- env_medium
    - ['acrisol', 'agricultural soil', 'albeluvisol', 'alisol', 'alluvial soil', 'alluvial swamp soil', 'alpine soil', 'andosol', 'anthrosol', 'arenosol', 'bare soil', 'beech forest soil', 'bluegrass field soil', 'burned soil', 'calcisol', 'cambisol', 'chernozem', 'compost soil', 'contaminated soil', 'cryosol', 'dune soil', 'durisol', 'eucalyptus forest soil', 'ferralsol', 'fluvisol', 'forest soil', 'friable-frozen soil', 'frost-susceptible soil', 'frozen compost soil', 'frozen soil', 'gleysol', 'grassland soil', 'greenhouse soil', 'gypsisol', 'histosol', 'humus-rich acidic ash soil', 'jungle soil', 'carbon nanotube enriched soil', 'kastanozem', 'leptosol', 'limed soil', 'lixisol', 'loam', 'luvisol', 'manured soil', 'meadow soil', 'mountain forest soil', 'muddy soil', 'nitisol', 'orchard soil', 'ornithogenic soil', 'paddy field soil', 'pantothenate enriched soil', 'pasture soil', 'peat soil', 'peaty paddy field soil', 'phaeozem', 'planosol', 'plastic-frozen soil', 'plinthosol', 'podzol', 'poly-beta-hydroxybutyrate enriched soil', 'pond soil', 'quinate enriched soil', 'regosol', 'rubber plantation soil', 'sarcosine enriched soil', 'savanna soil', 'skatole enriched soil', 'solonchak', 'solonetz', 'stagnosol', 'steppe soil', 'surface soil', 'technosol', 'threonine enriched soil', 'trimethylamine enriched soil', 'tropical soil', 'ultisol', 'umbrisol', 'upland soil', 'urea enriched soil', 'vertisol', 'volcanic soil', 'xylene contaminated soil']


'MIxS water':    {'folder': 'water', 'status': 'published'},
- env_broad_scale
    - ['aquatic biome', 'aquatic biome', 'epeiric sea biome', 'epeiric sea biome', 'estuarine biome', 'estuarine biome', 'freshwater biome', 'freshwater biome', 'freshwater lake biome', 'freshwater lake biome', 'freshwater river biome', 'freshwater river biome', 'marine abyssal zone biome', 'marine abyssal zone biome', 'marine bathyal zone biome', 'marine bathyal zone biome', 'marine biome', 'marine biome', 'marine black smoker biome', 'marine black smoker biome', 'marine cold seep biome', 'marine cold seep biome', 'marine hadal zone biome', 'marine hadal zone biome', 'marine hydrothermal vent biome', 'marine hydrothermal vent biome', 'marine neritic benthic zone biome', 'marine neritic benthic zone biome', 'marine pelagic biome', 'marine pelagic biome', 'marine reef biome', 'marine reef biome', 'marine salt marsh biome', 'marine salt marsh biome', 'marine sponge reef biome', 'marine sponge reef biome', 'marine subtidal rocky reef biome', 'marine subtidal rocky reef biome', 'marine upwelling biome', 'marine upwelling biome', 'marine white smoker biome', 'marine white smoker biome', 'mediterranean sea biome', 'mediterranean sea biome', 'neritic epipelagic zone biome', 'neritic epipelagic zone biome', 'neritic mesopelagic zone biome', 'neritic mesopelagic zone biome', 'neritic pelagic zone biome', 'neritic pelagic zone biome', 'neritic sea surface microlayer biome', 'neritic sea surface microlayer biome', 'ocean biome', 'ocean biome', 'oceanic abyssopelagic zone biome', 'oceanic abyssopelagic zone biome', 'oceanic bathypelagic zone biome', 'oceanic bathypelagic zone biome', 'oceanic benthopelagic zone biome', 'oceanic benthopelagic zone biome', 'oceanic epipelagic zone biome', 'oceanic epipelagic zone biome', 'oceanic hadal pelagic zone biome', 'oceanic hadal pelagic zone biome', 'oceanic mesopelagic zone biome', 'oceanic mesopelagic zone biome', 'oceanic pelagic zone biome', 'oceanic pelagic zone biome', 'oceanic sea surface microlayer biome', 'oceanic sea surface microlayer biome', 'temperate mediterranean sea biome', 'temperate mediterranean sea biome', 'tropical marginal sea biome', 'tropical marginal sea biome', 'tropical marine coral reef biome', 'tropical marine coral reef biome', 'tropical marine upwelling biome', 'tropical marine upwelling biome']
- env_local_scale
    - ['algal bloom', 'spring', 'strait', 'stream', 'tidal pool', 'tunnel', 'bank', 'bay', 'bayou', 'beach', 'mine drainage', 'phreatic zone', 'pond', 'reservoir', 'canal', 'river', 'rocky reef', 'marine mesoscale eddy field', 'shore', 'canyon', 'cave', 'channel', 'coast', 'coastal water body', 'coral reef', 'marine benthic feature', 'cove', 'delta', 'estuary', 'fjord', 'flash flood', 'geyser', 'glacial valley', 'hot spring', 'kelp forest', 'lake', 'snow field', 'marine tidal flow', 'lake shore', 'underground water body', 'waterfall', 'well']
- env_medium
    - []

### Don't forget to run `templates/make_all.sh`

## INSDC observed MIxS triad values
After some normalization


In [None]:
# observed term usage per package after normalizing
envpack_slot_fractions = pd.read_csv(envpack_slot_fractions_file, sep="\t")

soil_fractions = envpack_slot_fractions.loc[
    envpack_slot_fractions["env_package"].eq("soil")
].copy()
soil_ebs_fractions = soil_fractions.loc[
    envpack_slot_fractions["slot"].eq("env_broad_scale")
].copy()

# soil_ebs_fractions
# assume that the normalizastion ahs split multi annotations on |
soil_ebs_fractions[["label", "id"]] = soil_ebs_fractions["value"].str.split(
    " \[", expand=True
)
soil_ebs_fractions["id"] = soil_ebs_fractions["id"].str.replace("\]$", "", regex=True)
soil_ebs_fractions[["prefix", "local"]] = soil_ebs_fractions["id"].str.split(
    ":", expand=True
)

In [None]:
soil_ebs_fractions["biome"] = soil_ebs_fractions["id"].isin(list(biomes_res["subject"]))
soil_ebs_fractions


## bacteria archea traits below 

In [None]:
from linkml_runtime.utils.schemaview import SchemaView
import pandas as pd

In [None]:
bat_yaml_file = (
    "../../icbo2021/linkml-model-enrichment/condensed_traits_NCBI_tidied_curated.yaml"
)

rows_list = []
headers_list = []

ultilized_enums = {}

blank_row = {}

default_section = "Traits"

headers_list.append({"label": default_section})

In [None]:
bat_yaml_view = SchemaView(bat_yaml_file)

bat_classes = bat_yaml_view.all_class()

bat_classes_names = list(bat_classes.keys())

bat_enums = bat_yaml_view.all_enum()
bat_enum_names = list(bat_enums.keys())

# assuming single class from linkml model enrichment
# DH only supports one class anyway
class_slots = bat_yaml_view.class_slots(bat_classes_names[0])

bat_slots = bat_yaml_view.all_slot()

In [None]:
rows_list = []
utilized_enums = {}

for i in class_slots:
    row_dict = blank_row.copy()
    row_dict["Ontology ID"] = ""
    row_dict["parent class"] = default_section
    row_dict["label"] = i
    row_dict["datatype"] = "xs:token"
    row_dict["source"] = ""
    row_dict["data status"] = ""
    row_dict["requirement"] = ""
    row_dict["min value"] = ""
    row_dict["max value"] = ""
    row_dict["capitalize"] = ""
    row_dict["description"] = ""
    row_dict["guidance"] = ""

    slot_obj = bat_slots[i]
    # assuming single example from linkml model enrichment
    current_example = slot_obj.examples[0].value
    row_dict["examples"] = current_example
    current_range = slot_obj.range
    if current_range == "integer":
        # constraint becomes more restrictive
        row_dict["datatype"] = "xs:nonNegativeInteger"
    if current_range == "float":
        row_dict["datatype"] = "xs:decimal"
    if current_range == "datetime":
        row_dict["datatype"] = "xs:date"
    takes_enum = slot_obj.range in bat_enum_names
    if takes_enum:
        permissibles_count = len(bat_enums["cell_shape_enum"].permissible_values)
        # don't bother making select for a single value
        if permissibles_count >= 2:
            # assuming no multi valueds
            row_dict["datatype"] = "select"
            current_pvs = bat_enums[slot_obj.range].permissible_values
            enum_permissibles = []
            for k, v in current_pvs.items():
                current_meaning = v.meaning
                current_description = v.description
                assembled_meaning = current_description
                if current_meaning is not None:
                    assembled_meaning = (
                        current_description + " (" + current_meaning + ")"
                    )
                enum_permissibles.append(assembled_meaning)
                enum_permissibles.sort()
        utilized_enums[i] = enum_permissibles
    rows_list.append(row_dict)

rows_list = headers_list + rows_list

rows_frame = pd.DataFrame(rows_list)

# utilized_enums

In [None]:
reshape_list = []
for enum_name, permissibles in utilized_enums.items():
    #     print(enum_name)
    for permissible in permissibles:
        reshape_list.append({"parent class": enum_name, "label": permissible})

reshape_frame = pd.DataFrame(reshape_list)
# reshape_frame

In [None]:
package_template = pd.concat([rows_frame, reshape_frame])
package_template = package_template.fillna("")

package_template

In [None]:
package_template.to_csv("data.tsv", sep="\t", index=False)