In [1]:
from linkml_runtime.utils.schemaview import SchemaView
import pandas as pd
import sqlite3
import os
import sys
import shutil

In [2]:
# for making a template based on MIxS slots
#   from https://github.com/cmungall/mixs-source
#   (GenomicsStandardsConsortium/mixs-source might be slightly out of sync)
mixs_yaml_file = "../../mixs-source/model/schema/mixs.yaml"

# observed, normalized use of ontology terms (classes), per env package,
#   in the INSDC Biosample database
#   from https://github.com/turbomam/scoped-mapping/blob/main/notebooks/onto_slots_by_env_pack.ipynb
envpack_slot_fractions_file = (
    "../../scoped-mapping/notebooks/envpack_slot_fractions.tsv"
)

# ---

# this is a great source of knoweldge about envo
# requires a little semantic sql setup
# there are probably additional ontologies that need to be consulted
#   for picking terms to go in MIxS triad slots in non-soil templates
#   from https://github.com/cmungall/semantic-sql
envo_rdftab_file = "../../semantic-sql/db/envo.db"

# ---

# also contains some MIxS knowledge, possibly with NMDC elaboration on terms
#   also required for NMDC-native terms
#   from https://github.com/microbiomedata/nmdc-schema
# can also be helpful to look at Example-Soil_NMDC_SampleMetadata
#   https://docs.google.com/spreadsheets/d/1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0/edit#gid=0
nmdc_yaml_file = "../../nmdc-schema/src/schema/nmdc.yaml"

# ---

# output path
#   this should be paramterized for looping over the packages
data_tsv_prefix = "../../DataHarmonizer/template/"
data_tsv_suffix = "/data.tsv"

# this script creates template files, but they need to go in template directories
# if the per-package template directories need to be created, tehy will copied from data_tsv_prototype
data_tsv_prototype = "../../DataHarmonizer/template/soil/"

In [3]:
mixs_yaml_view = SchemaView(mixs_yaml_file)
nmdc_yaml_view = SchemaView(nmdc_yaml_file)

In [18]:
enums_obj = mixs_yaml_view.all_enum()
enum_list = list(enums_obj.keys())

### DataHarmonizer data types

| datatype              | description                                                                                                                                        |
|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|
| xs:token              | An XML string                                                                                                                                      |
| xs:unique             | A xs:token which should be unique in a   dataset. Good for validating sample identifiers for example                                               |
| xs:date               | An XML date                                                                                                                                        |
| select                | A field with a pulldown menu of selection   options. Can be indented as a hierarchy.                                                               |
| multiple              | A field with a popup menu allowing   multiple selection/deselection of items in a given hierarchy of terms.                                        |
| xs:nonNegativeInteger | An integer >= 0                                                                                                                                    |
| xs:decimal            | A decimal number                                                                                                                                   |
| provenance            | Marks a field that when Validated,   automatically receives a prefix of "DataHarmonizer provenance:   vX.Y.Z" in addition to its existing content. |

In [19]:
empty_dict = {
    "Ontology ID": "",
    "parent class": "",
    "label": "",
    "datatype": "",
    "source": "",
    "data status": "",
    "requirement": "",
    "min value": "",
    "max value": "",
    "capitalize": "",
    "description": "",
    "guidance": "",
    "examples": "",
}

In [4]:
# how to get a list of all environmental packages according to  MIxS
# mixs_yaml_view.get_class("soil").__dict__

In [5]:
envpack_slot_fractions = pd.read_csv(envpack_slot_fractions_file, sep="\t")

In [6]:
# workarround while uncertain about getting a list of all environmental packages
#   according to the MIxS model
unique_observed_packages = list(envpack_slot_fractions["env_package"].unique())
unique_observed_packages.sort()

In [7]:
if not os.path.exists(data_tsv_prototype):
    print("data_tsv_prototype: " + data_tsv_prototype + " does not exist")
    sys.exit()

if not os.path.exists(data_tsv_prefix):
    print("parent template directory: " + data_tsv_prefix + " does not exist")
    print("creating")
    os.makedirs(data_tsv_prefix)

In [12]:
# seems like packages can go by several slightly different names
# this was manually created
# might be better for https://github.com/turbomam/scoped-mapping/blob/main/notebooks/onto_slots_by_env_pack.ipynb
# to instantiate the MIxS package names from teh beginning
# or to build this lookup table from some authoritative source like
# https://www.ncbi.nlm.nih.gov/biosample/docs/packages/?format=xml

tabulations_to_mixs_source_dict = {
    "tabulations": {
        0: "air",
        1: "built",
        2: "host-associated",
        3: "human-associated",
        4: "human-gut",
        5: "human-oral",
        6: "human-skin",
        7: "human-vaginal",
        8: "",
        9: "",
        10: "microbial",
        11: "miscellaneous",
        12: "plant-associated",
        13: "sediment",
        14: "soil",
        15: "wastewater",
        16: "water",
    },
    "mixs_source": {
        0: "air",
        1: "built environment",
        2: "host-associated",
        3: "human-associated",
        4: "human-gut",
        5: "human-oral",
        6: "human-skin",
        7: "human-vaginal",
        8: "hydrocarbon resources-cores",
        9: "hydrocarbon resources-fluids_swabs",
        10: "microbial mat_biofilm",
        11: "miscellaneous natural or artificial environment",
        12: "plant-associated",
        13: "sediment",
        14: "soil",
        15: "wastewater_sludge",
        16: "water",
    },
}

tabulations_to_mixs_source = pd.DataFrame(tabulations_to_mixs_source_dict)

# tabulations_to_mixs_source

In [13]:
# mixs_yaml_view.all_class().keys()

In [87]:
def template_package(package, mixs_package):
    print(package)
    print(mixs_package)

    package_slots = list(mixs_yaml_view.class_slots(mixs_package))
    package_slots.sort(

    rows_list = []
    headers_list = []

    section_values_set = set()

    ultilized_enums = {}

    parent_class = "MIxS " + package + " attribute"
    # see is_a notes below

    temp = empty_dict.copy()
    temp["label"] = "Identifiers"
    temp = temp.copy()
    headers_list.append(temp)

    temp = empty_dict.copy()
    temp["parent class"] = "Identifiers"
    temp["label"] = "Biosample ID"
    temp["requirement"] = "required"
    temp["datatype"] = "xs:unique"
    temp = temp.copy()
    headers_list.append(temp)

    temp = empty_dict.copy()
    temp["parent class"] = "Identifiers"
    temp["label"] = "Study IDs"
    temp["datatype"] = "xs:token"
    temp["requirement"] = "recommended"
    temp["description"] = "|-seperated list of NMDC study identifiers"

    temp = temp.copy()
    headers_list.append(temp)

    for ps in package_slots:
        row_dict = empty_dict.copy()
        current_term = mixs_yaml_view.induced_slot(ps, "soil")

        row_dict["Ontology ID"] = current_term["slot_uri"]

        # TODO would the is_a parent ever be empty?
        section_values_set.add(current_term["is_a"])
        row_dict["parent class"] = current_term["is_a"]
        row_dict["label"] = current_term["name"]

        # see datatypes above
        temp = current_term["range"]
        row_dict["datatype"] = "xs:token"
        # TODO quantity value
        if temp in enum_list:
            # TODO could use meaning slots from enum permissible values as ontology terms
            #   is they were populated
            #   hey, who was supposed to do that?
            enum_permissibles = list(enums_obj[temp]["permissible_values"].keys())
            ultilized_enums[ps] = enum_permissibles
            if "multivalued" in row_dict and row_dict["multivalued"]:
                row_dict["datatype"] = "multiple"
            else:
                row_dict["datatype"] = "select"
        if temp == "date":
            row_dict["datatype"] = "xs:date"
        if temp == "double":
            # see also xs:nonNegativeInteger
            row_dict["datatype"] = "xs:decimal"

        # source	Used to name a select list field that row's select list should be replicated from.
        #   For example, a "citizenship" select field sourced from a "country of birth" select list of countries.
        row_dict["source"] = ""

        # data status	A customizable list of additional metadata select options to include with given
        #   (select or numeric or text) input field to indicate if a value was missing, not collected, etc.
        #   Format: semicolon separated list of options. Options are also displayed in column help info.
        row_dict["data status"] = ""

        row_dict["requirement"] = ""
        if current_term["recommended"]:
            row_dict["requirement"] = "recommended"
        if current_term["required"]:
            row_dict["requirement"] = "required"

        # TODO
        row_dict["min value"] = ""
        row_dict["max value"] = ""

        # On data entry or validation, capitalize field content according to setting.
        #   Leaves text unchanged when no value is provided. Options: lower / UPPER / Title
        row_dict["capitalize"] = ""

        temp = current_term["description"]
        encoded_string = temp.encode("ascii", "ignore")
        decoded_string = encoded_string.decode()
        row_dict["description"] = decoded_string

        comments = ". ".join(current_term["comments"])
        encoded_string = comments.encode("ascii", "ignore")
        decoded_string = encoded_string.decode()
        row_dict["guidance"] = comments

        temp = current_term.examples[0].value
        # am I parsing examples correctly?
        row_dict["examples"] = current_term.examples[0].value

        dictionary_copy = row_dict.copy()
        rows_list.append(dictionary_copy)

    reshape_list = []
    for enum_name, permissibles in ultilized_enums.items():
        for permissible in permissibles:
            reshape_list.append({"parent class": enum_name, "label": permissible})

    reshape_frame = pd.DataFrame(reshape_list)

    for i in section_values_set:
        temp = empty_dict.copy()
        temp["label"] = i
        temp = temp.copy()
        headers_list.append(temp)

    rows_list = headers_list + rows_list

    package_template = pd.DataFrame(rows_list)

    package_template = pd.concat([package_template, reshape_frame])
    package_template = package_template.fillna("")

    return package_template

In [89]:
package_template = template_package("soil", "soil")
# package_template
# package_template.to_csv(data_tsv_path, sep="\t", index=False)

soil
soil


In [90]:
for package in unique_observed_packages:
    print(package)
    mixs_package = tabulations_to_mixs_source["mixs_source"].loc[
        tabulations_to_mixs_source["tabulations"] == package
    ]
    mixs_package = mixs_package.values[0]
    data_tsv_dir = data_tsv_prefix + package
    if not os.path.exists(data_tsv_dir):
        print("package template directory: " + data_tsv_dir + " does not exist")
        print("copying from " + data_tsv_prototype)
        cpres = shutil.copytree(data_tsv_prototype, data_tsv_dir)
        print(cpres + " created")
    package_template = template_package(package, mixs_package)
    final_dest = os.path.join(data_tsv_dir,"data.tsv")
    package_template.to_csv(final_dest, sep="\t", index=False)

air
air
air
built
built
built environment
host-associated
host-associated
host-associated
human-associated
human-associated
human-associated
human-gut
human-gut
human-gut
human-oral
human-oral
human-oral
human-skin
human-skin
human-skin
human-vaginal
human-vaginal
human-vaginal
microbial
microbial
microbial mat_biofilm
miscellaneous
miscellaneous
miscellaneous natural or artificial environment
plant-associated
plant-associated
plant-associated
sediment
sediment
sediment
soil
soil
soil
wastewater
wastewater
wastewater_sludge
water
water
water


```
const TEMPLATES = {
  'CanCOGeN Covid-19': {'folder': 'canada_covid19', 'status': 'published'},
  'PHAC Dexa (ALPHA)': {'folder': 'phac_dexa', 'status': 'draft'},
  'GRDI (ALPHA)':      {'folder': 'grdi', 'status': 'draft'},
  'GISAID (ALPHA)':    {'folder': 'gisaid', 'status': 'draft'},
  'PHA4GE':    {'folder': 'pha4ge', 'status': 'published'},
  'MIxS Soil':    {'folder': 'soil', 'status': 'published'}
};
```

In [93]:
for package in unique_observed_packages:
    print("'MIxS " + package +  "':    {'folder': '" + package + "', 'status': 'published'},")

'MIxS air':    {'folder': 'air', 'status': 'published'},
'MIxS built':    {'folder': 'built', 'status': 'published'},
'MIxS host-associated':    {'folder': 'host-associated', 'status': 'published'},
'MIxS human-associated':    {'folder': 'human-associated', 'status': 'published'},
'MIxS human-gut':    {'folder': 'human-gut', 'status': 'published'},
'MIxS human-oral':    {'folder': 'human-oral', 'status': 'published'},
'MIxS human-skin':    {'folder': 'human-skin', 'status': 'published'},
'MIxS human-vaginal':    {'folder': 'human-vaginal', 'status': 'published'},
'MIxS microbial':    {'folder': 'microbial', 'status': 'published'},
'MIxS miscellaneous':    {'folder': 'miscellaneous', 'status': 'published'},
'MIxS plant-associated':    {'folder': 'plant-associated', 'status': 'published'},
'MIxS sediment':    {'folder': 'sediment', 'status': 'published'},
'MIxS soil':    {'folder': 'soil', 'status': 'published'},
'MIxS wastewater':    {'folder': 'wastewater', 'status': 'published'},
'MIx

---

In [None]:
# loads on first mention
# manually edited nmdc's mixs.yaml (from Chris' old Perl)
#   to add a name slot
#   otherwise schemaview complains
biosample = nmdc_yaml_view.get_class("biosample")
# how to get all required and reccomended slots?
# from schema view class slots?
# inducted slots?
# slot usage?
# biosample.slots

In [None]:
soil_template.to_clipboard(index=False)
soil_template.to_csv(data_tsv_path, sep="\t", index=False)

In [None]:
# templist = []
# for ss in soil_slots:
#     tempdict = {}
#     current_term = mixs_yaml_view.induced_slot(ss, "soil")
#     for i in current_term:
#         #         print(i)
#         #         print(current_term[i])
#         tempdict[i] = current_term[i]
#     #     print(tempdict)
#     tdc = tempdict.copy()
#     templist.append(tdc)
# tempframe = pd.DataFrame(templist)
# tempframe.to_clipboard(index=False)

In [None]:
# envo terms are predominant for env broad scelae in soil
# but that won't alawys be the case
# where to draw the line between REST API calls which don't require any setup
# and rdftab databases etc which arent' dependent on remote services

connection = sqlite3.connect(envo_rdftab_file)

biomes_q = """
select s2.subject, s2.value
from entailed_edge s1
join statements s2
on s1.subject = s2.subject
where s1.predicate = 'rdfs:subClassOf' 
and s1.object = 'ENVO:00000428'
and s2.predicate = 'rdfs:label'
---limit 9
"""

biomes_res = pd.read_sql_query(biomes_q, connection)

biomes_res

In [None]:
# observed term usage per package after normalizing
envpack_slot_fractions = pd.read_csv(envpack_slot_fractions_file, sep="\t")
soil_fractions = envpack_slot_fractions.loc[
    envpack_slot_fractions["env_package"].eq("soil")
].copy()
soil_ebs_fractions = soil_fractions.loc[
    envpack_slot_fractions["slot"].eq("env_broad_scale")
].copy()

# soil_ebs_fractions
# assume that the normalizastion ahs split multi annotations on |
soil_ebs_fractions[["label", "id"]] = soil_ebs_fractions["value"].str.split(
    " \[", expand=True
)
soil_ebs_fractions["id"] = soil_ebs_fractions["id"].str.replace("\]$", "", regex=True)
soil_ebs_fractions[["prefix", "local"]] = soil_ebs_fractions["id"].str.split(
    ":", expand=True
)

In [None]:
soil_ebs_fractions["biome"] = soil_ebs_fractions["id"].isin(list(biomes_res["subject"]))
soil_ebs_fractions