In [18]:
from linkml_runtime.utils.schemaview import SchemaView
import pandas as pd

In [19]:
mixs_yaml_file = "../../mixs-source/model/schema/mixs.yaml"

nmdc_yaml_file = "../../nmdc-schema/src/schema/nmdc.yaml"

data_tsv_path = "../../DataHarmonizer/template/soil/data.tsv"

In [20]:
mixs_yaml_view = SchemaView(mixs_yaml_file)
nmdc_yaml_view = SchemaView(nmdc_yaml_file)

In [24]:
# loads on first mention
x = nmdc_yaml_view.schema
x

SchemaDefinition(name='NMDC', id_prefixes=[], definition_uri=None, aliases=[], local_names={}, mappings=[], exact_mappings=[], close_mappings=[], related_mappings=[], narrow_mappings=[], broad_mappings=[], extensions={}, annotations={}, description='Schema for National Microbiome Data Collaborative (NMDC). This schem is organized into 3 separate modules:\n  \nThis schema is organized into distinct modules:\n  \n * a set of core types for representing data values\n * the mixs schema (auto-translated from mixs excel)\n * annotation schema\n * the NMDC schema itself', alt_descriptions={}, deprecated=None, todos=[], notes=[], comments=[], examples=[], in_subset=[], from_schema=None, imported_from=None, see_also=[], deprecated_element_has_exact_replacement=None, deprecated_element_has_possible_replacement=None, id='https://microbiomedata/schema', title='NMDC Schema', version='2021.06.30', imports=['linkml:types', 'mixs', 'core', 'prov', 'workflow_execution_activity', 'annotation', 'external

In [4]:
len(mixs_yaml_view.all_slot())

Loading schema water from ../../mixs-source/model/schema/mixs.yaml
Loading schema terms from ../../mixs-source/model/schema/mixs.yaml
Loading schema ranges from ../../mixs-source/model/schema/mixs.yaml
Loading schema https://w3id.org/linkml/types from ../../mixs-source/model/schema/mixs.yaml
Loading schema wastewater_sludge from ../../mixs-source/model/schema/mixs.yaml
Loading schema soil from ../../mixs-source/model/schema/mixs.yaml
Loading schema sediment from ../../mixs-source/model/schema/mixs.yaml
Loading schema plant_associated from ../../mixs-source/model/schema/mixs.yaml
Loading schema miscellaneous_natural_or_artificial_environment from ../../mixs-source/model/schema/mixs.yaml
Loading schema microbial_mat_biofilm from ../../mixs-source/model/schema/mixs.yaml
Loading schema hydrocarbon_resources_fluids_swabs from ../../mixs-source/model/schema/mixs.yaml
Loading schema hydrocarbon_resources_cores from ../../mixs-source/model/schema/mixs.yaml
Loading schema human_vaginal from ../

609

In [5]:
soil_slots = list(mixs_yaml_view.class_slots("soil"))
soil_slots.sort()

In [6]:
agrochem_addition_model = mixs_yaml_view.induced_slot("agrochem_addition", "soil")

In [7]:
# agrochem_addition_model.__dict__

In [8]:
enums_obj = mixs_yaml_view.all_enum()
enum_list = list(enums_obj.keys())

| datatype              | description                                                                                                                                        |
|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|
| xs:token              | An XML string                                                                                                                                      |
| xs:unique             | A xs:token which should be unique in a   dataset. Good for validating sample identifiers for example                                               |
| xs:date               | An XML date                                                                                                                                        |
| select                | A field with a pulldown menu of selection   options. Can be indented as a hierarchy.                                                               |
| multiple              | A field with a popup menu allowing   multiple selection/deselection of items in a given hierarchy of terms.                                        |
| xs:nonNegativeInteger | An integer >= 0                                                                                                                                    |
| xs:decimal            | A decimal number                                                                                                                                   |
| provenance            | Marks a field that when Validated,   automatically receives a prefix of "DataHarmonizer provenance:   vX.Y.Z" in addition to its existing content. |

In [9]:
rows_list = []
headers_list = []

section_values_set = set()

ultilized_enums = {}

empty_dict = {
    "Ontology ID": "",
    "parent class": "",
    "label": "",
    "datatype": "",
    "source": "",
    "data status": "",
    "requirement": "",
    "min value": "",
    "max value": "",
    "capitalize": "",
    "description": "",
    "guidance": "",
    "examples": "",
}

parent_class = "MIxS soil attribute"
# see is_a notes below

temp = empty_dict.copy()
temp["label"] = "Identifiers"
temp = temp.copy()
headers_list.append(temp)

temp = empty_dict.copy()
temp["parent class"] = "Identifiers"
temp["label"] = "Biosample ID"
temp["requirement"] = "required"
temp["datatype"] = "xs:unique"
temp = temp.copy()
headers_list.append(temp)

temp = empty_dict.copy()
temp["parent class"] = "Identifiers"
temp["label"] = "Study IDs"
temp["datatype"] = "xs:token"
temp["requirement"] = "recommended"
temp["description"] = "|-seperated list of NMDC study identifiers"

temp = temp.copy()
headers_list.append(temp)

In [10]:
for ss in soil_slots:
    row_dict = empty_dict.copy()
    current_term = mixs_yaml_view.induced_slot(ss, "soil")

    row_dict["Ontology ID"] = current_term["slot_uri"]

    # TODO would the is_a parent ever be empty?
    section_values_set.add(current_term["is_a"])
    row_dict["parent class"] = current_term["is_a"]
    row_dict["label"] = current_term["name"]

    # see datatypes above
    temp = current_term["range"]
    row_dict["datatype"] = "xs:token"
    # TODO quantity value
    if temp in enum_list:
        # TODO could use meaning slots from enum permissible values as ontology terms
        #   is they were populated
        #   hey, who was supposed to do that?
        enum_permissibles = list(enums_obj[temp]["permissible_values"].keys())
        ultilized_enums[ss] = enum_permissibles
        if "multivalued" in row_dict and row_dict["multivalued"]:
            row_dict["datatype"] = "multiple"
        else:
            row_dict["datatype"] = "select"
    if temp == "date":
        row_dict["datatype"] = "xs:date"
    if temp == "double":
        # see also xs:nonNegativeInteger
        row_dict["datatype"] = "xs:decimal"

    # source	Used to name a select list field that row's select list should be replicated from.
    #   For example, a "citizenship" select field sourced from a "country of birth" select list of countries.
    row_dict["source"] = ""

    # data status	A customizable list of additional metadata select options to include with given
    #   (select or numeric or text) input field to indicate if a value was missing, not collected, etc.
    #   Format: semicolon separated list of options. Options are also displayed in column help info.
    row_dict["data status"] = ""

    row_dict["requirement"] = ""
    if current_term["recommended"]:
        row_dict["requirement"] = "recommended"
    if current_term["required"]:
        row_dict["requirement"] = "required"

    # TODO
    row_dict["min value"] = ""
    row_dict["max value"] = ""

    # On data entry or validation, capitalize field content according to setting.
    #   Leaves text unchanged when no value is provided. Options: lower / UPPER / Title
    row_dict["capitalize"] = ""

    temp = current_term["description"]
    encoded_string = temp.encode("ascii", "ignore")
    decoded_string = encoded_string.decode()
    row_dict["description"] = decoded_string

    comments = ". ".join(current_term["comments"])
    encoded_string = comments.encode("ascii", "ignore")
    decoded_string = encoded_string.decode()
    row_dict["guidance"] = comments

    temp = current_term.examples[0].value
    # am I parsing examples correctly?
    row_dict["examples"] = current_term.examples[0].value

    dictionary_copy = row_dict.copy()
    rows_list.append(dictionary_copy)

In [17]:
# enum_list

# enums_obj["fao_class_enum"].permissible_values

{'Acrisols': PermissibleValue(text='Acrisols', description=None, meaning=None, is_a=None, mixins=[], extensions={}, annotations={}, alt_descriptions={}, deprecated=None, todos=[], notes=[], comments=[], examples=[], in_subset=[], from_schema=None, imported_from=None, see_also=[], deprecated_element_has_exact_replacement=None, deprecated_element_has_possible_replacement=None),
 'Andosols': PermissibleValue(text='Andosols', description=None, meaning=None, is_a=None, mixins=[], extensions={}, annotations={}, alt_descriptions={}, deprecated=None, todos=[], notes=[], comments=[], examples=[], in_subset=[], from_schema=None, imported_from=None, see_also=[], deprecated_element_has_exact_replacement=None, deprecated_element_has_possible_replacement=None),
 'Arenosols': PermissibleValue(text='Arenosols', description=None, meaning=None, is_a=None, mixins=[], extensions={}, annotations={}, alt_descriptions={}, deprecated=None, todos=[], notes=[], comments=[], examples=[], in_subset=[], from_schem

In [11]:
reshape_list = []
for enum_name, permissibles in ultilized_enums.items():
    for permissible in permissibles:
        reshape_list.append({"parent class": enum_name, "label": permissible})

reshape_frame = pd.DataFrame(reshape_list)
# reshape_frame

Unnamed: 0,parent class,label
0,cur_land_use,cities
1,cur_land_use,farmstead
2,cur_land_use,industrial areas
3,cur_land_use,roads/railroads
4,cur_land_use,rock
...,...,...
83,tillage,zonal tillage
84,tillage,chisel
85,tillage,tined
86,tillage,mouldboard


In [12]:
for i in section_values_set:
    temp = empty_dict.copy()
    temp["label"] = i
    temp = temp.copy()
    headers_list.append(temp)

In [13]:
rows_list = headers_list + rows_list
# rows_list = rows_list + reshape_frame

soil_template = pd.DataFrame(rows_list)

soil_template = pd.concat([soil_template, reshape_frame])
soil_template
soil_template.fillna("")

Unnamed: 0,Ontology ID,parent class,label,datatype,source,data status,requirement,min value,max value,capitalize,description,guidance,examples
0,,,Identifiers,,,,,,,,,,
1,,Identifiers,ID,xs:unique,,,required,,,,,,
2,,,environment field,,,,,,,,,,
3,,,core field,,,,,,,,,,
4,MIXS:0000639,core field,agrochem_addition,xs:token,,,,,,,"Addition of fertilizers, pesticides, etc. - am...",Expected value: agrochemical name;agrochemical...,roundup;5 milligram per liter;2018-06-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,,tillage,zonal tillage,,,,,,,,,,
84,,tillage,chisel,,,,,,,,,,
85,,tillage,tined,,,,,,,,,,
86,,tillage,mouldboard,,,,,,,,,,


In [14]:
soil_template.to_clipboard(index=False)
soil_template.to_csv(data_tsv_path, sep="\t", index=False)

In [None]:
# templist = []
# for ss in soil_slots:
#     tempdict = {}
#     current_term = mixs_yaml_view.induced_slot(ss, "soil")
#     for i in current_term:
#         #         print(i)
#         #         print(current_term[i])
#         tempdict[i] = current_term[i]
#     #     print(tempdict)
#     tdc = tempdict.copy()
#     templist.append(tdc)
# tempframe = pd.DataFrame(templist)
# tempframe.to_clipboard(index=False)