In [1]:
from __future__ import print_function
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials

import pandas as pd
import re
import os
from shutil import copyfile

from linkml_runtime.utils.schemaview import SchemaView

In [2]:
blank_row = {
    "Ontology ID": "",
    "parent class": "",
    "label": "",
    "datatype": "",
    "source": "",
    "data status": "",
    "requirement": "",
    "min value": "",
    "max value": "",
    "capitalize": "",
    "pattern": "",
    "description": "",
    "guidance": "",
    "examples": "",
}

In [3]:
# required columns can be asserted without putting them in a section entitled "required" etc.
#   ie it might be possible to use the section for something orthogonal
required_sections = ["sample identification", "required", "required where applicable"]

In [4]:
dh_template_root   = "../../DataHarmonizer/template/"
dh_template_prefix = dh_template_root + "IoT_"
dh_template_suffix = "/data.tsv"
ref_temp_filename = "reference_template.html"

In [5]:
# for making a template based on MIxS slots
#   from https://github.com/cmungall/mixs-source
#   (GenomicsStandardsConsortium/mixs-source might be slightly out of sync)

# make sure to regenrate this form tiemto time
mixs_yaml_file = "../../mixs-source/model/schema/mixs.yaml"

Both use same sheet: 1QDeeUcDqXes69Y2RjU2aWgOpCVWo5OVsBX9MKmMqi_o

```
                Mungall      GSC
mixs6.tsv       345753674    750683809
mixs6_core.tsv  567040283    178015749
```

In [6]:
# Montana's "Index-of-Terms" sheet is in XLSX format:
#   https://docs.google.com/spreadsheets/d/1R5w5tcIGm9dDRhC4H6NmhcHV7J5zx7FQ/edit#gid=1133203354
# I copied and converted to Google Sheet format in MAM@lbl.gov google drive
#   https://docs.google.com/spreadsheets/d/1lj4OuEE4IYwy2v7RzcG79lHjNdFwmDETMDTDaRAWojY/edit#gid=1133203354

# added mixs_6_slot_name column
#   blank if no corresponding mixs_6_slot
#   different from "name" column if "name" column can be repaired

# there's also a "Controlled Terms" tab

In [7]:
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"]

# The ID and range of a sample spreadsheet.
IOT_SPREADSHEET_ID = "1lj4OuEE4IYwy2v7RzcG79lHjNdFwmDETMDTDaRAWojY"
IOT_RANGE_NAME = "Glossary of terms!A1:Z"

CV_RANGE_NAME = "Controlled Terms!A1:Z"

# resuing google_api_credentials.json from https://github.com/cancerDHC/sheet2linkml
client_secret_flie = "../../sheet2linkml/google_api_credentials.json"

In [8]:
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists("token.json"):
    creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(client_secret_flie, SCOPES)
        creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("token.json", "w") as token:
        token.write(creds.to_json())

In [9]:
service = build("sheets", "v4", credentials=creds)

# Call the Sheets API
sheet = service.spreadsheets()

In [10]:
result = (
    sheet.values().get(spreadsheetId=IOT_SPREADSHEET_ID, range=IOT_RANGE_NAME).execute()
)

In [11]:
iot_glossary = pd.DataFrame(result["values"], columns=result["values"][0]).drop(0)

In [12]:
# iot_glossary

In [13]:
controlled_terms = (
    sheet.values().get(spreadsheetId=IOT_SPREADSHEET_ID, range=CV_RANGE_NAME).execute()
)

In [14]:
ctdf = pd.DataFrame(
    controlled_terms["values"], columns=controlled_terms["values"][0]
).drop(0)

In [15]:
ct_dol = {k: [i for i in v if i] for (k, v) in ctdf.items()}

In [16]:
# ct_dol

In [17]:
ct_keys = list(ct_dol.keys())
ct_keys.sort()
# ct_keys

## Compare to MIxS

In [18]:
# for which is the "column header" different from the display name?
# for which is the "definition" different from ???
# guidance

In [19]:
mixs_yaml_view = SchemaView(mixs_yaml_file)

In [20]:
# which "name" values aren't MiXS slots at all?
# could be minor spelling differences etc.

all_mixs_slots_dict = mixs_yaml_view.all_slots()
ams_keys = list(all_mixs_slots_dict.keys())
ams_keys.sort()

In [21]:
all_iot_slots = iot_glossary["name"]

iot_slots_only = list(set(all_iot_slots) - set(ams_keys))

iot_slots_only.sort()

iot_slots_only

['?? Incubation collection date',
 '?? Incubation start date',
 'extreme_salinity',
 'host_infra_spec_name',
 'host_infra_spec_rank',
 'investigation_type',
 'isotope_exposure',
 'other',
 'package',
 'sample_size',
 'source_mat_ID',
 'tot_nitro_content_meth',
 'unique_ID',
 'water_content_meth']

```
 '?? Incubation collection date',
 '?? Incubation start date',
 'extreme_salinity',
 'host_infra_spec_name', host_infra_specific_name
 'host_infra_spec_rank', host_infra_specific_rank
 'investigation_type',
 'isotope_exposure',
 'other',
 'package',
 'sample_size', samp_size
 'source_mat_ID', source_mat_id
 'tot_nitro_content_meth', tot_n_meth
 'unique_ID',
 'water_content_meth' water_cont_soil_meth
```

In [22]:
iot_undefined_slots_followup = iot_glossary.loc[
    iot_glossary["name"].isin(iot_slots_only)
]

In [23]:
iot_undefined_slots_followup

Unnamed: 0,Column Header,name,mixs_6_slot_name,Definition,Guidance,Expected Value,syntax,Category,Associated Packages,Origin,Notes,GitHub Ticket
1,Unique ID,unique_ID,,A globally unique identifier assigned to the b...,Field REQUIRED for ALL sample submission. Opti...,,{text},sample identification,all,,v6 has unique ID and sample name synonymous. B...,
3,Analysis/Data Type,investigation_type,,,This field is constrained to contain only a se...,,drop down selection list,sample identification,all,MIxS,"enumeration values are EMSL origin, MIxS has d...",
4,Sample Type,package,,,MIxS Package,,{text},sample identification,all,MIxS,"This is assumed by the package, but I think sh...",
5,Source Material ID,source_mat_ID,source_mat_id,A unique identifier assigned to a material sam...,A unique identifier assigned to an original ma...,,{text},sample identification,all,MIxS,New v6 definition is better to indicate bio sa...,
8,,?? Incubation collection date,,,,,,,,,Need to come up with a term for this or how we...,
9,,?? Incubation start date,,,,,,,,,,
13,Sample size,sample_size,samp_size,"The total amount or size (volume (ml), mass (g...",,,{value}{text},required,all,MIxS,I need how much sample is sent to EMSL. This i...,
22,isotope exposure/addition treatment,isotope_exposure,,,,,{text},required where applicable,soil; sediment; pore_water; plant_associated; ...,,,https://github.com/GenomicsStandardsConsortium...
27,treatment-other details,other,,,,,{text},required where applicable,soil; sediment; pore_water; plant_associated; ...,,link github ticket,
39,Other,other,other,,Other details about your sample that you feel ...,,{text},required where applicable,all,,I would like somewhere to caputre any informa...,


In [24]:
slot_check = iot_glossary[["name", "mixs_6_slot_name"]]
slot_check = slot_check.loc[slot_check["name"].ne(slot_check["mixs_6_slot_name"])]

In [25]:
slot_check.to_clipboard(index=False)

In [26]:
iot_package_slots = iot_glossary[["mixs_6_slot_name", "Associated Packages"]]

In [27]:
iot_package_slots = iot_package_slots.loc[iot_package_slots["mixs_6_slot_name"].ne("")]
iot_package_slots = iot_package_slots.loc[
    ~iot_package_slots["mixs_6_slot_name"].isnull()
]

In [28]:
iot_package_slots.to_clipboard()

In [29]:
iot_package_slots = iot_package_slots.explode("Associated Packages")

In [30]:
all_package_slots = list(
    iot_package_slots["mixs_6_slot_name"].loc[
        iot_package_slots["Associated Packages"].eq("all")
    ]
)
all_package_slots.sort()

In [31]:
all_package_slots

['collection_date',
 'growth_facil',
 'other',
 'samp_collect_device',
 'samp_mat_process',
 'samp_name',
 'samp_size',
 'samp_store_temp',
 'source_mat_id',
 'store_cond']

In [32]:
all_package_slots_string = ";".join(all_package_slots)
all_package_slots_string

'collection_date;growth_facil;other;samp_collect_device;samp_mat_process;samp_name;samp_size;samp_store_temp;source_mat_id;store_cond'

In [33]:
selective_package_slots = iot_package_slots.loc[
    iot_package_slots["Associated Packages"].ne("all")
]

In [34]:
selective_package_slots["splitted"] = selective_package_slots[
    "Associated Packages"
].str.split(" *; *", expand=False)
selective_package_slots = selective_package_slots[["mixs_6_slot_name", "splitted"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selective_package_slots["splitted"] = selective_package_slots[


In [35]:
selective_package_slots = selective_package_slots.explode("splitted")
selective_package_slots

Unnamed: 0,mixs_6_slot_name,splitted
14,geo_loc_name,soil
14,geo_loc_name,sediment
14,geo_loc_name,pore_water
14,geo_loc_name,plant_associated
14,geo_loc_name,water
...,...,...
246,water_cont_soil_meth,soil
246,water_cont_soil_meth,sediment
247,water_current,pore_water
247,water_current,water


In [36]:
packagewise = (
    selective_package_slots.astype(str)
    .groupby("splitted", as_index=False)
    .agg(";".join)[selective_package_slots.columns]
)
packagewise = packagewise[["splitted", "mixs_6_slot_name"]]
packagewise.columns = ["package", "mixs_6_slot_name"]

In [37]:
packagewise["mixs_6_slot_name"] = (
    packagewise["mixs_6_slot_name"].astype(str) + ";" + all_package_slots_string
)

In [38]:
packagewise

Unnamed: 0,package,mixs_6_slot_name
0,culture_environmental,geo_loc_name;lat_lon;elev;chem_administration;...
1,mixed_culture,chem_administration;watering_regm;air_temp_reg...
2,plant_associated,geo_loc_name;lat_lon;elev;chem_administration;...
3,pore_water,geo_loc_name;lat_lon;elev;depth;chem_administr...
4,pure_culture,chem_administration;watering_regm;air_temp_reg...
5,sediment,geo_loc_name;lat_lon;elev;depth;chem_administr...
6,soil,geo_loc_name;lat_lon;elev;depth;chem_administr...
7,water,geo_loc_name;lat_lon;elev;depth;chem_administr...
8,water_extract_biosolid,geo_loc_name;lat_lon;elev;chem_administration;...
9,water_extract_soil,geo_loc_name;lat_lon;elev;depth;chem_administr...


In [39]:
iot_packages = list(packagewise["package"])
iot_packages.sort()

In [40]:
iot_packages

['culture_environmental',
 'mixed_culture',
 'plant_associated',
 'pore_water',
 'pure_culture',
 'sediment',
 'soil',
 'water',
 'water_extract_biosolid',
 'water_extract_soil']

In [41]:
mixs_all_classes = mixs_yaml_view.all_classes()
mixs_all_classes = list(mixs_all_classes.keys())
mixs_all_classes.sort()

In [42]:
mixs_root_classes = mixs_yaml_view.class_roots()
mixs_root_classes.sort()

In [43]:
mixers = []
for i in mixs_all_classes:
    #     print(i)
    checkee = mixs_yaml_view.get_class(i)
    mixers = mixers + checkee.mixins

mixers = list(set(mixers))
mixers.sort()
mixers

['MIGS bacteria',
 'MIGS eukaryote',
 'MIGS org',
 'MIGS plant',
 'MIGS virus',
 'MIMAG',
 'MIMARKS specimen',
 'MIMARKS survey',
 'MIMS',
 'MISAG',
 'MIUVIG']

In [44]:
mixs_unmixed_roots = list(set(mixs_root_classes) - set(mixers))
mixs_unmixed_roots.sort()
mixs_unmixed_roots

['agriculture',
 'air',
 'built environment',
 'core',
 'food-animal and animal feed',
 'food-farm environment',
 'food-food production facility',
 'food-human foods',
 'host-associated',
 'human-associated',
 'human-gut',
 'human-oral',
 'human-skin',
 'human-vaginal',
 'hydrocarbon resources-cores',
 'hydrocarbon resources-fluids_swabs',
 'microbial mat_biofilm',
 'miscellaneous natural or artificial environment',
 'plant-associated',
 'quantity value',
 'sediment',
 'soil',
 'symbiont-associated',
 'wastewater_sludge',
 'water']

could exclude core package

In [45]:
iot_only_packages = list(set(iot_packages) - set(mixs_unmixed_roots))
iot_only_packages.sort()
iot_only_packages

['culture_environmental',
 'mixed_culture',
 'plant_associated',
 'pore_water',
 'pure_culture',
 'water_extract_biosolid',
 'water_extract_soil']

In [46]:
shared_packages = list(set(iot_packages).intersection(set(mixs_unmixed_roots)))
shared_packages.sort()

In [47]:
shared_packages

['sediment', 'soil', 'water']

In [48]:
slot_usage_differences = {}
for i in shared_packages:
    print(i)
    # assume one matching row
    ii = packagewise["mixs_6_slot_name"].loc[packagewise["package"].eq(i)]
    ii = ii.iloc[0]
    ii = re.split(" *; *", ii)
    ii.sort()
    im = mixs_yaml_view.class_induced_slots(i)
    im = [x.name for x in im]
    im.sort()
    i_only = list(set(ii) - set(im))
    i_only.sort()
    m_only = list(set(im) - set(ii))
    m_only.sort()
    slot_usage_differences[i] = {"IoT only": i_only, "MIxS 6 only": m_only}

slot_usage_differences

sediment
soil
water


{'sediment': {'IoT only': ['air_temp_regm',
   'alkalinity_method',
   'biotic_regm',
   'biotic_relationship',
   'climate_environment',
   'extreme_event',
   'fire',
   'flooding',
   'gaseous_environment',
   'growth_facil',
   'humidity_regm',
   'light_regm',
   'microbial_biomass',
   'microbial_biomass_meth',
   'other',
   'ph_meth',
   'samp_collect_device',
   'samp_mat_process',
   'samp_name',
   'samp_size',
   'sieving',
   'source_mat_id',
   'store_cond',
   'tot_n_meth',
   'tot_org_c_meth',
   'water_cont_soil_meth',
   'watering_regm'],
  'MIxS 6 only': ['env_broad_scale',
   'env_local_scale',
   'env_medium',
   'samp_store_dur',
   'samp_store_loc']},
 'soil': {'IoT only': ['air_temp_regm',
   'biotic_regm',
   'biotic_relationship',
   'chem_administration',
   'climate_environment',
   'gaseous_environment',
   'growth_facil',
   'horizon',
   'humidity_regm',
   'light_regm',
   'microbial_biomass_meth',
   'other',
   'oxy_stat_samp',
   'previous_land_use_me

In [49]:
iot_glossary["Expected Value"].value_counts()

    247
Name: Expected Value, dtype: int64

In [50]:
syntax_frame = (
    iot_glossary["syntax"]
    .value_counts()
    .rename_axis("unique_values")
    .reset_index(name="counts")
)

In [51]:
syntax_frame.to_clipboard(index=False)

In [52]:
category_frame = (
    iot_glossary["Category"]
    .value_counts()
    .rename_axis("unique_values")
    .reset_index(name="counts")
)

In [53]:
category_frame

Unnamed: 0,unique_values,counts
0,,210
1,required where applicable,26
2,required,6
3,sample identification,5


In [54]:
origin_frame = (
    iot_glossary["Origin"]
    .value_counts()
    .rename_axis("unique_values")
    .reset_index(name="counts")
)

In [55]:
origin_frame

Unnamed: 0,unique_values,counts
0,MIxS,242
1,,5


## Start building DataHarmonizer templates

### take all packages and slots, but repair typos in slot names


In [56]:
slot_to_pack_4_dh = iot_glossary[["name", "mixs_6_slot_name", "Associated Packages"]]

In [57]:
slot_to_pack_4_dh["ap_list"] = slot_to_pack_4_dh["Associated Packages"].str.split(
    " *; *", expand=False
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slot_to_pack_4_dh["ap_list"] = slot_to_pack_4_dh["Associated Packages"].str.split(


In [58]:
iot_packages = list(slot_to_pack_4_dh["ap_list"])
iot_packages = [i for i in iot_packages if i]
iot_packages = sum(iot_packages, [])
iot_packages = list(set(iot_packages))
iot_packages.sort()
iot_packages.remove("")
iot_packages.remove("all")

In [59]:
slot_to_pack_4_dh["repaired_name"] = slot_to_pack_4_dh["mixs_6_slot_name"]

slot_to_pack_4_dh["repaired_name"].loc[
    slot_to_pack_4_dh["repaired_name"] == ""
] = slot_to_pack_4_dh["name"].loc[slot_to_pack_4_dh["repaired_name"] == ""]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slot_to_pack_4_dh["repaired_name"] = slot_to_pack_4_dh["mixs_6_slot_name"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [60]:
# slot_to_pack_4_dh["ap_list"].loc[
#     slot_to_pack_4_dh["Associated Packages"] == "all"
# ] = iot_packages

slot_to_pack_4_dh["ap_list"].loc[
    slot_to_pack_4_dh["Associated Packages"] == "all"
] = slot_to_pack_4_dh["Associated Packages"].apply(lambda _: iot_packages)

# slot_to_pack_4_dh["Associated Packages"].apply(lambda _: iot_packages)

In [61]:
slot_to_pack_4_dh = slot_to_pack_4_dh[["repaired_name", "ap_list"]]

In [62]:
slot_to_pack_4_dh = slot_to_pack_4_dh.explode("ap_list")

In [63]:
slot_to_pack_4_dh = (
    slot_to_pack_4_dh.astype(str)
    .groupby("ap_list", as_index=False)
    .agg(";".join)[slot_to_pack_4_dh.columns]
)

In [64]:
slot_to_pack_4_dh = slot_to_pack_4_dh.loc[slot_to_pack_4_dh["ap_list"].ne("")]

In [65]:
slot_to_pack_4_dh = slot_to_pack_4_dh.loc[slot_to_pack_4_dh["ap_list"].ne("None")]

In [66]:
slot_to_pack_4_dh = slot_to_pack_4_dh.loc[~slot_to_pack_4_dh["ap_list"].isnull()]

In [67]:
slot_to_pack_4_dh = slot_to_pack_4_dh[["ap_list", "repaired_name"]]
slot_to_pack_4_dh.columns = ["package", "slots"]

In [68]:
# slot_to_pack_4_dh

In [69]:
slot_details_4_dh = iot_glossary.copy()
slot_details_4_dh.drop(
    labels=["Associated Packages", "Expected Value"], axis=1, inplace=True
)

slot_details_4_dh["repaired_name"] = slot_details_4_dh["mixs_6_slot_name"]

slot_details_4_dh["repaired_name"].loc[
    slot_details_4_dh["repaired_name"] == ""
] = slot_details_4_dh["name"].loc[slot_details_4_dh["repaired_name"] == ""]

In [70]:
# slot_details_4_dh

In [71]:
slot_details_4_dh["Category"].loc[slot_details_4_dh["Category"] == ""] = "optional"
slot_details_4_dh["Category"].loc[slot_details_4_dh["Category"].isnull()] = "optional"

In [72]:
# slot_details_4_dh

In [73]:
slot_categories = list(set(list(slot_details_4_dh["Category"])))

In [74]:
slot_categories

['sample identification', 'required where applicable', 'optional', 'required']

# Reconcile these

## Index of Terms

- Column Header
- name -> repaired_name
- Definition
- Guidance
- Expected Value
- syntax
- Category
- Associated Packages
- Origin
- Notes
- GitHub Ticket


## DataHarmonizer
- Ontology ID
- parent class (section)
- label
- datatype
- ~source~
- data status (semicolon separated list of options like "not collected", "missing", not applicable")
- requirement ("", "required" or "recommended")
- min value
- max value
- capitalize
- pattern
- description
- guidance
- examples


In [75]:
def template_package(
    current_package,
    slot_to_package_df,
    slot_details_df,
    enums_dict,
    template_prefix,
    template_suffix,
):
    print(current_package)

    main_row_list = []
    enum_row_list = []

    for i in slot_categories:
        print(i)
        current_row = blank_row.copy()
        current_row["label"] = i
        main_row_list.append(current_row)

    package_slots = slot_to_pack_4_dh["slots"].loc[
        slot_to_pack_4_dh["package"] == current_package
    ]
    package_slots = package_slots.iloc[0]
    package_slots = package_slots.split(";")

    for i in package_slots:
        print(i)
        current_details = slot_details_4_dh.loc[slot_details_4_dh["repaired_name"] == i]
        current_row = blank_row.copy()
        #     "Ontology ID"
        current_row["parent class"] = current_details["Category"].iloc[0]
        current_row["label"] = i
        current_row["datatype"] = "xs:token"
        # day resolution may not be specific enough
        if current_details["syntax"].iloc[0] == "{timestamp}":
            current_row["datatype"] = "xs:date"
        if current_details["syntax"].iloc[0] == "{float}":
            current_row["datatype"] = "xs:decimal"
        if current_details["syntax"].iloc[0] == "{value}":
            current_row["datatype"] = "xs:decimal"
        # {integer} doesn't actually = xs:nonNegativeInteger
        if current_details["syntax"].iloc[0] == "{integer}":
            current_row["datatype"] = "xs:nonNegativeInteger"
        if i == "unique_ID":
            current_row["datatype"] = "xs:unique"
        if current_details["syntax"].iloc[0] == "{float} {unit}":
            current_row["pattern"] = "^[+-]?([0-9]*[.])?[0-9]+ \S+$"
        #     "source": "",
        #     "data status": "",
        if current_details["Category"].iloc[0] in required_sections:
            current_row["requirement"] = "required"
        #     "min value": "",
        #     "max value": "",
        #     "capitalize": "",
        current_row["description"] = current_details["Column Header"].iloc[0]
        current_row["guidance"] = current_details["Guidance"].iloc[0]
        current_row["examples"] = current_details["syntax"].iloc[0]
        if i in ct_keys:
            current_row["datatype"] = "select"
            # map?
            # indent?
            current_enums = ct_dol[i]
            current_enums.sort()
            for j in current_enums:
                print("    " + j)
                current_enum_row = blank_row.copy()
                current_enum_row["label"] = j
                current_enum_row["parent class"] = i
                enum_row_list.append(current_enum_row)
        main_row_list.append(current_row)
    print("\n")

    current_frame = pd.DataFrame(main_row_list)
    enum_frame = pd.DataFrame(enum_row_list)
    assembled_frame = pd.concat([current_frame, enum_frame])

    # create directory if necessary
    required_directory = dh_template_prefix + current_package
    # print(required_directory)
    os.makedirs(required_directory, exist_ok=True)
    # copy from soil directory to new required_directory?
    ref_temp_src = dh_template_root + ref_temp_filename
    ref_temp_dest = required_directory + "/" + ref_temp_filename
    copyfile(ref_temp_src, ref_temp_dest)

    ref_temp_src = dh_template_root + "export.js"
    ref_temp_dest = required_directory + "/" + "export.js"
    copyfile(ref_temp_src, ref_temp_dest)

    # should escape characters that break filenames like whitespaces
    current_template_file = required_directory + dh_template_suffix
    assembled_frame.to_csv(current_template_file, index=False, sep="\t")

In [76]:
# template_package(
#     "soil",
#     slot_to_package_df=slot_to_pack_4_dh,
#     slot_details_df=slot_details_4_dh,
#     enums_dict=ct_dol,
#     template_prefix=dh_template_prefix,
#     template_suffix=dh_template_suffix,
# )

In [77]:
for current_package in iot_packages:
    #     print(current_package)
    template_package(
        current_package,
        slot_to_package_df=slot_to_pack_4_dh,
        slot_details_df=slot_details_4_dh,
        enums_dict=ct_dol,
        template_prefix=dh_template_prefix,
        template_suffix=dh_template_suffix,
    )

culture_environmental
sample identification
required where applicable
optional
required
unique_ID


samp_name


investigation_type
    chemical speciation/mapping
    genome
    imaging- electron
    imaging- ion
    imaging- light
    lipidome
    metabolome
    molecular structure
    organic matter
    proteome
    transcriptome


package


source_mat_id


growth_facil
    experimental_garden
    field
    field_incubation
    greenhouse
    growth_chamber
    lab_incubation
    open_top_chamber
    other


collection_date


samp_mat_process


store_cond
    fresh
    frozen
    lyophilized
    other


samp_store_temp
    -20 degree Celsius
    -80 degree Celsius
    4 degree Celsius
    other
    room temperature


samp_size


geo_loc_name


lat_lon


elev


chem_administration


watering_regm


air_temp_regm


gaseous_environment


isotope_exposure


climate_environment


humidity_regm


light_regm


biotic_regm


other


size_frac_low


size_frac_up


size_frac


sieving


samp_c



temp


tidal_stage
    ebb tide
    flood tide
    high tide
    low tide


tot_depth_water_col


tot_diss_nitro


tot_inorg_nitro


tot_nitro


tot_part_carb


tot_phosp


turbidity


water_current


pure_culture
sample identification
required where applicable
optional
required
unique_ID


samp_name


investigation_type
    chemical speciation/mapping
    genome
    imaging- electron
    imaging- ion
    imaging- light
    lipidome
    metabolome
    molecular structure
    organic matter
    proteome
    transcriptome


package


source_mat_id


growth_facil
    experimental_garden
    field
    field_incubation
    greenhouse
    growth_chamber
    lab_incubation
    open_top_chamber
    other


collection_date


samp_mat_process


store_cond
    fresh
    frozen
    lyophilized
    other


samp_store_temp
    -20 degree Celsius
    -80 degree Celsius
    4 degree Celsius
    other
    room temperature


samp_size


chem_administration


watering_regm


air_temp_regm


gaseous_env


diss_inorg_phosp


diss_org_carb


diss_org_nitro


diss_oxygen


down_par


fluor


glucosidase_act


light_intensity


magnesium


mean_frict_vel


mean_peak_frict_vel


misc_param


n_alkanes


nitrate


nitrite


nitro


org_carb


org_matter


org_nitro


organism_count


oxy_stat_samp
    aerobic
    anaerobic
    anoxic
    facultative
    microaerophilic
    microanaerobe
    obligate aerobe
    obligate anaerobe


part_org_carb


part_org_nitro


perturbation


petroleum_hydrocarb


ph


ph_meth


phaeopigments


phosphate


phosplipid_fatt_acid


photon_flux


potassium


pressure


primary_prod


redox_potential


salinity


silicate


size_frac_low


size_frac_up


sodium


soluble_react_phosp


sulfate


sulfide


suspend_part_matter


temp


tidal_stage
    ebb tide
    flood tide
    high tide
    low tide


tot_depth_water_col


tot_diss_nitro


tot_inorg_nitro


tot_nitro


tot_part_carb


tot_phosp


turbidity


water_current


water_extract_biosolid
sample identific

Add to `TEMPLATES` constant at top of `DataHarmonizer/main/script.js`

In [78]:
for current_package in iot_packages:
    print(
        "'Index of Terms "
        + current_package
        + "':    {'folder': 'IoT_"
        + current_package
        + "', 'status': 'published'},"
    )

'Index of Terms culture_environmental':    {'folder': 'IoT_culture_environmental', 'status': 'published'},
'Index of Terms mixed_culture':    {'folder': 'IoT_mixed_culture', 'status': 'published'},
'Index of Terms plant_associated':    {'folder': 'IoT_plant_associated', 'status': 'published'},
'Index of Terms pore_water':    {'folder': 'IoT_pore_water', 'status': 'published'},
'Index of Terms pure_culture':    {'folder': 'IoT_pure_culture', 'status': 'published'},
'Index of Terms sediment':    {'folder': 'IoT_sediment', 'status': 'published'},
'Index of Terms soil':    {'folder': 'IoT_soil', 'status': 'published'},
'Index of Terms water':    {'folder': 'IoT_water', 'status': 'published'},
'Index of Terms water_extract_biosolid':    {'folder': 'IoT_water_extract_biosolid', 'status': 'published'},
'Index of Terms water_extract_soil':    {'folder': 'IoT_water_extract_soil', 'status': 'published'},


- Don't forget to run `DataHarmonizer/template/make_all.sh`
- note that the refernce templates (`reference_template.html`) are very generic
