In [None]:
import pandas as pd
import os

from linkml_runtime.utils.schemaview import SchemaView
import json

In [None]:
# record the queries that generated these TSVs
# rows with env package values only
# /Users/MAM/Documents/gitrepos/biosample-analysis/...
biosample_repo_root = "../../gitrepos/biosample-analysis"

hn_long_file = "target/get_harmonized-values_all_has_env_pack.tsv"
hn_long_file = os.path.join(biosample_repo_root, hn_long_file)

bs_non_attribute_file = "target/non-bsattribute-columns_has_env_pack.tsv"
bs_non_attribute_file = os.path.join(biosample_repo_root, bs_non_attribute_file)

# ---


# manual mappings, starting with XXX
env_package_mapping_file = (
    "../../biosample-basex/notebooks/raw_env_package_mappings_only_20210924.tsv"
)

mixs_model_file = "../../mixs-source/model/schema/mixs.yaml"

# ---
# outputs

soil_slot_column_analysis_file = "soil_slot_column_analysis.json"

mixs_package_slots_file = "mixs_package_slots.tsv"

`montana_soil_columns` comes from
- All green and red sections from `Metadata` tab of Example-Soil_NMDC_SampleMetadata_soil_slots.xslx
- search `MenuTerms` tab for appearances of the word 'soil', for example next to
    - elev
    - depth
    - treatment (also look for subsumed slots, like chem_administration, watering_regm, air_temp_regm...)
    - samp_collect_device
    - size_fract (also look for subsumed slots)

In [None]:
montana_soil_columns = [
    "agrochem_addition",
    "al_sat",
    "al_sat_meth",
    "annual_precpt",
    "annual_temp",
    "biotic_relationship",
    "crop_rotation",
    "cur_land_use",
    "cur_vegetation",
    "cur_vegetation_meth",
    "drainage_class",
    "extreme_event",
    "extreme_salinity",
    "fao_class",
    "fire",
    "flooding",
    "heavy_metals",
    "heavy_metals_meth",
    "horizon",
    "horizon_meth",
    "lab_ID",
    "link_addit_analys",
    "link_class_info",
    "link_climate_info",
    "local_class",
    "local_class_meth",
    "microbial_biomass",
    "microbial_biomass_meth",
    "misc_param",
    "oxy_stat_samp",
    "ph",
    "ph_meth",
    "previous_land_use",
    "previous_land_use_meth",
    "profile_position",
    "salinity_meth",
    "season_precpt",
    "season_temp",
    "slope_aspect",
    "slope_gradient",
    "soil_type",
    "soil_type_meth",
    "texture",
    "texture_meth",
    "tillage",
    "tot_nitro_content",
    "tot_nitro_content_meth",
    "tot_org_c_meth",
    "tot_org_carb",
    "unique_ID",
    "water_content",
    "water_content_meth",
    "depth",
    "elev",
    "samp_collect_device",
    "size_fract",
    "filter_size",
    "filter_type",
    "sieving",
    "treatment",
    "air_temp_regm",
    "biotic_regm",
    "chem_administration",
    "climate_environment",
    "gaseous_environment",
    "humidity_regm",
    "isotope_exposure",
    "light_regm",
    "watering_regm",
    "ID",
    "investigation_type",
    "package",
    "sample_name",
    "source_mat_ID",
    "collection_date",
    "env_broad_scale",
    "env_local_scale",
    "env_medium",
    "geo_loc_name",
    "gold_ecosystem",
    "growth_facil",
    "lat_lon",
    "microbiome_taxonomy",
    "samp_mat_process",
    "samp_size",
    "samp_store_temp",
    "store_cond",
]

In [None]:
def get_mixs_slots_by_class(mixs_class):
    class_slots = list(
        mixs_package_slots["slot"].loc[mixs_package_slots["class"].eq(mixs_class)]
    )
    class_slots.sort()
    #     print(class_slots)
    return class_slots


def mixs_pack_slot_diff(a, b):
    a_classes = get_mixs_slots_by_class(a)
    b_classes = get_mixs_slots_by_class(b)
    a_minus_b = list(set(a_classes) - set(b_classes))
    a_minus_b.sort()
    return a_minus_b


def mixs_pack_slot_intersection(a, b):
    a_classes = get_mixs_slots_by_class(a)
    b_classes = get_mixs_slots_by_class(b)
    a_intersect_b = list(set(a_classes).intersection(set(b_classes)))
    a_intersect_b.sort()
    return a_intersect_b

In [None]:
mixs_view = SchemaView(mixs_model_file)

In [None]:
# soil_subclasses = mixs_view.class_children('soil')

In [None]:
# mixs_view.get_class('soil ME').is_a

In [None]:
# mixs_view.get_class('soil ME').mixins

In [None]:
# x = mixs_view.class_induced_slots('soil ME')
# # with open("soil_ME_induced_slots.json", "w") as fp:
# #     json.dump(x, fp)
# for y in x:
#     print(y.name)

In [None]:
mixs_classes = mixs_view.all_classes()
mixs_classes_keys = mixs_classes.keys()

In [None]:
outer_list = []
for current_class in mixs_classes_keys:
    current_induceds = mixs_view.class_induced_slots(current_class)
    for i in current_induceds:
        outer_list.append([current_class, i.name, i.required, i.recommended])

mixs_package_slots = pd.DataFrame(outer_list)
mixs_package_slots.columns = ["class", "slot", "required", "recommended"]
# mixs_package_slots.to_clipboard(index=False)
mixs_package_slots.to_csv(mixs_package_slots_file, sep="\t", index=False)

In [None]:
# soil_not_core = mixs_pack_slot_diff("soil", "core")
# # soil_not_core

In [None]:
# core_not_soil = mixs_pack_slot_diff("core", "soil")
# # core_not_soil

---

In [None]:
mixs_soil_slots = get_mixs_slots_by_class("soil")

In [None]:
montana_only = list(set(montana_soil_columns) - set(mixs_soil_slots))
montana_only.sort()

In [None]:
mixs_slot_only = list(set(mixs_soil_slots) - set(montana_soil_columns))
mixs_slot_only.sort()

In [None]:
soil_slot_intersection = set(montana_soil_columns).intersection(set(mixs_soil_slots))

In [None]:
all_model_slots = list(mixs_view.all_slots().keys())
all_model_slots.sort()

In [None]:
montana_only_defined = set(montana_only).intersection(set(all_model_slots))

In [None]:
montana_only_novel = set(montana_only) - set(all_model_slots)

In [None]:
slot_column_analysis = {
    "montana_only": {
        "mixs_defined": list(montana_only_defined),
        "montana_novel": list(montana_only_novel),
    },
    "mixs_slot_only": list(mixs_slot_only),
    "montana_mixs_soil_intersection": list(soil_slot_intersection),
}

In [None]:
with open(soil_slot_column_analysis_file, "w") as fp:
    json.dump(slot_column_analysis, fp)

In [None]:
# sort and beautify/wrap
# curate reasons for mismatches