In [1]:
import re
from datetime import datetime

from linkml.generators.linkmlgen import LinkmlGenerator
from linkml_runtime import SchemaView

import yaml

This really only works because there aren't object range slots in MIxS Extensions, Checklists and combinations.

The same thing holds true for the NMDC submission schema, but the examples checker writen for that injects each example value into an otherwise known good instance and used the linkml validator on that

**This does not check `any_of` ranges, etc**

In [2]:
tested_schema_url = "https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/refs/heads/388-we-are-misusing-the-string-serialization-slot/src/mixs/schema/mixs.yaml"

In [3]:
# test these in YAML or JSON, because the YAML deserializer may infer/repair some of these to true Booleans
boolean_proxies = {"true", "false", "yes", "no", "1", "0"}

We will check against the following types, strings with patterns, and enums

In [4]:
expected_types = [
    'boolean', # done
    'datetime', # done
    'float', # done
    'integer', # done
    'string',
]

What boolean proxies do we want to allow in MIxS compliant data?

In [5]:
mixs_examples_report_yaml = "mixs_examples_report.yaml"

Should I really have written my own type validators?

In [6]:
def is_boolean_extended(s):
    return s.lower() in boolean_proxies

In [7]:
def is_float(s):
    try:
        float(s)  # Try converting to float
        return True
    except ValueError:
        return False


In [8]:
def is_iso8601(s):
    try:
        datetime.fromisoformat(s)
        return True
    except ValueError:
        return False


In [9]:
def matches_regex(s, pattern):
    return bool(re.fullmatch(pattern, s))

In [10]:
generator = LinkmlGenerator(tested_schema_url, materialize_attributes=False, materialize_patterns=True, format='yaml')

In [11]:
generated_yaml_string = generator.serialize()

In [12]:
generated_schema = SchemaView(generated_yaml_string)
# 2 seconds

In [13]:
checklist_names = set(generated_schema.class_descendants(
    class_name="Checklist",
    reflexive=False,
    mixins=False,
    is_a=True
))

In [14]:
extension_names = set(generated_schema.class_descendants(
    class_name="Extension",
    reflexive=False,
    mixins=False,
    is_a=True
))

In [15]:
relevant_class_names = checklist_names | extension_names # union

In [16]:
relevant_class_names = list(relevant_class_names)

In [17]:
relevant_class_names.sort()

In [18]:
doesnt_match_enum_examples = []
doesnt_match_pattern_examples = []
unhandled_range_slot = []
unhandled_type_slot = []
doesnt_satisfy_type_examples = []
missing_example_slots = []
unconstrained_slots = []

Getting the type's class name and checking that in if statements seems pretty indirect

In [19]:
def validate_examples(relevant_class_names, generated_schema):
    for c_name in relevant_class_names:
        ic = generated_schema.induced_class(c_name)
        for ican, icav in ic.attributes.items():
            range_type = str(icav.range)
            range_element = generated_schema.get_element(range_type)
            range_element_name = str(type(range_element).class_name)
            examples = icav.examples

            if not examples:
                missing_example_slots.append({"class": str(ic.name), "slot": str(icav.name)})
                continue

            for example in examples:
                example_value = str(example.value)

                if range_element_name == 'enum_definition':
                    if example_value not in range_element.permissible_values:
                        doesnt_match_enum_examples.append({
                            "class": str(ic.name), "slot": str(icav.name), "enum": range_type, "example": example_value
                        })
                        continue

                elif range_element_name == 'type_definition':
                    invalid_type = (
                            (range_type == 'integer' and not example_value.isdigit()) or
                            (range_type == 'boolean' and not is_boolean_extended(example_value)) or
                            (range_type == 'float' and not is_float(example_value)) or
                            (range_type == 'datetime' and not is_iso8601(example_value))
                    )

                    if invalid_type:
                        unhandled_type_slot.append({"class": str(ic.name), "slot": str(icav.name), "type": range_type,
                                                    "example": example_value})
                        continue
                    if range_type == 'string' and icav.pattern and not matches_regex(s=example_value,
                                                                                     pattern=icav.pattern):
                        doesnt_match_pattern_examples.append(
                            {"class": str(ic.name), "slot": str(icav.name), "pattern": str(icav.pattern),
                             "example": example_value})
                        continue
                    if range_type == 'string':
                        if not icav.pattern:
                            unconstrained_slots.append({"class": str(ic.name), "slot": str(icav.name)})
                            continue
                        else:
                            continue

                    if range_type not in expected_types:
                        doesnt_satisfy_type_examples.append(
                            {"class": str(ic.name), "slot": str(icav.name), "type": range_type})
                    else:
                        continue
                        # it's not an enum slot, it is a type range slot, it's not a string slot, it's not an unhandled type slot, and it's not invalid for its type

                    print("how did we get here?")

                else:
                    unhandled_range_slot.append({"class": str(ic.name), "slot": str(icav.name), "type": range_type})


In [20]:
validate_examples(relevant_class_names, generated_schema)

In [21]:
report = {
    "doesnt_match_enum_examples": doesnt_match_enum_examples,
    "doesnt_match_pattern_examples": doesnt_match_pattern_examples,
    "unhandled_range_slot": unhandled_range_slot,
    "unhandled_type_slot": unhandled_type_slot,
    "doesnt_satisfy_type_examples": doesnt_satisfy_type_examples,
    "missing_example_slots": missing_example_slots,
    "unconstrained_slots": unconstrained_slots,
}

In [22]:
# Write to a YAML file
with open(mixs_examples_report_yaml, "w") as file:
    yaml.dump(report, file, default_flow_style=False, sort_keys=True)