In [1]:
from linkml_runtime import SchemaView
import re
import csv

from caseconverter import snakecase, kebabcase, camelcase, pascalcase, cobolcase, flatcase, macrocase, titlecase



In [2]:
nmdc_schema_url = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/src/schema/nmdc.yaml"

In [3]:
tsv_out = "nmdc_pvs.tsv"

In [4]:
include_mixs = False

In [5]:
def deduplicate_lod(lod):
    return [dict(t) for t in {tuple(d.items()) for d in lod}]



In [6]:
def get_string_case(s):
    if not s:  # Handle empty strings safely
        return "empty"

    # **Independent Condition Checks**
    has_lower = any(c.islower() for c in s)
    has_upper = any(c.isupper() for c in s)
    has_hyphen = "-" in s
    has_underscore = "_" in s
    has_whitespace = " " in s

    # starts_with_digit = s[0].isdigit() if s else False  # Ensure we don't index into empty string

    if has_upper and s[0].islower() and not has_hyphen and not has_underscore and not has_whitespace and s == camelcase(
            s):
        return ("camelCase")

    if has_upper and has_lower and s[
        0].isupper() and not has_underscore and not has_whitespace and not has_hyphen and s == pascalcase(s):
        return ("PascalCase")

    if has_upper and has_hyphen and not has_lower and not has_underscore and not has_whitespace and s == cobolcase(s):
        return ("UPPER-KEBAB-CASE")

    if has_lower and has_hyphen and not has_upper and not has_underscore and not has_whitespace and s == kebabcase(s):
        return ("lower-kebab-case")

    if has_lower and not has_upper and not has_hyphen and not has_underscore and not has_whitespace and s == flatcase(
            s):
        return ("lowerall")

    if has_upper and not has_lower and not has_hyphen and not has_underscore and not has_whitespace:
        return ("UPPERALL")

    if has_upper and has_underscore and not has_lower and not has_hyphen and not has_whitespace and s == macrocase(s):
        return ("UPPER_SNAKE_CASE")

    if has_lower and has_underscore and not has_upper and not has_hyphen and not has_whitespace and s == snakecase(s):
        return ("lower_snake_case")

    if has_whitespace and s == titlecase(s):
        return ("Title Case")

    if any(c.isspace() for c in s):
        return "other, including whitespace"

    # **If none of the above match, return "other"**
    return "other"


In [7]:
nmdc_schema_view = SchemaView(nmdc_schema_url)

In [8]:
nmdc_schema_enums = nmdc_schema_view.all_enums()

In [9]:
rows = []

In [10]:
for ek, ev in nmdc_schema_enums.items():
    if ev.permissible_values:
        for pvk, pvv in ev.permissible_values.items():
            pvk_case = get_string_case(pvk)
            schema_string = ev.from_schema.split("/")[-1]
            if schema_string != "mixs.yaml" or include_mixs:
                row = {
                    "from_schema": ev.from_schema.split("/")[-1],
                    "enum": ek,
                    "pv": pvk,
                    "pv_case": pvk_case,
                }
            rows.append(row)


In [11]:
len(rows)

1165

In [12]:
rows = deduplicate_lod(rows)

In [13]:
len(rows)

394

In [14]:
# Get the field names from the first dictionary
fieldnames = rows[0].keys()

In [15]:

with open(tsv_out, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter="\t")

    # Write header
    writer.writeheader()

    # Write data rows
    writer.writerows(rows)
