In [1]:
from linkml_runtime import SchemaView
import pprint
import pandas as pd
from jsonasobj2 import as_dict

In [2]:
schema_url = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/nmdc_schema/nmdc_materialized_patterns.yaml"

In [3]:
schema_view = SchemaView(schema_url)

In [4]:
class_names = list(schema_view.all_classes().keys())

In [5]:
class_names.sort()

In [6]:
report = []

In [7]:
for c_name in class_names:
    ic = schema_view.induced_class(c_name)
    slot_names = list(ic.attributes.keys())
    slot_names.sort()
    for slot_name in slot_names:
        ica = ic.attributes[slot_name]
        ica_dict = ica.__dict__
        temp_dict = {
            "class": c_name,
            "slot": slot_name,
        }
        for k, v in ica_dict.items():
            if v and k not in [
                'alias',  # done: only interesting if different from key/name
                'annotations', # done
                'any_of',  # done. just __repr__ flatten
                'domain_of',  # done. definitely leave out
                'examples',  # done
                'from_schema',  # done. definitely leave out
                'name',  # done. definitely leave out
                'owner',  # done. definitely leave out
                'structured_pattern',  # done
                'structured_aliases',  # maybe there's a better way to handle this than just a .__repr__
            ]:
                if isinstance(v, list):
                    if all(isinstance(item, str) for item in v):
                        stringified = '|'.join(v)
                    else:
                        stringified = '|'.join(repr(item) for item in v)
                else:
                    stringified = v
                temp_dict[k] = stringified

            if v and k == 'alias' and v != slot_name:
                temp_dict['alias'] = v

            if v and k == 'structured_pattern':
                temp_dict['structured_pattern.syntax'] = v.syntax
                if not v.interpolated:
                    print(f"{c_name}.{slot_name}.structured_pattern is not interpolated")
                if v.partial_match:
                    print(f"{c_name}.{slot_name}.structured_pattern is set as partial match TRUE")
                sp_dict_keys = set(v.__dict__.keys())
                for i in sp_dict_keys:
                    if v[i] and i not in [
                        'syntax',
                        'interpolated',
                        'partial_match',
                    ]:
                        print(f"{c_name}.{slot_name}.structured_pattern has unexpected field {i}")

            if v and k == 'any_of':
                temp_dict['any_of'] = v.__repr__()

            if v and k == 'structured_aliases':
                temp_dict['structured_aliases'] = v.__repr__()

            example_values = []

            if v and k == 'examples':
                for example in v:
                    if example.value:
                        example_values.append(example.value)
                        if example.description:
                            print(
                                f"{c_name}.{slot_name}.examples with value {example.value} has description: {example.description}")
                            pass
                example_values = "|".join(example_values)
                temp_dict['example_values'] = example_values

        anndict = as_dict(ica.annotations)
        for anndictk, anndictv in anndict.items():
            temp_dict[f"annotation_{anndictk}"] = anndictv['value']

        report.append(temp_dict)

Biosample.bulk_elect_conductivity.examples with value JsonObj(has_raw_value='0.017 mS/cm', has_numeric_value=0.017, has_unit='mS/cm') has description: The conductivity measurement was 0.017 millisiemens per centimeter.
Biosample.id.examples with value nmdc:mgmag-00-x012.1_7_c1 has description: https://github.com/microbiomedata/nmdc-schema/pull/499#discussion_r1018499248
Biosample.insdc_biosample_identifiers.examples with value https://bioregistry.io/biosample:SAMD00212331 has description: I13_N_5-10 sample from Soil fungal diversity along elevational gradients
CalibrationInformation.id.examples with value nmdc:mgmag-00-x012.1_7_c1 has description: https://github.com/microbiomedata/nmdc-schema/pull/499#discussion_r1018499248
ChemicalConversionProcess.id.examples with value nmdc:mgmag-00-x012.1_7_c1 has description: https://github.com/microbiomedata/nmdc-schema/pull/499#discussion_r1018499248
ChemicalEntity.id.examples with value nmdc:mgmag-00-x012.1_7_c1 has description: https://github.

In [8]:
report_frame = pd.DataFrame(report)

In [9]:
initial_columns = [
    'class',
    'slot',
]

In [10]:
all_column_names = set(report_frame.columns)

In [11]:
sorted_column_names = list(all_column_names - set(initial_columns))

In [12]:
sorted_column_names.sort()

In [13]:
reunited_column_names = initial_columns + sorted_column_names

In [14]:
report_frame = report_frame[reunited_column_names]

In [15]:
report_frame.drop_duplicates(inplace=True)

In [16]:
report_frame

Unnamed: 0,class,slot,abstract,alias,aliases,annotation_expected_value,annotation_occurrence,annotation_preferred_unit,annotation_tooltip,any_of,...,recommended,required,see_also,slot_group,slot_uri,string_serialization,structured_aliases,structured_pattern.syntax,title,todos
0,AttributeValue,has_raw_value,,,,,,,,,...,,,,,,,,,,
1,AttributeValue,type,,,,,,,,,...,,True,https://github.com/microbiomedata/nmdc-schema/...,,rdf:type,,JsonObj(workflow_execution_class=StructuredAli...,,,
2,Biosample,abs_air_humidity,,,absolute air humidity,measurement value,1,"gram per gram, kilogram per kilogram, kilogram...",,,...,,,,,MIXS:0000122,,,,absolute air humidity,
3,Biosample,add_date,,,,,,,,,...,,,,,,,,,,
4,Biosample,add_recov_method,,,secondary and tertiary recovery methods and st...,enumeration;timestamp,1,,,,...,,,,,MIXS:0001009,,,,secondary and tertiary recovery methods and st...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1561,WorkflowExecution,start_date,,,,,,,,,...,,,,,,,,,,add date string validation pattern
1562,WorkflowExecution,started_at_time,,,,,,,,,...,,True,,,,,,,,
1563,WorkflowExecution,type,,,,,,,,,...,,True,https://github.com/microbiomedata/nmdc-schema/...,,rdf:type,,JsonObj(workflow_execution_class=StructuredAli...,,,
1564,WorkflowExecution,version,,,,,,,,,...,,,,,,,,,,


In [17]:
report_frame.to_csv("nmdc_schema_flattening.tsv")