* annotation_dict
* get_classes_by_slot
* get_classes_modifying_slot
* get_elements_applicable_by_identifier
* get_elements_applicable_by_prefix
* get_slots_by_enum
* get_uri
* importmap
* in_schema
* slot_applicable_range_elements
* slot_is_true_for_metadata_property
* slot_name_mappings
* slot_range_as_union

## don't understand

* class_name_mappings
* get_mapping_index
* imports_closure
* namespaces

# haven't tried yet

* get_elements_applicable_by_identifier
* get_elements_applicable_by_prefix

In [1]:
from linkml_runtime.utils.schemaview import SchemaView
import yaml
import pandas as pd
from linkml_runtime.linkml_model import SlotDefinition
import inspect


In [2]:
# schema_file = "../nmdc_schema/nmdc_materialized_patterns.yaml"
schema_file = "../src/schema/nmdc.yaml"

In [3]:
meta_file = "../metamodel/no_orphans.yaml"

In [4]:
def get_element_names_and_origin(schema_view: SchemaView) -> dict[str, str]:
    """Get a mapping of element names to their origin schema.
    
    Args:
        schema_view: A LinkML SchemaView instance
        
    Returns:
        Dictionary mapping element names to their from_schema values
    """
    return {name: element.from_schema 
            for name, element in schema_view.all_elements().items()}

In [5]:
def analyze_slot_usage(slot_name, schema_view, all_slots_dict):
    """Analyze all the ways a slot is used in the schema."""
    
    # Get the slot object
    slot = schema_view.get_slot(slot_name)
    
    # Check for is_a children
    children = schema_view.slot_children(slot_name, mixins=False, is_a=True)
    
    # Check which classes use this slot
    classes_using = schema_view.get_classes_by_slot(slot)
    
    # Check which classes modify this slot
    classes_modifying = schema_view.get_classes_modifying_slot(slot)
    
    # Check if used as slot_group
    slots_grouped = [k for k, v in all_slots_dict.items() if v.slot_group == slot_name]
    
    # Check if used as a mixin by other slots
    slots_using_as_mixin = [
        k for k, v in all_slots_dict.items() 
        if v.mixins and slot_name in v.mixins
    ]
    
    # Check deprecation status
    is_deprecated = slot.deprecated is not None and slot.deprecated != ""
    has_exact_replacement = slot.deprecated_element_has_exact_replacement is not None
    has_possible_replacement = slot.deprecated_element_has_possible_replacement is not None
    deprecation_count = sum([is_deprecated, has_exact_replacement, has_possible_replacement])
    
    # Check if it's a grouping slot itself
    is_grouping = getattr(slot, 'grouping', None)
    
    return {
        'slot_name': slot_name,
        'has_children': len(children) > 0,
        'num_children': len(children),
        'children': children if children else None,
        'used_in_classes': len(classes_using) > 0,
        'num_classes': len(classes_using),
        'classes': classes_using if classes_using else None,
        'modified_by_classes': len(classes_modifying) > 0,
        'num_modifying': len(classes_modifying),
        'groups_other_slots': len(slots_grouped) > 0,
        'num_grouped': len(slots_grouped),
        'grouped_slots': slots_grouped if slots_grouped else None,
        'used_as_mixin': len(slots_using_as_mixin) > 0,
        'num_mixin_usage': len(slots_using_as_mixin),
        'mixin_users': slots_using_as_mixin if slots_using_as_mixin else None,
        'is_deprecated': is_deprecated,
        'has_exact_replacement': has_exact_replacement,
        'has_possible_replacement': has_possible_replacement,
        'deprecation_count': deprecation_count,
        'deprecated_message': slot.deprecated if is_deprecated else None,
        'is_grouping_slot': is_grouping,
        'from_schema': slot.from_schema
    }

In [6]:
def create_slot_usage_summary(slot_names, schema_view, all_slots_dict):
    """Create a usage summary dataframe for a list of slot names."""
    usage_results = [analyze_slot_usage(name, schema_view, all_slots_dict) for name in slot_names]
    
    usage_summary_df = pd.DataFrame([
        {
            'slot_name': result['slot_name'],
            'num_children': result['num_children'],
            'num_classes': result['num_classes'],
            'num_modifying': result['num_modifying'],
            'num_grouped': result['num_grouped'],
            'num_mixin_usage': result['num_mixin_usage'],
            'deprecation_count': result['deprecation_count'],
            'from_schema': result['from_schema']
        }
        for result in usage_results
    ])
    
    return usage_summary_df, usage_results

In [7]:
def filter_to_unused_slots(usage_summary_df):
    """Filter a usage summary dataframe to show only unused slots."""
    return usage_summary_df[
        (usage_summary_df['num_children'] == 0) & 
        (usage_summary_df['num_classes'] == 0) & 
        (usage_summary_df['num_grouped'] == 0) &
        (usage_summary_df['num_mixin_usage'] == 0)
    ]

In [8]:
schema_view = SchemaView(schema_file)

In [9]:
meta_view = SchemaView(meta_file)

In [10]:
current_slot_name = "depth"
current_slot = schema_view.get_slot(current_slot_name)

In [11]:
current_classes = schema_view.get_classes_by_slot(current_slot)
current_classes

['Biosample']

In [12]:
current_ancestors = schema_view.slot_ancestors(current_slot_name, reflexive=False)
current_ancestors
# ['environment field'] is a parent of depth

['environment field']

In [13]:
cousins = schema_view.slot_descendants(current_ancestors[0], reflexive=False)
cousins

['alt', 'collection_date', 'depth', 'elev', 'geo_loc_name', 'lat_lon', 'temp']

In [14]:
current_range_as_union = schema_view.slot_range_as_union(current_slot)
current_range_as_union

['QuantityValue']

In [15]:
current_applicable_range_elements = schema_view.slot_applicable_range_elements(current_slot)
current_applicable_range_elements

['class_definition']

In [16]:
# list of slot metaslots
# [attr for attr in dir(SlotDefinition) if not attr.startswith('_')]
# might be of interest for slot_is_true_for_metadata_property(slot_name, metadata_property)



In [17]:
schema_slots = schema_view.all_slots()

In [18]:
slots_with_groups_df = pd.DataFrame([
    {'slot': k, 'from_schema': v.from_schema, 'slot_group': v.slot_group}
    for k, v in schema_slots.items()
    if v.slot_group
])


In [19]:
slots_with_groups_df

Unnamed: 0,slot,from_schema,slot_group
0,dna_absorb1,https://w3id.org/nmdc/core,JGI-Metagenomics
1,dna_absorb2,https://w3id.org/nmdc/core,JGI-Metagenomics
2,dna_concentration,https://w3id.org/nmdc/basic_slots,JGI-Metagenomics
3,emsl_store_temp,https://w3id.org/nmdc/emsl,EMSL
4,project_id,https://w3id.org/nmdc/emsl,EMSL
5,replicate_number,https://w3id.org/nmdc/emsl,EMSL
6,sample_shipped,https://w3id.org/nmdc/emsl,EMSL
7,sample_type,https://w3id.org/nmdc/emsl,EMSL
8,technical_reps,https://w3id.org/nmdc/emsl,EMSL
9,dna_collect_site,https://w3id.org/nmdc/portal/jgi_metagenomics,JGI-Metagenomics


In [20]:
# Get unique slot_group values
unique_slot_groups = slots_with_groups_df['slot_group'].unique()

In [21]:
# Analyze each unique slot_group
slot_group_analysis = []
for group_name in sorted(unique_slot_groups):
  is_slot = group_name in schema_view.all_slots()

  analysis = {
      'slot_group_name': group_name,
      'is_actual_slot': is_slot,
      'grouping': None,
      'abstract': None,
      'mixin': None
  }

  if is_slot:
      slot_def = schema_view.get_slot(group_name)
      analysis['grouping'] = getattr(slot_def, 'grouping', None)
      analysis['abstract'] = slot_def.abstract
      analysis['mixin'] = slot_def.mixin

  slot_group_analysis.append(analysis)


In [22]:
slot_group_types_df = pd.DataFrame(slot_group_analysis)

In [23]:
slot_group_types_df

Unnamed: 0,slot_group_name,is_actual_slot,grouping,abstract,mixin
0,EMSL,False,,,
1,JGI-Metagenomics,False,,,
2,JGI-Metatranscriptomics,False,,,
3,MIxS Inspired,False,,,
4,Sample ID,False,,,


----

In [24]:
usage_index = schema_view.usage_index()

In [25]:
meta_eno = get_element_names_and_origin(meta_view)

In [26]:
nmdc_eno = get_element_names_and_origin(schema_view)

## Does nmdc-schema have any elements that use the name of a metamodel element?

In [27]:
# Find keys that exist in both dictionaries but have different values
shared_keys_different_values = {
    key: {'nmdc': nmdc_eno[key], 'meta': meta_eno[key]}
    for key in set(nmdc_eno.keys()) & set(meta_eno.keys())
    if nmdc_eno[key] != meta_eno[key]
}


In [28]:
# Convert to DataFrame for prettier display
skdv_frame = pd.DataFrame([
    {'element': key, 'nmdc_origin': value['nmdc'], 'meta_origin': value['meta']}
    for key, value in shared_keys_different_values.items()
]).sort_values('element')


In [29]:
skdv_frame

Unnamed: 0,element,nmdc_origin,meta_origin
3,bytes,https://w3id.org/nmdc/nmdc_types,https://w3id.org/linkml/datasets
13,definition,https://w3id.org/nmdc/basic_classes,https://w3id.org/linkml/meta
8,description,https://w3id.org/nmdc/attribute_values,https://w3id.org/linkml/datasets
11,id,https://w3id.org/nmdc/basic_slots,https://w3id.org/linkml/datasets
5,language,https://w3id.org/nmdc/attribute_values,https://w3id.org/linkml/datasets
0,name,https://w3id.org/nmdc/attribute_values,https://w3id.org/linkml/meta
1,notes,https://w3id.org/nmdc/basic_slots,https://w3id.org/linkml/meta
10,object,https://w3id.org/nmdc/basic_classes,https://w3id.org/linkml/reporting
4,predicate,https://w3id.org/nmdc/basic_classes,https://w3id.org/linkml/reporting
12,subject,https://w3id.org/nmdc/annotation,https://w3id.org/linkml/reporting


## Does nmdc-schema have any elements with whitespace in their names?

In [30]:
# Find elements with whitespace in their names
elements_with_whitespace = [
    name for name in nmdc_eno.keys()
    if any(char.isspace() for char in name)
]


In [31]:
# Display as DataFrame with element type
all_elements = schema_view.all_elements()
whitespace_df = pd.DataFrame([
    {
        'element': name,
        'type': type(all_elements[name]).__name__,
        'from_schema': nmdc_eno[name]
    }
    for name in sorted(elements_with_whitespace)
])


In [32]:
whitespace_df

Unnamed: 0,element,type,from_schema
0,core field,SlotDefinition,https://raw.githubusercontent.com/microbiomeda...
1,environment field,SlotDefinition,https://raw.githubusercontent.com/microbiomeda...
2,investigation field,SlotDefinition,https://raw.githubusercontent.com/microbiomeda...
3,nucleic acid sequence source field,SlotDefinition,https://raw.githubusercontent.com/microbiomeda...
4,sequencing field,SlotDefinition,https://raw.githubusercontent.com/microbiomeda...


## If there are slots with whitespace in their names
Are they used in any way? Would it be OK to delete them?

In [33]:
# Analyze usage for whitespace slots
whitespace_usage_summary_df, whitespace_usage_results = create_slot_usage_summary(
    elements_with_whitespace, schema_view, schema_slots
)


In [34]:
whitespace_usage_summary_df

Unnamed: 0,slot_name,num_children,num_classes,num_modifying,num_grouped,num_mixin_usage,deprecation_count,from_schema
0,core field,458,0,0,0,0,0,https://raw.githubusercontent.com/microbiomeda...
1,environment field,7,0,0,0,0,0,https://raw.githubusercontent.com/microbiomeda...
2,investigation field,3,0,0,0,0,0,https://raw.githubusercontent.com/microbiomeda...
3,nucleic acid sequence source field,9,0,0,0,0,0,https://raw.githubusercontent.com/microbiomeda...
4,sequencing field,9,0,0,0,0,0,https://raw.githubusercontent.com/microbiomeda...


In [35]:
# Filter whitespace slots to unused ones
whitespace_unused_slots = filter_to_unused_slots(whitespace_usage_summary_df)


In [36]:
whitespace_unused_slots

Unnamed: 0,slot_name,num_children,num_classes,num_modifying,num_grouped,num_mixin_usage,deprecation_count,from_schema


## Analyze usage for _all_ slots

In [37]:
# Analyze usage for ALL slots in the schema
all_slot_names = list(schema_slots.keys())
all_slots_usage_summary_df, all_slots_usage_results = create_slot_usage_summary(
    all_slot_names, schema_view, schema_slots
)


In [38]:
all_slots_usage_summary_df

Unnamed: 0,slot_name,num_children,num_classes,num_modifying,num_grouped,num_mixin_usage,deprecation_count,from_schema
0,chemical_conversion_category,0,1,0,0,0,0,https://w3id.org/nmdc/core
1,substances_volume,0,1,0,0,0,0,https://w3id.org/nmdc/core
2,biosample_categories,0,1,0,0,0,0,https://w3id.org/nmdc/core
3,collected_from,0,1,1,0,0,0,https://w3id.org/nmdc/core
4,bulk_elect_conductivity,0,1,0,0,0,0,https://w3id.org/nmdc/core
...,...,...,...,...,...,...,...,...
844,stationary_phase,0,2,1,0,0,0,https://w3id.org/nmdc/nmdc
845,chromatographic_category,0,2,1,0,0,0,https://w3id.org/nmdc/nmdc
846,sampled_portion,0,2,0,0,0,0,https://w3id.org/nmdc/nmdc
847,predicate,0,0,0,0,0,0,https://w3id.org/nmdc/basic_classes


In [39]:
# Filter all slots to unused ones
all_unused_slots = filter_to_unused_slots(all_slots_usage_summary_df)


## Analyze usage for non-slot elements (classes, enums, types, subsets)

In [40]:
def get_element_type(element_name, schema_view):
    """Determine the type of a schema element."""
    if element_name in schema_view.all_classes():
        return 'class'
    elif element_name in schema_view.all_slots():
        return 'slot'
    elif element_name in schema_view.all_enums():
        return 'enum'
    elif element_name in schema_view.all_types():
        return 'type'
    elif element_name in schema_view.all_subsets():
        return 'subset'
    else:
        return 'unknown'

In [41]:
def get_element_type(element_name, schema_view):
    """Determine the type of a schema element."""
    if element_name in schema_view.all_classes():
        return 'class'
    elif element_name in schema_view.all_slots():
        return 'slot'
    elif element_name in schema_view.all_enums():
        return 'enum'
    elif element_name in schema_view.all_types():
        return 'type'
    elif element_name in schema_view.all_subsets():
        return 'subset'
    else:
        return 'unknown'

def analyze_non_slot_elements(schema_view):
    """Analyze usage of all non-slot elements using usage_index."""
    usage_index = schema_view.usage_index()
    all_elements = schema_view.all_elements()
    
    results = []
    for element_name, element in all_elements.items():
        element_type = get_element_type(element_name, schema_view)
        
        # Skip slots - we already analyzed those
        if element_type == 'slot':
            continue
        
        # Check if element is referenced in usage_index
        is_used = element_name in usage_index and len(usage_index[element_name]) > 0
        num_usages = len(usage_index.get(element_name, []))
        
        # Check deprecation status
        is_deprecated = element.deprecated is not None and element.deprecated != ""
        has_exact_replacement = element.deprecated_element_has_exact_replacement is not None
        has_possible_replacement = element.deprecated_element_has_possible_replacement is not None
        deprecation_count = sum([is_deprecated, has_exact_replacement, has_possible_replacement])
        
        results.append({
            'element_name': element_name,
            'element_type': element_type,
            'is_used': is_used,
            'num_usages': num_usages,
            'deprecation_count': deprecation_count,
            'from_schema': element.from_schema
        })
    
    return pd.DataFrame(results)

# Analyze all non-slot elements
non_slot_elements_df = analyze_non_slot_elements(schema_view)
non_slot_elements_df

Unnamed: 0,element_name,element_type,is_used,num_usages,deprecation_count,from_schema
0,ChemicalConversionProcess,class,False,0,0,https://w3id.org/nmdc/core
1,AnnotatingWorkflow,class,True,1,0,https://w3id.org/nmdc/core
2,MetagenomeAnnotation,class,True,1,0,https://w3id.org/nmdc/core
3,FieldResearchSite,class,True,3,0,https://w3id.org/nmdc/core
4,Sample,class,True,14,0,https://w3id.org/nmdc/core
...,...,...,...,...,...,...
244,sparqlpath,type,False,0,0,https://w3id.org/linkml/types
245,external_identifier,type,True,23,0,https://w3id.org/nmdc/nmdc_types
246,bytes,type,True,1,0,https://w3id.org/nmdc/nmdc_types
247,decimal_degree,type,True,2,0,https://w3id.org/nmdc/nmdc_types


In [42]:
# Analyze all non-slot elements
non_slot_elements_df = analyze_non_slot_elements(schema_view)


In [43]:
non_slot_elements_df

Unnamed: 0,element_name,element_type,is_used,num_usages,deprecation_count,from_schema
0,ChemicalConversionProcess,class,False,0,0,https://w3id.org/nmdc/core
1,AnnotatingWorkflow,class,True,1,0,https://w3id.org/nmdc/core
2,MetagenomeAnnotation,class,True,1,0,https://w3id.org/nmdc/core
3,FieldResearchSite,class,True,3,0,https://w3id.org/nmdc/core
4,Sample,class,True,14,0,https://w3id.org/nmdc/core
...,...,...,...,...,...,...
244,sparqlpath,type,False,0,0,https://w3id.org/linkml/types
245,external_identifier,type,True,23,0,https://w3id.org/nmdc/nmdc_types
246,bytes,type,True,1,0,https://w3id.org/nmdc/nmdc_types
247,decimal_degree,type,True,2,0,https://w3id.org/nmdc/nmdc_types


In [44]:
# Filter to unused/orphan non-slot elements (excluding classes)
# Classes can be instantiated in data even if not referenced in schema
unused_non_slot_elements = non_slot_elements_df[
    (~non_slot_elements_df['is_used']) & 
    (non_slot_elements_df['element_type'] != 'class')
]
unused_non_slot_elements

Unnamed: 0,element_name,element_type,is_used,num_usages,deprecation_count,from_schema
80,SampleStateEnum,enum,False,0,0,https://w3id.org/nmdc/core
124,freq_clean_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
142,organism_count_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
144,plant_growth_med_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
158,samp_md_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
204,YesNoEnum,enum,False,0,0,https://w3id.org/nmdc/portal_enums
206,SubmissionStatusEnum,enum,False,0,0,https://w3id.org/nmdc/portal_enums
223,ProtocolCategoryEnum,enum,False,0,0,https://w3id.org/nmdc/nmdc
232,time,type,False,0,0,https://w3id.org/linkml/types
233,date,type,False,0,0,https://w3id.org/linkml/types


In [45]:
# Classes not referenced in schema (but may still be used in data!)
unreferenced_classes = non_slot_elements_df[
    (~non_slot_elements_df['is_used']) & 
    (non_slot_elements_df['element_type'] == 'class')
]
print(f"Classes with no schema references: {len(unreferenced_classes)}")
print("Note: These may be tree roots, top-level classes, or classes instantiated directly in data.")
unreferenced_classes

Classes with no schema references: 28
Note: These may be tree roots, top-level classes, or classes instantiated directly in data.


Unnamed: 0,element_name,element_type,is_used,num_usages,deprecation_count,from_schema
0,ChemicalConversionProcess,class,False,0,0,https://w3id.org/nmdc/core
11,EnvironmentalMaterialTerm,class,False,0,1,https://w3id.org/nmdc/core
16,FunctionalAnnotationTerm,class,False,0,0,https://w3id.org/nmdc/annotation
17,Pathway,class,False,0,1,https://w3id.org/nmdc/annotation
18,OrthologyGroup,class,False,0,0,https://w3id.org/nmdc/annotation
20,AttributeValue,class,False,0,0,https://w3id.org/nmdc/attribute_values
34,MaterialEntity,class,False,0,0,https://w3id.org/nmdc/basic_classes
36,PlannedProcess,class,False,0,0,https://w3id.org/nmdc/basic_classes
41,InformationObject,class,False,0,0,https://w3id.org/nmdc/basic_classes
46,MetagenomeAssembly,class,False,0,0,https://w3id.org/nmdc/workflow_execution_activity


In [46]:
unused_non_slot_elements

Unnamed: 0,element_name,element_type,is_used,num_usages,deprecation_count,from_schema
80,SampleStateEnum,enum,False,0,0,https://w3id.org/nmdc/core
124,freq_clean_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
142,organism_count_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
144,plant_growth_med_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
158,samp_md_enum,enum,False,0,0,https://raw.githubusercontent.com/microbiomeda...
204,YesNoEnum,enum,False,0,0,https://w3id.org/nmdc/portal_enums
206,SubmissionStatusEnum,enum,False,0,0,https://w3id.org/nmdc/portal_enums
223,ProtocolCategoryEnum,enum,False,0,0,https://w3id.org/nmdc/nmdc
232,time,type,False,0,0,https://w3id.org/linkml/types
233,date,type,False,0,0,https://w3id.org/linkml/types


In [47]:
# Summary of unused elements by type
unused_summary = unused_non_slot_elements.groupby('element_type').agg({
    'element_name': 'count',
    'from_schema': lambda x: x.nunique()
}).rename(columns={'element_name': 'count', 'from_schema': 'num_schemas'})


In [48]:
print(f"Total unused non-slot elements: {len(unused_non_slot_elements)}")
print(f"\nBreakdown by type:")


Total unused non-slot elements: 20

Breakdown by type:


In [49]:
unused_summary

Unnamed: 0_level_0,count,num_schemas
element_type,Unnamed: 1_level_1,Unnamed: 2_level_1
enum,8,4
type,12,1


In [50]:
# Save unused non-slot elements to TSV
unused_non_slot_elements.to_csv('unused_non_slot_elements.tsv', sep='\t', index=False)
print(f"Saved {len(unused_non_slot_elements)} unused non-slot elements to unused_non_slot_elements.tsv")

Saved 20 unused non-slot elements to unused_non_slot_elements.tsv


## Combined report: All unused elements (slots, enums, types, subsets)
Note: Classes are excluded because usage_index only tracks schema-level references, not data instantiation.

In [51]:
# Prepare unused slots dataframe for merging
unused_slots_for_merge = all_unused_slots[['slot_name', 'deprecation_count', 'from_schema']].copy()
unused_slots_for_merge['element_type'] = 'slot'
unused_slots_for_merge = unused_slots_for_merge.rename(columns={'slot_name': 'element_name'})

# Prepare unused non-slots dataframe for merging
unused_non_slots_for_merge = unused_non_slot_elements[['element_name', 'element_type', 'deprecation_count', 'from_schema']].copy()

# Combine both dataframes
all_unused_elements = pd.concat([unused_slots_for_merge, unused_non_slots_for_merge], ignore_index=True)

# Sort by element type and name
all_unused_elements = all_unused_elements.sort_values(['element_type', 'element_name']).reset_index(drop=True)

print(f"Total unused elements: {len(all_unused_elements)}")
print(f"  - Unused slots: {len(unused_slots_for_merge)}")
print(f"  - Unused non-slots: {len(unused_non_slots_for_merge)}")
print(f"\nWith deprecation info: {len(all_unused_elements[all_unused_elements['deprecation_count'] > 0])}")
print(f"Without deprecation info: {len(all_unused_elements[all_unused_elements['deprecation_count'] == 0])}")

all_unused_elements

Total unused elements: 57
  - Unused slots: 37
  - Unused non-slots: 20

With deprecation info: 4
Without deprecation info: 53


Unnamed: 0,element_name,deprecation_count,from_schema,element_type
0,ProtocolCategoryEnum,0,https://w3id.org/nmdc/nmdc,enum
1,SampleStateEnum,0,https://w3id.org/nmdc/core,enum
2,SubmissionStatusEnum,0,https://w3id.org/nmdc/portal_enums,enum
3,YesNoEnum,0,https://w3id.org/nmdc/portal_enums,enum
4,freq_clean_enum,0,https://raw.githubusercontent.com/microbiomeda...,enum
5,organism_count_enum,0,https://raw.githubusercontent.com/microbiomeda...,enum
6,plant_growth_med_enum,0,https://raw.githubusercontent.com/microbiomeda...,enum
7,samp_md_enum,0,https://raw.githubusercontent.com/microbiomeda...,enum
8,biogas_retention_time,0,https://w3id.org/nmdc/nmdc,slot
9,biogas_temperature,0,https://w3id.org/nmdc/nmdc,slot


In [52]:
# Breakdown by type and deprecation status
summary = all_unused_elements.groupby(['element_type', 'deprecation_count']).size().reset_index(name='count')
print("Breakdown by type and deprecation status:")
summary

Breakdown by type and deprecation status:


Unnamed: 0,element_type,deprecation_count,count
0,enum,0,8
1,slot,0,33
2,slot,1,4
3,type,0,12


In [53]:
# Save combined unused elements report to TSV
all_unused_elements.to_csv('all_unused_elements.tsv', sep='\t', index=False)
print(f"Saved {len(all_unused_elements)} unused elements to all_unused_elements.tsv")

Saved 57 unused elements to all_unused_elements.tsv


In [54]:
all_unused_slots

Unnamed: 0,slot_name,num_children,num_classes,num_modifying,num_grouped,num_mixin_usage,deprecation_count,from_schema
15,dna_absorb2,0,0,0,0,0,0,https://w3id.org/nmdc/core
45,chemical_formula,0,0,0,0,0,0,https://w3id.org/nmdc/core
46,inchi_key,0,0,0,0,0,1,https://w3id.org/nmdc/core
47,inchi,0,0,0,0,0,1,https://w3id.org/nmdc/core
48,smiles,0,0,0,0,0,1,https://w3id.org/nmdc/core
152,insdc_sra_ena_study_identifiers,0,0,0,0,0,0,https://w3id.org/nmdc/external_identifiers
162,insdc_secondary_sample_identifiers,0,0,0,0,0,0,https://w3id.org/nmdc/external_identifiers
170,jgi_portal_analysis_project_identifiers,0,0,0,0,0,0,https://w3id.org/nmdc/external_identifiers
171,insdc_analysis_identifiers,0,0,0,0,0,0,https://w3id.org/nmdc/external_identifiers
172,mgnify_analysis_identifiers,0,0,0,0,0,0,https://w3id.org/nmdc/external_identifiers


In [55]:
all_unused_slots.to_csv('unused_slots.tsv', sep='\t', index=False)

Here are additional ways a slot could be used that we're NOT currently checking:

Slot hierarchy & relationships:

1. mixins - Other slots might use this slot as a mixin (not just is_a)
2. apply_to - This slot could be applied to other definitions
3. subproperty_of - Similar to is_a but more RDF-specific
4. inverse - Another slot might declare this as its inverse
5. disjoint_with - Other slots might reference this in disjoint relationships

Referenced in class definitions:

6. domain_of - Which classes explicitly declare this slot in their definition (might differ from get_classes_by_slot which includes inherited usage)
7. attributes - Could be used as a class attribute
8. defining_slots - Could be used to define a class

Referenced in expressions:

9. slot_conditions - Used in conditional logic
10. any_of, all_of, exactly_one_of, none_of - Logical slot expressions
11. equals_expression - Value constraint expressions
12. Path expressions - traverse, followed_by for reachability

Other references:

13. alias - Other slots using this as their alias target
14. owner - Ownership relationships15. unique_key_slots - Used in unique key definitions
16. rules / classification_rules - Used in class-level rules

The most practically important ones to add would probably be:

- Checking if used as a mixin by other slots
- domain_of to see explicit class declarations
- inverse relationships
- unique_key_slots usage


In [72]:
subsets_file = "../src/schema/nmdc_subsets.yaml"
# not imported any more
subsets_view = SchemaView(subsets_file)
all_subsets = subsets_view.all_subsets()

----

In [56]:
meta_elements = meta_view.all_elements()

In [57]:
meta_element_names = list(meta_elements.keys())
meta_element_names.sort()

In [58]:
annotation_dict = schema_view.annotation_dict('depth')
annotation_dict

{'expected_value': 'measurement value', 'storage_units': 'm'}

In [59]:
class_name_mappings = schema_view.class_name_mappings()
class_name_mappings

{'ChemicalConversionProcess': ClassDefinition({
   'name': 'ChemicalConversionProcess',
   'description': ('A process that results in the interconversion of chemical species by a '
      'reaction to transform the reagents into products.\n'),
   'comments': ['The values of both has_reagents slot and has_input slot are considered the '
     'reagents of a chemical process.'],
   'from_schema': 'https://w3id.org/nmdc/core',
   'aliases': ['chemical reaction'],
   'exact_mappings': ['MISO:0000001'],
   'contributors': ['ORCID:0009-0001-1555-1601', 'ORCID:0000-0002-1368-8217'],
   'is_a': 'MaterialProcessing',
   'slots': ['chemical_conversion_category', 'duration', 'temperature', 'substances_used',
     'substances_volume'],
   'slot_usage': {'id': SlotDefinition({
       'name': 'id',
       'required': True,
       'structured_pattern': PatternExpression({'syntax': '{id_nmdc_prefix}:chcpr-{id_shoulder}-{id_blade}$', 'interpolated': True})
     })},
   'class_uri': 'nmdc:ChemicalConversi

In [60]:
depth = schema_view.get_slot('depth')
classes_by_slot = schema_view.get_classes_by_slot(depth)
classes_by_slot

['Biosample']

In [61]:
has_unit = schema_view.get_slot('has_unit')
schema_view.get_classes_modifying_slot(has_unit)

['QuantityValue', 'PropertyAssertion']

In [62]:
# get_elements_applicable_by_identifier
# get_elements_applicable_by_prefix

In [63]:
mapping_index = schema_view.get_mapping_index()
mapping_index

defaultdict(list,
            {'nmdc:ChemicalConversionProcess': [('self',
               ClassDefinition({
                 'name': 'ChemicalConversionProcess',
                 'description': ('A process that results in the interconversion of chemical species by a '
                    'reaction to transform the reagents into products.\n'),
                 'comments': ['The values of both has_reagents slot and has_input slot are considered the '
                   'reagents of a chemical process.'],
                 'from_schema': 'https://w3id.org/nmdc/core',
                 'aliases': ['chemical reaction'],
                 'exact_mappings': ['MISO:0000001'],
                 'contributors': ['ORCID:0009-0001-1555-1601', 'ORCID:0000-0002-1368-8217'],
                 'is_a': 'MaterialProcessing',
                 'slots': ['chemical_conversion_category', 'duration', 'temperature', 'substances_used',
                   'substances_volume'],
                 'slot_usage': {'id': Sl

In [64]:
slots_by_enum = schema_view.get_slots_by_enum('UnitEnum')
slots_by_enum

[SlotDefinition({
   'name': 'has_unit',
   'description': 'The unit of the quantity',
   'range': 'UnitEnum',
   'required': True
 })]

In [65]:
uri_gotten = schema_view.get_uri(depth)
uri_gotten

'MIXS:0000018'

In [66]:
importmap = schema_view.importmap
importmap

{}

In [67]:
imports_closure = schema_view.imports_closure
imports_closure

<bound method SchemaView.imports_closure of SchemaView(schema=SchemaDefinition({
  'name': 'NMDC',
  'description': ('The NMDC Schema is a foundational framework designed to standardize metadata '
     'for the National Microbiome Data  Collaborative (NMDC) and collaborating '
     'data providors. By establishing a structured approach to metadata, the NMDC '
     'Schema enables researchers to organize,  share, and interpret complex '
     'datasets with consistency and clarity. The NMDC Schema is critical substrate '
     'used to facilitate  interoperability and collaboration, as it provide a '
     'common language for data exchange across systems and disciplines.  In the '
     'context of the NMDC, this schema supports the integration of microbiome data '
     'from medicine, agriculture,  bioenergy, and environmental science into a '
     'cohesive platform.'),
  'title': 'NMDC Schema',
  'notes': ['not importing any MIxS terms where the relationship between the name (SCN) '
   

In [68]:
in_schema = schema_view.in_schema('QuantityValue')
in_schema

'NMDC-Attribute-values'

In [69]:
namespaces = schema_view.namespaces
namespaces

<bound method SchemaView.namespaces of SchemaView(schema=SchemaDefinition({
  'name': 'NMDC',
  'description': ('The NMDC Schema is a foundational framework designed to standardize metadata '
     'for the National Microbiome Data  Collaborative (NMDC) and collaborating '
     'data providors. By establishing a structured approach to metadata, the NMDC '
     'Schema enables researchers to organize,  share, and interpret complex '
     'datasets with consistency and clarity. The NMDC Schema is critical substrate '
     'used to facilitate  interoperability and collaboration, as it provide a '
     'common language for data exchange across systems and disciplines.  In the '
     'context of the NMDC, this schema supports the integration of microbiome data '
     'from medicine, agriculture,  bioenergy, and environmental science into a '
     'cohesive platform.'),
  'title': 'NMDC Schema',
  'notes': ['not importing any MIxS terms where the relationship between the name (SCN) '
    "and