I have a PyPI package for this too but let's try from scratch with an LLM

In [1]:
import glob
import os

import yaml

from nmdc_schema.get_nmdc_view import ViewGetter

In [2]:
vg = ViewGetter()
schema_view = vg.get_view()

In [3]:
induced_biosample = schema_view.induced_class("Biosample")

In [4]:
ib_attributes = induced_biosample.attributes

In [5]:
biosample_slots = ib_attributes.keys()

In [6]:
len(biosample_slots)

586

In [7]:
biosample_qv_slots = [k for k in biosample_slots if ib_attributes[k].range == "QuantityValue"]

In [8]:
biosample_qv_slots.sort()

In [9]:
# biosample_qv_slots

['abs_air_humidity',
 'air_temp',
 'al_sat',
 'alkalinity',
 'alkyl_diethers',
 'alt',
 'aminopept_act',
 'ammonium',
 'ammonium_nitrogen',
 'amount_light',
 'annual_precpt',
 'annual_temp',
 'api',
 'avg_dew_point',
 'avg_temp',
 'bac_prod',
 'bac_resp',
 'bacteria_carb_prod',
 'barometric_press',
 'benzene',
 'biochem_oxygen_dem',
 'bishomohopanol',
 'blood_press_diast',
 'blood_press_syst',
 'bromide',
 'built_struc_age',
 'bulk_elect_conductivity',
 'calcium',
 'carb_dioxide',
 'carb_monoxide',
 'carb_nitro_ratio',
 'ceil_area',
 'ceil_thermal_mass',
 'chem_oxygen_dem',
 'chloride',
 'chlorophyll',
 'conduc',
 'density',
 'depth',
 'dew_point',
 'diss_carb_dioxide',
 'diss_hydrogen',
 'diss_inorg_carb',
 'diss_inorg_nitro',
 'diss_inorg_phosp',
 'diss_iron',
 'diss_org_carb',
 'diss_org_nitro',
 'diss_oxygen',
 'diss_oxygen_fluid',
 'door_size',
 'down_par',
 'efficiency_percent',
 'ethylbenzene',
 'exp_duct',
 'exp_pipe',
 'floor_age',
 'floor_area',
 'floor_thermal_mass',
 'fluor

In [10]:
biosample_files = []
database_biosample_files = []

In [11]:
# Iterate through YAML files in src/data/valid
for yaml_file in glob.glob("../src/data/valid/*.yaml"):
    filename = os.path.basename(yaml_file)
    # print("Processing file:", filename)

    if filename.startswith("Biosample"):
        biosample_files.append(yaml_file)

    elif filename.startswith("Database"):
        # Read and check content for biosample_set
        with open(yaml_file) as f:
            content = f.read()
            if "biosample_set" in content:
                database_biosample_files.append(yaml_file)


In [12]:
# print("Biosample files:", biosample_files)
# print("Database files with biosamples:", database_biosample_files)

In [13]:
biosample_file_field_counts = {}

In [14]:
for yaml_file in biosample_files:
    with open(yaml_file) as f:
        yaml_dict = yaml.safe_load(f)
        if yaml_dict:
            biosample_id = yaml_dict.get('id', 'unknown')  # Get id or 'unknown' if not present
            field_count = len(yaml_dict)
            filename = os.path.basename(yaml_file)
            biosample_file_field_counts[filename] = {'id': biosample_id, 'field_count': field_count}


In [15]:
biosample_file_field_counts

{'Biosample-possibly-exhaustive.yaml': {'id': 'nmdc:bsm-99-4444444',
  'field_count': 576},
 'Biosample-embargoed.yaml': {'id': 'nmdc:bsm-99-dtTMNb', 'field_count': 7},
 'Biosample-soil_horizon.yaml': {'id': 'nmdc:bsm-99-dtTMNb', 'field_count': 7},
 'Biosample-minimal.yaml': {'id': 'nmdc:bsm-99-dtTMNb', 'field_count': 6},
 'Biosample-amplicon.yaml': {'id': 'nmdc:bsm-99-dtTMNb', 'field_count': 7},
 'Biosample-with-fire.yaml': {'id': 'nmdc:bsm-99-dtTMNb', 'field_count': 7}}

In [16]:
database_biosample_data = {}

In [17]:
for yaml_file in database_biosample_files:
    with open(yaml_file) as f:
        yaml_dict = yaml.safe_load(f)
        filename = os.path.basename(yaml_file)

        if yaml_dict and 'biosample_set' in yaml_dict:
            database_biosample_data[filename] = {}

            for biosample in yaml_dict['biosample_set']:
                if 'id' in biosample:
                    biosample_id = biosample['id']
                    field_count = len(biosample)
                    database_biosample_data[filename][biosample_id] = field_count


In [18]:
database_biosample_data

{'Database-biosamples-dna-in-tube.yaml': {'nmdc:bsm-99-dtTMNb': 7},
 'Database-nmdc-example.yaml': {'nmdc:bsm-99-isqhuW': 24,
  'nmdc:bsm-99-dge3H9': 24,
  'nmdc:bsm-99-dc6tg6': 25},
 'Database-biosamples-infiltrations.yaml': {'nmdc:bsm-99-dtTMNb': 7,
  'nmdc:bsm-99-abcdef': 7,
  'nmdc:bsm-99-qwerty': 7},
 'Database-neon-story.yaml': {'nmdc:bsm-99-abcdef1': 6,
  'nmdc:bsm-99-abcdef2': 6,
  'nmdc:bsm-99-abcdef3': 6},
 'Database-biosamples-dna-in-plate-valid-well-val.yaml': {'nmdc:bsm-99-dtTMNb': 8,
  'nmdc:bsm-99-000001': 8,
  'nmdc:bsm-99-000002': 8,
  'nmdc:bsm-99-000003': 8,
  'nmdc:bsm-99-000004': 8,
  'nmdc:bsm-99-000005': 8,
  'nmdc:bsm-99-000006': 8},
 'Database-biosamples-rna-in-plate-valid-well-val.yaml': {'nmdc:bsm-99-dtTMNb': 8,
  'nmdc:bsm-99-000001': 8,
  'nmdc:bsm-99-000002': 8,
  'nmdc:bsm-99-000003': 8,
  'nmdc:bsm-99-000004': 8,
  'nmdc:bsm-99-000005': 8,
  'nmdc:bsm-99-000006': 8},
 'Database-biosamples-sites.yaml': {'nmdc:bsm-99-J9FcnC': 10,
  'nmdc:bsm-99-BdlWdQ': 10

In [19]:
# # Integrate the data from both dictionaries
# integrated_biosample_data = {}

In [20]:
# # Process biosample_file_field_counts
# for filename, data in biosample_file_field_counts.items():
#     biosample_id = data['id']
#     field_count = data['field_count']
#     if biosample_id not in integrated_biosample_data:
#         integrated_biosample_data[biosample_id] = {
#             'individual_files': {},
#             'database_files': {}
#         }
#     integrated_biosample_data[biosample_id]['individual_files'][filename] = field_count

In [21]:
# # Process database_biosample_data  
# for db_file, biosamples in database_biosample_data.items():
#     for biosample_id, field_count in biosamples.items():
#         if biosample_id not in integrated_biosample_data:
#             integrated_biosample_data[biosample_id] = {
#                 'individual_files': {},
#                 'database_files': {}
#             }
#         integrated_biosample_data[biosample_id]['database_files'][db_file] = field_count


In [22]:
# integrated_biosample_data

In [23]:
with open("../src/data/valid/Biosample-possibly-exhaustive.yaml") as f:
    biosample = yaml.safe_load(f)
    current_keys = list(biosample.keys())


In [26]:
exhaustive_failures = list(set(biosample_slots).difference(set(current_keys)))


In [27]:
exhaustive_failures.sort()

In [28]:
for i in exhaustive_failures:
    print(i)

bulk_elect_conductivity
core field
environment field
host_disease_stat
infiltrations
investigation field
neon_biosample_identifiers
nitrate_nitrogen
nucleic acid sequence source field
sequencing field


In [31]:
slots = schema_view.all_slots()

In [32]:
abstract_slots = []
for slot_name, slot_info in slots.items():
    if slot_info.abstract:
        abstract_slots.append(slot_name)

In [36]:
abstract_slots.sort()

In [37]:
for i in abstract_slots:
    print(i)

analysis_identifiers
assembly_identifiers
biosample_identifiers
core field
environment field
external_database_identifiers
gold_path_field
investigation field
metagenome_assembly_parameter
mixs_env_triad_field
nucleic acid sequence source field
omics_processing_identifiers
read_qc_analysis_statistic
sequencing field
study_identifiers


In [34]:
intersection = set(biosample_slots).intersection(set(abstract_slots))
intersection = list(intersection)
intersection.sort()
print("Slots that are both in biosample_slots and abstract_slots:")
for slot in intersection:
    print(slot)


Slots that are both in biosample_slots and abstract_slots:
core field
environment field
investigation field
nucleic acid sequence source field
sequencing field


In [38]:
# Create a dictionary to store inheritance relationships
is_a_hierarchy = {}

# Iterate through slots to build hierarchy
for slot_name, slot_info in slots.items():
    if hasattr(slot_info, 'is_a') and slot_info.is_a:
        if slot_info.is_a not in is_a_hierarchy:
            is_a_hierarchy[slot_info.is_a] = []
        is_a_hierarchy[slot_info.is_a].append(slot_name)


# Print the hierarchy recursively
def print_hierarchy(parent, level=0):
    indent = "  " * level
    print(f"{indent}{parent}")
    if parent in is_a_hierarchy:
        for child in is_a_hierarchy[parent]:
            print_hierarchy(child, level + 1)


# Print starting from root slots (those that are not children)
all_children = [child for children in is_a_hierarchy.values() for child in children]
root_slots = [slot for slot in slots.keys() if slot not in all_children and slot in is_a_hierarchy]

print("Slot Hierarchy:")
for root in root_slots:
    print_hierarchy(root)


Slot Hierarchy:
concentration
  source_concentration
  final_concentration
gff_coordinate
  end
  start
biomaterial_purity
  dna_absorb1
  dna_absorb2
assembly_identifiers
  insdc_assembly_identifiers
core field
  abs_air_humidity
  add_recov_method
  additional_info
  address
  adj_room
  aero_struc
  agrochem_addition
  air_PM_concen
  air_temp
  air_temp_regm
  al_sat
  al_sat_meth
  alkalinity
  alkalinity_method
  alkyl_diethers
  aminopept_act
  ammonium
  amount_light
  ances_data
  annual_precpt
  annual_temp
  antibiotic_regm
  api
  arch_struc
  aromatics_pc
  asphaltenes_pc
  atmospheric_data
  avg_dew_point
  avg_occup
  avg_temp
  bac_prod
  bac_resp
  bacteria_carb_prod
  barometric_press
  basin
  bathroom_count
  bedroom_count
  benzene
  biochem_oxygen_dem
  biocide
  biocide_admin_method
  biol_stat
  biomass
  biotic_regm
  bishomohopanol
  blood_press_diast
  blood_press_syst
  bromide
  build_docs
  build_occup_type
  building_setting
  built_struc_age
  built_stru

In [40]:
elements = schema_view.all_elements()

In [41]:
# Get elements with whitespace in their names
whitespace_elements = [name for name in elements.keys() if ' ' in name]
whitespace_elements.sort()
for element in whitespace_elements:
    print(element)


core field
environment field
investigation field
nucleic acid sequence source field
sequencing field


In [42]:
print("Are intersection and whitespace_elements identical?")
print(intersection == whitespace_elements)
print("\nIntersection:", intersection)
print("Whitespace elements:", whitespace_elements)


Are intersection and whitespace_elements identical?
True

Intersection: ['core field', 'environment field', 'investigation field', 'nucleic acid sequence source field', 'sequencing field']
Whitespace elements: ['core field', 'environment field', 'investigation field', 'nucleic acid sequence source field', 'sequencing field']


In [47]:
identifiers = schema_view.slot_descendants("alternative_identifiers")

In [48]:
print(type(identifiers))

<class 'list'>


In [49]:
identifiers

['alternative_identifiers',
 'external_database_identifiers',
 'img_identifiers',
 'study_identifiers',
 'biosample_identifiers',
 'omics_processing_identifiers',
 'insdc_experiment_identifiers',
 'analysis_identifiers',
 'gold_analysis_project_identifiers',
 'jgi_portal_analysis_project_identifiers',
 'insdc_analysis_identifiers',
 'mgnify_analysis_identifiers',
 'gold_sequencing_project_identifiers',
 'neon_biosample_identifiers',
 'gold_biosample_identifiers',
 'insdc_biosample_identifiers',
 'insdc_secondary_sample_identifiers',
 'emsl_biosample_identifiers',
 'igsn_biosample_identifiers',
 'jgi_portal_study_identifiers',
 'neon_study_identifiers',
 'insdc_sra_ena_study_identifiers',
 'insdc_bioproject_identifiers',
 'gold_study_identifiers',
 'mgnify_project_identifiers',
 'gnps_task_identifiers',
 'emsl_project_identifiers']

In [50]:
# Get the set difference between abstract_slots and identifiers
difference = set(abstract_slots).difference(set(identifiers))
difference = list(difference)
difference.sort()

print("Slots that are in abstract_slots but not in identifiers:")
for slot in difference:
    print(slot)


Slots that are in abstract_slots but not in identifiers:
assembly_identifiers
core field
environment field
gold_path_field
investigation field
metagenome_assembly_parameter
mixs_env_triad_field
nucleic acid sequence source field
read_qc_analysis_statistic
sequencing field
