## Import libraries

In [1]:
from jsonschema import validate, RefResolver, Draft7Validator, ValidationError
import json
from pyld import jsonld

## Functions

In [2]:
def load_json(filepath: str) -> dict:
    return json.load(open(filepath, "r"))

def find_refs(obj, refs=None) -> list[str]:
    if refs is None:
        refs = []
        
    if isinstance(obj, dict):
        for key, value in obj.items():
            if key == '$ref':
                refs.append(value[2:])
            else:
                find_refs(value, refs)
    
    return refs

def find_ref_schemas(schemas_filenames: list[str], refs = None):
    if refs is None:
        refs = []
        
    for schema_filename in schemas_filenames:
        schema = load_json(f"New_Metadata_Schemas/{schema_filename}")
        ref_schemas_filenames = find_refs(schema)
        
        if schema_filename not in refs:
            refs.append(schema_filename)
            find_ref_schemas(ref_schemas_filenames, refs)
    
    return refs

def set_up_validator(json_schema_filename):
    schemas_filenames = [json_schema_filename]
    all_schemas_filenames = find_ref_schemas(schemas_filenames)

    first_schema = load_json(f"New_Metadata_Schemas/{all_schemas_filenames[0]}")
    resolver = RefResolver(base_uri="http://example.com/", referrer = first_schema)
    
    complete_schema_context = {}

    for referenced_schema_filename in all_schemas_filenames:
        referenced_schema = load_json(f"New_Metadata_Schemas/{referenced_schema_filename}")
        complete_schema_context.update(referenced_schema["@context"])
        resolver.store[referenced_schema["$id"]] = referenced_schema

    validator = Draft7Validator(first_schema, resolver=resolver)
    
    return validator, complete_schema_context

# Function to expand context terms
def expand_context(base_context):
    expanded_context = {}
    for key, value in base_context.items():
        if isinstance(value, str) and ':' in value:
            prefix, suffix = value.split(':', 1)
            if prefix in base_context:
                expanded_context[key] = base_context[prefix] + suffix
            else:
                expanded_context[key] = value
        else:
            expanded_context[key] = value
    return expanded_context

def replace_keys_recursive(data, old_key, new_key):
    if isinstance(data, dict):
        for key in list(data.keys()):  # Create a copy of keys to avoid RuntimeError
            if key == old_key:
                key = new_key
                data[new_key] = data.pop(old_key)
            replace_keys_recursive(data[key], old_key, new_key)
    elif isinstance(data, list):
        for item in data:
            replace_keys_recursive(item, old_key, new_key)

## Prepare the JSON Schema validator

In [3]:
validator, complete_schema_context = set_up_validator("molecule.schema.json")

## Validate the JSON file

In [4]:
json_file = load_json("example.json")
schema_context = complete_schema_context
schema_context = expand_context(schema_context)
json_context = expand_context(json_file["@context"])

if "@context" in json_file:
    _ = json_file.pop("@context")
    
for key1, value1 in schema_context.items():
    for key2, value2 in json_context.items():
        if value1 == value2:
            replace_keys_recursive(json_file, key2, key1)
            
try:
    validator.validate(instance = json_file)
    print('The JSON file is valid according to the schema.')
except ValidationError as e:
    message = e.schema["error_msg"] if "error_msg" in e.schema else e.message
    print(f'Invalid data: {message}')

The JSON file is valid according to the schema.


In [5]:
json_file

{'type': 'schema:MolecularEntity',
 'name': 'Methane',
 'sum_formula': 'CH4',
 'smiles': 'C',
 'empa_number': 500,
 'batch': 'a',
 'amount': {'value': 90, 'unit': 'mg'},
 'comments': '1',
 'storage': {'name': 'lab',
  'room': {'name': 'lab', 'institution': {'name': 'empa', 'address': 'empa'}}},
 'molecules': [{'type': 'schema:MolecularEntity',
   'name': 'Methane',
   'sum_formula': 'CH4',
   'smiles': 'C',
   'empa_number': 500,
   'batch': 'a',
   'storage': {'name': 'lab',
    'room': {'name': 'lab',
     'institution': {'name': 'empa', 'address': 'empa'}}}},
  {'type': 'schema:MolecularEntity',
   'name': 'Ethanol',
   'sum_formula': 'C2H6O',
   'smiles': 'CCO',
   'empa_number': 400,
   'batch': 'a',
   'storage': {'name': 'lab',
    'room': {'name': 'lab',
     'institution': {'name': 'empa', 'address': 'empa'}}}}]}