In [1]:
import json
from jsonschema import validate

VERSION = "0.2.0"

# Dataset schema

In [2]:
dataset_schema = {
    "$schema": "http://json-schema.org/draft-06/schema#",
    "title": "Dataset",
    "description": "A dataset in the Materials Data Facility",
    "type": "object",
    "definitions": {
        "data_link": {
            "description": "A link to a raw data file from the dataset",
            "type": "object",
            "properties": {

                "globus_endpoint": {
                    "description": "The ID of the Globus Endpoint hosting the file",
                    "type": "string"
                },
                
                "http_host": {
                    "description": "The fully-qualified HTTP hostname, including protocol, but without the path (for example, 'https://data.materialsdatafacility.org')",
                    "type": "string",
                    "format": "uri"
                },
                
                "path": {
                    "description": "The full path to the data file on the host",
                    "type": "string"
                }
            },
            "anyOf": [
                {"required": ["path", "globus_endpoint"]},
                {"required": ["path", "http_host"]}
            ]
        },
        
        "person": {
            "description": "Identifying information for a person",
            "type": "object",
            "properties": {

                "given_name": {
                    "description": "The person's given (or first) name",
                    "type": "string"
                },
                
                "family_name": {
                    "description": "The person's family (or last) name",
                    "type": "string"
                },
                
                "email": {
                    "description": "The person's email address",
                    "type": "string"
                },
                
                "institution": {
                    "description": "The primary affiliation for the person",
                    "type": "string"
                }
            },
            "required": [
                "given_name",
                "family_name"
            ],
            "additionalProperties": {
                "type": "object",
                "properties": {
                    "additionalProperties": {
                        "type": "string"
                    }
                }
            }
        }
    },
    "properties": {

        "mdf-title": {
            "description": "The title of the dataset",
            "type": "string"
        },

        "mdf-acl": {
            "description": "The UUIDs allowed to view this metadata, or 'public'",
            "type": "array",
            "items": {
                "type": "string"
            }
        },

        "mdf-source_name": {
            "description": "A short version of the dataset name, for quick reference, with underscores instead of spaces",
            "type": "string"
        },

        "mdf-collection": {
            "description": "The collection for the dataset, commonly a portion of the title",
            "type": "string"
        },
        
        "mdf-data_format": {
            "description": "The file format(s) of the data (for example, 'vasp')",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-data_type": {
            "description": "The broad categorization(s) of the data (for example, DFT)",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-citation": {
            "description": "The full bibliographic citation(s) for the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-license": {
            "description": "A link to the license for distribution of the dataset",
            "type": "string"
        },
        
        "mdf-author": {
            "description": "A list of the authors of this dataset",
            "type": "array",
            "items": {
                "$ref": "#/definitions/person"
            }
        },
        
        "mdf-data_contact": {
            "description": "The contact person/steward/custodian for the dataset",
            "$ref": "#/definitions/person"
        },
        
        "mdf-description": {
            "description": "A description of the dataset",
            "type": "string"
        },
        
        "mdf-tags": {
            "description": "Tags, keywords, or other general descriptors for the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-year": {
            "description": "The year of dataset creation",
            "type": "integer"
        },
        
        "mdf-ingest-date": {
            "description": "The date this entry was created",
            "type": "string",
            "format": "date-time"
        },
        
        "mdf-metadata_version": {
            "description": "The metadata schema version this entry adheres to",
            "type": "string"
        },
        
        "mdf-links": {
            "description": "Links relating to the dataset",
            "type": "object",
            "properties": {

                "mdf-landing_page": {
                    "description": "The human-friendly landing page for the dataset",
                    "type": "string",
                    "format": "uri"
                },
                
                "mdf-publication": {
                    "description": "The DOI(s) (in link form, ex. 'https://dx.doi.org/10.12345') for publications connected to the dataset",
                    "type": "array",
                    "items": {
                        "type": "string",
                        "format": "uri"
                    }
                },
                
                "mdf-dataset_doi": {
                    "description": "The DOI of the dataset itself (in link form)",
                    "type": "string",
                    "format": "uri"
                },
                
                "mdf-related_id": {
                    "description": "The mdf-id(s) of related entries, not including records from this dataset",
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                }
            },
            "additionalProperties": {
                "$ref": "#/definitions/data_link"
            },
            "required": [
                "mdf-landing_page"
            ]
        },
        
        "mdf-id": {
            "description": "A unique BSON ID for this entry",
            "type": "string"
        },
        
        "mdf-node_type": {
            "description": "The type of entry (dataset)",
            "type": "string"
        },
        
        "mdf-mrr": {
            "description": "Fields relating the the NIST Materials Resource Registry system",
            "type": "object",
            "properties": {}
        },
        
        "mdf-data_contributor": {
            "description": "The person contributing the tools to ingest the dataset (harvester, converter)",
            "type": "array",
            "items": {
                "$ref": "#/definitions/person"
            }
        }
    },
    "required": [
        "mdf-title",
        "mdf-acl",
        "mdf-source_name",
        "mdf-citation",
        "mdf-links",
        "mdf-data_contact",
        "mdf-ingest_date",
        "mdf-metadata_version",
        "mdf-id",
        "mdf-node_type"
    ]
}

## Write the dataset schema to file

In [3]:
with open("dataset_"+VERSION+".schema", 'w') as ds_md_file:
    json.dump(dataset_schema, ds_md_file)

## Validate an example dataset entry

In [4]:
dataset = { 
        "mdf-title":"test",
        "mdf-acl":["blah"],
        "mdf-source_name":"source name",
        "mdf-citation":["abc"],
        "mdf-links": {
            "mdf-landing_page":"http://www.globus.org"
        },
        "mdf-data_contact":{
            "given_name": "Test",
            "family_name": "McTesterson"
        },
        "mdf-ingest_date":"Jan 1, 2017",
        "mdf-metadata_version":"1.1",
        "mdf-id":"1",
        "mdf-node_type":"dataset",
        "mdf-mrr":{
            "data_acquisition":"abc"
        }
}

validate(dataset,dataset_schema)

# Record schema

In [5]:
record_schema = {
    "$schema": "http://json-schema.org/draft-06/schema#",
    "title": "Record",
    "description": "A record in the Materials Data Facility",
    "type": "object",
    "definitions": {
        "data_link": {
            "description": "A link to a raw data file from the dataset",
            "type": "object",
            "properties": {

                "globus_endpoint": {
                    "description": "The ID of the Globus Endpoint hosting the file",
                    "type": "string"
                },
                
                "http_host": {
                    "description": "The fully-qualified HTTP hostname, including protocol, but without the path (for example, 'https://data.materialsdatafacility.org')",
                    "type": "string",
                    "format": "uri"
                },
                
                "path": {
                    "description": "The full path to the data file on the host",
                    "type": "string"
                }
            },
            "anyOf": [
                {"required": ["path", "globus_endpoint"]},
                {"required": ["path", "http_host"]}
            ]
        },
        
        "person": {
            "description": "Identifying information for a person",
            "type": "object",
            "properties": {

                "given_name": {
                    "description": "The person's given (or first) name",
                    "type": "string"
                },
                
                "family_name": {
                    "description": "The person's family (or last) name",
                    "type": "string"
                },
                
                "email": {
                    "description": "The person's email address",
                    "type": "string"
                },
                
                "institution": {
                    "description": "The primary affiliation for the person",
                    "type": "string"
                }
            },
            "required": [
                "given_name",
                "family_name"
            ],
            "additionalProperties": {
                "type": "object",
                "properties": {
                    "additionalProperties": {
                        "type": "string"
                    }
                }
            }
        }
    },
    "properties": {

        "mdf-title": {
            "description": "The title of the record",
            "type": "string"
        },

        "mdf-acl": {
            "description": "The UUIDs allowed to view this metadata, or 'public'",
            "type": "array",
            "items": {
                "type": "string"
            }
        },

        "mdf-collection": {
            "description": "The collection for the record, if different from the dataset",
            "type": "string"
        },
        
        "mdf-data_format": {
            "description": "The file format(s) of the data (for example, 'vasp'), if different from the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-data_type": {
            "description": "The broad categorization(s) of the data (for example, DFT), if different from the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-citation": {
            "description": "The full bibliographic citation(s) for the record, if different from the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-license": {
            "description": "A link to the license for distribution of the record, if different from the dataset",
            "type": "string"
        },
        
        "mdf-author": {
            "description": "A list of the authors of this record, if different from the dataset",
            "type": "array",
            "items": {
                "$ref": "#/definitions/person"
            }
        },
        
        "mdf-data_contact": {
            "description": "The contact person/steward/custodian for the record, if different from the dataset",
            "$ref": "#/definitions/person"
        },
        
        "mdf-description": {
            "description": "A description of the record",
            "type": "string"
        },
        
        "mdf-tags": {
            "description": "Tags, keywords, or other specific descriptors for the record not in the dataset tags",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-year": {
            "description": "The year of record creation, if different from the dataset",
            "type": "integer"
        },
        
        "mdf-ingest-date": {
            "description": "The date this entry was created",
            "type": "string",
            "format": "date-time"
        },
        
        "mdf-metadata_version": {
            "description": "The metadata schema version this entry adheres to",
            "type": "string"
        },
        
        "mdf-links": {
            "description": "Links relating to the record",
            "type": "object",
            "properties": {

                "mdf-landing_page": {
                    "description": "The human-friendly landing page for the record",
                    "type": "string",
                    "format": "uri"
                },
                
                "mdf-publication": {
                    "description": "The DOI(s) (in link form, ex. 'https://dx.doi.org/10.12345') for publications connected to the record, if different from the dataset",
                    "type": "array",
                    "items": {
                        "type": "string",
                        "format": "uri"
                    }
                },
                
                "mdf-dataset_doi": {
                    "description": "The DOI of the record itself (in link form), if separate from the dataset",
                    "type": "string",
                    "format": "uri"
                },
                
                "mdf-parent_id": {
                    "description": "The mdf-id of this record's dataset",
                    "type": "string"
                },
                
                "mdf-related_id": {
                    "description": "The mdf-id(s) of related entries",
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                }
            },
            "additionalProperties": {
                "$ref": "#/definitions/data_link"
            },
            "required": [
                "mdf-landing_page",
                "mdf-parent_id"
            ]
        },
        
        "mdf-id": {
            "description": "A unique BSON ID for this entry",
            "type": "string"
        },
        
        "mdf-node_type": {
            "description": "The type of entry (record)",
            "type": "string"
        },
        
        "mdf-processing": {
            "description": "Processing information"
        },
        
        "mdf-composition": {
            "description": "Subject material composition, expressed in a chemical formula (ex. Bi2S3)",
            "type": "string"
        },
        
        "mdf-element": {
            "description": "The element(s) in the material composition",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        
        "mdf-structure": {
            "description": "Structure information"
        },
        
        "mdf-raw": {
            "description": "A string containing the record as JSON",
            "type": "string"
        },
        
        "mdf-mrr": {
            "description": "Fields relating the the NIST Materials Resource Registry system",
            "type": "object",
            "properties": {}
        },
        
        "mdf-data_contributor": {
            "description": "The person contributing the tools to ingest the dataset (harvester, converter)",
            "type": "array",
            "items": {
                "$ref": "#/definitions/person"
            }
        }
    },
    "required": [
        "mdf-title",
        "mdf-acl",
        "mdf-links",
        "mdf-ingest_date",
        "mdf-metadata_version",
        "mdf-id",
        "mdf-node_type"
    ]
}

## Write record schema to file

In [6]:
with open("record_"+VERSION+".schema", 'w') as rc_md_file:
    json.dump(record_schema, rc_md_file)

## Validate an example record 

In [7]:
record = { 
        "mdf-title":"test",
        "mdf-acl":["blah"],
        "mdf-source_name":"source name",
        "mdf-citation":["abc"],
        "mdf-links": {
            "mdf-landing_page":"http://www.globus.org",
            "mdf-parent_id":"abc"
        },
        "mdf-ingest_date":"Jan 1, 2017",
        "mdf-metadata_version":"1.1",
        "mdf-id":"1",
        "mdf-node_type":"dataset",
        "mdf-mrr":{
            "data_acquisition":"abc"
        }
}

validate(record,record_schema)