In [1]:
import json
from jsonschema import validate

from generate_schemas import write_schema

VERSION = "0.4.0"
USER_KEY_LIMIT = 10

# Definitions

In [2]:
definitions = {
    "data_link": {
        "description": "RCM: Links to raw data files from the dataset (multiple allowed, field name should be data type)",
        "type": "object",
        "properties": {
            "globus_endpoint": {
                "description": "RCM: The ID of the Globus Endpoint hosting the file",
                "type": "string"
            },                
            "http_host": {
                "description": "RCM: The fully-qualified HTTP hostname, including protocol, but without the path (for example, 'https://data.materialsdatafacility.org')",
                "type": "string",
                "format": "uri"
            },
            "path": {
                "description": "REQ: The full path to the data file on the host",
                "type": "string"
            }
        },
        "anyOf": [
            {"required": ["path", "globus_endpoint"]},
            {"required": ["path", "http_host"]}
        ]
    },
    "person_author": {
        "description": "Identifying information for an author",
        "type": "object",
        "properties": {
            "given_name": {
                "description": "REQ: The person's given (or first) name",
                "type": "string"
            },                
            "family_name": {
                "description": "REQ: The person's family (or last) name",
                "type": "string"
            },
            "full_name": {
                "description": "INTERNAL: Given and family names, concatenated.",
                "type": "string"
            },
            "email": {
                "description": "RCM: The person's email address",
                "type": "string"
            },
            "institution": {
                "description": "RCM: The primary affiliation for the person",
                "type": "string"
            }
        },
        "required": [
            "given_name",
            "family_name",
            "full_name"
        ],
        "additionalProperties": {
            "type": "string"
        }
    },
    "person_data_contact": {
        "description": "Identifying information for a data contact",
        "type": "object",
        "properties": {
            "given_name": {
                "description": "REQ: The person's given (or first) name",
                "type": "string"
            },
            "family_name": {
                "description": "REQ: The person's family (or last) name",
                "type": "string"
            },
            "full_name": {
                "description": "INTERNAL: Given and family names, concatenated.",
                "type": "string"
            },
            "email": {
                "description": "REQ: The person's email address",
                "type": "string"
            },
            "institution": {
                "description": "RCM: The primary affiliation for the person",
                "type": "string"
            }
        },
        "required": [
            "given_name",
            "family_name",
            "full_name",
            "email"
        ],
        "additionalProperties": {
            "type": "string"
        }
    },
    "person_data_contributor": {
        "description": "Identifying information for a data contributor",
        "type": "object",
        "properties": {
            "given_name": {
                "description": "REQ: The person's given (or first) name",
                "type": "string"
            },
            "family_name": {
                "description": "REQ: The person's family (or last) name",
                "type": "string"
            },
            "full_name": {
                "description": "INTERNAL: Given and family names, concatenated.",
                "type": "string"
            },
            "email": {
                "description": "REQ: The person's email address",
                "type": "string"
            },
            "institution": {
                "description": "RCM: The primary affiliation for the person",
                "type": "string"
            },
            "github": {
                "description": "RCM: The person's GitHub username",
                "type": "string"
            }
        },
        "required": [
            "given_name",
            "family_name",
            "full_name",
            "email"
        ],
        "additionalProperties": {
            "type": "string"
        }
    }
}

# Dataset schema

## MDF Block - Dataset

In [3]:
mdf_dataset = {
    "title": "MDF Block - Dataset",
    "description": "REQ: MDF-format dataset metadata",
    "type": "object",
    "properties": {
        "title": {
            "description": "REQ: The title of the dataset",
            "type": "string"
        },
        "acl": {
            "description": "REQ: The UUIDs allowed to view this metadata, or 'public'",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "source_name": {
            "description": "REQ: A short version of the dataset name, for quick reference. Spaces and dashes will be replaced with underscores, and other non-alphanumeric characters will be removed.",
            "type": "string",
            "not": {
                "pattern": "^mdf"
            }
        },
        "data_contact": {
            "description": "REQ: The contact person/steward/custodian for the dataset",
            "$ref": "#/definitions/person_data_contact"
        },
        "data_contributor": {
            "description": "REQ: The person/people contributing the tools (harvester, this converter) to ingest the dataset",
            "type": "array",
            "items": {
                "$ref": "#/definitions/person_data_contributor"
            }
        },
        "citation": {
            "description": "RCM: The full bibliographic citation(s) for the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "author": {
            "description": "RCM: A list of the authors of this dataset",
            "type": "array",
            "items": {
                "$ref": "#/definitions/person_author"
            }
        },
        "license": {
            "description": "RCM: A link to the license for distribution of the dataset",
            "type": "string"
        },
        "repository": {
            "description": "RCM: The repository (that should already be in MDF) holding the dataset",
            "type": "string"
        },
        "collection": {
            "description": "RCM: The collection for the dataset, commonly a portion of the title",
            "type": "string"
        },
        "tags": {
            "description": "RCM: Tags, keywords, or other general descriptors for the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "description": {
            "description": "RCM: A description of the dataset",
            "type": "string"
        },
        "year": {
            "description": "RCM: The year of dataset creation",
            "type": "integer"
        },
        "links": {
            "description": "REQ: Links relating to the dataset",
            "type": "object",
            "properties": {
                "landing_page": {
                    "description": "REQ: The human-friendly landing page for the dataset",
                    "type": "string",
                    "format": "uri"
                },
                "publication": {
                    "description": "RCM: The DOI(s) (in link form, ex. 'https://dx.doi.org/10.12345') for publications connected to the dataset",
                    "type": "array",
                    "items": {
                        "type": "string",
                        "format": "uri"
                    }
                },
                "data_doi": {
                    "description": "RCM: The DOI of the dataset itself (in link form)",
                    "type": "string",
                    "format": "uri"
                },
                "related_id": {
                    "description": "OPT: The mdf-id(s) of related entries, not including records from this dataset",
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                },
                "parent_id": {
                    "description": "INTERNAL: The mdf-id of this dataset's repository, if applicable",
                    "type": "string"
                }
            },
            "additionalProperties": {
                "$ref": "#/definitions/data_link"
            },
            "required": [
                "landing_page"
            ]
        },
        "ingest_date": {
            "description": "INTERNAL: The date this entry was created",
            "type": "string",
            "format": "date-time"
        },
        "metadata_version": {
            "description": "INTERNAL: The metadata schema version this entry adheres to",
            "type": "string"
        },
        "mdf_id": {
            "description": "INTERNAL: A unique BSON ID for this entry",
            "type": "string"
        },
        "resource_type": {
            "description": "INTERNAL: The type of entry (dataset)",
            "type": "string"
        },
    },
    "required": [
        "title",
        "acl",
        "source_name",
        "links",
        "data_contact",
        "data_contributor",
        "ingest_date",
        "metadata_version",
        "mdf_id",
        "resource_type"
    ],
    "additionalProperties": False
}

## DC Block - Dataset

In [4]:
dc_dataset = {
    "title": "DC Block - Dataset",
    "description": "OPT: DataCite-format metadata",
    "type": "object",
    "properties": {}
}

## User-supplied fields

In [5]:
user_dataset = {
    "title": "User Block - Dataset",
    "description": "RCM: User-supplied metadata",
    "type": "object",
    "properties": {},
    "maxProperties": USER_KEY_LIMIT
}

## Full dataset schema

In [6]:
dataset_schema = {
    "$schema": "http://json-schema.org/draft-06/schema#",
    "title": "MDF Dataset",
    "description": "A dataset in the Materials Data Facility",
    "definitions": definitions,
    "properties":{
        "mdf": mdf_dataset,
        "dc": dc_dataset
    },
    "required": [
        "mdf",
        "dc"
    ],
    "additionalProperties": user_dataset,
    "minProperties": 2,
    "maxProperties": 3
}

## Validate an example dataset entry

In [7]:
dataset = {
    "mdf": {
            "title":"test",
            "acl":["blah"],
            "source_name":"source name",
            "citation":["abc"],
            "links": {
                "landing_page":"http://www.globus.org"
            },
            "data_contact":{
                "given_name": "Test",
                "family_name": "McTesterson",
                "full_name": "Test McTesterson",
                "email": "test@example.com"
            },
            "data_contributor":[{
                "given_name": "Test",
                "family_name": "McTesterson",
                "full_name": "Test McTesterson",
                "email": "test@example.com"
            }],
            "ingest_date":"Jan 1, 2017",
            "metadata_version":"1.1",
            "mdf_id":"1",
            "resource_type":"dataset"
    },
    "dc": {},
    "misc": {}
}

validate(dataset,dataset_schema)

# Record schema

## MDF Block - Record

In [8]:
mdf_record = {
    "title": "MDF Block - Record",
    "description": "REQ: MDF-format record metadata",
    "type": "object",
    "properties": {
        "title": {
            "description": "REQ: The title of the record",
            "type": "string"
        },
        "acl": {
            "description": "RCM: The UUIDs allowed to view this metadata, or 'public' (defaults to the dataset ACL)",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "composition": {
            "description": "RCM: Subject material composition, expressed in a chemical formula (ex. Bi2S3)",
            "type": "string"
        },
        "tags": {
            "description": "RCM: Tags, keywords, or other general descriptors for the record",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "description": {
            "description": "RCM: A description of the record",
            "type": "string"
        },
        "raw": {
            "description": "RCM: The record as a JSON string (see json.dumps())",
            "type": "string"
        },
        "links": {
            "description": "REQ: Links relating to the record",
            "type": "object",
            "properties": {
                "landing_page": {
                    "description": "RCM: The human-friendly landing page for the record (defaults to the dataset landing page)",
                    "type": "string",
                    "format": "uri"
                },
                "publication": {
                    "description": "RCM: The DOI(s) (in link form, ex. 'https://dx.doi.org/10.12345') for publications specific to this record",
                    "type": "array",
                    "items": {
                        "type": "string",
                        "format": "uri"
                    }
                },
                "data_doi": {
                    "description": "RCM: The DOI of the record itself (in link form)",
                    "type": "string",
                    "format": "uri"
                },
                "related_id": {
                    "description": "OPT: The mdf-id(s) of related entries, not including the dataset entry",
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                },
                "parent_id": {
                    "description": "INTERNAL: The mdf-id of this record's dataset",
                    "type": "string"
                },
            },
            "additionalProperties": {
                "$ref": "#/definitions/data_link"
            },
            "required": [
                "landing_page",
                "parent_id"
            ]
        },
        "citation": {
            "description": "OPT: The full bibliographic citation(s) for the record, if different from the dataset",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "data_contact": {
            "description": "OPT: The contact person/steward/custodian for the record, if different from the dataset",
            "$ref": "#/definitions/person_data_contact"
        },
        "author": {
            "description": "OPT: A list of the authors of this record, if different from the dataset",
            "type": "array",
            "items": {
                "$ref": "#/definitions/person_author"
            }
        },
        "year": {
            "description": "OPT: The year of dataset creation, if different from the dataset",
            "type": "integer"
        },
        "processing": {
            "description": "Undefined: Processing information"
        },
        
        "structure": {
            "description": "Undefined: Structure information"
        },
        "collection": {
            "description": "INTERNAL: The collection for the dataset, commonly a portion of the title",
            "type": "string"
        },
        "source_name": {
            "description": "INTERNAL: A short version of the dataset name, for quick reference. Spaces and dashes will be replaced with underscores, and other non-alphanumeric characters will be removed.",
            "type": "string"
        },
        "ingest_date": {
            "description": "INTERNAL: The date this entry was created",
            "type": "string",
            "format": "date-time"
        },
        "metadata_version": {
            "description": "INTERNAL: The metadata schema version this entry adheres to",
            "type": "string"
        },
        "mdf_id": {
            "description": "INTERNAL: A unique BSON ID for this entry",
            "type": "string"
        },
        "resource_type": {
            "description": "INTERNAL: The type of entry (dataset)",
            "type": "string"
        },
        "elements": {
            "description": "INTERNAL: A list of the elements in the composition",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "scroll_id": {
            "description": "INTERNAL: A sequential, dataset-unique ID number for the record, for aggregation purposes",
            "type": "integer"
        }
    },
    "required": [
        "title",
        "acl",
        "links",
        "source_name",
        "ingest_date",
        "metadata_version",
        "mdf_id",
        "resource_type"
    ],
    "additionalProperties": False
}

## DC Block - Record

In [9]:
dc_record = {
    "title": "DC Block - Record",
    "description": "OPT: DataCite-format metadata",
    "type": "object",
    "properties": {}
}

## User-supplied fields

In [10]:
user_dataset = {
    "title": "User Block - Record",
    "description": "RCM: User-supplied metadata",
    "type": "object",
    "properties": {},
    "maxProperties": USER_KEY_LIMIT
}

## Full record schema

In [11]:
record_schema = {
    "$schema": "http://json-schema.org/draft-06/schema#",
    "title": "MDF Record",
    "description": "A record in the Materials Data Facility",
    "definitions": definitions,
    "properties":{
        "mdf": mdf_record,
        "dc": dc_record
    },
    "required": [
        "mdf",
        "dc"
    ],
    "additionalProperties": user_dataset,
    "minProperties": 2,
    "maxProperties": 3
}

## Validate an example record 

In [12]:
record = {
    "mdf": {
            "title":"test",
            "acl":["blah"],
            "source_name":"source name",
            "citation":["abc"],
            "links": {
                "landing_page":"http://www.globus.org",
                "parent_id":"abc"
            },
            "ingest_date":"Jan 1, 2017",
            "metadata_version":"1.1",
            "mdf_id":"1",
            "resource_type":"dataset"
    },
    "dc": {},
    "misc": {}
}

validate(record,record_schema)

# Write schemas to file and generate templates

In [13]:
# Dataset
print(write_schema(dataset_schema, "dataset", VERSION))
# Record
print(write_schema(record_schema, "record", VERSION))
# Repository is effectively a dataset containing datasets, can use same schema
repo_schema = json.loads(json.dumps(dataset_schema).replace("dataset", "repository"))
print(write_schema(repo_schema, "repository", VERSION))

{'success': True}
{'success': True}
{'success': True}
