Skip to content

Commit

Permalink
Merge pull request #1281 from nextstrain/annotations-schema-updates
Browse files Browse the repository at this point in the history
Annotations schema updates
  • Loading branch information
jameshadfield committed Aug 20, 2023
2 parents 29bb674 + c530539 commit 37a0a07
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 63 deletions.
96 changes: 76 additions & 20 deletions augur/data/schema-annotations.json
Original file line number Diff line number Diff line change
@@ -1,34 +1,90 @@
{
"type" : "object",
"$schema": "http://json-schema.org/draft-06/schema#",
"title": "JSON object for the `annotations` key, typically produced by `augur translate`",
"description": "Coordinates etc of genes / genome",
"$id": "https://nextstrain.org/schemas/augur/annotations",
"title": "Schema for the 'annotations' property (node-data JSON) or the 'genome_annotations' property (auspice JSON)",
"properties": {
"nuc": {
"type": "object",
"allOf": [{ "$ref": "#/$defs/startend" }],
"properties": {
"start": {
"enum": [1],
"$comment": "nuc must begin at 1"
},
"strand": {
"type": "string",
"enum":["+"],
"description": "Strand is optional for nuc, as it should be +ve for all genomes (-ve strand genomes are reverse complemented)",
"$comment": "Auspice will not proceed if the JSON has strand='-'"
}
},
"additionalProperties": true,
"$comment": "All other properties are unused by Auspice."
}
},
"required": ["nuc"],
"patternProperties": {
"^[a-zA-Z0-9*_-]+$": {
"^(?!nuc)[a-zA-Z0-9*_-]+$": {
"$comment": "Each object here defines a single CDS",
"type": "object",
"oneOf": [{ "$ref": "#/$defs/startend" }, { "$ref": "#/$defs/segments" }],
"additionalProperties": true,
"required": ["strand"],
"properties": {
"seqid":{
"description": "Sequence on which the coordinates below are valid. Could be viral segment, bacterial contig, etc",
"$comment": "Unused by Auspice 2.0",
"type": "string"
"gene": {
"type": "string",
"description": "The name of the gene the CDS is from. Optional.",
"$comment": "Shown in on-hover infobox & influences default CDS colors"
},
"type": {
"description": "Type of the feature. could be mRNA, CDS, or similar",
"$comment": "Unused by Auspice 2.0",
"type": "string"
"strand": {
"description": "Strand of the CDS",
"type": "string",
"enum": ["-", "+"]
},
"start": {
"description": "Gene start position (one-based, following GFF format)",
"type": "number"
"color": {
"type": "string",
"description": "A CSS color or a color hex code. Optional."
},
"end": {
"description": "Gene end position (one-based closed, last position of feature, following GFF format)",
"type": "number"
"display_name": {
"type": "string",
"$comment": "Shown in the on-hover info box"
},
"strand": {
"description": "Positive or negative strand",
"description": {
"type": "string",
"enum": ["-","+"]
"$comment": "Shown in the on-hover info box"
}
}
}
},
"$defs": {
"startend": {
"type": "object",
"required": ["start", "end"],
"properties": {
"start": {
"type": "integer",
"minimum": 1,
"description": "Start position (one-based, following GFF format)"
},
"end": {
"type": "integer",
"minimum": 2,
"description": "End position (one-based, following GFF format). This value _must_ be greater than the start."
}
}
},
"segments": {
"type": "object",
"required": ["segments"],
"properties": {
"segments": {
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"allOf": [{ "$ref": "#/$defs/startend" }]
}
}
}
}
Expand Down
39 changes: 1 addition & 38 deletions augur/data/schema-export-v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,44 +51,7 @@
}
},
"genome_annotations": {
"description": "Genome annotations (e.g. genes), relative to the reference genome",
"$comment": "Required for the entropy panel",
"type": "object",
"required": ["nuc"],
"additionalProperties": false,
"properties": {
"nuc": {
"type": "object",
"properties": {
"seqid":{
"description": "Sequence on which the coordinates below are valid. Could be viral segment, bacterial contig, etc",
"$comment": "currently unused by Auspice",
"type": "string"
},
"type": {
"description": "Type of the feature. could be mRNA, CDS, or similar",
"$comment": "currently unused by Auspice",
"type": "string"
},
"start": {
"description": "Gene start position (one-based, following GFF format)",
"type": "number"
},
"end": {
"description": "Gene end position (one-based closed, last position of feature, following GFF format)",
"type": "number"
},
"strand": {
"description": "Positive or negative strand",
"type": "string",
"enum": ["-","+"]
}
}
}
},
"patternProperties": {
"^[a-zA-Z0-9*_-]+$": {"$ref": "#/properties/meta/properties/genome_annotations/properties/nuc"}
}
"$ref": "https://nextstrain.org/schemas/augur/annotations"
},
"filters": {
"description": "These appear as filters in the footer of Auspice (which populates the displayed values based upon the tree)",
Expand Down
35 changes: 32 additions & 3 deletions augur/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class ValidateError(Exception):
pass


def load_json_schema(path):
def load_json_schema(path, refs=None):
'''
Load a JSON schema from the augur included set of schemas
(located in augur/data)
Expand All @@ -40,7 +40,28 @@ def load_json_schema(path):
Validator.check_schema(schema)
except jsonschema.exceptions.SchemaError as err:
raise ValidateError(f"Schema {path} is not a valid JSON Schema ({Validator.META_SCHEMA['$schema']}). Error: {err}")
return Validator(schema)

if refs:
# Make the validator aware of additional schemas
schema_store = {k: json.loads(resource_string(__package__, os.path.join("data", v))) for k,v in refs.items()}
resolver = jsonschema.RefResolver.from_schema(schema,store=schema_store)
schema_validator = Validator(schema, resolver=resolver)
else:
schema_validator = Validator(schema)

# By default $ref URLs which we don't define in a schema_store are fetched
# by jsonschema. This often indicates a typo (the $ref doesn't match the key
# of the schema_store) or we forgot to add a local mapping for a new $ref.
# Either way, Augur should not be accessing the network.
def resolve_remote(url):
# The exception type is not important as jsonschema will catch & re-raise as a RefResolutionError
raise Exception(f"The schema used for validation attempted to fetch the remote URL '{url!r}'. " +
"Augur should resolve schema references to local files, please check the schema used " +
"and update the appropriate schema_store as needed." )
schema_validator.resolver.resolve_remote = resolve_remote

return schema_validator


def load_json(path):
with open(path, 'rb') as fh:
Expand Down Expand Up @@ -163,7 +184,15 @@ def auspice_config_v2(config_json, **kwargs):
validate(config, schema, config_json)

def export_v2(main_json, **kwargs):
main_schema = load_json_schema("schema-export-v2.json")
# The main_schema uses references to other schemas, and the suggested use is
# to define these refs as valid URLs. Augur itself should not access schemas
# over the wire so we provide a mapping between URLs and filepaths here. The
# filepath is specified relative to ./augur/data (where all the schemas
# live).
refs = {
'https://nextstrain.org/schemas/augur/annotations': "schema-annotations.json"
}
main_schema = load_json_schema("schema-export-v2.json", refs)

if main_json.endswith("frequencies.json") or main_json.endswith("entropy.json") or main_json.endswith("sequences.json"):
raise ValidateError("This validation subfunction is for the main `augur export v2` JSON only.")
Expand Down
73 changes: 72 additions & 1 deletion tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from augur.validate import (
validate_collection_config_fields,
validate_collection_display_defaults,
validate_measurements_config
validate_measurements_config,
load_json_schema,
validate_json,
ValidateError
)


Expand Down Expand Up @@ -88,3 +91,71 @@ def test_validate_measurements_config_invalid_default_collection(self, example_m
}
assert not validate_measurements_config(measurements)
assert capsys.readouterr().err == "ERROR: The default collection key 'invalid_collection' does not match any of the collections' keys.\n"


@pytest.fixture
def genome_annotation_schema():
return load_json_schema("schema-annotations.json")

class TestValidateGenomeAnnotations():
def test_negative_strand_nuc(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 1, "end": 200, "strand": "-"}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_nuc_not_starting_at_one(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 100, "end": 200, "strand": "+"}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_missing_nuc(self, capsys, genome_annotation_schema):
d = {"cds": {"start": 100, "end": 200, "strand": "+"}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_missing_properties(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 20, "strand": "+"}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_not_stranded_cds(self, capsys, genome_annotation_schema):
# Strand . is for features that are not stranded (as per GFF spec), and thus they're not CDSs
d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 18, "end": 20, "strand": "."}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_negative_coordinates(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": -2, "end": 10, "strand": "+"}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_valid_genome(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 20, "end": 28, "strand": "+"}}
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_valid_segmented_genome(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 1, "end": 100},
"cds": {"segments": [{"start": 20, "end": 28}], "strand": "+"}}
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_invalid_segmented_genome(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 1, "end": 100},
"cds": {"segments": [{"start": 20, "end": 28}, {"start": 27}], "strand": "+"}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing

def test_string_coordinates(self, capsys, genome_annotation_schema):
d = {"nuc": {"start": 1, "end": 100},
"cds": {"segments": [{"start": 20, "end": 28}, {"start": "27", "end": "29"}], "strand": "+"}}
with pytest.raises(ValidateError):
validate_json(d, genome_annotation_schema, "<test-json>")
capsys.readouterr() # suppress validation error printing
2 changes: 1 addition & 1 deletion tests/util_support/test_node_data_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_validate_valid(self, build_node_data_file):
build_node_data_file(
f"""
{{
"annotations": {{ "a": {{ "start": 5 }} }},
"annotations": {{ "nuc": {{ "start": 1, "end": 100 }} }},
"generated_by": {{ "program": "augur", "version": "{__version__}" }},
"nodes": {{ "a": 5 }}
}}
Expand Down

0 comments on commit 37a0a07

Please sign in to comment.