Skip to content

Commit

Permalink
Merge branch 'main' into issue-84
Browse files Browse the repository at this point in the history
  • Loading branch information
wdduncan committed Jul 1, 2021
2 parents e1bdc1a + 2d82955 commit 881371b
Show file tree
Hide file tree
Showing 16 changed files with 1,257 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ docs/types/
*/my_schema.*
.env
.venv
/local
local/

# directory that hold the nmdc_schema pypi package
nmdc_schema/
Expand Down
23 changes: 19 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ deploy-testpypi:
## -- TEST/VALIDATE JSONSCHEMA

# datasets used test/validate the schema
SHEMA_TEST_EXAMPLES := \
SCHEMA_TEST_EXAMPLES := \
biosample_test \
gold_project_test \
img_mg_annotation_objects \
Expand All @@ -159,12 +159,27 @@ SHEMA_TEST_EXAMPLES := \
mg_assembly_activities_test \
mg_assembly_data_objects_test \
nmdc_example_database \
study_test
# invalid_study_test # this is used to make sure invalid data is caught!
study_test \
functional_annotation_set

SCHEMA_TEST_EXAMPLES_INVALID := \
invalid_study_test \

# functional_annotation_set_invalid has invalid ID pattern but regex tests aren't applied yet? MAM 2021-06-24

.PHONY: test-jsonschema
test-jsonschema: $(foreach example, $(SHEMA_TEST_EXAMPLES), validate-$(example))
test-jsonschema: $(foreach example, $(SCHEMA_TEST_EXAMPLES), validate-$(example))

# .PHONY: test-jsonschema
# test-jsonschema: $(foreach example, $(SCHEMA_TEST_EXAMPLES), echo $(example))

.PHONY: test-jsonschema_invalid
test-jsonschema_invalid: $(foreach example, $(SCHEMA_TEST_EXAMPLES_INVALID), validate-invalid-$(example))

validate-%: test/data/%.json jsonschema/nmdc.schema.json
# util/validate_nmdc_json.py -i $< # example of validating data using the cli
jsonschema -i $< $(word 2, $^)

validate-invalid-%: test/data/%.json jsonschema/nmdc.schema.json
@echo $(word 2, $^)
! jsonschema -i $< $(word 2, $^)
86 changes: 82 additions & 4 deletions jsonschema/nmdc.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -908,14 +908,14 @@
"type": "array"
},
"metagenome_annotation_activity_set": {
"description": "This property links a database object to the set of metaproteomics analysis activities.",
"description": "This property links a database object to the set of metagenome annotation activities.",
"items": {
"$ref": "#/definitions/MetagenomeAnnotationActivity"
},
"type": "array"
},
"metagenome_assembly_set": {
"description": "This property links a database object to the set of metaproteomics analysis activities.\n ",
"description": "This property links a database object to the set of metagenome assembly activities.",
"items": {
"$ref": "#/definitions/MetagenomeAssembly"
},
Expand All @@ -928,6 +928,13 @@
},
"type": "array"
},
"metatranscriptome_activity_set": {
"description": "This property links a database object to the set of metatranscriptome analysis activities.",
"items": {
"$ref": "#/definitions/MetatranscriptomeActivity"
},
"type": "array"
},
"nmdc_schema_version": {
"description": "TODO",
"type": "string"
Expand Down Expand Up @@ -1737,6 +1744,70 @@
"title": "MetaproteomicsAnalysisActivity",
"type": "object"
},
"MetatranscriptomeActivity": {
"additionalProperties": false,
"description": "A metatranscriptome activity that e.g. pools assembly and annotation activity.",
"properties": {
"ended_at_time": {
"type": "string"
},
"execution_resource": {
"type": "string"
},
"git_url": {
"type": "string"
},
"has_input": {
"items": {
"type": "string"
},
"type": "array"
},
"has_output": {
"items": {
"type": "string"
},
"type": "array"
},
"id": {
"description": "A unique identifier for a thing. Must be either a CURIE shorthand for a URI or a complete URI",
"type": "string"
},
"name": {
"description": "A human readable label for an entity",
"type": "string"
},
"started_at_time": {
"type": "string"
},
"type": {
"type": "string"
},
"used": {
"type": "string"
},
"was_associated_with": {
"description": "the agent/entity associated with the generation of the file",
"type": "string"
},
"was_informed_by": {
"type": "string"
}
},
"required": [
"id",
"execution_resource",
"git_url",
"has_input",
"has_output",
"type",
"started_at_time",
"ended_at_time",
"was_informed_by"
],
"title": "MetatranscriptomeActivity",
"type": "object"
},
"MetatranscriptomeAnnotationActivity": {
"additionalProperties": false,
"description": "",
Expand Down Expand Up @@ -2902,14 +2973,14 @@
"type": "array"
},
"metagenome_annotation_activity_set": {
"description": "This property links a database object to the set of metaproteomics analysis activities.",
"description": "This property links a database object to the set of metagenome annotation activities.",
"items": {
"$ref": "#/definitions/MetagenomeAnnotationActivity"
},
"type": "array"
},
"metagenome_assembly_set": {
"description": "This property links a database object to the set of metaproteomics analysis activities.\n ",
"description": "This property links a database object to the set of metagenome assembly activities.",
"items": {
"$ref": "#/definitions/MetagenomeAssembly"
},
Expand All @@ -2922,6 +2993,13 @@
},
"type": "array"
},
"metatranscriptome_activity_set": {
"description": "This property links a database object to the set of metatranscriptome analysis activities.",
"items": {
"$ref": "#/definitions/MetatranscriptomeActivity"
},
"type": "array"
},
"nmdc_schema_version": {
"description": "TODO",
"type": "string"
Expand Down
55 changes: 50 additions & 5 deletions python/nmdc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Auto generated from nmdc.yaml by pythongen.py version: 0.9.0
# Generation date: 2021-06-24 19:48
# Generation date: 2021-06-30 12:54
# Schema: NMDC
#
# id: https://microbiomedata/schema
Expand Down Expand Up @@ -188,6 +188,10 @@ class MetatranscriptomeAnnotationActivityId(WorkflowExecutionActivityId):
pass


class MetatranscriptomeActivityId(WorkflowExecutionActivityId):
pass


class MAGsAnalysisActivityId(WorkflowExecutionActivityId):
pass

Expand Down Expand Up @@ -251,6 +255,7 @@ class Database(YAMLRoot):
metaproteomics_analysis_activity_set: Optional[Union[Dict[Union[str, MetaproteomicsAnalysisActivityId], Union[dict, "MetaproteomicsAnalysisActivity"]], List[Union[dict, "MetaproteomicsAnalysisActivity"]]]] = empty_dict()
metagenome_annotation_activity_set: Optional[Union[Dict[Union[str, MetagenomeAnnotationActivityId], Union[dict, "MetagenomeAnnotationActivity"]], List[Union[dict, "MetagenomeAnnotationActivity"]]]] = empty_dict()
metagenome_assembly_set: Optional[Union[Dict[Union[str, MetagenomeAssemblyId], Union[dict, "MetagenomeAssembly"]], List[Union[dict, "MetagenomeAssembly"]]]] = empty_dict()
metatranscriptome_activity_set: Optional[Union[Dict[Union[str, MetatranscriptomeActivityId], Union[dict, "MetatranscriptomeActivity"]], List[Union[dict, "MetatranscriptomeActivity"]]]] = empty_dict()
read_QC_analysis_activity_set: Optional[Union[Dict[Union[str, ReadQCAnalysisActivityId], Union[dict, "ReadQCAnalysisActivity"]], List[Union[dict, "ReadQCAnalysisActivity"]]]] = empty_dict()
read_based_analysis_activity_set: Optional[Union[Dict[Union[str, ReadBasedAnalysisActivityId], Union[dict, "ReadBasedAnalysisActivity"]], List[Union[dict, "ReadBasedAnalysisActivity"]]]] = empty_dict()
nom_analysis_activity_set: Optional[Union[Dict[Union[str, NomAnalysisActivityId], Union[dict, "NomAnalysisActivity"]], List[Union[dict, "NomAnalysisActivity"]]]] = empty_dict()
Expand Down Expand Up @@ -280,6 +285,8 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):

self._normalize_inlined_as_dict(slot_name="metagenome_assembly_set", slot_type=MetagenomeAssembly, key_name="id", keyed=True)

self._normalize_inlined_as_dict(slot_name="metatranscriptome_activity_set", slot_type=MetatranscriptomeActivity, key_name="id", keyed=True)

self._normalize_inlined_as_dict(slot_name="read_QC_analysis_activity_set", slot_type=ReadQCAnalysisActivity, key_name="id", keyed=True)

self._normalize_inlined_as_dict(slot_name="read_based_analysis_activity_set", slot_type=ReadBasedAnalysisActivity, key_name="id", keyed=True)
Expand Down Expand Up @@ -2252,6 +2259,37 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
super().__post_init__(**kwargs)


@dataclass
class MetatranscriptomeActivity(WorkflowExecutionActivity):
"""
A metatranscriptome activity that e.g. pools assembly and annotation activity.
"""
_inherited_slots: ClassVar[List[str]] = []

class_class_uri: ClassVar[URIRef] = URIRef("https://microbiomedata/schema/workflow_execution_activity/MetatranscriptomeActivity")
class_class_curie: ClassVar[str] = None
class_name: ClassVar[str] = "metatranscriptome activity"
class_model_uri: ClassVar[URIRef] = NMDC.MetatranscriptomeActivity

id: Union[str, MetatranscriptomeActivityId] = None
execution_resource: str = None
git_url: str = None
has_input: Union[Union[str, NamedThingId], List[Union[str, NamedThingId]]] = None
has_output: Union[Union[str, NamedThingId], List[Union[str, NamedThingId]]] = None
type: str = None
started_at_time: str = None
ended_at_time: str = None
was_informed_by: Union[str, ActivityId] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self._is_empty(self.id):
self.MissingRequiredField("id")
if not isinstance(self.id, MetatranscriptomeActivityId):
self.id = MetatranscriptomeActivityId(self.id)

super().__post_init__(**kwargs)


@dataclass
class MAGsAnalysisActivity(WorkflowExecutionActivity):
_inherited_slots: ClassVar[List[str]] = []
Expand Down Expand Up @@ -2736,7 +2774,7 @@ class FunctionalAnnotation(YAMLRoot):

was_generated_by: Optional[Union[str, MetagenomeAnnotationActivityId]] = None
subject: Optional[Union[str, GeneProductId]] = None
has_function: Optional[Union[str, FunctionalAnnotationTermId]] = None
has_function: Optional[str] = None
type: Optional[Union[str, OntologyClassId]] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
Expand All @@ -2746,8 +2784,8 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.subject is not None and not isinstance(self.subject, GeneProductId):
self.subject = GeneProductId(self.subject)

if self.has_function is not None and not isinstance(self.has_function, FunctionalAnnotationTermId):
self.has_function = FunctionalAnnotationTermId(self.has_function)
if self.has_function is not None and not isinstance(self.has_function, str):
self.has_function = str(self.has_function)

if self.type is not None and not isinstance(self.type, OntologyClassId):
self.type = OntologyClassId(self.type)
Expand Down Expand Up @@ -2847,6 +2885,9 @@ class slots:
slots.metagenome_assembly_set = Slot(uri=NMDC.metagenome_assembly_set, name="metagenome assembly set", curie=NMDC.curie('metagenome_assembly_set'),
model_uri=NMDC.metagenome_assembly_set, domain=Database, range=Optional[Union[Dict[Union[str, MetagenomeAssemblyId], Union[dict, "MetagenomeAssembly"]], List[Union[dict, "MetagenomeAssembly"]]]])

slots.metatranscriptome_activity_set = Slot(uri=NMDC.metatranscriptome_activity_set, name="metatranscriptome activity set", curie=NMDC.curie('metatranscriptome_activity_set'),
model_uri=NMDC.metatranscriptome_activity_set, domain=Database, range=Optional[Union[Dict[Union[str, MetatranscriptomeActivityId], Union[dict, "MetatranscriptomeActivity"]], List[Union[dict, "MetatranscriptomeActivity"]]]])

slots.read_QC_analysis_activity_set = Slot(uri=NMDC.read_QC_analysis_activity_set, name="read QC analysis activity set", curie=NMDC.curie('read_QC_analysis_activity_set'),
model_uri=NMDC.read_QC_analysis_activity_set, domain=Database, range=Optional[Union[Dict[Union[str, ReadQCAnalysisActivityId], Union[dict, "ReadQCAnalysisActivity"]], List[Union[dict, "ReadQCAnalysisActivity"]]]])

Expand Down Expand Up @@ -5344,7 +5385,7 @@ class slots:
model_uri=NMDC.subject, domain=None, range=Optional[Union[str, GeneProductId]])

slots.has_function = Slot(uri="str(uriorcurie)", name="has function", curie=None,
model_uri=NMDC.has_function, domain=None, range=Optional[Union[str, FunctionalAnnotationTermId]])
model_uri=NMDC.has_function, domain=None, range=Optional[str])

slots.has_participants = Slot(uri="str(uriorcurie)", name="has participants", curie=None,
model_uri=NMDC.has_participants, domain=None, range=Optional[str])
Expand Down Expand Up @@ -5788,6 +5829,10 @@ class slots:
slots.reaction_participant_stoichiometry = Slot(uri=NMDC.stoichiometry, name="reaction participant_stoichiometry", curie=NMDC.curie('stoichiometry'),
model_uri=NMDC.reaction_participant_stoichiometry, domain=ReactionParticipant, range=Optional[int])

slots.functional_annotation_has_function = Slot(uri=NMDC.has_function, name="functional annotation_has function", curie=NMDC.curie('has_function'),
model_uri=NMDC.functional_annotation_has_function, domain=FunctionalAnnotation, range=Optional[str],
pattern=re.compile(r'^(KEGG.PATHWAY:\w{2,4}\d{5}|KEGG.REACTION:R\d+|RHEA:\d{5}|MetaCyc:[A-Za-z0-9+_.%-:]+|EC:\d{1,2}(\.\d{0,3}){0,3}|GO:\d{7}|MetaNetX:(MNXR\d+|EMPTY)|SEED:\w+|KEGG\.ORTHOLOGY:K\d+|EGGNOG:\w+|PFAM:PF\d{5}|TIGRFAM:TIGR\d+|SUPFAM:\w+|CATH:[1-6]\.[0-9]+\.[0-9]+\.[0-9]+|PANTHER.FAMILY:PTHR\d{5}(\:SF\d{1,3})?)$'))

slots.functional_annotation_type = Slot(uri=NMDC.type, name="functional annotation_type", curie=NMDC.curie('type'),
model_uri=NMDC.functional_annotation_type, domain=FunctionalAnnotation, range=Optional[Union[str, OntologyClassId]])

Expand Down
13 changes: 12 additions & 1 deletion src/schema/annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ classes:
An assignment of a function term (e.g. reaction or pathway) that is executed by a gene product, or which the gene product plays an active role in.
Functional annotations can be assigned manually by curators, or automatically in workflows. In the context of NMDC, all function annotation is performed
automatically, typically using HMM or Blast type methods
comments:
- "move id slot usage patterns to has_function slot usage?"
see_also:
- https://img.jgi.doe.gov/docs/functional-annotation.pdf
- https://github.com/microbiomedata/mg_annotation/blob/master/functional-annotation.wdl
Expand All @@ -192,6 +194,11 @@ classes:
- subject
- has function
slot_usage:
has function: # was: id
pattern: "^(KEGG.PATHWAY:\\w{2,4}\\d{5}|KEGG.REACTION:R\\d+|RHEA:\\d{5}|MetaCyc:[A-Za-z0-9+_.%-:]+|EC:\\d{1,2}(\\.\\d{0,3}){0,3}|GO:\\d{7}|MetaNetX:(MNXR\\d+|EMPTY)|SEED:\\w+|KEGG\\.ORTHOLOGY:K\\d+|EGGNOG:\\w+|PFAM:PF\\d{5}|TIGRFAM:TIGR\\d+|SUPFAM:\\w+|CATH:[1-6]\\.[0-9]+\\.[0-9]+\\.[0-9]+|PANTHER.FAMILY:PTHR\\d{5}(\\:SF\\d{1,3})?)$"
comments:
- "missing patterns for COG and RetroRules"
- "these patterns aren't tied to the listed prefixes. a discussion about that possibility had been started, including the question of whether these lists are intended to be open examples or closed"
type:
range: ontology class
description: >-
Expand All @@ -210,7 +217,11 @@ slots:
range: gene product

has function:
range: functional annotation term
range: string # was: functional annotation term
comments:
- "the range for has_function was asserted as functional_annotation_term,"
- "but is actually taking string arguments in the Polyneme MongoDB,"
- "and those are frequently fulltext, not CURIEs. MAM 2021-06-23"

has participants:
abstract: true
Expand Down
18 changes: 14 additions & 4 deletions src/schema/nmdc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ description: >-
* the NMDC schema itself
license: https://creativecommons.org/publicdomain/zero/1.0/
version: 2021.06.16rc1
version: 2021.06.30

prefixes:
nmdc: https://microbiomedata/meta/
Expand Down Expand Up @@ -82,6 +82,7 @@ classes:
- metaproteomics analysis activity set
- metagenome annotation activity set
- metagenome assembly set
- metatranscriptome activity set
- read QC analysis activity set
- read based analysis activity set
- nom analysis activity set
Expand Down Expand Up @@ -508,7 +509,7 @@ slots:
multivalued: true
inlined: true
description: >-
This property links a database object to the set of metaproteomics analysis activities.
This property links a database object to the set of metagenome annotation activities.
metagenome assembly set:
mixins: object set
Expand All @@ -517,8 +518,17 @@ slots:
multivalued: true
inlined: true
description: >-
This property links a database object to the set of metaproteomics analysis activities.
This property links a database object to the set of metagenome assembly activities.
metatranscriptome activity set:
mixins: object set
domain: database
range: metatranscriptome activity
multivalued: true
inlined: true
description: >-
This property links a database object to the set of metatranscriptome analysis activities.
read QC analysis activity set:
mixins: object set
domain: database
Expand Down
7 changes: 7 additions & 0 deletions src/schema/workflow_execution_activity.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@ classes:
in_subset:
- workflow subset

metatranscriptome activity:
is_a: workflow execution activity
description: >-
A metatranscriptome activity that e.g. pools assembly and annotation activity.
in_subset:
- workflow subset

MAGs analysis activity:
is_a: workflow execution activity
in_subset:
Expand Down
Loading

0 comments on commit 881371b

Please sign in to comment.