-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add an annotation module with corresponding test cases and model tests. * Move test results to `pandas.DataFrame`s.
- Loading branch information
1 parent
752f2b6
commit 530805d
Showing
4 changed files
with
609 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability, | ||
# Technical University of Denmark. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""Annotation tests performed on an instance of `cobra.Model`.""" | ||
|
||
from __future__ import absolute_import | ||
|
||
from warnings import warn | ||
from builtins import dict | ||
|
||
from pandas import DataFrame | ||
|
||
import memote.support.annotation as annotation | ||
from memote.support.helpers import df2dict | ||
|
||
|
||
def test_metabolite_annotation_presence(read_only_model, store): | ||
"""Expect all metabolites to have a non-empty annotation attribute.""" | ||
store["metabolites_without_annotation"] = [ | ||
met.id for met in annotation.find_components_without_annotation( | ||
read_only_model, "metabolites")] | ||
assert len(store["metabolites_without_annotation"]) == 0, \ | ||
"The following metabolites lack any form of annotation: " \ | ||
"{}".format(", ".join(store["metabolites_without_annotation"])) | ||
|
||
|
||
def test_reaction_annotation_presence(read_only_model, store): | ||
"""Expect all reactions to have a non-empty annotation attribute.""" | ||
store["reactions_without_annotation"] = [ | ||
rxn.id for rxn in annotation.find_components_without_annotation( | ||
read_only_model, "reactions")] | ||
assert len(store["reactions_without_annotation"]) == 0, \ | ||
"The following reactions lack any form of annotation: " \ | ||
"{}".format(", ".join(store["reactions_without_annotation"])) | ||
|
||
|
||
def test_metabolite_annotation_overview(read_only_model, store): | ||
""" | ||
Expect all metabolites to have annotations from common databases. | ||
The required databases are outlined in `annotation.py`. | ||
""" | ||
overview = annotation.generate_component_annotation_overview( | ||
read_only_model, "metabolites") | ||
store['met_annotation_overview'] = df2dict(overview) | ||
for db in annotation.METABOLITE_ANNOTATIONS: | ||
sub = overview.loc[~overview[db], db] | ||
assert len(sub) == 0, \ | ||
"The following metabolites lack annotation for {}: " \ | ||
"{}".format(db, ", ".join(sub.index)) | ||
|
||
|
||
def test_reaction_annotation_overview(read_only_model, store): | ||
""" | ||
Expect all reactions to have annotations from common databases. | ||
The required databases are outlined in `annotation.py`. | ||
""" | ||
overview = annotation.generate_component_annotation_overview( | ||
read_only_model, "reactions") | ||
store['rxn_annotation_overview'] = df2dict(overview) | ||
for db in annotation.REACTION_ANNOTATIONS: | ||
sub = overview.loc[~overview[db], db] | ||
assert len(sub) == 0, \ | ||
"The following reactions lack annotation for {}: " \ | ||
"{}".format(db, ", ".join(sub.index)) | ||
|
||
|
||
def test_metabolite_annotation_wrong_ids(read_only_model, store): | ||
""" | ||
Expect all annotations of metabolites to be in the correct format. | ||
The required formats, i.e., regex patterns are outlined in `annotation.py`. | ||
""" | ||
has_annotation = annotation.generate_component_annotation_overview( | ||
read_only_model, "metabolites") | ||
matches = annotation.generate_component_annotation_miriam_match( | ||
read_only_model, "metabolites") | ||
wrong = DataFrame(has_annotation.values & (~matches.values), | ||
index=has_annotation.index, | ||
columns=has_annotation.columns) | ||
store['met_wrong_annotation_ids'] = df2dict(wrong) | ||
for db in annotation.METABOLITE_ANNOTATIONS: | ||
sub = wrong.loc[wrong[db], db] | ||
assert len(sub) == 0, \ | ||
"The following metabolites use wrong IDs for {}: " \ | ||
"{}".format(db, ", ".join(sub.index)) | ||
|
||
|
||
def test_reaction_annotation_wrong_ids(read_only_model, store): | ||
""" | ||
Expect all annotations of reactions to be in the correct format. | ||
The required formats, i.e., regex patterns are outlined in `annotation.py`. | ||
""" | ||
has_annotation = annotation.generate_component_annotation_overview( | ||
read_only_model, "reactions") | ||
matches = annotation.generate_component_annotation_miriam_match( | ||
read_only_model, "reactions") | ||
wrong = DataFrame(has_annotation.values & (~matches.values), | ||
index=has_annotation.index, | ||
columns=has_annotation.columns) | ||
store["rxn_wrong_annotation_ids"] = df2dict(wrong) | ||
for db in annotation.REACTION_ANNOTATIONS: | ||
sub = wrong.loc[wrong[db], db] | ||
assert len(sub) == 0, \ | ||
"The following reactions use wrong IDs for {}: " \ | ||
"{}".format(db, ", ".join(sub.index)) | ||
|
||
|
||
def test_metabolite_id_namespace_consistency(read_only_model, store): | ||
"""Expect metabolite IDs to be from the same namespace.""" | ||
met_id_ns = annotation.generate_component_id_namespace_overview( | ||
read_only_model, "metabolites") | ||
distribution = met_id_ns.sum() | ||
store['met_ids_in_each_namespace'] = dict( | ||
(key, int(val)) for key, val in distribution.iteritems()) | ||
# The BioCyc regex is extremely general, we ignore it here. | ||
cols = list(distribution.index) | ||
cols.remove('biocyc') | ||
largest = distribution[cols].idxmax() | ||
if distribution[largest] == 0: | ||
largest = 'biocyc' | ||
if largest != 'bigg.metabolite': | ||
warn( | ||
'memote currently only supports syntax checks for BiGG identifiers.' | ||
' Please consider mapping your IDs from {} to BiGG' | ||
''.format(largest) | ||
) | ||
assert distribution[largest] == len(read_only_model.metabolites), \ | ||
"Metabolite IDs that don't belong to the largest fraction: {}"\ | ||
"".format(met_id_ns.loc[~met_id_ns[largest], largest].index) | ||
|
||
|
||
def test_reaction_id_namespace_consistency(read_only_model, store): | ||
"""Expect reaction IDs to be from the same namespace.""" | ||
rxn_id_ns = annotation.generate_component_id_namespace_overview( | ||
read_only_model, "reactions") | ||
distribution = rxn_id_ns.sum() | ||
store['rxn_ids_in_each_namespace'] = dict( | ||
(key, int(val)) for key, val in distribution.iteritems()) | ||
# The BioCyc regex is extremely general, we ignore it here. | ||
cols = list(distribution.index) | ||
cols.remove('biocyc') | ||
largest = distribution[cols].idxmax() | ||
if distribution[largest] == 0: | ||
largest = 'biocyc' | ||
if largest != 'bigg.reaction': | ||
warn( | ||
'memote currently only supports syntax checks for BiGG identifiers.' | ||
' Please consider mapping your IDs from {} to BiGG' | ||
''.format(largest) | ||
) | ||
assert distribution[largest] == len(read_only_model.metabolites), \ | ||
"Reaction IDs that don't belong to the largest fraction: {}" \ | ||
"".format(rxn_id_ns.loc[~rxn_id_ns[largest], largest].index) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability, | ||
# Technical University of Denmark. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""Supporting functions for annotation checks performed on the model object.""" | ||
|
||
from __future__ import absolute_import | ||
|
||
import logging | ||
import re | ||
from future.utils import native_str | ||
|
||
import pandas as pd | ||
|
||
from collections import OrderedDict | ||
|
||
LOGGER = logging.getLogger(__name__) | ||
|
||
# MIRIAM (http://www.ebi.ac.uk/miriam/) styled identifiers for | ||
# common databases that are currently included are: | ||
# DB rxn,met,gen url | ||
# 'MetaNetX' ['rxn','met'] 'http://www.metanetx.org' | ||
# 'Kegg' ['rxn','met'] 'http://www.kegg.jp/' | ||
# 'SEED' ['met'] 'http://modelseed.org/' | ||
# 'InChIKey' ['met'] 'http://cactus.nci.nih.gov/chemical/structure' | ||
# 'ChEBI' ['met'] 'http://bioportal.bioontology.org/ontologies/CHEBI' | ||
# 'EnzymeNomenclature' ['rxn'] 'http://www.enzyme-database.org/' | ||
# 'BRENDA' ['rxn'] 'http://www.brenda-enzymes.org/' | ||
# 'RHEA' ['rxn'] 'http://www.rhea-db.org/' | ||
# 'HMDB' ['met'] 'http://www.hmdb.ca/' | ||
# 'BioCyc' ['rxn','met'] 'http://biocyc.org' | ||
# 'Reactome' ['met'] 'http://www.reactome.org/' | ||
# 'BiGG' ['rxn','met'] 'http://bigg.ucsd.edu/universal/' | ||
|
||
REACTION_ANNOTATIONS = [('rhea', re.compile(r"^\d{5}$")), | ||
('kegg.reaction', re.compile(r"^R\d+$")), | ||
('metanetx.reaction', re.compile(r"^MNXR\d+$")), | ||
('bigg.reaction', re.compile(r"^[a-z_A-Z0-9]+$")), | ||
('ec-code', re.compile( | ||
r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|" | ||
r"\d+\.\d+\.\d+\.-|" | ||
r"\d+\.\d+\.\d+\.(n)?\d+$") | ||
), | ||
('brenda', re.compile( | ||
r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|" | ||
r"\d+\.\d+\.\d+\.-|" | ||
r"\d+\.\d+\.\d+\.(n)?\d+$") | ||
), | ||
('biocyc', re.compile( | ||
r"^[A-Z-0-9]+(?<!CHEBI)" | ||
r"(\:)?[A-Za-z0-9+_.%-]+$") | ||
) | ||
] | ||
REACTION_ANNOTATIONS = OrderedDict(REACTION_ANNOTATIONS) | ||
|
||
|
||
METABOLITE_ANNOTATIONS = [('kegg.compound', re.compile(r"^C\d+$")), | ||
('seed.compound', re.compile(r"^cpd\d+$")), | ||
('inchikey', re.compile(r"^[A-Z]{14}\-" | ||
r"[A-Z]{10}(\-[A-Z])?")), | ||
('chebi', re.compile(r"^CHEBI:\d+$")), | ||
('hmdb', re.compile(r"^HMDB\d{5}$")), | ||
('reactome', re.compile(r"(^R-[A-Z]{3}-[0-9]+" | ||
r"(-[0-9]+)?$)|" | ||
r"(^REACT_\d+(\.\d+)?$)")), | ||
('metanetx.chemical', re.compile(r"^MNXM\d+$")), | ||
('bigg.metabolite', re.compile(r"^[a-z_A-Z0-9]+$")), | ||
('biocyc', re.compile(r"^[A-Z-0-9]+(?<!CHEBI)" | ||
r"(\:)?[A-Za-z0-9+_.%-]+$")) | ||
] | ||
METABOLITE_ANNOTATIONS = OrderedDict(METABOLITE_ANNOTATIONS) | ||
|
||
|
||
def find_components_without_annotation(model, components): | ||
""" | ||
Find model components with empty annotation attributes. | ||
Parameters | ||
---------- | ||
model : cobra.Model | ||
A cobrapy metabolic model. | ||
components : {"metabolites", "reactions", "genes"} | ||
A string denoting `cobra.Model` components. | ||
Returns | ||
------- | ||
list | ||
The components without any annotation. | ||
""" | ||
return [elem for elem in getattr(model, components) if | ||
elem.annotation is None or len(elem.annotation) == 0] | ||
|
||
|
||
def generate_component_annotation_overview(model, components): | ||
""" | ||
Tabulate which MIRIAM databases the component's annotation match. | ||
Parameters | ||
---------- | ||
model : cobra.Model | ||
A cobrapy metabolic model. | ||
components : {"metabolites", "reactions", "genes"} | ||
A string denoting `cobra.Model` components. | ||
Returns | ||
------- | ||
pandas.DataFrame | ||
The index of the table is given by the component identifiers. Each | ||
column corresponds to one MIRIAM database and a Boolean entry | ||
determines whether the annotation matches. | ||
""" | ||
databases = list({ | ||
"metabolites": METABOLITE_ANNOTATIONS, | ||
"reactions": REACTION_ANNOTATIONS | ||
}[components]) | ||
data = list() | ||
index = list() | ||
for elem in getattr(model, components): | ||
index.append(elem.id) | ||
data.append(tuple(db in elem.annotation for db in databases)) | ||
return pd.DataFrame(data, index=index, columns=databases) | ||
|
||
|
||
def generate_component_annotation_miriam_match(model, components): | ||
""" | ||
Tabulate which MIRIAM databases the component's annotation match. | ||
Parameters | ||
---------- | ||
model : cobra.Model | ||
A cobrapy metabolic model. | ||
components : {"metabolites", "reactions", "genes"} | ||
A string denoting `cobra.Model` components. | ||
Returns | ||
------- | ||
pandas.DataFrame | ||
The index of the table is given by the component identifiers. Each | ||
column corresponds to one MIRIAM database and a Boolean entry | ||
determines whether the annotation matches. | ||
""" | ||
def check_annotation(key, annotation): | ||
if key not in annotation: | ||
return False | ||
test = annotation[key] | ||
pattern = patterns[key] | ||
if isinstance(test, native_str): | ||
print("string match!") | ||
return pattern.match(test) is not None | ||
return all(pattern.match(elem) is not None for elem in test) | ||
|
||
patterns = { | ||
"metabolites": METABOLITE_ANNOTATIONS, | ||
"reactions": REACTION_ANNOTATIONS | ||
}[components] | ||
databases = list(patterns) | ||
data = list() | ||
index = list() | ||
for elem in getattr(model, components): | ||
index.append(elem.id) | ||
data.append(tuple(check_annotation(db, elem.annotation) | ||
for db in databases)) | ||
return pd.DataFrame(data, index=index, columns=databases) | ||
|
||
|
||
def generate_component_id_namespace_overview(model, components): | ||
""" | ||
Tabulate which MIRIAM databases the component's annotation match. | ||
Parameters | ||
---------- | ||
model : cobra.Model | ||
A cobrapy metabolic model. | ||
components : {"metabolites", "reactions", "genes"} | ||
A string denoting `cobra.Model` components. | ||
Returns | ||
------- | ||
pandas.DataFrame | ||
The index of the table is given by the component identifiers. Each | ||
column corresponds to one MIRIAM database and a Boolean entry | ||
determines whether the annotation matches. | ||
""" | ||
patterns = { | ||
"metabolites": METABOLITE_ANNOTATIONS, | ||
"reactions": REACTION_ANNOTATIONS | ||
}[components] | ||
databases = list(patterns) | ||
data = list() | ||
index = list() | ||
for elem in getattr(model, components): | ||
index.append(elem.id) | ||
data.append(tuple(patterns[db].match(elem.id) is not None | ||
for db in databases)) | ||
return pd.DataFrame(data, index=index, columns=databases) |
Oops, something went wrong.