Skip to content

Commit

Permalink
33 annotation tests (#103)
Browse files Browse the repository at this point in the history
* Add an annotation module with corresponding test cases and model tests.
* Move test results to `pandas.DataFrame`s.
  • Loading branch information
Christian Lieven authored and Midnighter committed Jul 6, 2017
1 parent 752f2b6 commit 530805d
Show file tree
Hide file tree
Showing 4 changed files with 609 additions and 0 deletions.
170 changes: 170 additions & 0 deletions memote/suite/tests/test_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-

# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability,
# Technical University of Denmark.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Annotation tests performed on an instance of `cobra.Model`."""

from __future__ import absolute_import

from warnings import warn
from builtins import dict

from pandas import DataFrame

import memote.support.annotation as annotation
from memote.support.helpers import df2dict


def test_metabolite_annotation_presence(read_only_model, store):
"""Expect all metabolites to have a non-empty annotation attribute."""
store["metabolites_without_annotation"] = [
met.id for met in annotation.find_components_without_annotation(
read_only_model, "metabolites")]
assert len(store["metabolites_without_annotation"]) == 0, \
"The following metabolites lack any form of annotation: " \
"{}".format(", ".join(store["metabolites_without_annotation"]))


def test_reaction_annotation_presence(read_only_model, store):
"""Expect all reactions to have a non-empty annotation attribute."""
store["reactions_without_annotation"] = [
rxn.id for rxn in annotation.find_components_without_annotation(
read_only_model, "reactions")]
assert len(store["reactions_without_annotation"]) == 0, \
"The following reactions lack any form of annotation: " \
"{}".format(", ".join(store["reactions_without_annotation"]))


def test_metabolite_annotation_overview(read_only_model, store):
"""
Expect all metabolites to have annotations from common databases.
The required databases are outlined in `annotation.py`.
"""
overview = annotation.generate_component_annotation_overview(
read_only_model, "metabolites")
store['met_annotation_overview'] = df2dict(overview)
for db in annotation.METABOLITE_ANNOTATIONS:
sub = overview.loc[~overview[db], db]
assert len(sub) == 0, \
"The following metabolites lack annotation for {}: " \
"{}".format(db, ", ".join(sub.index))


def test_reaction_annotation_overview(read_only_model, store):
"""
Expect all reactions to have annotations from common databases.
The required databases are outlined in `annotation.py`.
"""
overview = annotation.generate_component_annotation_overview(
read_only_model, "reactions")
store['rxn_annotation_overview'] = df2dict(overview)
for db in annotation.REACTION_ANNOTATIONS:
sub = overview.loc[~overview[db], db]
assert len(sub) == 0, \
"The following reactions lack annotation for {}: " \
"{}".format(db, ", ".join(sub.index))


def test_metabolite_annotation_wrong_ids(read_only_model, store):
"""
Expect all annotations of metabolites to be in the correct format.
The required formats, i.e., regex patterns are outlined in `annotation.py`.
"""
has_annotation = annotation.generate_component_annotation_overview(
read_only_model, "metabolites")
matches = annotation.generate_component_annotation_miriam_match(
read_only_model, "metabolites")
wrong = DataFrame(has_annotation.values & (~matches.values),
index=has_annotation.index,
columns=has_annotation.columns)
store['met_wrong_annotation_ids'] = df2dict(wrong)
for db in annotation.METABOLITE_ANNOTATIONS:
sub = wrong.loc[wrong[db], db]
assert len(sub) == 0, \
"The following metabolites use wrong IDs for {}: " \
"{}".format(db, ", ".join(sub.index))


def test_reaction_annotation_wrong_ids(read_only_model, store):
"""
Expect all annotations of reactions to be in the correct format.
The required formats, i.e., regex patterns are outlined in `annotation.py`.
"""
has_annotation = annotation.generate_component_annotation_overview(
read_only_model, "reactions")
matches = annotation.generate_component_annotation_miriam_match(
read_only_model, "reactions")
wrong = DataFrame(has_annotation.values & (~matches.values),
index=has_annotation.index,
columns=has_annotation.columns)
store["rxn_wrong_annotation_ids"] = df2dict(wrong)
for db in annotation.REACTION_ANNOTATIONS:
sub = wrong.loc[wrong[db], db]
assert len(sub) == 0, \
"The following reactions use wrong IDs for {}: " \
"{}".format(db, ", ".join(sub.index))


def test_metabolite_id_namespace_consistency(read_only_model, store):
"""Expect metabolite IDs to be from the same namespace."""
met_id_ns = annotation.generate_component_id_namespace_overview(
read_only_model, "metabolites")
distribution = met_id_ns.sum()
store['met_ids_in_each_namespace'] = dict(
(key, int(val)) for key, val in distribution.iteritems())
# The BioCyc regex is extremely general, we ignore it here.
cols = list(distribution.index)
cols.remove('biocyc')
largest = distribution[cols].idxmax()
if distribution[largest] == 0:
largest = 'biocyc'
if largest != 'bigg.metabolite':
warn(
'memote currently only supports syntax checks for BiGG identifiers.'
' Please consider mapping your IDs from {} to BiGG'
''.format(largest)
)
assert distribution[largest] == len(read_only_model.metabolites), \
"Metabolite IDs that don't belong to the largest fraction: {}"\
"".format(met_id_ns.loc[~met_id_ns[largest], largest].index)


def test_reaction_id_namespace_consistency(read_only_model, store):
"""Expect reaction IDs to be from the same namespace."""
rxn_id_ns = annotation.generate_component_id_namespace_overview(
read_only_model, "reactions")
distribution = rxn_id_ns.sum()
store['rxn_ids_in_each_namespace'] = dict(
(key, int(val)) for key, val in distribution.iteritems())
# The BioCyc regex is extremely general, we ignore it here.
cols = list(distribution.index)
cols.remove('biocyc')
largest = distribution[cols].idxmax()
if distribution[largest] == 0:
largest = 'biocyc'
if largest != 'bigg.reaction':
warn(
'memote currently only supports syntax checks for BiGG identifiers.'
' Please consider mapping your IDs from {} to BiGG'
''.format(largest)
)
assert distribution[largest] == len(read_only_model.metabolites), \
"Reaction IDs that don't belong to the largest fraction: {}" \
"".format(rxn_id_ns.loc[~rxn_id_ns[largest], largest].index)
212 changes: 212 additions & 0 deletions memote/support/annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
# -*- coding: utf-8 -*-

# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability,
# Technical University of Denmark.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Supporting functions for annotation checks performed on the model object."""

from __future__ import absolute_import

import logging
import re
from future.utils import native_str

import pandas as pd

from collections import OrderedDict

LOGGER = logging.getLogger(__name__)

# MIRIAM (http://www.ebi.ac.uk/miriam/) styled identifiers for
# common databases that are currently included are:
# DB rxn,met,gen url
# 'MetaNetX' ['rxn','met'] 'http://www.metanetx.org'
# 'Kegg' ['rxn','met'] 'http://www.kegg.jp/'
# 'SEED' ['met'] 'http://modelseed.org/'
# 'InChIKey' ['met'] 'http://cactus.nci.nih.gov/chemical/structure'
# 'ChEBI' ['met'] 'http://bioportal.bioontology.org/ontologies/CHEBI'
# 'EnzymeNomenclature' ['rxn'] 'http://www.enzyme-database.org/'
# 'BRENDA' ['rxn'] 'http://www.brenda-enzymes.org/'
# 'RHEA' ['rxn'] 'http://www.rhea-db.org/'
# 'HMDB' ['met'] 'http://www.hmdb.ca/'
# 'BioCyc' ['rxn','met'] 'http://biocyc.org'
# 'Reactome' ['met'] 'http://www.reactome.org/'
# 'BiGG' ['rxn','met'] 'http://bigg.ucsd.edu/universal/'

REACTION_ANNOTATIONS = [('rhea', re.compile(r"^\d{5}$")),
('kegg.reaction', re.compile(r"^R\d+$")),
('metanetx.reaction', re.compile(r"^MNXR\d+$")),
('bigg.reaction', re.compile(r"^[a-z_A-Z0-9]+$")),
('ec-code', re.compile(
r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|"
r"\d+\.\d+\.\d+\.-|"
r"\d+\.\d+\.\d+\.(n)?\d+$")
),
('brenda', re.compile(
r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|"
r"\d+\.\d+\.\d+\.-|"
r"\d+\.\d+\.\d+\.(n)?\d+$")
),
('biocyc', re.compile(
r"^[A-Z-0-9]+(?<!CHEBI)"
r"(\:)?[A-Za-z0-9+_.%-]+$")
)
]
REACTION_ANNOTATIONS = OrderedDict(REACTION_ANNOTATIONS)


METABOLITE_ANNOTATIONS = [('kegg.compound', re.compile(r"^C\d+$")),
('seed.compound', re.compile(r"^cpd\d+$")),
('inchikey', re.compile(r"^[A-Z]{14}\-"
r"[A-Z]{10}(\-[A-Z])?")),
('chebi', re.compile(r"^CHEBI:\d+$")),
('hmdb', re.compile(r"^HMDB\d{5}$")),
('reactome', re.compile(r"(^R-[A-Z]{3}-[0-9]+"
r"(-[0-9]+)?$)|"
r"(^REACT_\d+(\.\d+)?$)")),
('metanetx.chemical', re.compile(r"^MNXM\d+$")),
('bigg.metabolite', re.compile(r"^[a-z_A-Z0-9]+$")),
('biocyc', re.compile(r"^[A-Z-0-9]+(?<!CHEBI)"
r"(\:)?[A-Za-z0-9+_.%-]+$"))
]
METABOLITE_ANNOTATIONS = OrderedDict(METABOLITE_ANNOTATIONS)


def find_components_without_annotation(model, components):
"""
Find model components with empty annotation attributes.
Parameters
----------
model : cobra.Model
A cobrapy metabolic model.
components : {"metabolites", "reactions", "genes"}
A string denoting `cobra.Model` components.
Returns
-------
list
The components without any annotation.
"""
return [elem for elem in getattr(model, components) if
elem.annotation is None or len(elem.annotation) == 0]


def generate_component_annotation_overview(model, components):
"""
Tabulate which MIRIAM databases the component's annotation match.
Parameters
----------
model : cobra.Model
A cobrapy metabolic model.
components : {"metabolites", "reactions", "genes"}
A string denoting `cobra.Model` components.
Returns
-------
pandas.DataFrame
The index of the table is given by the component identifiers. Each
column corresponds to one MIRIAM database and a Boolean entry
determines whether the annotation matches.
"""
databases = list({
"metabolites": METABOLITE_ANNOTATIONS,
"reactions": REACTION_ANNOTATIONS
}[components])
data = list()
index = list()
for elem in getattr(model, components):
index.append(elem.id)
data.append(tuple(db in elem.annotation for db in databases))
return pd.DataFrame(data, index=index, columns=databases)


def generate_component_annotation_miriam_match(model, components):
"""
Tabulate which MIRIAM databases the component's annotation match.
Parameters
----------
model : cobra.Model
A cobrapy metabolic model.
components : {"metabolites", "reactions", "genes"}
A string denoting `cobra.Model` components.
Returns
-------
pandas.DataFrame
The index of the table is given by the component identifiers. Each
column corresponds to one MIRIAM database and a Boolean entry
determines whether the annotation matches.
"""
def check_annotation(key, annotation):
if key not in annotation:
return False
test = annotation[key]
pattern = patterns[key]
if isinstance(test, native_str):
print("string match!")
return pattern.match(test) is not None
return all(pattern.match(elem) is not None for elem in test)

patterns = {
"metabolites": METABOLITE_ANNOTATIONS,
"reactions": REACTION_ANNOTATIONS
}[components]
databases = list(patterns)
data = list()
index = list()
for elem in getattr(model, components):
index.append(elem.id)
data.append(tuple(check_annotation(db, elem.annotation)
for db in databases))
return pd.DataFrame(data, index=index, columns=databases)


def generate_component_id_namespace_overview(model, components):
"""
Tabulate which MIRIAM databases the component's annotation match.
Parameters
----------
model : cobra.Model
A cobrapy metabolic model.
components : {"metabolites", "reactions", "genes"}
A string denoting `cobra.Model` components.
Returns
-------
pandas.DataFrame
The index of the table is given by the component identifiers. Each
column corresponds to one MIRIAM database and a Boolean entry
determines whether the annotation matches.
"""
patterns = {
"metabolites": METABOLITE_ANNOTATIONS,
"reactions": REACTION_ANNOTATIONS
}[components]
databases = list(patterns)
data = list()
index = list()
for elem in getattr(model, components):
index.append(elem.id)
data.append(tuple(patterns[db].match(elem.id) is not None
for db in databases))
return pd.DataFrame(data, index=index, columns=databases)
Loading

0 comments on commit 530805d

Please sign in to comment.