33 annotation tests (#103)

* Add an annotation module with corresponding test cases and model tests. * Move test results to `pandas.DataFrame`s.
opencobra · Jul 6, 2017 · 530805d · 530805d
1 parent 752f2b6
commit 530805d
Show file tree

Hide file tree

Showing 4 changed files with 609 additions and 0 deletions.
diff --git a/memote/suite/tests/test_annotation.py b/memote/suite/tests/test_annotation.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability,
+# Technical University of Denmark.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Annotation tests performed on an instance of `cobra.Model`."""
+
+from __future__ import absolute_import
+
+from warnings import warn
+from builtins import dict
+
+from pandas import DataFrame
+
+import memote.support.annotation as annotation
+from memote.support.helpers import df2dict
+
+
+def test_metabolite_annotation_presence(read_only_model, store):
+    """Expect all metabolites to have a non-empty annotation attribute."""
+    store["metabolites_without_annotation"] = [
+        met.id for met in annotation.find_components_without_annotation(
+            read_only_model, "metabolites")]
+    assert len(store["metabolites_without_annotation"]) == 0, \
+        "The following metabolites lack any form of annotation: " \
+        "{}".format(", ".join(store["metabolites_without_annotation"]))
+
+
+def test_reaction_annotation_presence(read_only_model, store):
+    """Expect all reactions to have a non-empty annotation attribute."""
+    store["reactions_without_annotation"] = [
+        rxn.id for rxn in annotation.find_components_without_annotation(
+            read_only_model, "reactions")]
+    assert len(store["reactions_without_annotation"]) == 0, \
+        "The following reactions lack any form of annotation: " \
+        "{}".format(", ".join(store["reactions_without_annotation"]))
+
+
+def test_metabolite_annotation_overview(read_only_model, store):
+    """
+    Expect all metabolites to have annotations from common databases.
+
+    The required databases are outlined in `annotation.py`.
+    """
+    overview = annotation.generate_component_annotation_overview(
+        read_only_model, "metabolites")
+    store['met_annotation_overview'] = df2dict(overview)
+    for db in annotation.METABOLITE_ANNOTATIONS:
+        sub = overview.loc[~overview[db], db]
+        assert len(sub) == 0, \
+            "The following metabolites lack annotation for {}: " \
+            "{}".format(db, ", ".join(sub.index))
+
+
+def test_reaction_annotation_overview(read_only_model, store):
+    """
+    Expect all reactions to have annotations from common databases.
+
+    The required databases are outlined in `annotation.py`.
+    """
+    overview = annotation.generate_component_annotation_overview(
+        read_only_model, "reactions")
+    store['rxn_annotation_overview'] = df2dict(overview)
+    for db in annotation.REACTION_ANNOTATIONS:
+        sub = overview.loc[~overview[db], db]
+        assert len(sub) == 0, \
+            "The following reactions lack annotation for {}: " \
+            "{}".format(db, ", ".join(sub.index))
+
+
+def test_metabolite_annotation_wrong_ids(read_only_model, store):
+    """
+    Expect all annotations of metabolites to be in the correct format.
+
+    The required formats, i.e., regex patterns are outlined in `annotation.py`.
+    """
+    has_annotation = annotation.generate_component_annotation_overview(
+        read_only_model, "metabolites")
+    matches = annotation.generate_component_annotation_miriam_match(
+        read_only_model, "metabolites")
+    wrong = DataFrame(has_annotation.values & (~matches.values),
+                      index=has_annotation.index,
+                      columns=has_annotation.columns)
+    store['met_wrong_annotation_ids'] = df2dict(wrong)
+    for db in annotation.METABOLITE_ANNOTATIONS:
+        sub = wrong.loc[wrong[db], db]
+        assert len(sub) == 0, \
+            "The following metabolites use wrong IDs for {}: " \
+            "{}".format(db, ", ".join(sub.index))
+
+
+def test_reaction_annotation_wrong_ids(read_only_model, store):
+    """
+    Expect all annotations of reactions to be in the correct format.
+
+    The required formats, i.e., regex patterns are outlined in `annotation.py`.
+    """
+    has_annotation = annotation.generate_component_annotation_overview(
+        read_only_model, "reactions")
+    matches = annotation.generate_component_annotation_miriam_match(
+        read_only_model, "reactions")
+    wrong = DataFrame(has_annotation.values & (~matches.values),
+                      index=has_annotation.index,
+                      columns=has_annotation.columns)
+    store["rxn_wrong_annotation_ids"] = df2dict(wrong)
+    for db in annotation.REACTION_ANNOTATIONS:
+        sub = wrong.loc[wrong[db], db]
+        assert len(sub) == 0, \
+            "The following reactions use wrong IDs for {}: " \
+            "{}".format(db, ", ".join(sub.index))
+
+
+def test_metabolite_id_namespace_consistency(read_only_model, store):
+    """Expect metabolite IDs to be from the same namespace."""
+    met_id_ns = annotation.generate_component_id_namespace_overview(
+        read_only_model, "metabolites")
+    distribution = met_id_ns.sum()
+    store['met_ids_in_each_namespace'] = dict(
+        (key, int(val)) for key, val in distribution.iteritems())
+    # The BioCyc regex is extremely general, we ignore it here.
+    cols = list(distribution.index)
+    cols.remove('biocyc')
+    largest = distribution[cols].idxmax()
+    if distribution[largest] == 0:
+        largest = 'biocyc'
+    if largest != 'bigg.metabolite':
+        warn(
+            'memote currently only supports syntax checks for BiGG identifiers.'
+            ' Please consider mapping your IDs from {} to BiGG'
+            ''.format(largest)
+        )
+    assert distribution[largest] == len(read_only_model.metabolites), \
+        "Metabolite IDs that don't belong to the largest fraction: {}"\
+        "".format(met_id_ns.loc[~met_id_ns[largest], largest].index)
+
+
+def test_reaction_id_namespace_consistency(read_only_model, store):
+    """Expect reaction IDs to be from the same namespace."""
+    rxn_id_ns = annotation.generate_component_id_namespace_overview(
+        read_only_model, "reactions")
+    distribution = rxn_id_ns.sum()
+    store['rxn_ids_in_each_namespace'] = dict(
+        (key, int(val)) for key, val in distribution.iteritems())
+    # The BioCyc regex is extremely general, we ignore it here.
+    cols = list(distribution.index)
+    cols.remove('biocyc')
+    largest = distribution[cols].idxmax()
+    if distribution[largest] == 0:
+        largest = 'biocyc'
+    if largest != 'bigg.reaction':
+        warn(
+            'memote currently only supports syntax checks for BiGG identifiers.'
+            ' Please consider mapping your IDs from {} to BiGG'
+            ''.format(largest)
+        )
+    assert distribution[largest] == len(read_only_model.metabolites), \
+        "Reaction IDs that don't belong to the largest fraction: {}" \
+        "".format(rxn_id_ns.loc[~rxn_id_ns[largest], largest].index)
diff --git a/memote/support/annotation.py b/memote/support/annotation.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability,
+# Technical University of Denmark.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Supporting functions for annotation checks performed on the model object."""
+
+from __future__ import absolute_import
+
+import logging
+import re
+from future.utils import native_str
+
+import pandas as pd
+
+from collections import OrderedDict
+
+LOGGER = logging.getLogger(__name__)
+
+# MIRIAM (http://www.ebi.ac.uk/miriam/) styled identifiers for
+# common databases that are currently included are:
+# DB    rxn,met,gen url
+# 'MetaNetX'  ['rxn','met'] 'http://www.metanetx.org'
+# 'Kegg'  ['rxn','met'] 'http://www.kegg.jp/'
+# 'SEED'  ['met']  'http://modelseed.org/'
+# 'InChIKey'  ['met'] 'http://cactus.nci.nih.gov/chemical/structure'
+# 'ChEBI' ['met'] 'http://bioportal.bioontology.org/ontologies/CHEBI'
+# 'EnzymeNomenclature'    ['rxn'] 'http://www.enzyme-database.org/'
+# 'BRENDA'    ['rxn']    'http://www.brenda-enzymes.org/'
+# 'RHEA'  ['rxn']  'http://www.rhea-db.org/'
+# 'HMDB'  ['met'] 'http://www.hmdb.ca/'
+# 'BioCyc'  ['rxn','met'] 'http://biocyc.org'
+# 'Reactome'    ['met'] 'http://www.reactome.org/'
+# 'BiGG'    ['rxn','met']   'http://bigg.ucsd.edu/universal/'
+
+REACTION_ANNOTATIONS = [('rhea', re.compile(r"^\d{5}$")),
+                        ('kegg.reaction', re.compile(r"^R\d+$")),
+                        ('metanetx.reaction', re.compile(r"^MNXR\d+$")),
+                        ('bigg.reaction', re.compile(r"^[a-z_A-Z0-9]+$")),
+                        ('ec-code', re.compile(
+                            r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|"
+                            r"\d+\.\d+\.\d+\.-|"
+                            r"\d+\.\d+\.\d+\.(n)?\d+$")
+                         ),
+                        ('brenda', re.compile(
+                            r"^\d+\.-\.-\.-|\d+\.\d+\.-\.-|"
+                            r"\d+\.\d+\.\d+\.-|"
+                            r"\d+\.\d+\.\d+\.(n)?\d+$")
+                         ),
+                        ('biocyc', re.compile(
+                            r"^[A-Z-0-9]+(?<!CHEBI)"
+                            r"(\:)?[A-Za-z0-9+_.%-]+$")
+                         )
+                        ]
+REACTION_ANNOTATIONS = OrderedDict(REACTION_ANNOTATIONS)
+
+
+METABOLITE_ANNOTATIONS = [('kegg.compound', re.compile(r"^C\d+$")),
+                          ('seed.compound', re.compile(r"^cpd\d+$")),
+                          ('inchikey', re.compile(r"^[A-Z]{14}\-"
+                                                  r"[A-Z]{10}(\-[A-Z])?")),
+                          ('chebi', re.compile(r"^CHEBI:\d+$")),
+                          ('hmdb', re.compile(r"^HMDB\d{5}$")),
+                          ('reactome', re.compile(r"(^R-[A-Z]{3}-[0-9]+"
+                                                  r"(-[0-9]+)?$)|"
+                                                  r"(^REACT_\d+(\.\d+)?$)")),
+                          ('metanetx.chemical', re.compile(r"^MNXM\d+$")),
+                          ('bigg.metabolite', re.compile(r"^[a-z_A-Z0-9]+$")),
+                          ('biocyc', re.compile(r"^[A-Z-0-9]+(?<!CHEBI)"
+                                                r"(\:)?[A-Za-z0-9+_.%-]+$"))
+                          ]
+METABOLITE_ANNOTATIONS = OrderedDict(METABOLITE_ANNOTATIONS)
+
+
+def find_components_without_annotation(model, components):
+    """
+    Find model components with empty annotation attributes.
+
+    Parameters
+    ----------
+    model : cobra.Model
+        A cobrapy metabolic model.
+    components : {"metabolites", "reactions", "genes"}
+        A string denoting `cobra.Model` components.
+
+    Returns
+    -------
+    list
+        The components without any annotation.
+
+    """
+    return [elem for elem in getattr(model, components) if
+            elem.annotation is None or len(elem.annotation) == 0]
+
+
+def generate_component_annotation_overview(model, components):
+    """
+    Tabulate which MIRIAM databases the component's annotation match.
+
+    Parameters
+    ----------
+    model : cobra.Model
+        A cobrapy metabolic model.
+    components : {"metabolites", "reactions", "genes"}
+        A string denoting `cobra.Model` components.
+
+    Returns
+    -------
+    pandas.DataFrame
+        The index of the table is given by the component identifiers. Each
+        column corresponds to one MIRIAM database and a Boolean entry
+        determines whether the annotation matches.
+
+    """
+    databases = list({
+        "metabolites": METABOLITE_ANNOTATIONS,
+        "reactions": REACTION_ANNOTATIONS
+    }[components])
+    data = list()
+    index = list()
+    for elem in getattr(model, components):
+        index.append(elem.id)
+        data.append(tuple(db in elem.annotation for db in databases))
+    return pd.DataFrame(data, index=index, columns=databases)
+
+
+def generate_component_annotation_miriam_match(model, components):
+    """
+    Tabulate which MIRIAM databases the component's annotation match.
+
+    Parameters
+    ----------
+    model : cobra.Model
+        A cobrapy metabolic model.
+    components : {"metabolites", "reactions", "genes"}
+        A string denoting `cobra.Model` components.
+
+    Returns
+    -------
+    pandas.DataFrame
+        The index of the table is given by the component identifiers. Each
+        column corresponds to one MIRIAM database and a Boolean entry
+        determines whether the annotation matches.
+
+    """
+    def check_annotation(key, annotation):
+        if key not in annotation:
+            return False
+        test = annotation[key]
+        pattern = patterns[key]
+        if isinstance(test, native_str):
+            print("string match!")
+            return pattern.match(test) is not None
+        return all(pattern.match(elem) is not None for elem in test)
+
+    patterns = {
+        "metabolites": METABOLITE_ANNOTATIONS,
+        "reactions": REACTION_ANNOTATIONS
+    }[components]
+    databases = list(patterns)
+    data = list()
+    index = list()
+    for elem in getattr(model, components):
+        index.append(elem.id)
+        data.append(tuple(check_annotation(db, elem.annotation)
+                          for db in databases))
+    return pd.DataFrame(data, index=index, columns=databases)
+
+
+def generate_component_id_namespace_overview(model, components):
+    """
+    Tabulate which MIRIAM databases the component's annotation match.
+
+    Parameters
+    ----------
+    model : cobra.Model
+        A cobrapy metabolic model.
+    components : {"metabolites", "reactions", "genes"}
+        A string denoting `cobra.Model` components.
+
+    Returns
+    -------
+    pandas.DataFrame
+        The index of the table is given by the component identifiers. Each
+        column corresponds to one MIRIAM database and a Boolean entry
+        determines whether the annotation matches.
+
+    """
+    patterns = {
+        "metabolites": METABOLITE_ANNOTATIONS,
+        "reactions": REACTION_ANNOTATIONS
+    }[components]
+    databases = list(patterns)
+    data = list()
+    index = list()
+    for elem in getattr(model, components):
+        index.append(elem.id)
+        data.append(tuple(patterns[db].match(elem.id) is not None
+                          for db in databases))
+    return pd.DataFrame(data, index=index, columns=databases)