Merge pull request #693 from opencobra/fix-find-biomass

Fix find biomass
opencobra · Aug 10, 2020 · cdeb804 · cdeb804
2 parents 4c6739e + c7a9b47
commit cdeb804
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 38 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -4,20 +4,17 @@ root = true
 [*]
 charset = utf-8
 indent_style = space
-indent_size = 2
+indent_size = 4
 end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
+max_line_length = 80
 
-[*.{py,pyi}]
-indent_size = 4
-
-[*.{ini,cfg}]
-indent_size = 4
+[*.{json,yml}]
+indent_size = 2
 
 [*.{md,rst}]
 trim_trailing_whitespace = false
 
 [Makefile]
 indent_style = tab
-indent_size = 4
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,6 +3,7 @@ History
 
 Next Release
 ------------
+* Change the logic for identifying biomass reactions to be less eager.
 
 0.11.0 (2020-06-26)
 -------------------

diff --git a/src/memote/experimental/tabular.py b/src/memote/experimental/tabular.py
@@ -55,7 +55,7 @@ def read_tabular(filename, dtype_conversion=None):
             filename, sep="\t", dtype=dtype_conversion, encoding="utf-8"
         )
     elif "xls" in ext or "xlsx" in ext:
-        df = pd.read_excel(filename, dtype=dtype_conversion, encoding="utf-8")
+        df = pd.read_excel(filename, dtype=dtype_conversion)
     # TODO: Add a function to parse ODS data into a pandas data frame.
     else:
         raise ValueError("Unknown file format '{}'.".format(ext))

diff --git a/src/memote/support/helpers.py b/src/memote/support/helpers.py
@@ -277,42 +277,74 @@ def find_converting_reactions(model, pair):
     return frozenset(hits)
 
 
-def filter_biomass(
-    component,
-    sbo_term,
-    buzzwords,
-):
+def filter_sbo_term(component, sbo_term):
     """
-    Return True if the component matches a biomass description.
+    Return true if the component is annotated with the given SBO term.
 
     Parameters
     ----------
     component : cobra.Reaction or cobra.Metabolite
         Either a reaction or a metabolite instance.
     sbo_term : str
         The term for either biomass production or biomass.
-    buzzwords : collection of patterns
-        One or more regular expression patterns to match against the name or
-        identifier of the component.
+
+    """
+    return component.annotation.get("sbo", "") == sbo_term
+
+
+def filter_match_name(component, buzzwords):
+    """
+    Return whether the component's name matches a biomass description.
+
+    Notes
+    -----
+    Regex patterns are necessary here to prevent, for example, 'non-growth' from
+    matching.
+
+    Parameters
+    ----------
+    component : cobra.Reaction or cobra.Metabolite
+        Either a reaction or a metabolite instance.
+    buzzwords : collection of regex patterns
+        One or more regular expression patterns to match against the name of the
+        component.
 
     Returns
     -------
     bool
         True if there was any match at all.
 
     """
-    if component.annotation is not None and 'sbo' in component.annotation and \
-            component.annotation['sbo'] == sbo_term:
-        return True
-    if component.name is not None:
-        name = component.name.lower()
-        if any(b.match(name) for b in buzzwords):
-            return True
-    if component.id is not None:
-        identifier = component.id.lower()
-        if any(b.match(identifier) for b in buzzwords):
-            return True
-    return False
+    if component.name is None:
+        return False
+    name = component.name.lower()
+    return any(b.match(name) for b in buzzwords)
+
+
+def filter_identifier(component, buzzwords):
+    """
+    Return whether the component's identifier contains a biomass description.
+
+    Notes
+    -----
+    We check substring presence here because identifiers are often prefixed with
+    ``M_`` or ``R_``.
+
+    Parameters
+    ----------
+    component : cobra.Reaction or cobra.Metabolite
+        Either a reaction or a metabolite instance.
+    buzzwords : iterable of str
+        One or more buzzwords that the identifier should contain.
+
+    Returns
+    -------
+    bool
+        True if there was any match at all.
+
+    """
+    identifier = component.id.lower()
+    return any(b in identifier for b in buzzwords)
 
 
 @lrudecorator(size=2)
@@ -342,22 +374,59 @@ def find_biomass_reaction(model):
         Identified biomass reactions (if any).
 
     """
-    # Patterns are necessary here to prevent, for example, 'non-growth' from
-    # matching.
-    buzzwords = (
+    boundary = frozenset(model.boundary)
+
+    # 1.
+    candidates = {
+        r for r in model.reactions
+        if filter_sbo_term(r, 'SBO:0000629')
+    }
+    candidates.difference_update(boundary)
+    if candidates:
+        return sorted(candidates, key=attrgetter('id'))
+
+    # 2.
+    name_buzzwords = (
         re.compile(r'\bbiomass'), re.compile(r'\bgrowth'), re.compile(r'bof')
     )
+    id_buzzwords = ('biomass',)
     candidates = {
         r for r in model.reactions
-        if filter_biomass(r, 'SBO:0000629', buzzwords)
+        if filter_match_name(r, name_buzzwords) or
+        filter_identifier(r, id_buzzwords)
+    }
+    candidates.difference_update(boundary)
+    if candidates:
+        return sorted(candidates, key=attrgetter('id'))
+
+    # 3.
+    name_buzzwords = (re.compile(r'\bbiomass'),)
+    id_buzzwords = ('biomass',)
+    sbo_metabolites = {
+        m for m in model.metabolites
+        if filter_sbo_term(m, 'SBO:0000649')
     }
-    buzzwords = (re.compile(r'\bbiomass'),)
     metabolites = {
         m for m in model.metabolites
-        if filter_biomass(m, 'SBO:0000649', buzzwords)
+        if filter_match_name(m, name_buzzwords) or
+        filter_identifier(m, id_buzzwords)
     }
-    candidates.update({r for m in metabolites for r in m.reactions})
-    return sorted(candidates.difference(model.boundary), key=attrgetter('id'))
+    # Many metabolites may match 'SBO:0000649', we filter those further by name
+    # and ID.
+    sbo_metabolites.intersection_update(metabolites)
+    if sbo_metabolites:
+        candidates = {r for m in sbo_metabolites for r in m.reactions}
+        candidates.difference_update(boundary)
+        if candidates:
+            return sorted(candidates, key=attrgetter('id'))
+
+    # 4.
+    candidates = {r for m in metabolites for r in m.reactions}
+    candidates.difference_update(boundary)
+    if candidates:
+        return sorted(candidates, key=attrgetter('id'))
+
+    return []
 
 
 @lrudecorator(size=2)

diff --git a/tests/test_for_support/test_for_helpers.py b/tests/test_for_support/test_for_helpers.py
@@ -456,12 +456,33 @@ def biomass_sbo(base):
 
 @register_with(MODEL_REGISTRY)
 def biomass_metabolite(base):
-    """Provide a model with a reaction that will be identified as biomass."""
+    """Provide a model with a metabolite that will be identified as biomass."""
+    a = cobra.Metabolite("Protein_c", compartment="c")
+    b = cobra.Metabolite("DNA_c", compartment="c")
+    c = cobra.Metabolite("RNA_c", compartment="c")
+    d = cobra.Metabolite("GAM_c", compartment="c")
+    e = cobra.Metabolite("Biomass_c", compartment="c")
+    rxn1 = cobra.Reaction("R0001")
+    rxn1.add_metabolites({a: -1, b: -1, c: -1, d: -1, e: 1})
+    rxn2 = cobra.Reaction("EX_Biomass")
+    rxn2.add_metabolites({e: -1})
+    base.add_reactions([rxn1, rxn2])
+    return base
+
+
+@register_with(MODEL_REGISTRY)
+def biomass_sbo_metabolite(base):
+    """Provide a model with a metabolite that will be identified as biomass."""
     a = cobra.Metabolite("Protein_c", compartment="c")
+    a.annotation["sbo"] = "SBO:0000649"
     b = cobra.Metabolite("DNA_c", compartment="c")
+    b.annotation["sbo"] = "SBO:0000649"
     c = cobra.Metabolite("RNA_c", compartment="c")
+    c.annotation["sbo"] = "SBO:0000649"
     d = cobra.Metabolite("GAM_c", compartment="c")
+    d.annotation["sbo"] = "SBO:0000649"
     e = cobra.Metabolite("Biomass_c", compartment="c")
+    e.annotation["sbo"] = "SBO:0000649"
     rxn1 = cobra.Reaction("R0001")
     rxn1.add_metabolites({a: -1, b: -1, c: -1, d: -1, e: 1})
     rxn2 = cobra.Reaction("EX_Biomass")
@@ -653,6 +674,7 @@ def test_largest_compartment_id_met(model, expected):
     ("biomass_buzzwords", 1),
     ("biomass_sbo", 1),
     ("biomass_metabolite", 1),
+    ("biomass_sbo_metabolite", 1),
 ], indirect=["model"])
 def test_find_biomass_reaction(model, expected):
     """