Skip to content

Commit

Permalink
Merge pull request #144 from marrink-lab/fix-140
Browse files Browse the repository at this point in the history
Fix #140
  • Loading branch information
jbarnoud committed Oct 18, 2018
2 parents 9f7ee10 + cc4de41 commit 5d835c5
Show file tree
Hide file tree
Showing 2 changed files with 247 additions and 30 deletions.
84 changes: 54 additions & 30 deletions vermouth/processors/canonicalize_modifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Provides a Processor that identifies unexpected atoms such as PTMs and
protonations, and canonicalizes their attributes based on modifications known
in the forcefield.
"""

from collections import defaultdict
import itertools
Expand All @@ -28,6 +33,9 @@


class PTMGraphMatcher(nx.isomorphism.GraphMatcher):
"""
Implements matching logic for PTMs
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

Expand All @@ -52,7 +60,7 @@ def semantic_feasibility(self, node1, node2):
return False


def find_PTM_atoms(molecule):
def find_ptm_atoms(molecule):
"""
Finds all atoms in molecule that have the node attribute ``PTM_atom`` set
to a value that evaluates to ``True``. ``molecule`` will be traversed
Expand All @@ -77,7 +85,7 @@ def find_PTM_atoms(molecule):
# In addition, unrecognized atoms have been labeled with the PTM attribute.
extra_atoms = set(n_idx for n_idx in molecule
if molecule.nodes[n_idx].get('PTM_atom', False))
PTMs = []
ptms = []
while extra_atoms:
# First PTM atom we'll look at
first = next(iter(extra_atoms))
Expand All @@ -102,12 +110,19 @@ def find_PTM_atoms(molecule):
if not to_see:
# We've traversed the interesting bit of the tree
break
# Although we know how far our tree spans we may still have work to do
# for terminal nodes. There has to be a more elegant solution though.
for node in to_see:
if node in extra_atoms:
atoms.add(node)
else:
anchors.add(node)
extra_atoms -= atoms
PTMs.append((atoms, anchors))
return PTMs
ptms.append((atoms, anchors))
return ptms


def identify_ptms(residue, residue_ptms, known_PTMs):
def identify_ptms(residue, residue_ptms, known_ptms):
"""
Identifies all PTMs in ``known_PTMs`` nescessary to describe all PTM atoms in
``residue_ptms``. Will take PTMs such that all PTM atoms in ``residue``
Expand All @@ -128,7 +143,7 @@ def identify_ptms(residue, residue_ptms, known_PTMs):
The nodes in the graph must have the `PTM_atom` attribute (True or
False). It should be True for atoms that are not part of the PTM
itself, but describe where it is attached to the molecule.
In addition, it's nodes must have the `atomname` attribute, which will
In addition, its nodes must have the `atomname` attribute, which will
be used to recognize where the PTM is anchored, or to correct the
atomnames. Lastly, the nodes may have a `replace` attribute, which
is a dictionary of ``{attribute_name: new_value}`` pairs. The special
Expand All @@ -148,34 +163,43 @@ def identify_ptms(residue, residue_ptms, known_PTMs):
KeyError
Not all PTM atoms in ``residue`` can be covered with ``known_PTMs``.
"""
# BASECASE: residue_ptms is empty
if not any(res_ptm[0] for res_ptm in residue_ptms):
to_cover = set()
for res_ptm in residue_ptms:
to_cover.update(res_ptm[0])
to_cover.update(res_ptm[1])
return _cover_graph(residue, to_cover, known_ptms)


def _cover_graph(graph, to_cover, fragments):
# BASECASE: to_cover is empty
if not to_cover:
return []
# REDUCTION: Apply one of known_PTMs, remove those atoms from residue_ptms

# All non-PTM atoms in residue are always available for matching...
available = set(n_idx for n_idx in graph
if not graph.nodes[n_idx].get('PTM_atom', False))
# ... and add those we still need to cover
available.update(to_cover)

# REDUCTION: Apply one of fragments, remove those atoms from to_cover
# COMBINATION: add the applied option to the output.
for idx, option in enumerate(known_PTMs):
ptm, matcher = option
for idx, option in enumerate(fragments):
graphlet, matcher = option
matches = list(matcher.subgraph_isomorphisms_iter())
# Matches: [{res_idxs: ptm_idxs}, {...}, ...]
# All non-PTM atoms in residue are always available for matching...
available = set(n_idx for n_idx in residue
if not residue.nodes[n_idx].get('PTM_atom', False))
# ... and only add non consumed PTM atoms
for res_ptm in residue_ptms:
available.update(res_ptm[0])
# Matches: [{graph_idxs: fragment_idxs}, {...}, ...]
for match in matches:
matching = set(match.keys())
has_anchor = any(m in r[1] for m in matching for r in residue_ptms)
if matching.issubset(available) and has_anchor:
new_res_ptms = []
for res_ptm in residue_ptms:
new_res_ptms.append((res_ptm[0] - matching, res_ptm[1]))
# TODO: one of the matching atoms must be an anchor. Should be
# handled by PTMGraphMatcher already, assuming every PTM graph has
# at least one non-ptm atom specified
if matching <= available:
# Continue with the remaining ptm atoms, and try just this
# option and all smaller.
try:
return [(ptm, match)] + identify_ptms(residue, new_res_ptms, known_PTMs[idx:])
rest_cover = _cover_graph(graph, to_cover - matching, fragments[idx:])
except KeyError:
continue
return [(graphlet, match)] + rest_cover
raise KeyError('Could not identify PTM')


Expand Down Expand Up @@ -219,21 +243,21 @@ def fix_ptm(molecule):
could not be recognized must be labeled with the attribute
PTM_atom=True.
'''
PTM_atoms = find_PTM_atoms(molecule)
ptm_atoms = find_ptm_atoms(molecule)

def key_func(ptm_atoms):
node_idxs = ptm_atoms[-1] # The anchors
return sorted(molecule.nodes[idx]['resid'] for idx in node_idxs)

ptm_atoms = sorted(PTM_atoms, key=key_func)
ptm_atoms = sorted(ptm_atoms, key=key_func)

resid_to_idxs = defaultdict(list)
for n_idx in molecule:
residx = molecule.nodes[n_idx]['resid']
resid_to_idxs[residx].append(n_idx)
resid_to_idxs = dict(resid_to_idxs)

known_PTMs = molecule.force_field.modifications
known_ptms = molecule.force_field.modifications

for resids, res_ptms in itertools.groupby(ptm_atoms, key_func):
# How to solve this graph covering problem
Expand All @@ -257,14 +281,14 @@ def key_func(ptm_atoms):
# TODO: Maybe use graph_utils.make_residue_graph? Or rewrite that
# function?
residue = molecule.subgraph(n_idxs)
options = allowed_ptms(residue, res_ptms, known_PTMs)
options = allowed_ptms(residue, res_ptms, known_ptms)
# TODO/FIXME: This includes anchors in sorting by size.
options = sorted(options, key=lambda opt: len(opt[0]), reverse=True)
try:
identified = identify_ptms(residue, res_ptms, options)
except KeyError:
LOGGER.exception('Could not identify the modifications for'
' residues {}, involving atoms {}',
' residues {}, involving atoms {}',
['{resname}{resid}'.format(**molecule.nodes[resid_to_idxs[resid][0]])
for resid in sorted(set(resids))],
['{atomid}-{atomname}'.format(**molecule.nodes[idx])
Expand Down Expand Up @@ -297,6 +321,7 @@ def key_func(ptm_atoms):
type='remove-atom')
molecule.remove_node(mol_idx)
n_idxs.remove(mol_idx)
resid_to_idxs[mol_node['resid']].remove(mol_idx)
break
if mol_node.get(attr_name) != val:
fmt = 'Changing attribute {} from {} to {} for atom {}'
Expand All @@ -307,7 +332,6 @@ def key_func(ptm_atoms):
for n_idx in n_idxs:
molecule.nodes[n_idx]['modifications'] = molecule.nodes[n_idx].get('modifications', [])
molecule.nodes[n_idx]['modifications'].append(ptm)
return None


class CanonicalizeModifications(Processor):
Expand Down
193 changes: 193 additions & 0 deletions vermouth/tests/test_ptm_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# -*- coding: utf-8 -*-
# Copyright 2018 University of Groningen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Test PTM detection and canonicalisation.
"""

import networkx as nx
import pytest

import vermouth
import vermouth.processors.canonicalize_modifications as canmod

# pylint: disable=redefined-outer-name


@pytest.fixture
def known_ptm_graphs():
"""
Provide some known PTMs in a defined order
"""
ptm_graphs = []

nh_ptm = nx.Graph(name='NH')
nh_ptm.add_nodes_from([
(0, {'atomname': 'H', 'PTM_atom': True, 'element': 'H'}),
(1, {'atomname': 'N', 'PTM_atom': False, 'element': 'N'}),
])
nh_ptm.add_edge(0, 1)
ptm_graphs.append(nh_ptm)

cooc_ptm = nx.Graph(name='COOC')
cooc_ptm.add_nodes_from([
(0, {'atomname': 'C', 'PTM_atom': False, 'element': 'C'}),
(1, {'atomname': 'O', 'PTM_atom': True, 'element': 'O'}),
(2, {'atomname': 'O', 'PTM_atom': True, 'element': 'O'}),
(3, {'atomname': 'C', 'PTM_atom': False, 'element': 'C'}),
])
cooc_ptm.add_edges_from([(0, 1), (1, 2), (2, 3)])
ptm_graphs.append(cooc_ptm)

return sorted(ptm_graphs, key=len, reverse=True)


def make_molecule(atoms, edges):
"""
Makes molecules from atoms and edges
"""
mol = vermouth.molecule.Molecule()
mol.add_nodes_from(atoms.items())
mol.add_edges_from(edges)
return mol


@pytest.mark.parametrize('atoms, edges, expected', [
({}, [], []),
(
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
2: {'atomname': 'N', 'PTM_atom': True, 'element': 'N', 'resid': 1},
},
[(0, 1), (1, 2)],
[({1, 2}, {0})]
),
(
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': False, 'element': 'H', 'resid': 1},
2: {'atomname': 'N', 'PTM_atom': True, 'element': 'N', 'resid': 1},
},
[(0, 1), (1, 2)],
[({2}, {1})]
),
(
{
0: {'atomname': 'N', 'PTM_atom': True, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': False, 'element': 'H', 'resid': 1},
2: {'atomname': 'N', 'PTM_atom': True, 'element': 'N', 'resid': 1},
},
[(0, 1), (1, 2)],
[({0}, {1}), ({2}, {1})]
),
(
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
2: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 2},
},
[(0, 1), (1, 2)],
[({1}, {0, 2})]
),
(
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
2: {'atomname': 'N', 'PTM_atom': True, 'element': 'N', 'resid': 2},
3: {'atomname': 'CA', 'PTM_atom': False, 'element': 'N', 'resid': 2},
},
[(0, 1), (1, 2), (2, 3)],
[({1, 2}, {0, 3})]
),
])
def test_ptm_groups(atoms, edges, expected):
"""
Make sure PTM atoms are grouped correctly with appropriate anchors
"""
molecule = make_molecule(atoms, edges)

found = canmod.find_ptm_atoms(molecule)
assert expected == found


@pytest.mark.parametrize('atoms, edges, expected', [
pytest.param(
# This needs to raise a KeyError, because not all the anchors are
# covered. This is the root of #140
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
2: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 2},
},
[(0, 1), (1, 2)],
[('NH', {1: 0, 2: 1})],
marks=pytest.mark.xfail(raises=KeyError, strict=True)
),
(
# Simplest case: one PTM atom for 1 residue
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
},
[(0, 1)],
[('NH', {0: 1, 1: 0})]
),
(
# Two PTM atoms with a shared anchor
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
2: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
},
[(0, 1), (0, 2)],
[('NH', {0: 1, 1: 0}), ('NH', {0: 1, 2: 0})]
),
(
# Two PTM atoms with two anchors covered (?) by 2 fragments
{
0: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 1},
1: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 1},
2: {'atomname': 'H', 'PTM_atom': True, 'element': 'H', 'resid': 2},
3: {'atomname': 'N', 'PTM_atom': False, 'element': 'N', 'resid': 2},
},
[(0, 1), (1, 2), (2, 3)],
[('NH', {0: 1, 1: 0}), ('NH', {2: 0, 3: 1})]
),
(
# Two PTM atoms with two anchors covered by 1 fragment
{
0: {'atomname': 'C', 'PTM_atom': False, 'element': 'C', 'resid': 1},
1: {'atomname': 'O', 'PTM_atom': True, 'element': 'O', 'resid': 1},
2: {'atomname': 'O', 'PTM_atom': True, 'element': 'O', 'resid': 2},
3: {'atomname': 'C', 'PTM_atom': False, 'element': 'C', 'resid': 2},
},
[(0, 1), (1, 2), (2, 3)],
[('COOC', {0: 0, 1: 1, 2: 2, 3: 3})]
),
])
def test_identify_ptms(known_ptm_graphs, atoms, edges, expected):
"""
Make sure PTMs are identified correctly.
"""
molecule = make_molecule(atoms, edges)

ptms = canmod.find_ptm_atoms(molecule)
known_ptms = [(ptm_graph, canmod.PTMGraphMatcher(molecule, ptm_graph))
for ptm_graph in known_ptm_graphs]

found = canmod.identify_ptms(molecule, ptms, known_ptms)
found = [(ptm.name, match) for ptm, match in found]
assert found == expected

0 comments on commit 5d835c5

Please sign in to comment.