Skip to content

Commit

Permalink
Merge pull request #155 from marrink-lab/ismags
Browse files Browse the repository at this point in the history
Implement ISMAGS
  • Loading branch information
jbarnoud committed Feb 1, 2019
2 parents b726dda + c5759f3 commit 7f862f4
Show file tree
Hide file tree
Showing 9 changed files with 1,682 additions and 619 deletions.
1 change: 1 addition & 0 deletions requirements-tests.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pytest
coverage
pytest-cov
pylint
hypothesis
hypothesis-networkx
codecov
203 changes: 3 additions & 200 deletions vermouth/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
import itertools
import networkx as nx

from .ismags import ISMAGS
from .utils import maxes, first_alpha


Expand All @@ -43,7 +43,8 @@ def categorical_cartesian_product(graph1, graph2, attributes=tuple()):
for idx1, idx2 in itertools.product(graph1, graph2):
node1 = graph1.nodes[idx1]
node2 = graph2.nodes[idx2]
if all(attr in node1 and attr in node2 and node1[attr] == node2[attr] for attr in attributes):
if all((attr in node1 and attr in node2 and node1[attr] == node2[attr])
or (attr not in node1 and attr not in node2) for attr in attributes):
attrs = {}
for attr in set(node1.keys()) | set(node2.keys()):
attrs[attr] = (node1.get(attr, None), node2.get(attr, None))
Expand Down Expand Up @@ -83,204 +84,6 @@ def categorical_maximum_common_subgraph(graph1, graph2, attributes=tuple()):
return matches


def maximum_common_subgraph(graph1, graph2, attributes=tuple()):
product = nx.Graph()
# First, find the MCS between all nodes of degree != 1, such as the carbons
# Nothing new or exciting here.
for g1_node, g2_node in itertools.product(graph1, graph2):
node1 = graph1.nodes[g1_node]
node2 = graph2.nodes[g2_node]
if all(attr in node1 and attr in node2 and node1[attr] == node2[attr] for attr in attributes):
if graph1.degree(g1_node) != 1 and graph2.degree(g2_node) != 1:
product.add_node((g1_node, g2_node))
for (g1_node1, g2_node1), (g1_node2, g2_node2) in itertools.combinations(product.nodes(), 2):
both_edge = graph1.has_edge(g1_node1, g1_node2) and graph2.has_edge(g2_node1, g2_node2)
neither_edge = not graph1.has_edge(g1_node1, g1_node2) and not graph2.has_edge(g2_node1, g2_node2)
# Effectively: not (graph1.has_edge(g1_node1, g1_node2) xor graph2.has_edge(g2_node1, g2_node2))
if g1_node1 != g1_node2 and g2_node1 != g2_node2 and (both_edge or neither_edge):
product.add_edge((g1_node1, g2_node1), (g1_node2, g2_node2))
cliques = nx.find_cliques(product)
# largest = maxes(cliques, key=len)
# We can't do maxes, because it might still grow to be as large. Does make
# things slower though... We could say that it can grow to be at most the
# current size plus the number of degree-1 nodes.
largest = cliques

# Add an empty match in case nothing of degree > 1 matches. In that case we
# still need to do the loop below.
largest = itertools.chain([[]], largest)

# Now, for every MCS we found, look at the nodes of degree 1. The
# attributes still need to match. In addition, they need to have the same
# (mapped) neighbour, or the neighbour must be missing from the graph2 graph
all_cliques = []
for clique in largest:
match = dict(clique)
product = nx.Graph()
product.add_nodes_from(clique)
for g1_node, g2_node in itertools.product(graph1, graph2):
node1 = graph1.nodes[g1_node]
node2 = graph2.nodes[g2_node]
# We can't do this above, because we need the match to translate
# nodes from graph graph1 to graph graph2 to see whether their neighbours
# correspond.
if (graph1.degree(g1_node) <= 1 or graph2.degree(g2_node) <= 1) and\
all(attr in node1 and attr in node2 and node1[attr] == node2[attr] for attr in attributes):
g1_neighbors = [match.get(n, None) for n in graph1.neighbors(g1_node)]
# If no neighbors are found for g1_node, or if any of them are
# the same in graph2, they're compatible.

# FIXME?
# This eliminates some nodes from the MCS, since it's possible
# the MCS does not include *all* nodes in match. This means
# that some nodes should be considered compatible, even if they
# have different neighbours, but only if that neighbor is not
# part of the final MCS. This makes testing a little tricky,
# since categorical_maximum_common_subgraph *does* find them.
# It'll be a cornercase anyway.
if not g1_neighbors or None in g1_neighbors or any(n in g1_neighbors for n in graph2.neighbors(g2_node)):
product.add_node((g1_node, g2_node))
for (g1_node1, g2_node1), (g1_node2, g2_node2) in itertools.combinations(product.nodes(), 2):
both_edge = graph1.has_edge(g1_node1, g1_node2) and graph2.has_edge(g2_node1, g2_node2)
neither_edge = not graph1.has_edge(g1_node1, g1_node2) and not graph2.has_edge(g2_node1, g2_node2)
# Effectively: not (graph1.has_edge(g1_node1, g1_node2) xor graph2.has_edge(g2_node1, g2_node2))
if g1_node1 != g1_node2 and g2_node1 != g2_node2 and (both_edge or neither_edge):
product.add_edge((g1_node1, g2_node1), (g1_node2, g2_node2))
# TODO: This duplicates a lot of effort. Maybe create the compatibility
# graph first from all cliques, and find the cliques only once?
this_pass = nx.find_cliques(product)
all_cliques.append(this_pass)
# And finally, find the largest MCS in all cliques graph2.
largest = maxes(itertools.chain(*all_cliques), key=len)
matches = set(frozenset(m) for m in largest) # remove duplicates
matches = [dict(clique) for clique in matches]

return matches


def isomorphism(reference, residue):
"""
Finds matching atoms between ``reference`` and ``residue``. ``residue`` should be
a subgraph of ``reference``. Matchin is done based on connectivity and
the ``element`` attribute of the nodes.
The subgraph isomorphism is first calculated using non-hydrogen atoms only.
These matches are then extended to include one option for the hydrogen
isomorphism. This is done because otherwise a combinatorial problem is
created: take for example an alkane chain: the carbon atoms match in one
way. Then, for every :math:``CH_2`` group there are two, independent options
for the hydrogen isomorphism. This would result in :math:``2^n`` subgraph
isomorphisms for :math:``n`` carbon atoms.
This means that the matches found will not be optimal for the hydrogens.
This is acceptable, since hydrogrens are supposed to be equal. Let's say
you have some sort of chiral atom with two hydrogens: it's not chiral and
the hydrogen atoms are equal. Let's now say one of the two is a deuterium:
in that case you should have a proper 'element' header, and the subgraph
will be matched correctly.
Parameters
----------
reference : networkx.Graph
The reference graph.
residue : networkx.Graph
The graph to match to ``reference``.
Returns
-------
matches : list[dict]
The matches found. The dictionaries have node indices of ``reference`` as
keys and node indices of ``residue`` as values. Is an empty list if
``residue`` is not a subgraph of ``reference``.
"""
# TODO: refactor this thing to accept node and edge compatibility checkers
matches = []
# H_idxs = [idx for idx in residue if residue.node[idx]['element'] == 'H']
H_idxs = [idx for idx in residue if residue.degree(idx) == 1]
heavy_res = nx.Graph(residue).copy()
heavy_res.remove_nodes_from(H_idxs)

# ref_H_idxs = [idx for idx in reference if reference.degree(idx) == 1]
# heavy_ref = nx.Graph(reference).copy()
# heavy_ref.remove_nodes_from(ref_H_idxs)
# First, generate all the isomorphisms on heavy atoms. For each of these
# we'll find *something* where the hydrogens match.
GM = ElementGraphMatcher(reference, heavy_res)
first_matches = list(GM.subgraph_isomorphisms_iter())
for match in first_matches:
reverse_match = {v: k for k, v in match.items()}
for res_H_idx in H_idxs:
# We know which parent atom this hydrogen is bound to, and we know
# how it matches to the reference. We're going to find all the
# neighboring atoms of the reference parent, and see if the name
# of this hydrogen atom matches with any of those neighbors. If so,
# we extend the match that way.
# It should be noted that in exceptional cases where atomnames are
# very wrong, this might cause a problem?
res_neighbor = list(residue[res_H_idx].keys())[0]
if res_neighbor not in reverse_match:
continue
ref_neighbor = reverse_match[res_neighbor]
H_names = defaultdict(list)
for idx in reference[ref_neighbor]:
if reference.degree(idx) == 1:
H_names[reference.nodes[idx]['atomname']].append(idx)
H_names = dict(H_names)
res_H_name = residue.nodes[res_H_idx]['atomname']
if res_H_name in H_names:
if len(H_names[res_H_name]) != 1:
continue
ref_H_idx = H_names[res_H_name][0]
if ref_H_idx not in match and reference.nodes[ref_H_idx]['element'] == residue.nodes[res_H_idx]['element']:
reverse_match[res_H_idx] = ref_H_idx
match[ref_H_idx] = res_H_idx
GM_large = ElementGraphMatcher(reference, residue)
# Put the knowledge from the heavy atom isomorphism back in. Note that
# ElementGraphMatched is modified to enable this and is no longer
# re-entrant.
# Indices in match do not have to be changed to account for interlaced
# hydrogens: the node-indices in heavy_res and residue are the same.
GM_large.core_1 = match # pylint: disable=attribute-defined-outside-init
GM_large.core_2 = reverse_match # pylint: disable=attribute-defined-outside-init
outcome = GM_large.subgraph_isomorphisms_iter()
# Take just the first match found, otherwise it becomes a combinatorics
# problem (consider an alkane chain). This is fine though, since
# hydrogrens are supposed to be equal. Let's say you have some sort of
# chiral atom with two hydrogens: It's not chiral. Let's now say one of
# the two is a deuterium: in that case you should have a proper
# 'element' header, and the subgraph will be matched correctly.
# So worst case scenario we rename all hydrogens. This is acceptable
# since they're equal.
# And do islice since there may be none.
# This will fail for e.g. oxygens which have degree 1 in the reference,
# but are substituted with PTMs in the actual molecule. In that case
# the atomnames might be flipped, and make PTM identification
# troublesome. For example: C(=O)OH. That's why we extend the match to
# include degree-1 nodes above.
if match:
matches.extend(itertools.islice(outcome, 1))
else:
matches.extend(outcome)
matches = sorted(matches,
key=lambda m: rate_match(reference, residue, m),
reverse=True)
return matches


class ElementGraphMatcher(nx.isomorphism.GraphMatcher):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
super().initialize()

def initialize(self):
return

def semantic_feasibility(self, node1, node2):
# TODO: implement (partial) wildcards
elem1 = self.G1.node[node1]['element']
elem2 = self.G2.node[node2]['element']
return elem1 == elem2


def blockmodel(G, partitions, **attrs):
"""
Analogous to networkx.blockmodel, but can deal with incomplete partitions,
Expand Down

0 comments on commit 7f862f4

Please sign in to comment.