Add substructure 1:1 node matching

When you want to preserve some property indexed to the nodes, it's helpful to construct the node map. Documentation here isn't complete because `atommatch` and `bondmatch` are undocumented, but hopefully it sends users in the right direction.
mojaie · Dec 1, 2020 · 63c1c20 · 63c1c20
1 parent 8844432
commit 63c1c20
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 2 deletions.
diff --git a/src/graph/isomorphism/vf2.jl b/src/graph/isomorphism/vf2.jl
@@ -325,12 +325,17 @@ issubgraphmatch(G, H; kwargs...) = subgraphmatch(G, H; kwargs...) !== nothing
 # Edge induced subgraph isomorphism
 
 """
-    edgesubgraphmatch(G::AbstractGraph, H::AbstractGraph; kwargs...) -> Iterator
+    edgesubgraphmatch(G::AbstractGraph, H::AbstractGraph;
+                      nodematcher=(g,h)->true , edgematcher=(g,h)->true,
+                      kwargs...) -> Iterator
 
 Generate edge induced subgraph isomorphism mappings between `G` and `H`.
 The returned iterator has `ig => ih` pairs that correspond to the indices of matching
 edges in `G` and `H`, respectively.
 
+`nodematcher` and `edgematcher` control the features needed to be counted as a match.
+[`atommatch`](@ref) and [`bondmatch`](@ref) can be used to construct these functions.
+
 See [`MolecularGraph.edgesubgraph`](@ref) to construct the subgraphs that result from the match.
 """
 edgesubgraphmatches(G, H; kwargs...

diff --git a/src/substructure.jl b/src/substructure.jl
@@ -107,6 +107,42 @@ function querymatch(mol::UndirectedGraph, query::QueryMol; kwargs...)
     return
 end
 
+"""
+    nmap = emaptonmap(emap, mol, query)
+
+Convert an edge-based mapping, of the form returned by [`edgesubgraphmatches`](@ref), into
+a map between nodes. Commonly, `nmap[i]` is a length-1 vector `[j]`, where `i=>j` is the mapping
+from `nodeattr(query, i)` to `nodeattr(mol, j)`. In cases where the mapping is ambiguous,
+`nmap[i]` may be multivalued.
+"""
+function emaptonmap(emap, mol::UndirectedGraph, query::QueryMol)
+    nmol, nq = nodecount(mol), nodecount(query)
+    nq <= nmol || throw(ArgumentError("query must be a substructure of mol"))
+    # Each node in the query edges can map to either of two nodes in mol
+    qnodeoptions = [Tuple{Int,Int}[] for _ = 1:nq]
+    for (edgeidmol, edgeidq) in emap
+        edgemol, edgeq = getedge(mol, edgeidmol), getedge(query, edgeidq)
+        for nq in edgeq
+            push!(qnodeoptions[nq], edgemol)
+        end
+    end
+    # For nodes connected to two or more other nodes, intersection results in a unique answer
+    assignment = [intersect(nodeops...) for nodeops in qnodeoptions]
+    # For the singly-connected nodes, assign them by eliminating ones already taken
+    taken = falses(nmol)
+    for a in assignment
+        if length(a) == 1
+            taken[a[1]] = true
+        end
+    end
+    for a in assignment
+        if length(a) > 1
+            deleteat!(a, findall(taken[a]))
+        end
+    end
+    return assignment
+end
+
 """
     isquerymatch(mol, query; kwargs...)
 

diff --git a/test/substructure.jl b/test/substructure.jl
@@ -2,7 +2,7 @@
 using MolecularGraph.Graph: lgnodematcher, lgedgematcher
 using MolecularGraph:
     fastidentityfilter, fastsubstrfilter,
-    atommatch, bondmatch
+    atommatch, bondmatch, emaptonmap
 
 @testset "substructure" begin
 
@@ -108,4 +108,49 @@ end
     @test !isquerymatch(hetero4, disconn)
 end
 
+@testset "node matching" begin
+    function nodematch(mol, query, idx=1, len=1)
+        afunc = atommatch(mol, query)
+        bfunc = bondmatch(mol, query)
+        emaps = edgesubgraphmatches(mol, query, nodematcher=afunc, edgematcher=bfunc)
+        @test length(emaps) == len
+        emap = emaps[idx]
+        return emaptonmap(emap, mol, query)
+    end
+    function single(v)   # like `only` on Julia 1.4+
+        length(v) == 1 || error("expected a single entry")
+        return v[1]
+    end
+    # 6-membered carbon ring fused to a 5-membered carbon ring with an attached oxygen
+    query = smartstomol("[#6]~1~[#6]~[#6]~[#6]~2~[#6](~[#6]~1)~[#6]~[#6]~[#6]~2~[#8]")
+    # The carbon numbering looks like this:
+    #         3     9
+    #      2     4     8
+    #      1     5     7
+    #         6
+    # 1-Indanone, PubChem CID6735
+    mol = smilestomol("C1CC(=O)C2=CC=CC=C21")
+    nmap = nodematch(mol, query)
+    @test single.(nmap) == [8,7,6,5,10,9,1,2,3,4]  # indices in mol corresponding to atoms in query
+    # 1-Acenaphthenone, PubChem CID75229
+    mol = smilestomol("C1C2=CC=CC3=C2C(=CC=C3)C1=O")
+    nmap = nodematch(mol, query)
+    @test single.(nmap) == [11,10,9,8,7,6,2,1,12,13]
+    # Ninhydrin, PubChem CID10236 (2 matches)
+    mol = smilestomol("C1=CC=C2C(=C1)C(=O)C(C2=O)(O)O")
+    nmaps = ([1,2,3,4,5,6,7,9,10,11], [2,1,6,5,4,3,10,9,7,8])
+    nmap = nodematch(mol, query, 1, 2)
+    snmap = single.(nmap)
+    @test snmap ∈ nmaps
+    nmap = nodematch(mol, query, 2, 2)
+    snmap = single.(nmap)
+    @test snmap ∈ nmaps
+
+    # Ambiguous match
+    query = smartstomol("[H][H]")
+    mol = smilestomol("[H][H]")
+    nmap = nodematch(mol, query)
+    @test nmap == [[1,2], [1,2]]
+end
+
 end # substructure