From bdbee7b10b924d3ff1651d4127a35800c6796f09 Mon Sep 17 00:00:00 2001
From: Fabian Gruenewald <f.grunewald@rug.nl>
Date: Fri, 3 May 2024 15:52:05 +0200
Subject: [PATCH] EZ isomerism and chirality

---
 pysmiles/read_smiles.py   | 39 +++++++++++++++--
 pysmiles/smiles_helper.py | 80 +++++++++++++++++++++++++++++++++--
 tests/test_read_smiles.py | 88 +++++++++++++++++++++++++++++++++------
 3 files changed, 188 insertions(+), 19 deletions(-)

diff --git a/pysmiles/read_smiles.py b/pysmiles/read_smiles.py
index 79e9634..673b189 100644
--- a/pysmiles/read_smiles.py
+++ b/pysmiles/read_smiles.py
@@ -24,7 +24,8 @@
 
 from .smiles_helper import (add_explicit_hydrogens, remove_explicit_hydrogens,
                             parse_atom, fill_valence, mark_aromatic_edges,
-                            mark_aromatic_atoms)
+                            mark_aromatic_atoms, mark_chiral_atoms,
+                            annotate_ez_isomers)
 
 LOGGER = logging.getLogger(__name__)
 
@@ -37,6 +38,7 @@ class TokenType(enum.Enum):
     BRANCH_END = 4
     RING_NUM = 5
     EZSTEREO = 6
+    CHIRAL = 7
 
 
 def _tokenize(smiles):
@@ -129,6 +131,10 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
     next_bond = None
     branches = []
     ring_nums = {}
+    current_ez = None
+    prev_token = None
+    prev_type = None
+    ez_isomer_pairs = []
     for tokentype, token in _tokenize(smiles):
         if tokentype == TokenType.ATOM:
             mol.add_node(idx, **parse_atom(token))
@@ -180,10 +186,30 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
                 ring_nums[token] = (idx - 1, next_bond)
                 next_bond = None
         elif tokentype == TokenType.EZSTEREO:
-            LOGGER.warning('E/Z stereochemical information, which is specified by "%s", will be discarded', token)
+            # we found the second ez reference and
+            # annotate the molecule
+            if current_ez:
+                ez_isomer_pairs.append((current_ez, (idx, anchor, token)))
+                current_ez = None
+            # current_ez is formatted as:
+            # ligand, anchor, token where ligand is the atom defining CIS/TRANS
+            # we found a token belonging to a branch (e.g. C(\F))
+            elif prev_type == TokenType.BRANCH_START:
+                current_ez = (idx, anchor, token)
+#            elif prev_type == TokenType.BRANCH_STOP:
+#                raise ValueError(f"The E/Z token {token} may not follow a closing bracket.")
+            else:
+                current_ez = (anchor, idx, token)
+
+        prev_token = token
+        prev_type = tokentype
+
     if ring_nums:
         raise KeyError('Unmatched ring indices {}'.format(list(ring_nums.keys())))
 
+    if current_ez:
+        raise ValueError('There is an unmatched stereochemical token.')
+
     # Time to deal with aromaticity. This is a mess, because it's not super
     # clear what aromaticity information has been provided, and what should be
     # inferred. In addition, to what extend do we want to provide a "sane"
@@ -197,7 +223,6 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
         if mol.nodes[n_idx].get('aromatic', False):
             raise ValueError("You specified an aromatic atom outside of a"
                              " ring. This is impossible")
-    
     mark_aromatic_edges(mol)
     fill_valence(mol)
     if reinterpret_aromatic:
@@ -213,4 +238,12 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
         add_explicit_hydrogens(mol)
     else:
         remove_explicit_hydrogens(mol)
+
+    # post-processing of E/Z isomerism
+    annotate_ez_isomers(mol, ez_isomer_pairs)
+
+    # post-processing of chiral atoms
+    # potentially we need all hydrogen in place
+    mark_chiral_atoms(mol)
+
     return mol
diff --git a/pysmiles/smiles_helper.py b/pysmiles/smiles_helper.py
index 0c13c19..c112bc7 100644
--- a/pysmiles/smiles_helper.py
+++ b/pysmiles/smiles_helper.py
@@ -99,9 +99,8 @@ def parse_atom(atom):
 
     if out.get('element') == 'H' and out.get('hcount', 0):
         raise ValueError("A hydrogen atom can't have hydrogens")
-
-    if 'stereo' in out:
-        LOGGER.warning('Atom "%s" contains stereochemical information that will be discarded.', atom)
+#    if 'stereo' in out:
+#        LOGGER.warning('Atom "%s" contains stereochemical information that will be discarded.', atom)
 
     return out
 
@@ -579,3 +578,78 @@ def increment_bond_orders(molecule, max_bond_order=3):
         molecule.edges[idx, jdx]['order'] = new_order
         missing_bonds[idx] -= edge_missing
         missing_bonds[jdx] -= edge_missing
+
+def mark_chiral_atoms(molecule):
+    """
+    For all nodes tagged as chiral, figure out the three
+    substiuents and annotate the node with a tuple that
+    has the order in which to rotate. This essentially
+    corresponds to the definition of an improper dihedral
+    angle centered on the chiral atom.
+    """
+    chiral_nodes = nx.get_node_attributes(molecule, 'stereo')
+    for node, direction in chiral_nodes.items():
+        # discard the node with the lowest index because we
+        # are looking down that node; it is not part of the
+        # stereo projection
+        neighbors = sorted(molecule.neighbors(node))[1:]
+        if len(neighbors) != 3:
+            n = len(neighbors) + 1
+            msg = (f"Chiral node {node} has {n} neighbors, which "
+                    "is different than the four expected for "
+                    "tetrahedral chirality. If the chiral center "
+                    "contains hydrogen you need to set explicit_hydrogen "
+                    "to True.")
+            raise ValueError(msg)
+        # the default is anti-clockwise sorting indicated by '@'
+        # in this case the nodes are sorted with increasing
+        # node index; however @@ means clockwise and the
+        # order of nodes is reversed (i.e. with decreasing index)
+        if direction == '@@':
+            neighbors = [neighbors[0], neighbors[2], neighbors[1]]
+        molecule.nodes[node]['stereo'] = (node, *neighbors)
+
+def annotate_ez_isomers(molecule, ez_pairs):
+    for first, second in ez_pairs:
+        ligand_first, anchor_first, ez_first = first
+        ligand_second, anchor_second, ez_second = second
+        # here come all the ridcilous cases of EZ in smiles
+        if ligand_first < anchor_first:
+            # case 1 F/C=C/F is trans
+            if ez_first == '/' and ez_second == '/':
+                ez_isomer = 'trans'
+            # case 2 F\C=C/F
+            elif ez_first == '\\' and ez_second == '/':
+                ez_isomer = 'cis'
+            # case 3 F/C=C\F
+            elif ez_first == '/' and ez_second == '\\':
+                ez_isomer = 'cis'
+            # case 4 F\C=C\F
+            elif ez_fist == '\\' and ez_second == '\\':
+                ez_isomer = 'trans'
+        # as in C(/F)
+        elif ligand_first > anchor_first:
+            # case 5 C(\F)=C/F is trans
+            if ez_first == '\\' and ez_second == '/':
+                ez_isomer = 'trans'
+            # case 6 C(/F)=C/F
+            elif ez_first == '/' and ez_second == '/':
+                ez_isomer = 'cis'
+            # case 7 C(/F)=C\F
+            elif ez_first == '/' and ez_second == '\\':
+                ez_isomer = 'trans'
+            # case 4 C(\F)=C\F
+            elif ez_fist == '\\' and ez_second == '\\':
+                ez_isomer = 'cis'
+        # annotate ligands
+        print(ligand_first, ligand_second)
+        molecule.nodes[ligand_first]['ez_isomer'] = (ligand_first,
+                                                     anchor_first,
+                                                     anchor_second,
+                                                     ligand_second,
+                                                     ez_isomer)
+        molecule.nodes[ligand_second]['ez_isomer'] = (ligand_second,
+                                                      anchor_second,
+                                                      anchor_first,
+                                                      ligand_first,
+                                                      ez_isomer)
diff --git a/tests/test_read_smiles.py b/tests/test_read_smiles.py
index 73196d3..8a836ee 100644
--- a/tests/test_read_smiles.py
+++ b/tests/test_read_smiles.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import pytest
-
+import networkx as nx
 from pysmiles import read_smiles
 from pysmiles.testhelper import assertEqualGraphs, make_mol
 
@@ -517,6 +517,64 @@
          (1, 2, {'order': 1.5}),
          (2, 0, {'order': 1.5}),],
         False
+    ),
+    # chiral center S/L alanine
+    (   'C[C@@H](C(=O)O)N',
+        [(0, {'element': 'C', 'charge': 0, 'aromatic': False}),
+         (1, {'charge': 0, 'aromatic': False, 'element': 'C', 'stereo': (1, 2, 9, 5)}),
+         (2, {'element': 'C', 'charge': 0, 'aromatic': False}),
+         (3, {'element': 'O', 'charge': 0, 'aromatic': False}),
+         (4, {'element': 'O', 'charge': 0, 'aromatic': False}),
+         (5, {'element': 'N', 'charge': 0, 'aromatic': False}),
+         (6, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (7, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (8, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (9, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (10, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (11, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (12, {'charge': 0, 'aromatic': False, 'element': 'H'})],
+        [(0, 1, {'order': 1}),
+         (0, 6, {'order': 1}),
+         (0, 7, {'order': 1}),
+         (0, 8, {'order': 1}),
+         (1, 2, {'order': 1}),
+         (1, 5, {'order': 1}),
+         (1, 9, {'order': 1}),
+         (2, 3, {'order': 2}),
+         (2, 4, {'order': 1}),
+         (4, 10, {'order': 1}),
+         (5, 11, {'order': 1}),
+         (5, 12, {'order': 1})],
+        True
+    ),
+    # chiral center R/D alanine
+    (   'C[C@H](C(=O)O)N',
+        [(0, {'element': 'C', 'charge': 0, 'aromatic': False}),
+         (1, {'charge': 0, 'aromatic': False, 'element': 'C', 'stereo': (1, 2, 5, 9)}),
+         (2, {'element': 'C', 'charge': 0, 'aromatic': False}),
+         (3, {'element': 'O', 'charge': 0, 'aromatic': False}),
+         (4, {'element': 'O', 'charge': 0, 'aromatic': False}),
+         (5, {'element': 'N', 'charge': 0, 'aromatic': False}),
+         (6, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (7, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (8, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (9, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (10, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (11, {'charge': 0, 'aromatic': False, 'element': 'H'}),
+         (12, {'charge': 0, 'aromatic': False, 'element': 'H'})],
+        [(0, 1, {'order': 1}),
+         (0, 6, {'order': 1}),
+         (0, 7, {'order': 1}),
+         (0, 8, {'order': 1}),
+         (1, 2, {'order': 1}),
+         (1, 5, {'order': 1}),
+         (1, 9, {'order': 1}),
+         (2, 3, {'order': 2}),
+         (2, 4, {'order': 1}),
+         (4, 10, {'order': 1}),
+         (5, 11, {'order': 1}),
+         (5, 12, {'order': 1})],
+        True,
     )
 ))
 def test_read_smiles(smiles, node_data, edge_data, explicit_h):
@@ -543,16 +601,20 @@ def test_invalid_smiles(smiles, error_type):
         read_smiles(smiles)
 
 
-@pytest.mark.parametrize('smiles, n_records',[
-    (r'F/C=C/F', 2),
-    (r'C(\F)=C/F', 2),
-    (r'F\C=C/F', 2),
-    (r'C(/F)=C/F', 2),
-    ('NC(Br)=[C@]=C(O)C', 1),
-    ('c1ccccc1', 0)
+@pytest.mark.parametrize('smiles, records',[
+    (r'F/C=C/F', [(0, 1, 2, 3, 'trans'), (3, 2, 1, 0, 'trans')]),
+    (r'C(\F)=C/F', [(1, 0, 2, 3, 'trans'), (3, 2, 0, 1, 'trans')]),
+    (r'F\C=C/F', [(0, 1, 2, 3, 'cis'), (3, 2, 1, 0, 'cis')]),
+    (r'C(/F)=C/F', [(1, 0, 2, 3, 'cis'), (3, 2, 0, 1, 'cis')]),
+    (r'F/C(CC)=C/F', [(0, 1, 4, 5, 'trans'), (5, 4, 1, 0, 'trans')]),
+    (r'F/C=C=C=C/F', [(0, 1, 4, 5, 'trans'), (5, 4, 1, 0, 'trans')]),
+    (r'F/C=C=C=C\F', [(0, 1, 4, 5, 'cis'), (5, 4, 1, 0, 'cis')]),
+    ('c1ccccc1', None)
 ])
-def test_stereo_logging(caplog, smiles, n_records):
-    read_smiles(smiles, explicit_hydrogen=False)
-    assert len(caplog.records) == n_records
-    for record in caplog.records:
-        assert record.levelname == "WARNING"
+def test_stereo_eading(smiles, records):
+    mol = read_smiles(smiles, explicit_hydrogen=False)
+    if records:
+        for record in records:
+            assert mol.nodes[record[0]]['ez_isomer'] == record
+    else:
+        assert len(nx.get_node_attributes(mol, 'ez_isomer')) == 0