Skip to content

Commit

Permalink
EZ isomerism and chirality
Browse files Browse the repository at this point in the history
  • Loading branch information
fgrunewald committed May 3, 2024
1 parent 2308e24 commit bdbee7b
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 19 deletions.
39 changes: 36 additions & 3 deletions pysmiles/read_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@

from .smiles_helper import (add_explicit_hydrogens, remove_explicit_hydrogens,
parse_atom, fill_valence, mark_aromatic_edges,
mark_aromatic_atoms)
mark_aromatic_atoms, mark_chiral_atoms,
annotate_ez_isomers)

LOGGER = logging.getLogger(__name__)

Expand All @@ -37,6 +38,7 @@ class TokenType(enum.Enum):
BRANCH_END = 4
RING_NUM = 5
EZSTEREO = 6
CHIRAL = 7


def _tokenize(smiles):
Expand Down Expand Up @@ -129,6 +131,10 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
next_bond = None
branches = []
ring_nums = {}
current_ez = None
prev_token = None
prev_type = None
ez_isomer_pairs = []
for tokentype, token in _tokenize(smiles):
if tokentype == TokenType.ATOM:
mol.add_node(idx, **parse_atom(token))
Expand Down Expand Up @@ -180,10 +186,30 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
ring_nums[token] = (idx - 1, next_bond)
next_bond = None
elif tokentype == TokenType.EZSTEREO:
LOGGER.warning('E/Z stereochemical information, which is specified by "%s", will be discarded', token)
# we found the second ez reference and
# annotate the molecule
if current_ez:
ez_isomer_pairs.append((current_ez, (idx, anchor, token)))
current_ez = None
# current_ez is formatted as:
# ligand, anchor, token where ligand is the atom defining CIS/TRANS
# we found a token belonging to a branch (e.g. C(\F))
elif prev_type == TokenType.BRANCH_START:
current_ez = (idx, anchor, token)
# elif prev_type == TokenType.BRANCH_STOP:
# raise ValueError(f"The E/Z token {token} may not follow a closing bracket.")
else:
current_ez = (anchor, idx, token)

prev_token = token
prev_type = tokentype

if ring_nums:
raise KeyError('Unmatched ring indices {}'.format(list(ring_nums.keys())))

if current_ez:
raise ValueError('There is an unmatched stereochemical token.')

# Time to deal with aromaticity. This is a mess, because it's not super
# clear what aromaticity information has been provided, and what should be
# inferred. In addition, to what extend do we want to provide a "sane"
Expand All @@ -197,7 +223,6 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
if mol.nodes[n_idx].get('aromatic', False):
raise ValueError("You specified an aromatic atom outside of a"
" ring. This is impossible")

mark_aromatic_edges(mol)
fill_valence(mol)
if reinterpret_aromatic:
Expand All @@ -213,4 +238,12 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
add_explicit_hydrogens(mol)
else:
remove_explicit_hydrogens(mol)

# post-processing of E/Z isomerism
annotate_ez_isomers(mol, ez_isomer_pairs)

# post-processing of chiral atoms
# potentially we need all hydrogen in place
mark_chiral_atoms(mol)

return mol
80 changes: 77 additions & 3 deletions pysmiles/smiles_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,8 @@ def parse_atom(atom):

if out.get('element') == 'H' and out.get('hcount', 0):
raise ValueError("A hydrogen atom can't have hydrogens")

if 'stereo' in out:
LOGGER.warning('Atom "%s" contains stereochemical information that will be discarded.', atom)
# if 'stereo' in out:
# LOGGER.warning('Atom "%s" contains stereochemical information that will be discarded.', atom)

return out

Expand Down Expand Up @@ -579,3 +578,78 @@ def increment_bond_orders(molecule, max_bond_order=3):
molecule.edges[idx, jdx]['order'] = new_order
missing_bonds[idx] -= edge_missing
missing_bonds[jdx] -= edge_missing

def mark_chiral_atoms(molecule):
"""
For all nodes tagged as chiral, figure out the three
substiuents and annotate the node with a tuple that
has the order in which to rotate. This essentially
corresponds to the definition of an improper dihedral
angle centered on the chiral atom.
"""
chiral_nodes = nx.get_node_attributes(molecule, 'stereo')
for node, direction in chiral_nodes.items():
# discard the node with the lowest index because we
# are looking down that node; it is not part of the
# stereo projection
neighbors = sorted(molecule.neighbors(node))[1:]
if len(neighbors) != 3:
n = len(neighbors) + 1
msg = (f"Chiral node {node} has {n} neighbors, which "
"is different than the four expected for "
"tetrahedral chirality. If the chiral center "
"contains hydrogen you need to set explicit_hydrogen "
"to True.")
raise ValueError(msg)
# the default is anti-clockwise sorting indicated by '@'
# in this case the nodes are sorted with increasing
# node index; however @@ means clockwise and the
# order of nodes is reversed (i.e. with decreasing index)
if direction == '@@':
neighbors = [neighbors[0], neighbors[2], neighbors[1]]
molecule.nodes[node]['stereo'] = (node, *neighbors)

def annotate_ez_isomers(molecule, ez_pairs):
for first, second in ez_pairs:
ligand_first, anchor_first, ez_first = first
ligand_second, anchor_second, ez_second = second
# here come all the ridcilous cases of EZ in smiles
if ligand_first < anchor_first:
# case 1 F/C=C/F is trans
if ez_first == '/' and ez_second == '/':
ez_isomer = 'trans'
# case 2 F\C=C/F
elif ez_first == '\\' and ez_second == '/':
ez_isomer = 'cis'
# case 3 F/C=C\F
elif ez_first == '/' and ez_second == '\\':
ez_isomer = 'cis'
# case 4 F\C=C\F
elif ez_fist == '\\' and ez_second == '\\':
ez_isomer = 'trans'
# as in C(/F)
elif ligand_first > anchor_first:
# case 5 C(\F)=C/F is trans
if ez_first == '\\' and ez_second == '/':
ez_isomer = 'trans'
# case 6 C(/F)=C/F
elif ez_first == '/' and ez_second == '/':
ez_isomer = 'cis'
# case 7 C(/F)=C\F
elif ez_first == '/' and ez_second == '\\':
ez_isomer = 'trans'
# case 4 C(\F)=C\F
elif ez_fist == '\\' and ez_second == '\\':
ez_isomer = 'cis'
# annotate ligands
print(ligand_first, ligand_second)
molecule.nodes[ligand_first]['ez_isomer'] = (ligand_first,
anchor_first,
anchor_second,
ligand_second,
ez_isomer)
molecule.nodes[ligand_second]['ez_isomer'] = (ligand_second,
anchor_second,
anchor_first,
ligand_first,
ez_isomer)
88 changes: 75 additions & 13 deletions tests/test_read_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.

import pytest

import networkx as nx
from pysmiles import read_smiles
from pysmiles.testhelper import assertEqualGraphs, make_mol

Expand Down Expand Up @@ -517,6 +517,64 @@
(1, 2, {'order': 1.5}),
(2, 0, {'order': 1.5}),],
False
),
# chiral center S/L alanine
( 'C[C@@H](C(=O)O)N',
[(0, {'element': 'C', 'charge': 0, 'aromatic': False}),
(1, {'charge': 0, 'aromatic': False, 'element': 'C', 'stereo': (1, 2, 9, 5)}),
(2, {'element': 'C', 'charge': 0, 'aromatic': False}),
(3, {'element': 'O', 'charge': 0, 'aromatic': False}),
(4, {'element': 'O', 'charge': 0, 'aromatic': False}),
(5, {'element': 'N', 'charge': 0, 'aromatic': False}),
(6, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(7, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(8, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(9, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(10, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(11, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(12, {'charge': 0, 'aromatic': False, 'element': 'H'})],
[(0, 1, {'order': 1}),
(0, 6, {'order': 1}),
(0, 7, {'order': 1}),
(0, 8, {'order': 1}),
(1, 2, {'order': 1}),
(1, 5, {'order': 1}),
(1, 9, {'order': 1}),
(2, 3, {'order': 2}),
(2, 4, {'order': 1}),
(4, 10, {'order': 1}),
(5, 11, {'order': 1}),
(5, 12, {'order': 1})],
True
),
# chiral center R/D alanine
( 'C[C@H](C(=O)O)N',
[(0, {'element': 'C', 'charge': 0, 'aromatic': False}),
(1, {'charge': 0, 'aromatic': False, 'element': 'C', 'stereo': (1, 2, 5, 9)}),
(2, {'element': 'C', 'charge': 0, 'aromatic': False}),
(3, {'element': 'O', 'charge': 0, 'aromatic': False}),
(4, {'element': 'O', 'charge': 0, 'aromatic': False}),
(5, {'element': 'N', 'charge': 0, 'aromatic': False}),
(6, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(7, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(8, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(9, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(10, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(11, {'charge': 0, 'aromatic': False, 'element': 'H'}),
(12, {'charge': 0, 'aromatic': False, 'element': 'H'})],
[(0, 1, {'order': 1}),
(0, 6, {'order': 1}),
(0, 7, {'order': 1}),
(0, 8, {'order': 1}),
(1, 2, {'order': 1}),
(1, 5, {'order': 1}),
(1, 9, {'order': 1}),
(2, 3, {'order': 2}),
(2, 4, {'order': 1}),
(4, 10, {'order': 1}),
(5, 11, {'order': 1}),
(5, 12, {'order': 1})],
True,
)
))
def test_read_smiles(smiles, node_data, edge_data, explicit_h):
Expand All @@ -543,16 +601,20 @@ def test_invalid_smiles(smiles, error_type):
read_smiles(smiles)


@pytest.mark.parametrize('smiles, n_records',[
(r'F/C=C/F', 2),
(r'C(\F)=C/F', 2),
(r'F\C=C/F', 2),
(r'C(/F)=C/F', 2),
('NC(Br)=[C@]=C(O)C', 1),
('c1ccccc1', 0)
@pytest.mark.parametrize('smiles, records',[
(r'F/C=C/F', [(0, 1, 2, 3, 'trans'), (3, 2, 1, 0, 'trans')]),
(r'C(\F)=C/F', [(1, 0, 2, 3, 'trans'), (3, 2, 0, 1, 'trans')]),
(r'F\C=C/F', [(0, 1, 2, 3, 'cis'), (3, 2, 1, 0, 'cis')]),
(r'C(/F)=C/F', [(1, 0, 2, 3, 'cis'), (3, 2, 0, 1, 'cis')]),
(r'F/C(CC)=C/F', [(0, 1, 4, 5, 'trans'), (5, 4, 1, 0, 'trans')]),
(r'F/C=C=C=C/F', [(0, 1, 4, 5, 'trans'), (5, 4, 1, 0, 'trans')]),
(r'F/C=C=C=C\F', [(0, 1, 4, 5, 'cis'), (5, 4, 1, 0, 'cis')]),
('c1ccccc1', None)
])
def test_stereo_logging(caplog, smiles, n_records):
read_smiles(smiles, explicit_hydrogen=False)
assert len(caplog.records) == n_records
for record in caplog.records:
assert record.levelname == "WARNING"
def test_stereo_eading(smiles, records):
mol = read_smiles(smiles, explicit_hydrogen=False)
if records:
for record in records:
assert mol.nodes[record[0]]['ez_isomer'] == record
else:
assert len(nx.get_node_attributes(mol, 'ez_isomer')) == 0

0 comments on commit bdbee7b

Please sign in to comment.