From bdbee7b10b924d3ff1651d4127a35800c6796f09 Mon Sep 17 00:00:00 2001 From: Fabian Gruenewald Date: Fri, 3 May 2024 15:52:05 +0200 Subject: [PATCH] EZ isomerism and chirality --- pysmiles/read_smiles.py | 39 +++++++++++++++-- pysmiles/smiles_helper.py | 80 +++++++++++++++++++++++++++++++++-- tests/test_read_smiles.py | 88 +++++++++++++++++++++++++++++++++------ 3 files changed, 188 insertions(+), 19 deletions(-) diff --git a/pysmiles/read_smiles.py b/pysmiles/read_smiles.py index 79e9634..673b189 100644 --- a/pysmiles/read_smiles.py +++ b/pysmiles/read_smiles.py @@ -24,7 +24,8 @@ from .smiles_helper import (add_explicit_hydrogens, remove_explicit_hydrogens, parse_atom, fill_valence, mark_aromatic_edges, - mark_aromatic_atoms) + mark_aromatic_atoms, mark_chiral_atoms, + annotate_ez_isomers) LOGGER = logging.getLogger(__name__) @@ -37,6 +38,7 @@ class TokenType(enum.Enum): BRANCH_END = 4 RING_NUM = 5 EZSTEREO = 6 + CHIRAL = 7 def _tokenize(smiles): @@ -129,6 +131,10 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True, next_bond = None branches = [] ring_nums = {} + current_ez = None + prev_token = None + prev_type = None + ez_isomer_pairs = [] for tokentype, token in _tokenize(smiles): if tokentype == TokenType.ATOM: mol.add_node(idx, **parse_atom(token)) @@ -180,10 +186,30 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True, ring_nums[token] = (idx - 1, next_bond) next_bond = None elif tokentype == TokenType.EZSTEREO: - LOGGER.warning('E/Z stereochemical information, which is specified by "%s", will be discarded', token) + # we found the second ez reference and + # annotate the molecule + if current_ez: + ez_isomer_pairs.append((current_ez, (idx, anchor, token))) + current_ez = None + # current_ez is formatted as: + # ligand, anchor, token where ligand is the atom defining CIS/TRANS + # we found a token belonging to a branch (e.g. C(\F)) + elif prev_type == TokenType.BRANCH_START: + current_ez = (idx, anchor, token) +# elif prev_type == TokenType.BRANCH_STOP: +# raise ValueError(f"The E/Z token {token} may not follow a closing bracket.") + else: + current_ez = (anchor, idx, token) + + prev_token = token + prev_type = tokentype + if ring_nums: raise KeyError('Unmatched ring indices {}'.format(list(ring_nums.keys()))) + if current_ez: + raise ValueError('There is an unmatched stereochemical token.') + # Time to deal with aromaticity. This is a mess, because it's not super # clear what aromaticity information has been provided, and what should be # inferred. In addition, to what extend do we want to provide a "sane" @@ -197,7 +223,6 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True, if mol.nodes[n_idx].get('aromatic', False): raise ValueError("You specified an aromatic atom outside of a" " ring. This is impossible") - mark_aromatic_edges(mol) fill_valence(mol) if reinterpret_aromatic: @@ -213,4 +238,12 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True, add_explicit_hydrogens(mol) else: remove_explicit_hydrogens(mol) + + # post-processing of E/Z isomerism + annotate_ez_isomers(mol, ez_isomer_pairs) + + # post-processing of chiral atoms + # potentially we need all hydrogen in place + mark_chiral_atoms(mol) + return mol diff --git a/pysmiles/smiles_helper.py b/pysmiles/smiles_helper.py index 0c13c19..c112bc7 100644 --- a/pysmiles/smiles_helper.py +++ b/pysmiles/smiles_helper.py @@ -99,9 +99,8 @@ def parse_atom(atom): if out.get('element') == 'H' and out.get('hcount', 0): raise ValueError("A hydrogen atom can't have hydrogens") - - if 'stereo' in out: - LOGGER.warning('Atom "%s" contains stereochemical information that will be discarded.', atom) +# if 'stereo' in out: +# LOGGER.warning('Atom "%s" contains stereochemical information that will be discarded.', atom) return out @@ -579,3 +578,78 @@ def increment_bond_orders(molecule, max_bond_order=3): molecule.edges[idx, jdx]['order'] = new_order missing_bonds[idx] -= edge_missing missing_bonds[jdx] -= edge_missing + +def mark_chiral_atoms(molecule): + """ + For all nodes tagged as chiral, figure out the three + substiuents and annotate the node with a tuple that + has the order in which to rotate. This essentially + corresponds to the definition of an improper dihedral + angle centered on the chiral atom. + """ + chiral_nodes = nx.get_node_attributes(molecule, 'stereo') + for node, direction in chiral_nodes.items(): + # discard the node with the lowest index because we + # are looking down that node; it is not part of the + # stereo projection + neighbors = sorted(molecule.neighbors(node))[1:] + if len(neighbors) != 3: + n = len(neighbors) + 1 + msg = (f"Chiral node {node} has {n} neighbors, which " + "is different than the four expected for " + "tetrahedral chirality. If the chiral center " + "contains hydrogen you need to set explicit_hydrogen " + "to True.") + raise ValueError(msg) + # the default is anti-clockwise sorting indicated by '@' + # in this case the nodes are sorted with increasing + # node index; however @@ means clockwise and the + # order of nodes is reversed (i.e. with decreasing index) + if direction == '@@': + neighbors = [neighbors[0], neighbors[2], neighbors[1]] + molecule.nodes[node]['stereo'] = (node, *neighbors) + +def annotate_ez_isomers(molecule, ez_pairs): + for first, second in ez_pairs: + ligand_first, anchor_first, ez_first = first + ligand_second, anchor_second, ez_second = second + # here come all the ridcilous cases of EZ in smiles + if ligand_first < anchor_first: + # case 1 F/C=C/F is trans + if ez_first == '/' and ez_second == '/': + ez_isomer = 'trans' + # case 2 F\C=C/F + elif ez_first == '\\' and ez_second == '/': + ez_isomer = 'cis' + # case 3 F/C=C\F + elif ez_first == '/' and ez_second == '\\': + ez_isomer = 'cis' + # case 4 F\C=C\F + elif ez_fist == '\\' and ez_second == '\\': + ez_isomer = 'trans' + # as in C(/F) + elif ligand_first > anchor_first: + # case 5 C(\F)=C/F is trans + if ez_first == '\\' and ez_second == '/': + ez_isomer = 'trans' + # case 6 C(/F)=C/F + elif ez_first == '/' and ez_second == '/': + ez_isomer = 'cis' + # case 7 C(/F)=C\F + elif ez_first == '/' and ez_second == '\\': + ez_isomer = 'trans' + # case 4 C(\F)=C\F + elif ez_fist == '\\' and ez_second == '\\': + ez_isomer = 'cis' + # annotate ligands + print(ligand_first, ligand_second) + molecule.nodes[ligand_first]['ez_isomer'] = (ligand_first, + anchor_first, + anchor_second, + ligand_second, + ez_isomer) + molecule.nodes[ligand_second]['ez_isomer'] = (ligand_second, + anchor_second, + anchor_first, + ligand_first, + ez_isomer) diff --git a/tests/test_read_smiles.py b/tests/test_read_smiles.py index 73196d3..8a836ee 100644 --- a/tests/test_read_smiles.py +++ b/tests/test_read_smiles.py @@ -14,7 +14,7 @@ # limitations under the License. import pytest - +import networkx as nx from pysmiles import read_smiles from pysmiles.testhelper import assertEqualGraphs, make_mol @@ -517,6 +517,64 @@ (1, 2, {'order': 1.5}), (2, 0, {'order': 1.5}),], False + ), + # chiral center S/L alanine + ( 'C[C@@H](C(=O)O)N', + [(0, {'element': 'C', 'charge': 0, 'aromatic': False}), + (1, {'charge': 0, 'aromatic': False, 'element': 'C', 'stereo': (1, 2, 9, 5)}), + (2, {'element': 'C', 'charge': 0, 'aromatic': False}), + (3, {'element': 'O', 'charge': 0, 'aromatic': False}), + (4, {'element': 'O', 'charge': 0, 'aromatic': False}), + (5, {'element': 'N', 'charge': 0, 'aromatic': False}), + (6, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (7, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (8, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (9, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (10, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (11, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (12, {'charge': 0, 'aromatic': False, 'element': 'H'})], + [(0, 1, {'order': 1}), + (0, 6, {'order': 1}), + (0, 7, {'order': 1}), + (0, 8, {'order': 1}), + (1, 2, {'order': 1}), + (1, 5, {'order': 1}), + (1, 9, {'order': 1}), + (2, 3, {'order': 2}), + (2, 4, {'order': 1}), + (4, 10, {'order': 1}), + (5, 11, {'order': 1}), + (5, 12, {'order': 1})], + True + ), + # chiral center R/D alanine + ( 'C[C@H](C(=O)O)N', + [(0, {'element': 'C', 'charge': 0, 'aromatic': False}), + (1, {'charge': 0, 'aromatic': False, 'element': 'C', 'stereo': (1, 2, 5, 9)}), + (2, {'element': 'C', 'charge': 0, 'aromatic': False}), + (3, {'element': 'O', 'charge': 0, 'aromatic': False}), + (4, {'element': 'O', 'charge': 0, 'aromatic': False}), + (5, {'element': 'N', 'charge': 0, 'aromatic': False}), + (6, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (7, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (8, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (9, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (10, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (11, {'charge': 0, 'aromatic': False, 'element': 'H'}), + (12, {'charge': 0, 'aromatic': False, 'element': 'H'})], + [(0, 1, {'order': 1}), + (0, 6, {'order': 1}), + (0, 7, {'order': 1}), + (0, 8, {'order': 1}), + (1, 2, {'order': 1}), + (1, 5, {'order': 1}), + (1, 9, {'order': 1}), + (2, 3, {'order': 2}), + (2, 4, {'order': 1}), + (4, 10, {'order': 1}), + (5, 11, {'order': 1}), + (5, 12, {'order': 1})], + True, ) )) def test_read_smiles(smiles, node_data, edge_data, explicit_h): @@ -543,16 +601,20 @@ def test_invalid_smiles(smiles, error_type): read_smiles(smiles) -@pytest.mark.parametrize('smiles, n_records',[ - (r'F/C=C/F', 2), - (r'C(\F)=C/F', 2), - (r'F\C=C/F', 2), - (r'C(/F)=C/F', 2), - ('NC(Br)=[C@]=C(O)C', 1), - ('c1ccccc1', 0) +@pytest.mark.parametrize('smiles, records',[ + (r'F/C=C/F', [(0, 1, 2, 3, 'trans'), (3, 2, 1, 0, 'trans')]), + (r'C(\F)=C/F', [(1, 0, 2, 3, 'trans'), (3, 2, 0, 1, 'trans')]), + (r'F\C=C/F', [(0, 1, 2, 3, 'cis'), (3, 2, 1, 0, 'cis')]), + (r'C(/F)=C/F', [(1, 0, 2, 3, 'cis'), (3, 2, 0, 1, 'cis')]), + (r'F/C(CC)=C/F', [(0, 1, 4, 5, 'trans'), (5, 4, 1, 0, 'trans')]), + (r'F/C=C=C=C/F', [(0, 1, 4, 5, 'trans'), (5, 4, 1, 0, 'trans')]), + (r'F/C=C=C=C\F', [(0, 1, 4, 5, 'cis'), (5, 4, 1, 0, 'cis')]), + ('c1ccccc1', None) ]) -def test_stereo_logging(caplog, smiles, n_records): - read_smiles(smiles, explicit_hydrogen=False) - assert len(caplog.records) == n_records - for record in caplog.records: - assert record.levelname == "WARNING" +def test_stereo_eading(smiles, records): + mol = read_smiles(smiles, explicit_hydrogen=False) + if records: + for record in records: + assert mol.nodes[record[0]]['ez_isomer'] == record + else: + assert len(nx.get_node_attributes(mol, 'ez_isomer')) == 0