From fc9e4af95ddce230cc138cdae9edce27e34d709c Mon Sep 17 00:00:00 2001 From: Laura Malo <44496034+laumalo@users.noreply.github.com> Date: Tue, 3 Nov 2020 16:43:13 +0100 Subject: [PATCH 1/7] Add check PDB function --- peleffy/topology/molecule.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/peleffy/topology/molecule.py b/peleffy/topology/molecule.py index c84403d1..0d3923d1 100644 --- a/peleffy/topology/molecule.py +++ b/peleffy/topology/molecule.py @@ -571,6 +571,36 @@ def _initialize(self): from peleffy.forcefield.parameters import BaseParameterWrapper self._parameters = BaseParameterWrapper() + def _PDB_checkup(self, path): + """ + Safety check for PDB files in order to properly handle exceptions + related with its format prior running the parameterization. + + Parameters + ---------- + path : str + The path to a PDB with the molecule structure + """ + # Parse PDB file + atom_id, res_name, res_id = ([] for i in range(3)) + for line in open(path): + if not len(line.strip()) == 0: + list = line.split() + id = list[0] + if id == 'HETATM': + atom_id.append(list[2]) + res_name.append(list[3]) + res_id.append(list[4]) + + # Handle exceptions related with the PDB file format + assert res_id[:-1] == res_id[1:], \ + 'A single ligand with immutable residue ids is expected' + assert res_name[:-1] == res_name[1:], \ + 'A single ligand with immutable residue names is expected' + assert len(atom_id) == len(set(atom_id)), \ + 'Ligand in input PDB has no unique atom names' + + def _initialize_from_pdb(self, path): """ It initializes a molecule with the molecule structure read from @@ -585,6 +615,9 @@ def _initialize_from_pdb(self, path): logger.info(' - Initializing molecule from PDB') self._initialize() + #Validate PDB + self._PDB_checkup(path) + logger.info(' - Loading molecule from RDKit') rdkit_toolkit = RDKitToolkitWrapper() self._rdkit_molecule = rdkit_toolkit.from_pdb(path) From 15c14a603d409c2ac0fd920d61d1713abcd55ce9 Mon Sep 17 00:00:00 2001 From: Laura Malo <44496034+laumalo@users.noreply.github.com> Date: Wed, 4 Nov 2020 11:42:30 +0100 Subject: [PATCH 2/7] Fix some errors --- peleffy/topology/molecule.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/peleffy/topology/molecule.py b/peleffy/topology/molecule.py index 0d3923d1..8358fbea 100644 --- a/peleffy/topology/molecule.py +++ b/peleffy/topology/molecule.py @@ -584,13 +584,10 @@ def _PDB_checkup(self, path): # Parse PDB file atom_id, res_name, res_id = ([] for i in range(3)) for line in open(path): - if not len(line.strip()) == 0: - list = line.split() - id = list[0] - if id == 'HETATM': - atom_id.append(list[2]) - res_name.append(list[3]) - res_id.append(list[4]) + if not len(line.strip()) == 0 and line.startswith('HETATM'): + atom_id.append(line[13:16]) + res_name.append(line[18:20]) + res_id.append(line[23:26]) # Handle exceptions related with the PDB file format assert res_id[:-1] == res_id[1:], \ From 752a3e3843e898f18f1f77fcf4592464df245480 Mon Sep 17 00:00:00 2001 From: Laura Malo <44496034+laumalo@users.noreply.github.com> Date: Wed, 4 Nov 2020 14:46:16 +0100 Subject: [PATCH 3/7] Add unit test and pdbs --- peleffy/data/tests/ethylene_error1.pdb | 10 ++++++++++ peleffy/data/tests/ethylene_error2.pdb | 10 ++++++++++ peleffy/data/tests/ethylene_error3.pdb | 10 ++++++++++ peleffy/tests/test_molecule.py | 21 +++++++++++++++++++++ peleffy/topology/molecule.py | 25 +++++++++++++++++-------- 5 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 peleffy/data/tests/ethylene_error1.pdb create mode 100644 peleffy/data/tests/ethylene_error2.pdb create mode 100644 peleffy/data/tests/ethylene_error3.pdb diff --git a/peleffy/data/tests/ethylene_error1.pdb b/peleffy/data/tests/ethylene_error1.pdb new file mode 100644 index 00000000..c5956c58 --- /dev/null +++ b/peleffy/data/tests/ethylene_error1.pdb @@ -0,0 +1,10 @@ +HETATM 1 C1 UNK 1 0.634 -0.083 0.007 1.00 0.00 C +HETATM 2 C1 UNK 1 -0.671 0.098 -0.007 1.00 0.00 C +HETATM 3 H1 UNK 1 1.354 0.739 0.049 1.00 0.00 H +HETATM 4 H2 UNK 1 1.082 -1.071 -0.022 1.00 0.00 H +HETATM 5 H3 UNK 1 -1.096 1.088 0.022 1.00 0.00 H +HETATM 6 H4 UNK 1 -1.302 -0.770 -0.049 1.00 0.00 H +CONECT 1 2 2 3 4 +CONECT 2 5 6 +END + diff --git a/peleffy/data/tests/ethylene_error2.pdb b/peleffy/data/tests/ethylene_error2.pdb new file mode 100644 index 00000000..a6ab5294 --- /dev/null +++ b/peleffy/data/tests/ethylene_error2.pdb @@ -0,0 +1,10 @@ +HETATM 1 C1 UNK 1 0.634 -0.083 0.007 1.00 0.00 C +HETATM 2 C2 UNK 1 -0.671 0.098 -0.007 1.00 0.00 C +HETATM 3 H1 UNK 1 1.354 0.739 0.049 1.00 0.00 H +HETATM 4 H2 UNK 1 1.082 -1.071 -0.022 1.00 0.00 H +HETATM 5 H3 UNK 2 -1.096 1.088 0.022 1.00 0.00 H +HETATM 6 H4 UNK 1 -1.302 -0.770 -0.049 1.00 0.00 H +CONECT 1 2 2 3 4 +CONECT 2 5 6 +END + diff --git a/peleffy/data/tests/ethylene_error3.pdb b/peleffy/data/tests/ethylene_error3.pdb new file mode 100644 index 00000000..cc63a601 --- /dev/null +++ b/peleffy/data/tests/ethylene_error3.pdb @@ -0,0 +1,10 @@ +HETATM 1 C1 UNK 1 0.634 -0.083 0.007 1.00 0.00 C +HETATM 2 C2 UNK 1 -0.671 0.098 -0.007 1.00 0.00 C +HETATM 3 H1 UNN 1 1.354 0.739 0.049 1.00 0.00 H +HETATM 4 H2 UNK 1 1.082 -1.071 -0.022 1.00 0.00 H +HETATM 5 H3 UNK 1 -1.096 1.088 0.022 1.00 0.00 H +HETATM 6 H4 UNK 1 -1.302 -0.770 -0.049 1.00 0.00 H +CONECT 1 2 2 3 4 +CONECT 2 5 6 +END + diff --git a/peleffy/tests/test_molecule.py b/peleffy/tests/test_molecule.py index ac4b53bd..eb2f1097 100644 --- a/peleffy/tests/test_molecule.py +++ b/peleffy/tests/test_molecule.py @@ -238,3 +238,24 @@ def check_residue_name(name): assert molecule.tag == 'BNZ', 'Unexpected molecule tag' molecule.to_pdb_file('molecule.pdb') check_residue_name('BNZ') + + def test_PDB_checkup(self): + """ + It tests the safety check function for PDB files. + """ + + LIGAND_ERROR1 = get_data_file_path('tests/ethylene_error1.pdb') + LIGAND_ERROR2 = get_data_file_path('tests/ethylene_error2.pdb') + LIGAND_ERROR3 = get_data_file_path('tests/ethylene_error3.pdb') + + # All atom names need to be unique + with pytest.raises(Exception): + _ = Molecule(LIGAND_ERROR1) + + # All residue ids must match + with pytest.raises(Exception): + _ = Molecule(LIGAND_ERROR2) + + # All residue names must match + with pytest.raises(Exception): + _ = Molecule(LIGAND_ERROR3) diff --git a/peleffy/topology/molecule.py b/peleffy/topology/molecule.py index 8358fbea..d3ef9f38 100644 --- a/peleffy/topology/molecule.py +++ b/peleffy/topology/molecule.py @@ -581,22 +581,31 @@ def _PDB_checkup(self, path): path : str The path to a PDB with the molecule structure """ + # Parse PDB file atom_id, res_name, res_id = ([] for i in range(3)) + connectivity = False for line in open(path): - if not len(line.strip()) == 0 and line.startswith('HETATM'): + if line.startswith('HETATM'): atom_id.append(line[13:16]) res_name.append(line[18:20]) res_id.append(line[23:26]) + if line.startswith('CONECT'): + connectivity = True # Handle exceptions related with the PDB file format - assert res_id[:-1] == res_id[1:], \ - 'A single ligand with immutable residue ids is expected' - assert res_name[:-1] == res_name[1:], \ - 'A single ligand with immutable residue names is expected' - assert len(atom_id) == len(set(atom_id)), \ - 'Ligand in input PDB has no unique atom names' - + if not res_id[:-1] == res_id[1:]: raise Exception( \ + 'A single ligand with immutable residue ids is expected') + if not res_name[:-1] == res_name[1:]: raise Exception( \ + 'A single ligand with immutable residue names is expected') + if not len(atom_id) == len(set(atom_id)): raise Exception( \ + 'Ligand in input PDB has no unique atom names') + if not connectivity: + import logging + logging.warning( \ + "Input PDB has no information about the connectivity and" + + " this could result in an unexpected bond assignment" + ) def _initialize_from_pdb(self, path): """ From 092e045dfa16e42f8bf5feb260f38078dd90d08c Mon Sep 17 00:00:00 2001 From: Laura Malo <44496034+laumalo@users.noreply.github.com> Date: Wed, 4 Nov 2020 21:17:15 +0100 Subject: [PATCH 4/7] Minor changes --- peleffy/topology/molecule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/peleffy/topology/molecule.py b/peleffy/topology/molecule.py index d3ef9f38..397b6770 100644 --- a/peleffy/topology/molecule.py +++ b/peleffy/topology/molecule.py @@ -601,8 +601,8 @@ def _PDB_checkup(self, path): if not len(atom_id) == len(set(atom_id)): raise Exception( \ 'Ligand in input PDB has no unique atom names') if not connectivity: - import logging - logging.warning( \ + log = Logger() + log.warning( \ "Input PDB has no information about the connectivity and" + " this could result in an unexpected bond assignment" ) From 2d15986c2f9f2dddbf942d8e5e0c7a9e2d1dde54 Mon Sep 17 00:00:00 2001 From: Marti Municoy Date: Fri, 6 Nov 2020 11:45:38 +0100 Subject: [PATCH 5/7] Minor changes and new test for PDB checkup --- peleffy/tests/test_molecule.py | 36 ++++++++++++++++++++++++++++++---- peleffy/topology/molecule.py | 32 ++++++++++++++++-------------- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/peleffy/tests/test_molecule.py b/peleffy/tests/test_molecule.py index eb2f1097..2eaf1656 100644 --- a/peleffy/tests/test_molecule.py +++ b/peleffy/tests/test_molecule.py @@ -239,14 +239,17 @@ def check_residue_name(name): molecule.to_pdb_file('molecule.pdb') check_residue_name('BNZ') - def test_PDB_checkup(self): - """ - It tests the safety check function for PDB files. - """ + def test_PDB_checkup(self): + """It tests the safety check function for PDB files.""" + LIGAND_GOOD = get_data_file_path('ligands/ethylene.pdb') LIGAND_ERROR1 = get_data_file_path('tests/ethylene_error1.pdb') LIGAND_ERROR2 = get_data_file_path('tests/ethylene_error2.pdb') LIGAND_ERROR3 = get_data_file_path('tests/ethylene_error3.pdb') + LIGAND_ERROR4 = get_data_file_path('tests/ethylene_error4.pdb') + + # This should work without any complain + _ = Molecule(LIGAND_GOOD) # All atom names need to be unique with pytest.raises(Exception): @@ -259,3 +262,28 @@ def test_PDB_checkup(self): # All residue names must match with pytest.raises(Exception): _ = Molecule(LIGAND_ERROR3) + + # Check warning message in the logger when connectivity is missing + import io + from peleffy.utils import Logger + import logging + from importlib import reload + logging.shutdown() + reload(logging) + + log = Logger() + log.set_level('WARNING') + + # Catch logger messages to string buffer + with io.StringIO() as buf: + log_handler = logging.StreamHandler(buf) + log._logger.handlers = list() + log._logger.addHandler(log_handler) + + _ = Molecule(LIGAND_ERROR4) + + output = buf.getvalue() + + assert output == "Input PDB has no information about the " \ + + "connectivity and this could result in an unexpected " \ + + "bond assignment\n" diff --git a/peleffy/topology/molecule.py b/peleffy/topology/molecule.py index 397b6770..afdb9612 100644 --- a/peleffy/topology/molecule.py +++ b/peleffy/topology/molecule.py @@ -573,39 +573,41 @@ def _initialize(self): def _PDB_checkup(self, path): """ - Safety check for PDB files in order to properly handle exceptions + Safety check for PDB files in order to properly handle exceptions related with its format prior running the parameterization. Parameters ---------- path : str The path to a PDB with the molecule structure - """ + """ # Parse PDB file - atom_id, res_name, res_id = ([] for i in range(3)) + atom_id, res_name, res_id = ([] for i in range(3)) connectivity = False for line in open(path): if line.startswith('HETATM'): atom_id.append(line[13:16]) res_name.append(line[18:20]) res_id.append(line[23:26]) - if line.startswith('CONECT'): + if line.startswith('CONECT'): connectivity = True # Handle exceptions related with the PDB file format - if not res_id[:-1] == res_id[1:]: raise Exception( \ - 'A single ligand with immutable residue ids is expected') - if not res_name[:-1] == res_name[1:]: raise Exception( \ - 'A single ligand with immutable residue names is expected') - if not len(atom_id) == len(set(atom_id)): raise Exception( \ - 'Ligand in input PDB has no unique atom names') - if not connectivity: + if not res_id[:-1] == res_id[1:]: + raise Exception( + 'A single ligand with immutable residue ids is expected') + if not res_name[:-1] == res_name[1:]: + raise Exception( + 'A single ligand with immutable residue names is expected') + if not len(atom_id) == len(set(atom_id)): + raise Exception( + 'Ligand in input PDB has no unique atom names') + if not connectivity: log = Logger() - log.warning( \ + log.warning( "Input PDB has no information about the connectivity and" - + " this could result in an unexpected bond assignment" - ) + + " this could result in an unexpected bond assignment") def _initialize_from_pdb(self, path): """ @@ -621,7 +623,7 @@ def _initialize_from_pdb(self, path): logger.info(' - Initializing molecule from PDB') self._initialize() - #Validate PDB + # Validate PDB self._PDB_checkup(path) logger.info(' - Loading molecule from RDKit') From b13db43c1c86da83bdf7fc4e163d83ce036adae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=AD=20Municoy?= Date: Fri, 6 Nov 2020 11:49:39 +0100 Subject: [PATCH 6/7] Update releasehistory.rst --- docs/releasehistory.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/releasehistory.rst b/docs/releasehistory.rst index b40ac3d8..11e9ad8e 100644 --- a/docs/releasehistory.rst +++ b/docs/releasehistory.rst @@ -15,10 +15,12 @@ This minor release integrates the parameterization of OBC charges and scale fact New features """""""""""" - `PR #88 `_: New method to retrieve atom degrees with RDKit. +- `PR #86 `_: New method to check the input PDB prior building the molecule. Tests added """"""""""" - `PR #88 `_: Adds tests to validate the atom degrees getter. +- `PR #86 `_: Adds tests to validate the PDB check up. 1.0.0 - Full compatibility for OPLS2005 and OpenFF dihedrals and rotamer library improvements From 0a7d5bbe06ee70b1af76bb9e1ddecade5c71043c Mon Sep 17 00:00:00 2001 From: Marti Municoy Date: Fri, 6 Nov 2020 11:51:04 +0100 Subject: [PATCH 7/7] Add missing file for tests --- peleffy/data/tests/ethylene_error4.pdb | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 peleffy/data/tests/ethylene_error4.pdb diff --git a/peleffy/data/tests/ethylene_error4.pdb b/peleffy/data/tests/ethylene_error4.pdb new file mode 100644 index 00000000..34dc377f --- /dev/null +++ b/peleffy/data/tests/ethylene_error4.pdb @@ -0,0 +1,8 @@ +HETATM 1 C1 UNK 1 0.634 -0.083 0.007 1.00 0.00 C +HETATM 2 C2 UNK 1 -0.671 0.098 -0.007 1.00 0.00 C +HETATM 3 H1 UNK 1 1.354 0.739 0.049 1.00 0.00 H +HETATM 4 H2 UNK 1 1.082 -1.071 -0.022 1.00 0.00 H +HETATM 5 H3 UNK 1 -1.096 1.088 0.022 1.00 0.00 H +HETATM 6 H4 UNK 1 -1.302 -0.770 -0.049 1.00 0.00 H +END +