/
energies.py
314 lines (273 loc) · 10 KB
/
energies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python3
from Bio import PDB
import pandas as pd
import argparse
import os
import time
from math import floor
from collections import defaultdict
from itertools import combinations
from multiprocessing import Pool
################################################################################
# PARAMETERS
################################################################################
"""
Special cases, defined as list of (amino acid, atom)-tuples.
All greek letters are substituted by these following ascii equivalents:
-alpha: A
-beta: B
-gamma: G
-delta: D
-epsilon: E
-eta: H
-zeta: Z
See table 1 in paper.
"""
specialcodes = {"GCA": [("GLY", "GA")],
"CB": [("PRO", "CG"), ("PRO", "CD")],
"KNZ": [("LYS", "CE"), ("LYS", "NZ")],
"KCD": [("LYS", "CD")],
"DOD": [("ASP", "CG"), ("ASP", "OD1"), ("ASP", "OD2"),
("GLU", "CD"), ("GLU", "OE1"), ("GLU", "OE2")],
"RNH": [("ARG", "CZ"), ("ARG", "NH1"), ("ARG", "NH2")],
"NND": [("ASN", "CG"), ("ASN", "OD1"), ("ASN", "ND2"),
("GLN", "CD"), ("GLN", "OE1"), ("GLN", "NE2")],
"RNE": [("ARG", "CD"), ("ARG", "NE")],
"SOG": [("SER", "CB"), ("SER", "OG"), ("THR", "OG1"),
("TYR", "OH")],
"HNE": [("HIS", "CG"), ("HIS", "ND1"), ("HIS", "CD2"),
("HIS", "CE1"), ("HIS", "NE2"), ("TRP", "NE1")],
"YCZ": [("TYR", "CE1"), ("TYR", "CE2"), ("TYR", "CZ")],
"FCZ": [("ARG", "CG"), ("GLN", "CG"), ("GLU", "CG"),
("ILE", "CG1"), ("LEU", "CG"), ("LYS", "CG"),
("MET", "CG"), ("MET", "SD"), ("PHE", "CG"),
("PHE", "CD1"), ("PHE", "CD2"), ("PHE", "CE1"),
("PHE", "CE2"), ("PHE", "CZ"), ("THR", "CG2"),
("TRP", "CG"), ("TRP", "CD1"), ("TRP", "CD2"),
("TRP", "CE2"), ("TRP", "CE3"), ("TRP", "CZ2"),
("TRP", "CZ3"), ("TRP", "CH2"), ("TYR", "CG"),
("TYR", "CD1"), ("TYR", "CD2")],
"LCD": [("ILE", "CG2"),
# The paper suggests the nomenclature CD for Isoleucin,
# but it seems that CD1 is the standard.
# See: http://www.bmrb.wisc.edu/ref_info/atom_nom.tbl
# ("ILE", "CD"),
("ILE", "CD1"),
#
("LEU", "CD1"),
("LEU", "CD2"), ("MET", "CE"), ("VAL", "CG1"),
("VAL", "CG2")],
"CSG": [("CYS", "SG")]
}
# Values of the connectivity matrix as defined in the paper.
connectivitymatrix = pd.DataFrame([[3, 3, 2, 2],
[3, 3, 3, 3],
[4, 3, 3, 3],
[3, 3, 2, 2]],
index=[1, 2, 3, 4], columns=[1, 2, 3, 4])
# Class of atoms for connectivity matrix, default is side-chain, class 4.
connectivityclass = defaultdict(lambda: 4,
{"N": 1,
"CA": 2,
"C": 3,
"O": 3})
# Load contact energies
contactenergies = pd.DataFrame.from_csv("contactenergies.csv")
# Fill missing values such that the matrix is symmetric
contactenergies = contactenergies.fillna(contactenergies.T)
################################################################################
# //PARAMETERS
################################################################################
# VT100 controls
CURSOR_UP_ONE = "\x1b[1A"
CURSOR_DOWN_ONE = "\x1b[1B"
ERASE_LINE = "\x1b[2K"
DELETE_LINE = "\x1b[1M"
"""
Pre-process special cases for performance boost.
"""
processed_specialcases = dict()
for code, caseslist in specialcodes.items():
for case in caseslist:
processed_specialcases[case] = code
def atomtype(atom):
"""
Atom type for atom. (Table 1)
:param atom: PDB.Atom
:return: str
"""
try:
# Constant time lookup of special cases
return processed_specialcases[(atom.parent.resname, atom.name)]
except KeyError:
# For backbone atoms, atom.name is code
return atom.name
def ligandfilter(pdb):
"""
Remove water and other ligands from pdb.
:param pdb: PDB.Structure.Structure
:return: None
"""
# Remove non amino acid residues
# To upkeep the integrity due to detaching, iterate over child_list copy!
for model in pdb.child_list[:]:
for chain in model.child_list[:]:
for res in chain.child_list[:]:
if not PDB.is_aa(res):
chain.detach_child(res.id)
if len(chain) == 0:
model.detach_child(chain)
if len(model) == 0:
pdb.detach_child(model)
# if the pdb still has more than one model, it's probably an NMR structure
# simply keep the first model
if len(pdb) > 1:
for model in pdb.child_list[1:]:
pdb.detach_child(model.id)
if len(pdb.child_list[0]) > 1:
model = pdb.child_list[0]
for chain in model.child_list[1:]:
model.detach_child(chain.id)
# There is only one model left
assert len(pdb) == 1
# This model has only one chain
assert len(pdb.child_list[0]) == 1
def iscontactpair(atom1, atom2):
"""
Determine whether two atoms are possible contact pairs based on their
connection class. It is assumed that the atoms are on the same chain.
:param atom1: PDB.Atom
:param atom2: PDB.Atom
:return: bool
"""
if atom1 - atom2 > 6:
# Exclusion due to big distance
return False
# Keep if residue position is distant enough
code1 = connectivityclass[atom1.name]
code2 = connectivityclass[atom2.name]
respos1 = atom1.parent.id[1]
respos2 = atom2.parent.id[1]
return respos2 - respos1 > connectivitymatrix.loc[code1, code2]
def energylookup(atom1, atom2):
"""
Lookup energy codes for a pair of atoms.
:param atom1: PDBAtom
:param atom12: PDBAtom
:return: float
"""
return contactenergies.loc[atomtype(atom1), atomtype(atom2)]
def allpairs(pdb):
"""
Iterator over all heavy atom pairs in a PDB structure.
PDB consists of only one model and only one chain.
:param pdb: PDB.Structure.Structure
:return: iterator
"""
# Filter out hydrogen and C-Terminal hydroxyl-group
heavyatoms = []
for atom in pdb.get_atoms():
if atom.element != "H" and atom.name != "OXT":
heavyatoms.append(atom)
# Generator of pairs
for atom1, atom2 in combinations(heavyatoms, 2):
yield (atom1, atom2)
def worker(pair):
"""
Check whether a pair is a valid contact pair, and compute energies.
:param pair: tuple(PDBAtom, PDBAtom)
:return: flaot
"""
if iscontactpair(*pair):
try:
return energylookup(*pair)
except KeyError:
# Some annoying pdb files use unconventional nomenclature for
# C-Terminal atoms
return 0
else:
return 0
def computecontactenergy(pdbpath, pool):
"""
Compute contact pairs energy for a PDB file.
:param pdbpath: str
:param pool: multiporecessing.Pool
:return: float (energy in kcal/mol)
"""
parser = PDB.PDBParser(QUIET=True)
pdb = parser.get_structure("", pdbpath)
ligandfilter(pdb)
# Compute pairwise energy on multiple cores
async = pool.map_async(worker, allpairs(pdb))
# Wating wheel for user
waiting(async)
energies = async.get()
# Convert to kcal/mol
return sum(energies) / 21
def waiting(async):
"""
Show waiting symbol as long as async is still running.
:param async: Async object
:return: None
"""
print()
state = 0
chars = r"|/-\."
while not async.ready():
print(CURSOR_UP_ONE + DELETE_LINE + chars[state])
state = (state + 1) % len(chars)
time.sleep(0.5)
# Back to line of progressbar
print(DELETE_LINE + CURSOR_UP_ONE + DELETE_LINE + CURSOR_UP_ONE)
def updateprogress(current, ratio):
"""
Let user know how much work has been completed so far, and what the current
file is.
:param current: str
:param ratio: flaot
:return: None
"""
# Delete last progressbar
print(CURSOR_UP_ONE * 2 + DELETE_LINE * 2, end="")
# Print new one
print("[{: >#6.2f}% {: <50}]".format(ratio * 100, "-" * floor(ratio * 50)))
print("Current PDB: {}".format(current))
def main(path, out, cores):
"""
Compute contact energies for each pdb in path and write results to 'out'.
:param path: str
:param out: str
:param cores: int
:return: None
"""
# Find all pdbs in path
workload = []
for file in os.listdir(path):
if os.path.splitext(file)[1].lower() == ".pdb":
workload.append(file)
# Print few newlines to prevent progressbar from messing up the shell
print("\n\n")
# Compute energies
pool = Pool(processes=cores)
results = []
for (nr, pdb) in enumerate(workload):
updateprogress(pdb, nr / len(workload))
e = computecontactenergy(os.path.join(path, pdb), pool)
results.append((pdb, e))
pool.close()
# Make 100% to appear
updateprogress("Finished", 1)
# Store output
with open(out, "w") as handler:
handler.write("PDB,Energy in kcal/mol\n")
for pair in results:
handler.write("{},{}\n".format(*pair))
if __name__ == "__main__":
shell = argparse.ArgumentParser()
shell.add_argument("path",
help="Path to directory that contains the PDB files",
type=str)
shell.add_argument("out", help="Output-file for the computed energies")
shell.add_argument("cores", help="Nr. of cores", type=int, default=4)
args = shell.parse_args()
main(args.path, args.out, args.cores)