Skip to content

Commit

Permalink
Added Knotscope to package
Browse files Browse the repository at this point in the history
  • Loading branch information
monteirotorres committed Jun 27, 2018
1 parent f0486ae commit b80fc19
Show file tree
Hide file tree
Showing 3 changed files with 305 additions and 1 deletion.
8 changes: 8 additions & 0 deletions Structuralia/GlobalVars.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,11 @@
progressbar.Bar(),
progressbar.Percentage(),
' (', progressbar.AdaptiveETA(), ') ']

clrs = {'r': "\033[1;91m",
'g': "\033[1;92m",
'y': "\033[1;93m",
'b': "\033[1;94m",
'p': "\033[1;95m",
'c': "\033[1;96m",
'n': "\033[1;0m"}
264 changes: 264 additions & 0 deletions Structuralia/KnotScope.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#!/usr/bin/env python3
# Imports
###############################################################################
import os
import sys
import argparse
import collections
import pandas as pd
import textwrap as tw
import itertools as it
import Structuralia.Toolbox as strtools
from Structuralia.GlobalVars import *
import Bio.PDB.Polypeptide as bpp_poly

# LICENSE
###############################################################################
'''
Structuralia: A suite of python scripts to easily manipulate PDBs
Copyright (C) 2018 Pedro H. M. Torres
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact info:
Department Of Biochemistry
University of Cambridge
80 Tennis Court Road
Cambridge CB2 1GA
E-mail address: monteirotorres@gmail.com
'''

# Description
###############################################################################
'''
A simple biopython-based code to detect and output unrealistic aplha
carbon distances in pdb files.
'''
# Dictionaries
###############################################################################

# Global Variables
###############################################################################

# Classes
###############################################################################

# Functions
###############################################################################
def printv(text, verbosity):
'''
Function to control print verbosity.
Called by: get_knots()
'''
if verbosity == 1:
print(text)


def make_diagonal(core, entry):
'''
Outputs a list of 3 elements to act as the diagonal of an identity matrix.
the first and two elements are equal and will serve as the row and column
indexes, while the third element is a dash.
Called by: get_knots()
'''
line = []
line.append(core[entry].get_parent().id[1])
line.append(core[entry].get_parent().id[1])
line.append('-')
return line


def get_knots(pdb, cutoff, cluster_cutoff, verbosity):
'''
Main routine, uses biopython and pandas to detect knots and cluster them
through the implementation of the average linkage algorithm
Called by: main()
'''
if (pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz") or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz") or pdb.endswith(".pdb.gz")) and not pdb.startswith('CONTACTS-'):
pdb_name, structure, nchains = strtools.parse_pdb_structure(pdb)
print(str('\n'+clrs['p']+pdb+clrs['n']))
with open('KnotScope.log', 'a') as log:
log.write(str('>'+pdb+'\n'))
mainchain = [atom for atom in bpp.Selection.unfold_entities(structure[0], 'A') if bpp_poly.is_aa(atom.get_parent(), standard=True) and (atom.id == 'CA')]# or atom.id == 'N' or atom.id == 'O')]
contacts = []
core = []
for atom in mainchain:
distances = []
ns = bpp.NeighborSearch(mainchain)
center = atom.get_coord()
neighbors = [neighbor for neighbor in ns.search(center, cutoff) if (neighbor.get_parent().id[1] - atom.get_parent().id[1]) > abs(3)]
if neighbors:
for neighbor in neighbors:
d = neighbor - atom
distances.append(d)
if d <= cutoff:
printv(clrs['y']+'Unlikely proximity'+clrs['n']+' between residues '+clrs['y']+str(atom.get_parent().id[1])+clrs['n']+' and '+clrs['y']+str(neighbor.get_parent().id[1])+clrs['n']+'!', verbosity)
printv(str(d), verbosity)
with open('KnotScope.log', 'a') as log:
log.write(str(atom.get_parent().id[1])+','+str(neighbor.get_parent().id[1])+','+str(d)+'\n')
contacts.append(neighbor.get_parent())
contacts.append(atom.get_parent())
if atom not in core:
core.append(atom)
if neighbor not in core:
core.append(neighbor)
# Save contacts to pdb file if they exist
if contacts:
io.set_structure(structure)
io.save('CONTACTS-'+pdb, strtools.SelectResidues(contacts))
# Start cluster analysis to separate knots
pairwisedist = []
# Measure pairwise distances of every CA involved in knots and record in vertical list
if len(core) > 1:
for a, b in it.combinations(core, 2):
d = a - b
pairwisedist.append([a.get_parent().id[1],b.get_parent().id[1],d])
# Add values for diagonal
for entry in range(len(core)):
line=make_diagonal(core, entry)
pairwisedist.append([line[0],line[1],line[2]])
# Create pandas dataframe, make it a square and symmetric matrix
df = pd.DataFrame(pairwisedist, index=None, columns=None)
df = pd.crosstab(index=df[0],columns=df[1], values=df[2], aggfunc='sum' , dropna=True).fillna(0)
df = df+df.T
# Start average linkage algorithm
reslist = list(df.columns)
clusters = []
row_index = -1
col_index = -1
array = []
for n in range(df.shape[0]):
array.append(n)
clusters.append(array.copy())
for k in range(1, df.shape[0]):
min_val = sys.maxsize
for i in range(0, df.shape[0]):
for j in range(0, df.shape[1]):
#print(str(df.iloc[i,j]))
if type(df.iloc[i, j]) != str:
if(df.iloc[i, j] <= min_val):
min_val = df.iloc[i,j]
row_index = i
col_index = j

for i in range(0,df.shape[0]):
if(i != col_index and i!=row_index):
temp = (df.iloc[col_index,i]+df.iloc[row_index,i])/2
df.iloc[col_index,i] = temp
df.iloc[i,col_index] = temp
for i in range (0,df.shape[0]):
df.iloc[row_index,i] = sys.maxsize
df.iloc[i,row_index] = sys.maxsize
minimum = min(row_index,col_index)
maximum = max(row_index,col_index)
for n in range(len(array)):
if(array[n] == maximum):
array[n] = minimum
clusters.append(array.copy())
# Stop iterations when minimum pairwise distance in the matrix is greater than 22
if min_val > cluster_cutoff:
break
# Get the clusters from last iteration and 'count' elements
clustered_res = clusters[-1]
counter = collections.Counter(clustered_res)
# Combine residue and cluster information and print them user-friendly
clusterdict = dict(zip(reslist, clustered_res))
print(clrs['y']+'\nLikely '+str(len(set(clusters[-1])))+' knot(s) found in structure under chosen criteria...'+clrs['n'])
temp = []
n = 0
for cl in set(clusterdict.values()):
n += 1
cluster_residues = []
print('\nKnot '+clrs['y']+str(n)+clrs['n']+' (Cluster id: '+str(cl)+') involves '+clrs['y']+str(list(counter.values())[n-1])+clrs['n']+' residues:')
for res in clusterdict:
if clusterdict[res] == cl:
cluster_residues.append(res)
print(clrs['y']+', '.join([str(a) for a in cluster_residues])+clrs['n'])
else:
print(clrs['g']+'No CA distances under '+str(cutoff)+' angstrons found'+clrs['n']+'!\n')
with open('KnotScope.log', 'a') as log:
log.write('No contacts found!\n')

del pdb_name, structure, nchains, contacts
elif pdb.startswith('CONTACTS-'):
pass
else:
print(clrs['y']+pdb+clrs['n']+' not a pdb-related structure format.'+clrs['r']+' SKIPPING!'+clrs['n'])


# Main Function
###############################################################################
def main():
print(tw.dedent("""
###################################################################
########################### KnotScope #############################
###################################################################
Copyright (C) 2018 Torres, P.H.M.; Blundell, T.L.
[The University of Cambridge]
This program comes with ABSOLUTELY NO WARRANTY
A simple biopython-based code to detect and output unrealistic aplha
carbon distances in pdb files.
###################################################################
"""))

parser = argparse.ArgumentParser()

parser.add_argument(dest='files',
metavar='files',
help='Either pdb file or directory containing pdbs\n')

parser.add_argument('-v', '--verbose',
dest='verbosity',
action='count',
default=0,
help='Controls verbosity\n')

parser.add_argument('-c', '--cutoff',
dest='cutoff',
type=float, default=3.2,
metavar='float',
help='Cutoff to be used in neighborsearching (default = 3.2)\n')

parser.add_argument('-l', '--cluster',
dest='cluster_cutoff',
type=float, default=20.0,
metavar='float',
help='Cutoff to be used in knot clustering (default = 20)\n')

args = parser.parse_args()

files = args.files
verbosity = args.verbosity
cutoff = args.cutoff
cluster_cutoff = args.cluster_cutoff

with open('KnotScope.log', 'w+'):
pass
if os.path.isdir(args.files):
for pdb_file in os.listdir(args.files):
get_knots(pdb_file, cutoff, cluster_cutoff, verbosity)
elif os.path.isfile(args.files):
get_knots(args.files, cutoff, cluster_cutoff, verbosity)
else:
raise FormatError('Please provide either a pdb file or a valid path to files')


# Execute
###############################################################################
main()
34 changes: 33 additions & 1 deletion Structuralia/Toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ def accept_residue(self, residue):
return 0


class SelectAtoms(bpp.Select):
def __init__(self, atomlist):
self.atomlist = atomlist

def accept_atom(self, atom):
if atom in self.atomlist:
return 1
else:
return 0

# Functions
###############################################################################

Expand Down Expand Up @@ -173,7 +183,7 @@ def download_pdblist(pdb_dir, list_file):
pdbl.download_pdb_files(pdb_codes, pdir=pdb_dir, file_format='pdb')

def parse_pdb(pdb_dir, pdb):
if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz"):
if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz") or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz") or pdb.endswith(".pdb.gz"):
pdb_name = pdb.split('.')[0].split("/")[-1]
if pdb.endswith(".ent.gz"):
pdb_file = gzip.open(pdb_dir+'/'+pdb, 'rt')
Expand All @@ -185,8 +195,30 @@ def parse_pdb(pdb_dir, pdb):
structure = p.get_structure(pdb_name, pdb_file)
except:
print("Structure "+pdb_name+" could not be strictly parsed.")
else:
raise FormatError('Looks like you have not selected a pdb-related structure format.')

return pdb_name, pdb_file, structure, contents


def parse_pdb_structure(pdb):
if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz") or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz") or pdb.endswith(".pdb.gz"):
pdb_name = pdb.split('.')[0].split("/")[-1][-4:]
if pdb.endswith(".gz"):
pdb_file = gzip.open(pdb, 'rt')
else:
pdb_file = open(pdb)
try:
structure = p.get_structure(pdb_name, pdb_file)
nchains = len(bpp.Selection.unfold_entities(structure, 'C'))
except:
print("Structure "+pdb_name+" could not be strictly parsed.")
return pdb_name, structure, nchains
else:
print(clrs['y']+pdb+clrs['n']+' not a pdb-related structure format.'+clrs['r']+' SKIPPING!'+clrs['n'])



# @timed
def extract_seqs(structure, defmodel):
'''
Expand Down

0 comments on commit b80fc19

Please sign in to comment.