Added Knotscope to package

monteirotorres · Jun 27, 2018 · b80fc19 · b80fc19
1 parent f0486ae
commit b80fc19
Show file tree

Hide file tree

Showing 3 changed files with 305 additions and 1 deletion.
diff --git a/Structuralia/GlobalVars.py b/Structuralia/GlobalVars.py
@@ -54,3 +54,11 @@
            progressbar.Bar(),
            progressbar.Percentage(),
            ' (', progressbar.AdaptiveETA(), ') ']
+
+clrs = {'r': "\033[1;91m",
+        'g': "\033[1;92m",
+        'y': "\033[1;93m",
+        'b': "\033[1;94m",
+        'p': "\033[1;95m",
+        'c': "\033[1;96m",
+        'n': "\033[1;0m"}
diff --git a/Structuralia/KnotScope.py b/Structuralia/KnotScope.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# Imports
+###############################################################################
+import os
+import sys
+import argparse
+import collections
+import pandas as pd
+import textwrap as tw
+import itertools as it
+import Structuralia.Toolbox as strtools
+from Structuralia.GlobalVars import *
+import Bio.PDB.Polypeptide as bpp_poly
+
+# LICENSE
+###############################################################################
+'''
+Structuralia: A suite of python scripts to easily manipulate PDBs
+Copyright (C) 2018  Pedro H. M. Torres
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Contact info:
+Department Of Biochemistry
+University of Cambridge
+80 Tennis Court Road
+Cambridge CB2 1GA
+E-mail address: monteirotorres@gmail.com
+'''
+
+# Description
+###############################################################################
+'''
+ A simple biopython-based code to detect and output unrealistic aplha
+ carbon distances in pdb files.
+'''
+# Dictionaries
+###############################################################################
+
+# Global Variables
+###############################################################################
+
+# Classes
+###############################################################################
+
+# Functions
+###############################################################################
+def printv(text, verbosity):
+    '''
+    Function to control print verbosity.
+    Called by: get_knots()
+    '''
+    if verbosity == 1:
+        print(text)
+
+
+def make_diagonal(core, entry):
+    '''
+    Outputs a list of 3 elements to act as the diagonal of an identity matrix.
+    the first and two elements are equal and will serve as the row and column
+    indexes, while the third element is a dash.
+    Called by: get_knots()
+    '''
+    line = []
+    line.append(core[entry].get_parent().id[1])
+    line.append(core[entry].get_parent().id[1])
+    line.append('-')
+    return line
+
+
+def get_knots(pdb, cutoff, cluster_cutoff, verbosity):
+    '''
+    Main routine, uses biopython and pandas to detect knots and cluster them
+    through the implementation of the average linkage algorithm
+    Called by: main()
+    '''
+    if (pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz") or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz") or pdb.endswith(".pdb.gz")) and not pdb.startswith('CONTACTS-'):
+        pdb_name, structure, nchains = strtools.parse_pdb_structure(pdb)
+        print(str('\n'+clrs['p']+pdb+clrs['n']))
+        with open('KnotScope.log', 'a') as log:
+            log.write(str('>'+pdb+'\n'))
+        mainchain = [atom for atom in bpp.Selection.unfold_entities(structure[0], 'A') if bpp_poly.is_aa(atom.get_parent(), standard=True) and (atom.id == 'CA')]# or atom.id == 'N' or atom.id == 'O')]
+        contacts = []
+        core = []
+        for atom in mainchain:
+            distances = []
+            ns = bpp.NeighborSearch(mainchain)
+            center = atom.get_coord()
+            neighbors = [neighbor for neighbor in ns.search(center, cutoff) if (neighbor.get_parent().id[1] - atom.get_parent().id[1]) > abs(3)]
+            if neighbors:
+                for neighbor in neighbors:
+                    d = neighbor - atom
+                    distances.append(d)
+                    if d <= cutoff:
+                        printv(clrs['y']+'Unlikely proximity'+clrs['n']+' between residues '+clrs['y']+str(atom.get_parent().id[1])+clrs['n']+' and '+clrs['y']+str(neighbor.get_parent().id[1])+clrs['n']+'!', verbosity)
+                        printv(str(d), verbosity)
+                        with open('KnotScope.log', 'a') as log:
+                            log.write(str(atom.get_parent().id[1])+','+str(neighbor.get_parent().id[1])+','+str(d)+'\n')
+                        contacts.append(neighbor.get_parent())
+                        contacts.append(atom.get_parent())
+                        if atom not in core:
+                            core.append(atom)
+                        if neighbor not in core:
+                            core.append(neighbor)
+        # Save contacts to pdb file if they exist
+        if contacts:
+            io.set_structure(structure)
+            io.save('CONTACTS-'+pdb, strtools.SelectResidues(contacts))
+        # Start cluster analysis to separate knots
+        pairwisedist = []
+        # Measure pairwise distances of every CA involved in knots and record in vertical list
+        if len(core) > 1:
+            for a, b in it.combinations(core, 2):
+                d = a - b
+                pairwisedist.append([a.get_parent().id[1],b.get_parent().id[1],d])
+            # Add values for diagonal
+            for entry in range(len(core)):
+                line=make_diagonal(core, entry)
+                pairwisedist.append([line[0],line[1],line[2]])
+            # Create pandas dataframe, make it a square and symmetric matrix
+            df = pd.DataFrame(pairwisedist, index=None, columns=None)
+            df = pd.crosstab(index=df[0],columns=df[1], values=df[2], aggfunc='sum' , dropna=True).fillna(0)
+            df = df+df.T
+            # Start average linkage algorithm
+            reslist = list(df.columns)
+            clusters = []
+            row_index = -1
+            col_index = -1
+            array = []
+            for n in range(df.shape[0]):
+                array.append(n)
+            clusters.append(array.copy())
+            for k in range(1, df.shape[0]):
+                min_val = sys.maxsize
+                for i in range(0, df.shape[0]):
+                    for j in range(0, df.shape[1]):
+                        #print(str(df.iloc[i,j]))
+                        if type(df.iloc[i, j]) != str:
+                            if(df.iloc[i, j] <= min_val):
+                                min_val = df.iloc[i,j]
+                                row_index = i
+                                col_index = j
+
+                for i in range(0,df.shape[0]):
+                    if(i != col_index and i!=row_index):
+                        temp = (df.iloc[col_index,i]+df.iloc[row_index,i])/2
+                        df.iloc[col_index,i] = temp
+                        df.iloc[i,col_index] = temp
+                for i in range (0,df.shape[0]):
+                    df.iloc[row_index,i] = sys.maxsize
+                    df.iloc[i,row_index] = sys.maxsize
+                minimum = min(row_index,col_index)
+                maximum = max(row_index,col_index)
+                for n in range(len(array)):
+                    if(array[n] == maximum):
+                        array[n] = minimum
+                clusters.append(array.copy())
+                # Stop iterations when minimum pairwise distance in the matrix is greater than 22
+                if min_val > cluster_cutoff:
+                    break
+            # Get the clusters from last iteration and 'count' elements
+            clustered_res = clusters[-1]
+            counter = collections.Counter(clustered_res)
+            # Combine residue and cluster information and print them user-friendly
+            clusterdict = dict(zip(reslist, clustered_res))
+            print(clrs['y']+'\nLikely '+str(len(set(clusters[-1])))+' knot(s) found in structure under chosen criteria...'+clrs['n'])
+            temp = []
+            n = 0
+            for cl in set(clusterdict.values()):
+                n += 1
+                cluster_residues = []
+                print('\nKnot '+clrs['y']+str(n)+clrs['n']+' (Cluster id: '+str(cl)+') involves '+clrs['y']+str(list(counter.values())[n-1])+clrs['n']+' residues:')
+                for res in clusterdict:
+                    if clusterdict[res] == cl:
+                        cluster_residues.append(res)
+                print(clrs['y']+', '.join([str(a) for a in cluster_residues])+clrs['n'])
+        else:
+            print(clrs['g']+'No CA distances under '+str(cutoff)+' angstrons found'+clrs['n']+'!\n')
+            with open('KnotScope.log', 'a') as log:
+                log.write('No contacts found!\n')
+
+        del pdb_name, structure, nchains, contacts
+    elif pdb.startswith('CONTACTS-'):
+        pass
+    else:
+        print(clrs['y']+pdb+clrs['n']+' not a pdb-related structure format.'+clrs['r']+' SKIPPING!'+clrs['n'])
+
+
+# Main Function
+###############################################################################
+def main():
+    print(tw.dedent("""
+         ###################################################################
+         ########################### KnotScope #############################
+         ###################################################################
+
+                 Copyright (C) 2018  Torres, P.H.M.; Blundell, T.L.
+                          [The University of Cambridge]
+
+                   This program comes with ABSOLUTELY NO WARRANTY
+
+         A simple biopython-based code to detect and output unrealistic aplha
+         carbon distances in pdb files.
+
+         ###################################################################
+
+          """))
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(dest='files',
+                        metavar='files',
+                        help='Either pdb file or directory containing pdbs\n')
+
+    parser.add_argument('-v', '--verbose',
+                        dest='verbosity',
+                        action='count',
+                        default=0,
+                        help='Controls verbosity\n')
+
+    parser.add_argument('-c', '--cutoff',
+                        dest='cutoff',
+                        type=float, default=3.2,
+                        metavar='float',
+                        help='Cutoff to be used in neighborsearching (default = 3.2)\n')
+
+    parser.add_argument('-l', '--cluster',
+                        dest='cluster_cutoff',
+                        type=float, default=20.0,
+                        metavar='float',
+                        help='Cutoff to be used in knot clustering (default = 20)\n')
+
+    args = parser.parse_args()
+
+    files = args.files
+    verbosity = args.verbosity
+    cutoff = args.cutoff
+    cluster_cutoff = args.cluster_cutoff
+
+    with open('KnotScope.log', 'w+'):
+        pass
+    if os.path.isdir(args.files):
+        for pdb_file in os.listdir(args.files):
+            get_knots(pdb_file, cutoff, cluster_cutoff, verbosity)
+    elif os.path.isfile(args.files):
+        get_knots(args.files, cutoff, cluster_cutoff, verbosity)
+    else:
+        raise FormatError('Please provide either a pdb file or a valid path to files')
+
+
+# Execute
+###############################################################################
+main()
diff --git a/Structuralia/Toolbox.py b/Structuralia/Toolbox.py
@@ -100,6 +100,16 @@ def accept_residue(self, residue):
             return 0
 
 
+class SelectAtoms(bpp.Select):
+    def __init__(self, atomlist):
+        self.atomlist = atomlist
+
+    def accept_atom(self, atom):
+        if atom in self.atomlist:
+            return 1
+        else:
+            return 0
+
 # Functions
 ###############################################################################
 
@@ -173,7 +183,7 @@ def download_pdblist(pdb_dir, list_file):
     pdbl.download_pdb_files(pdb_codes, pdir=pdb_dir, file_format='pdb')
 
 def parse_pdb(pdb_dir, pdb):
-    if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz"):
+    if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz") or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz") or pdb.endswith(".pdb.gz"):
         pdb_name = pdb.split('.')[0].split("/")[-1]
         if pdb.endswith(".ent.gz"):
             pdb_file = gzip.open(pdb_dir+'/'+pdb, 'rt')
@@ -185,8 +195,30 @@ def parse_pdb(pdb_dir, pdb):
             structure = p.get_structure(pdb_name, pdb_file)
         except:
             print("Structure "+pdb_name+" could not be strictly parsed.")
+    else:
+        raise FormatError('Looks like you have not selected a pdb-related structure format.')
+
     return pdb_name, pdb_file, structure, contents
 
+
+def parse_pdb_structure(pdb):
+    if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz") or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz") or pdb.endswith(".pdb.gz"):
+        pdb_name = pdb.split('.')[0].split("/")[-1][-4:]
+        if pdb.endswith(".gz"):
+            pdb_file = gzip.open(pdb, 'rt')
+        else:
+            pdb_file = open(pdb)
+        try:
+            structure = p.get_structure(pdb_name, pdb_file)
+            nchains = len(bpp.Selection.unfold_entities(structure, 'C'))
+        except:
+            print("Structure "+pdb_name+" could not be strictly parsed.")
+        return pdb_name, structure, nchains
+    else:
+        print(clrs['y']+pdb+clrs['n']+' not a pdb-related structure format.'+clrs['r']+' SKIPPING!'+clrs['n'])
+
+
+
 # @timed
 def extract_seqs(structure, defmodel):
     '''