### Imports

In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import os

### Example file

In [2]:
df = pd.read_csv('../clean_dssp_csv/1i3n.csv')
df.head(10)

Unnamed: 0,DSSP,PDB,CHAIN,AA,SS,3H,4H,5H,BEND,CHIR,...,ON2I,ON2E,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
0,1,2.0,A,A,C,?,?,?,?,?,...,79,-0.2,0.0,360.0,360.0,360.0,-1.4,-4.8,7.2,22.5
1,2,3.0,A,E,C,?,?,?,?,-,...,79,-0.2,-0.681,360.0,-126.5,-88.8,145.6,-1.4,8.7,21.7
2,3,4.0,A,K,E,?,?,?,?,-,...,80,-0.9,-0.3,2.4,-131.5,-88.8,174.4,1.3,7.7,24.2
3,4,5.0,A,V,E,?,?,?,?,-,...,2,-0.4,-0.993,20.7,-141.5,-123.8,122.5,3.8,9.6,26.3
4,5,6.0,A,L,E,?,?,?,?,-,...,2,-0.5,-0.735,14.2,-164.6,-85.6,131.1,7.4,8.4,26.1
5,6,7.0,A,V,E,>,?,?,?,-,...,3,-0.8,-0.945,3.6,-159.8,-118.7,106.8,9.2,8.5,29.5
6,7,8.0,A,T,E,3,?,?,S,+,...,25,-0.2,-0.651,79.5,12.3,-85.0,145.9,13.0,8.2,29.3
7,8,9.0,A,G,T,>,?,?,S,+,...,6,-0.4,0.792,83.3,153.9,62.7,25.1,14.9,7.1,32.4
8,9,10.0,A,G,T,<,?,?,?,+,...,23,-0.1,0.541,56.7,65.3,-66.2,-9.4,11.4,6.1,33.8
9,10,11.0,A,A,T,3,?,?,S,+,...,28,-0.2,0.578,88.6,88.7,-89.4,-9.4,12.8,3.4,36.1


In [3]:
# key columns
data = df[['AA', 'SS']]
data.head()

Unnamed: 0,AA,SS
0,A,C
1,E,C
2,K,E
3,V,E
4,L,E


In [4]:
# look at just the AA sequence
seq = data['AA'].values
seq

array(['A', 'E', 'K', 'V', 'L', 'V', 'T', 'G', 'G', 'A', 'G', 'Y', 'I',
       'G', 'S', 'H', 'T', 'V', 'L', 'E', 'L', 'L', 'E', 'A', 'G', 'Y',
       'L', 'P', 'V', 'V', 'I', 'D', 'N', 'F', 'H', 'N', 'A', 'F', 'R',
       'G', 'G', 'G', 'S', 'L', 'P', 'E', 'S', 'L', 'R', 'R', 'V', 'Q',
       'E', 'L', 'T', 'G', 'R', 'S', 'V', 'E', 'F', 'E', 'E', 'M', 'D',
       'I', 'L', 'D', 'Q', 'G', 'A', 'L', 'Q', 'R', 'L', 'F', 'K', 'K',
       'Y', 'S', 'F', 'M', 'A', 'V', 'I', 'H', 'F', 'A', 'G', 'L', 'K',
       'A', 'M', 'G', 'E', 'S', 'V', 'Q', 'K', 'P', 'L', 'D', 'Y', 'Y',
       'R', 'V', 'N', 'L', 'T', 'G', 'T', 'I', 'Q', 'L', 'L', 'E', 'I',
       'M', 'K', 'A', 'H', 'G', 'V', 'K', 'N', 'L', 'V', 'F', 'S', 'S',
       'S', 'A', 'T', 'V', 'Y', 'G', 'N', 'P', 'Q', 'Y', 'L', 'P', 'L',
       'D', 'E', 'A', 'H', 'P', 'T', 'G', 'G', 'C', 'T', 'N', 'P', 'Y',
       'G', 'K', 'S', 'K', 'F', 'F', 'I', 'E', 'E', 'M', 'I', 'R', 'D',
       'L', 'C', 'Q', 'A', 'D', 'K', 'T', 'W', 'N', 'V', 'V', 'L

In [5]:
def construct_positional(seq, w=5, header=True):
    """
    INPUT: sequence of amino acids (seq)
    OUTPUT: 2D numpy array, known as a positional protein
    
    Given an sequence (array) of amino acids
    construct a windowed representation of each amino acid. 
    Default window size (w) of 5, includes a header row
    
    Contains helper function (construct_positional_header) to make header row
    """    
    
    s = w//2
    pad = ['-']*s + list(seq) + ['-']*s
    n = len(seq)
    rows = []
    
    def construct_positional_header():
        return np.asarray(['AA'+str(i) for i in xrange(-s, s+1)])
    
    if header:
        rows.append(construct_positional_header())
    
    for i in xrange(s, n+s):
        row = pad[i-s:i+s+1]
        rows.append(row)    
    
    return np.asarray(rows)

In [6]:
positional = construct_positional(seq)

In [7]:
# ie head
positional[:10]

array([['AA-2', 'AA-1', 'AA0', 'AA1', 'AA2'],
       ['-', '-', 'A', 'E', 'K'],
       ['-', 'A', 'E', 'K', 'V'],
       ['A', 'E', 'K', 'V', 'L'],
       ['E', 'K', 'V', 'L', 'V'],
       ['K', 'V', 'L', 'V', 'T'],
       ['V', 'L', 'V', 'T', 'G'],
       ['L', 'V', 'T', 'G', 'G'],
       ['V', 'T', 'G', 'G', 'A'],
       ['T', 'G', 'G', 'A', 'G']], 
      dtype='|S4')

In [8]:
# ie tail
positional[-11:]

array([['W', 'R', 'W', 'Q', 'K'],
       ['R', 'W', 'Q', 'K', 'Q'],
       ['W', 'Q', 'K', 'Q', 'N'],
       ['Q', 'K', 'Q', 'N', 'P'],
       ['K', 'Q', 'N', 'P', 'S'],
       ['Q', 'N', 'P', 'S', 'G'],
       ['N', 'P', 'S', 'G', 'F'],
       ['P', 'S', 'G', 'F', 'G'],
       ['S', 'G', 'F', 'G', 'T'],
       ['G', 'F', 'G', 'T', '-'],
       ['F', 'G', 'T', '-', '-']], 
      dtype='|S4')

In [9]:
pos_df = pd.DataFrame(positional[1:], columns=positional[0])
pos_df.head(10)

Unnamed: 0,AA-2,AA-1,AA0,AA1,AA2
0,-,-,A,E,K
1,-,A,E,K,V
2,A,E,K,V,L
3,E,K,V,L,V
4,K,V,L,V,T
5,V,L,V,T,G
6,L,V,T,G,G
7,V,T,G,G,A
8,T,G,G,A,G
9,G,G,A,G,Y


### Now run over all spacings and proteins

In [12]:
from __future__ import print_function

In [13]:
topdir, _, files = next(os.walk('../clean_dssp_csv/'))
files[:10]

['1a3n.csv',
 '1b3n.csv',
 '1c3n.csv',
 '1d3n.csv',
 '1g3n.csv',
 '1h3n.csv',
 '1i3n.csv',
 '1j3n.csv',
 '1k3n.csv',
 '1l3n.csv']

In [14]:
n = len(files)-1

for i, fi in enumerate(files):
    # calculate fraction of total
    per = int(i*100.0/n)
    
    # print progress
    print('\rprogress: '+str(per)+'%', end='')
    
    # get the data
    df = pd.read_csv(topdir+fi)
    
    # get seq
    seq = df['AA'].values
    
    # construct the positional df for each spacing 
    for spacing in xrange(5, 20, 2):
        
        # construct positional array
        positional = construct_positional(seq, w=spacing, header=True)
        
        # create new pos df
        pos_df = pd.DataFrame(positional[1:], columns=positional[0])

        # construct filename
        fn = '../positional_dssp_csv/(' + str(spacing) + ')_' + fi
        
        # write to file
        pos_df.to_csv(fn, index=False)

print('\ncompleted')

progress: 100%
completed


In [15]:
# check
ex_pos_df = pd.read_csv('../positional_dssp_csv/(5)_1i3n.csv')
ex_pos_df.head(10)

Unnamed: 0,AA-2,AA-1,AA0,AA1,AA2
0,-,-,A,E,K
1,-,A,E,K,V
2,A,E,K,V,L
3,E,K,V,L,V
4,K,V,L,V,T
5,V,L,V,T,G
6,L,V,T,G,G
7,V,T,G,G,A
8,T,G,G,A,G
9,G,G,A,G,Y


In [16]:
ex_pos_df.tail(10)

Unnamed: 0,AA-2,AA-1,AA0,AA1,AA2
682,R,W,Q,K,Q
683,W,Q,K,Q,N
684,Q,K,Q,N,P
685,K,Q,N,P,S
686,Q,N,P,S,G
687,N,P,S,G,F
688,P,S,G,F,G
689,S,G,F,G,T
690,G,F,G,T,-
691,F,G,T,-,-
