### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import string # need this for the amino acid feature space

### Single file example

In [2]:
df = pd.read_csv('../positional_dssp_csv/(5)_1i3n.csv')
df.head(10)

Unnamed: 0,AA-2,AA-1,AA0,AA1,AA2,SS
0,-,-,A,E,K,C
1,-,A,E,K,V,C
2,A,E,K,V,L,E
3,E,K,V,L,V,E
4,K,V,L,V,T,E
5,V,L,V,T,G,E
6,L,V,T,G,G,E
7,V,T,G,G,A,T
8,T,G,G,A,G,T
9,G,G,A,G,Y,T


In [3]:
ss = df[['SS']]
data = df.copy().drop(['SS'], axis=1)

In [4]:
data.head()

Unnamed: 0,AA-2,AA-1,AA0,AA1,AA2
0,-,-,A,E,K
1,-,A,E,K,V
2,A,E,K,V,L
3,E,K,V,L,V
4,K,V,L,V,T


In [5]:
ss.head()

Unnamed: 0,SS
0,C
1,C
2,E
3,E
4,E


In [6]:
# include 'X' Amino Acid for consistency with DSSP program
# since 'X' is the undetermined symbol

aminos = sorted(list(set(df['AA-1']))+['X'])
print aminos

['-', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y']


### Single row example

In [16]:
def construct_verbose(row):
    """
    Pass in a positional row
    Get back a verbose row
    """
    m = len(row)//2
    pos = range(-m, m+1)
    verb = []
    
    for v, p in zip(row, pos):
        entry = v + str(p)
        verb.append(entry)
    
    return verb

In [17]:
p0 = data.ix[0].values
vb = construct_verbose(p0)
vb

['--2', '--1', 'A0', 'E1', 'K2']

In [18]:
def construct_features(n=5):
    m = n//2
    pos = range(-m, m+1)
    
    aminos = ['-', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 
              'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y']

    aminos = aminos + list(string.ascii_lowercase)
    
    # construct feature space
    fs = np.asarray([[aa+str(p) for p in pos] for aa in aminos]).ravel()
    
    return fs

In [19]:
print len(construct_features())
fs = construct_features()
print fs

240
['--2' '--1' '-0' '-1' '-2' 'A-2' 'A-1' 'A0' 'A1' 'A2' 'C-2' 'C-1' 'C0'
 'C1' 'C2' 'D-2' 'D-1' 'D0' 'D1' 'D2' 'E-2' 'E-1' 'E0' 'E1' 'E2' 'F-2'
 'F-1' 'F0' 'F1' 'F2' 'G-2' 'G-1' 'G0' 'G1' 'G2' 'H-2' 'H-1' 'H0' 'H1' 'H2'
 'I-2' 'I-1' 'I0' 'I1' 'I2' 'K-2' 'K-1' 'K0' 'K1' 'K2' 'L-2' 'L-1' 'L0'
 'L1' 'L2' 'M-2' 'M-1' 'M0' 'M1' 'M2' 'N-2' 'N-1' 'N0' 'N1' 'N2' 'P-2'
 'P-1' 'P0' 'P1' 'P2' 'Q-2' 'Q-1' 'Q0' 'Q1' 'Q2' 'R-2' 'R-1' 'R0' 'R1' 'R2'
 'S-2' 'S-1' 'S0' 'S1' 'S2' 'T-2' 'T-1' 'T0' 'T1' 'T2' 'V-2' 'V-1' 'V0'
 'V1' 'V2' 'W-2' 'W-1' 'W0' 'W1' 'W2' 'X-2' 'X-1' 'X0' 'X1' 'X2' 'Y-2'
 'Y-1' 'Y0' 'Y1' 'Y2' 'a-2' 'a-1' 'a0' 'a1' 'a2' 'b-2' 'b-1' 'b0' 'b1' 'b2'
 'c-2' 'c-1' 'c0' 'c1' 'c2' 'd-2' 'd-1' 'd0' 'd1' 'd2' 'e-2' 'e-1' 'e0'
 'e1' 'e2' 'f-2' 'f-1' 'f0' 'f1' 'f2' 'g-2' 'g-1' 'g0' 'g1' 'g2' 'h-2'
 'h-1' 'h0' 'h1' 'h2' 'i-2' 'i-1' 'i0' 'i1' 'i2' 'j-2' 'j-1' 'j0' 'j1' 'j2'
 'k-2' 'k-1' 'k0' 'k1' 'k2' 'l-2' 'l-1' 'l0' 'l1' 'l2' 'm-2' 'm-1' 'm0'
 'm1' 'm2' 'n-2' 'n-1' 'n0' 'n1' 'n2' 'o-2' 'o-1

In [24]:
def construct_full(verb, fs):
        
    n = len(verb)
    m = n//2
    pos = range(-m, m+1)
    
    # start with empty array
    full = np.zeros(len(fs), dtype=int)
    
    # get the indices where
#     ind = []
#     for v in verb:
#         print(v)
#         i = np.where(v == fs)[0][0]
#         print(str(v)+':'+str(i))
#         ind.append(i)
    ind = [np.where(v == fs)[0][0] for v in verb]
    
    # set the indices where we have an AA
    full[ind] = 1
    
    return full
    

In [25]:
vb

['--2', '--1', 'A0', 'E1', 'K2']

In [26]:
r0 = construct_full(vb, fs)

In [27]:
r0

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Run over all rows 

In [28]:
def construct_verbose_df(data, n=5):
    fs = construct_features(n=n)

    full_ar = []

    for row in data.values:
        vb = construct_verbose(row)
        full = construct_full(vb, fs)
        full_ar.append(full)

    full_df = pd.DataFrame(full_ar, columns=fs)
    
    return full_df

In [29]:
full_df = construct_verbose_df(data)
full_df.head(10)

Unnamed: 0,--2,--1,-0,-1,-2,A-2,A-1,A0,A1,A2,...,y-2,y-1,y0,y1,y2,z-2,z-1,z0,z1,z2
0,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Now run over all files

In [30]:
from __future__ import print_function

In [31]:
topdir, _, files = next(os.walk('../positional_dssp_csv/'))
files[:10]

['(11)_1a3n.csv',
 '(11)_1b3n.csv',
 '(11)_1c3n.csv',
 '(11)_1d3n.csv',
 '(11)_1g3n.csv',
 '(11)_1h3n.csv',
 '(11)_1i3n.csv',
 '(11)_1j3n.csv',
 '(11)_1k3n.csv',
 '(11)_1l3n.csv']

In [32]:
n = len(files)

for i, fi in enumerate(files):
    # calculate fraction of total
    per = int((i+1)*100.0/n)
    
    # print progress
    print('\rprogress: '+str(per)+'%', end='')
    
    # get window spacing
    w = int(fi[fi.find('(') + 1: fi.find(')')])
    
    # get the data
    df = pd.read_csv(topdir+fi)
    ss = df['SS']
    data = df.drop(['SS'], axis=1)
    
#     print('\n')
#     print(fi)
#     print(data.values[:10])
#     print('\n')
    
    # make the df
    full_df = construct_verbose_df(data, n=w)
    
    # write the df
    fn = '../verbose_dssp_csv/' + fi
    full_df.to_csv(fn, index=False)

progress: 100%