In [30]:
import pandas as pd
import numpy as np
import os

In [3]:
# take for example
with open('../dssp.data/1a3n.dssp') as f:
    data = f.read()

In [4]:
protein = data[data.find('#'):].split('\n')
protein

['#  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O    O-->H-N    N-H-->O    O-->H-N    TCO  KAPPA ALPHA  PHI   PSI    X-CA   Y-CA   Z-CA ',
 '    1    1 A V              0   0  147      0, 0.0     2,-0.6     0, 0.0   428,-0.0   0.000 360.0 360.0 360.0 157.5   10.2   20.8    6.8',
 '    2    2 A L        -     0   0   13     71,-0.1   122,-0.0   125,-0.1     0, 0.0  -0.785 360.0-142.6 -89.6 124.7    6.6   21.5    7.8',
 '    3    3 A S     >  -     0   0   42     -2,-0.6     4,-3.0     1,-0.1     5,-0.3  -0.260  26.2-103.3 -75.8 167.2    4.8   23.2    4.9',
 '    4    4 A P  H  > S+     0   0  106      0, 0.0     4,-2.3     0, 0.0     5,-0.2   0.929 126.4  49.2 -56.6 -43.6    2.3   26.0    5.3',
 '    5    5 A A  H  > S+     0   0   59      2,-0.2     4,-2.6     1,-0.2     5,-0.2   0.920 109.3  52.7 -66.7 -37.7   -0.5   23.4    4.5',
 '    6    6 A D  H  > S+     0   0   13      1,-0.2     4,-2.7     2,-0.2     5,-0.2   0.917 110.1  47.8 -60.9 -43.1    1.1   21.0    7.1',
 '    7    7 A

In [7]:
# construct the header row
header = protein[0].split()
header = [header[0]] + [header[1]+" "+str(i) for i in range(1,3)] + [header[2]] + ['STRUCTURE '+str(i) for i in range(1, 4)] + header[4:]
header[0] = 'DSSP RESIDUE #'
header[1] = 'PDB RESIDUE #'
header[2] = 'PDB CHAIN ID'

['DSSP RESIDUE #',
 'PDB RESIDUE #',
 'PDB CHAIN ID',
 'AA',
 'STRUCTURE 1',
 'STRUCTURE 2',
 'STRUCTURE 3',
 'BP1',
 'BP2',
 'ACC',
 'N-H-->O',
 'O-->H-N',
 'N-H-->O',
 'O-->H-N',
 'TCO',
 'KAPPA',
 'ALPHA',
 'PHI',
 'PSI',
 'X-CA',
 'Y-CA',
 'Z-CA']

In [15]:
def row_splitter(l):
    
    ind = [(0, 5),     # known as '#'
           (5, 10),    # known as RESIDUE
           (10, 12),   # known as AA
           (12, 14),   # known as STRUCTURE 1
           (14, 17),   # known as STRUCTURE 2
           (17, 20),   # known as STRUCTURE 3
           (20, 23),   # known as STRUCTURE 4
           (23, 29),   # known as BP1
           (29, 33),   # known as BP2
           (33, 38),   # known as ACC
           (38, 50),   # known as N-H-->O 1
           (50, 61),   # known as O-->H-N 1
           (61, 72),   # known as N-H-->O 2
           (72, 83),   # known as O-->H-N 2
           (83, 91),   # known as TCO
           (91, 97),   # known as KAPPA
           (97, 103),  # known as ALPHA
           (103, 109), # known as PHI
           (109, 115), # known as PSI
           (115, 122), # known as X-CA
           (122, 129), # known as Y-CA
           (129, 136)] # known as Z-CA
    
    row = []
    
    for s, e in ind: 
        entry = l[s:e]
        entry = entry.replace(' ', '')
        row.append(entry)
    
    return row

In [16]:
protein[0]

'#  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O    O-->H-N    N-H-->O    O-->H-N    TCO  KAPPA ALPHA  PHI   PSI    X-CA   Y-CA   Z-CA '

In [17]:
protein[142]

'  142        !*             0   0    0      0, 0.0     0, 0.0     0, 0.0     0, 0.0   0.000 360.0 360.0 360.0 360.0    0.0    0.0    0.0'

In [18]:
row_splitter(protein[142])

['142',
 '',
 '',
 '!',
 '*',
 '',
 '',
 '0',
 '0',
 '0',
 '0,0.0',
 '0,0.0',
 '0,0.0',
 '0,0.0',
 '0.000',
 '360.0',
 '360.0',
 '360.0',
 '360.0',
 '0.0',
 '0.0',
 '0.0']

In [19]:
clean_protein = [row_splitter(r) for r in protein[1:]]

In [26]:
protein_df = pd.DataFrame(clean_protein, columns=header, index=range(1, len(clean_protein)+1))

In [27]:
protein_df.head()

Unnamed: 0,DSSP RESIDUE #,PDB RESIDUE #,PDB CHAIN ID,AA,STRUCTURE 1,STRUCTURE 2,STRUCTURE 3,BP1,BP2,ACC,...,N-H-->O,O-->H-N,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
1,1,1,A,V,,,,0,0,147,...,00.0,"428,-0.0",0.0,360.0,360.0,360.0,157.5,10.2,20.8,6.8
2,2,2,A,L,,,-,0,0,13,...,"125,-0.1",00.0,-0.785,360.0,-142.6,-89.6,124.7,6.6,21.5,7.8
3,3,3,A,S,,>,-,0,0,42,...,"1,-0.1","5,-0.3",-0.26,26.2,-103.3,-75.8,167.2,4.8,23.2,4.9
4,4,4,A,P,H,>,S+,0,0,106,...,00.0,"5,-0.2",0.929,126.4,49.2,-56.6,-43.6,2.3,26.0,5.3
5,5,5,A,A,H,>,S+,0,0,59,...,"1,-0.2","5,-0.2",0.92,109.3,52.7,-66.7,-37.7,-0.5,23.4,4.5


### Now run on all of the DSSP files

In [37]:
topdir, _, files = next(os.walk('../dssp.data/'))

In [43]:
files[1][:files[0].find('.')]

'1b3n'

In [50]:
for i, fi in enumerate(files):
    # show progress
    print(i, sep='', end='\r', flush=True)
    # open the .dssp file
    with open(topdir+fi) as f:
        data = f.read()
    # find the sequence section
    protein = data[data.find('#'):].split('\n')
    # go through and get each column of the row of data
    clean_protein = [row_splitter(r) for r in protein[1:]]
    # construct the dataframe
    protein_df = pd.DataFrame(clean_protein, columns=header, index=range(1, len(clean_protein)+1))
    # construct filename
    fn = '../dssp_csv/' + fi[:fi.find('.')] + '.csv'
    # write to csv
    protein_df.to_csv(fn, index=False)



In [51]:
ex_df = pd.read_csv('../dssp_csv/1i3n.csv')
ex_df.head()

Unnamed: 0,DSSP RESIDUE #,PDB RESIDUE #,PDB CHAIN ID,AA,STRUCTURE 1,STRUCTURE 2,STRUCTURE 3,BP1,BP2,ACC,...,N-H-->O.1,O-->H-N.1,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
0,1.0,2.0,A,A,,,,0,0.0,133,...,00.0,"79,-0.2",0.0,360.0,360.0,360.0,-1.4,-4.8,7.2,22.5
1,2.0,3.0,A,E,,,-,0,0.0,105,...,"77,-0.1","79,-0.2",-0.681,360.0,-126.5,-88.8,145.6,-1.4,8.7,21.7
2,3.0,4.0,A,K,E,,-,a81,0.0,A46,...,"-2,-0.3","80,-0.9",-0.3,2.4,-131.5,-88.8,174.4,1.3,7.7,24.2
3,4.0,5.0,A,V,E,,-,ab83,28.0,A0,...,"77,-0.2","2,-0.4",-0.993,20.7,-141.5,-123.8,122.5,3.8,9.6,26.3
4,5.0,6.0,A,L,E,,-,ab84,29.0,A0,...,"-2,-0.5","2,-0.5",-0.735,14.2,-164.6,-85.6,131.1,7.4,8.4,26.1
