In [30]:
import pandas as pd
import numpy as np
import os

In [3]:
# take for example
with open('../dssp.data/1a3n.dssp') as f:
    data = f.read()

In [82]:
protein = data[data.find('#'):].split('\n')
protein

['#  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O    O-->H-N    N-H-->O    O-->H-N    TCO  KAPPA ALPHA  PHI   PSI    X-CA   Y-CA   Z-CA ',
 '    1    2 A Y              0   0  132      0, 0.0     2,-0.6     0, 0.0   337,-0.0   0.000 360.0 360.0 360.0 125.9   36.3    3.3   32.0',
 '    2    3 A D        -     0   0  106      1,-0.1     3,-0.3     2,-0.0     4,-0.2  -0.801 360.0-178.6 -89.2 117.9   32.9    4.9   32.2',
 '    3    4 A F        +     0   0    3    395,-0.9    -1,-0.1    -2,-0.6   396,-0.1   0.306  61.4  93.2 -98.8   9.3   33.1    8.6   31.2',
 '    4    5 A K  S    S+     0   0   94    395,-0.2    -1,-0.2     1,-0.2   395,-0.1   0.799  97.1  27.7 -70.0 -29.0   29.4    9.2   31.7',
 '    5    6 A K  S    S+     0   0  171     -3,-0.3    -1,-0.2   394,-0.0    -2,-0.1   0.456  87.5 120.8-117.1   0.8   29.9   10.4   35.3',
 '    6    7 A I        -     0   0   36     -4,-0.2     2,-0.6     1,-0.1    -3,-0.0  -0.193  54.3-136.3 -66.2 159.5   33.4   11.9   35.5',
 '    7    8 A

In [7]:
# construct the header row
header = protein[0].split()
header = [header[0]] + [header[1]+" "+str(i) for i in range(1,3)] + [header[2]] + ['STRUCTURE '+str(i) for i in range(1, 4)] + header[4:]
header[0] = 'DSSP RESIDUE #'
header[1] = 'PDB RESIDUE #'
header[2] = 'PDB CHAIN ID'

['DSSP RESIDUE #',
 'PDB RESIDUE #',
 'PDB CHAIN ID',
 'AA',
 'STRUCTURE 1',
 'STRUCTURE 2',
 'STRUCTURE 3',
 'BP1',
 'BP2',
 'ACC',
 'N-H-->O',
 'O-->H-N',
 'N-H-->O',
 'O-->H-N',
 'TCO',
 'KAPPA',
 'ALPHA',
 'PHI',
 'PSI',
 'X-CA',
 'Y-CA',
 'Z-CA']

In [108]:
def row_splitter(l):
    
    ind = [(0, 5),     # known as 'DSSP RESIDUE #'
           (5, 10),    # known as 'PDB RESIDUE #'
           (10, 12),   # known as 'CHAIN ID'
           (12, 14),   # known as 'AA'
           (14, 17),   # known as 'SECONDARY STRUCTURE'
           (17, 19),   # known as '3-HELIX'
           (19, 20),   # known as '4-HELIX'
           (20, 21),   # known as '5-HELIX'
           (21, 22),   # known as 'BEND'
           (22, 23),   # known as 'CHIRALITY'
           (23, 24),   # known as 'BETA BRIDGE 1'
           (24, 25),   # known as 'BETA BRIDGE 2'
           (25, 29),   # known as 'BP1'
           (29, 33),   # known as 'BP2'
           (33, 34),   # known as 'BSL'
           (34, 38),   # known as 'ACC'
           (38, 45),   # known as N-H-->O BF1 I
           (46, 50),   # known as N-H-->O BF1 E
           (50, 56),   # known as O-->H-N BF1 I
           (57, 61),   # known as O-->H-N BF1 E
           (61, 67),   # known as N-H-->O BF2 I
           (68, 72),   # known as N-H-->O BF2 E
           (72, 78),   # known as O-->H-N BF2 I 
           (79, 83),   # known as O-->H-N BF2 E
           (83, 91),   # known as TCO
           (91, 97),   # known as KAPPA
           (97, 103),  # known as ALPHA
           (103, 109), # known as PHI
           (109, 115), # known as PSI
           (115, 122), # known as X-CA
           (122, 129), # known as Y-CA
           (129, 136)] # known as Z-CA
    
    row = []
    
    for s, e in ind: 
        entry = l[s:e]
        entry = entry.replace(' ', '')
        row.append(entry)
    
    return row

In [115]:
def make_header_row():
    row = []
    row.append('DSSP')
    row.append('PDB')
    row.append('CHAIN')
    row.append('AA')
    row.append('SS')
    row.append('3H')
    row.append('4H')
    row.append('5H')
    row.append('BEND')
    row.append('CHIR')
    row.append('BB1')
    row.append('BB2')
    row.append('BP1')
    row.append('BP2')
    row.append('BSL')
    row.append('ACC')
    row.append('NO1I')
    row.append('NO1E')
    row.append('ON1I')
    row.append('ON1E')
    row.append('NO2I')
    row.append('NO2E')
    row.append('ON2I')
    row.append('ON2E')
    row.append('TCO')
    row.append('KAPPA')
    row.append('ALPHA')
    row.append('PHI')
    row.append('PSI')
    row.append('X-CA')
    row.append('Y-CA')
    row.append('Z-CA')    
    return row

In [109]:
protein[0]

'#  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O    O-->H-N    N-H-->O    O-->H-N    TCO  KAPPA ALPHA  PHI   PSI    X-CA   Y-CA   Z-CA '

In [117]:
make_header_row()

32

In [110]:
protein[142]

'  142  143 A D     >  -     0   0   22      1,-0.2     4,-2.2    -2,-0.2     5,-0.2  -0.365  43.7-136.9 -46.9 120.6   38.7   14.2   36.9'

In [119]:
row_splitter(protein[142])

['142',
 '143',
 'A',
 'D',
 '',
 '',
 '>',
 '',
 '',
 '-',
 '',
 '',
 '0',
 '0',
 '',
 '22',
 '1',
 '-0.2',
 '4',
 '-2.2',
 '-2',
 '-0.2',
 '5',
 '-0.2',
 '-0.365',
 '43.7',
 '-136.9',
 '-46.9',
 '120.6',
 '38.7',
 '14.2',
 '36.9']

### Trying a harder example

In [112]:
hard_ex = '  355    8 B T  E 3  S+mn 434 379G   0     78,-2.4    80,-1.5    -2,-0.5    25,-0.2  -0.608  80.2  11.2 -80.1 146.3   39.6   17.5   10.4'

In [113]:
hard_ex[72:79]

'    25,'

In [114]:
row_splitter(hard_ex)

['355',
 '8',
 'B',
 'T',
 'E',
 '3',
 '',
 '',
 'S',
 '+',
 'm',
 'n',
 '434',
 '379',
 'G',
 '0',
 '78',
 '-2.4',
 '80',
 '-1.5',
 '-2',
 '-0.5',
 '25',
 '-0.2',
 '-0.608',
 '80.2',
 '11.2',
 '-80.1',
 '146.3',
 '39.6',
 '17.5',
 '10.4']

### Running on the whole protein

In [120]:
clean_protein = [row_splitter(r) for r in protein[1:]]

In [121]:
protein_df = pd.DataFrame(clean_protein, columns=make_header_row(), index=range(1, len(clean_protein)+1))

In [122]:
protein_df.head()

Unnamed: 0,DSSP,PDB,CHAIN,AA,SS,3H,4H,5H,BEND,CHIR,...,ON2I,ON2E,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
1,1,2,A,Y,,,,,,,...,337,-0.0,0.0,360.0,360.0,360.0,125.9,36.3,3.3,32.0
2,2,3,A,D,,,,,,-,...,4,-0.2,-0.801,360.0,-178.6,-89.2,117.9,32.9,4.9,32.2
3,3,4,A,F,,,,,,+,...,396,-0.1,0.306,61.4,93.2,-98.8,9.3,33.1,8.6,31.2
4,4,5,A,K,S,,,,S,+,...,395,-0.1,0.799,97.1,27.7,-70.0,-29.0,29.4,9.2,31.7
5,5,6,A,K,S,,,,S,+,...,-2,-0.1,0.456,87.5,120.8,-117.1,0.8,29.9,10.4,35.3


### Now run on all of the DSSP files

In [37]:
topdir, _, files = next(os.walk('../dssp.data/'))

In [43]:
files[1][:files[0].find('.')]

'1b3n'

In [50]:
for i, fi in enumerate(files):
    # show progress
    print(i, sep='', end='\r', flush=True)
    # open the .dssp file
    with open(topdir+fi) as f:
        data = f.read()
    # find the sequence section
    protein = data[data.find('#'):].split('\n')
    # go through and get each column of the row of data
    clean_protein = [row_splitter(r) for r in protein[1:]]
    # construct the dataframe
    protein_df = pd.DataFrame(clean_protein, columns=header, index=range(1, len(clean_protein)+1))
    # construct filename
    fn = '../dssp_csv/' + fi[:fi.find('.')] + '.csv'
    # write to csv
    protein_df.to_csv(fn, index=False)



In [51]:
ex_df = pd.read_csv('../dssp_csv/1i3n.csv')
ex_df.head()

Unnamed: 0,DSSP RESIDUE #,PDB RESIDUE #,PDB CHAIN ID,AA,STRUCTURE 1,STRUCTURE 2,STRUCTURE 3,BP1,BP2,ACC,...,N-H-->O.1,O-->H-N.1,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
0,1.0,2.0,A,A,,,,0,0.0,133,...,00.0,"79,-0.2",0.0,360.0,360.0,360.0,-1.4,-4.8,7.2,22.5
1,2.0,3.0,A,E,,,-,0,0.0,105,...,"77,-0.1","79,-0.2",-0.681,360.0,-126.5,-88.8,145.6,-1.4,8.7,21.7
2,3.0,4.0,A,K,E,,-,a81,0.0,A46,...,"-2,-0.3","80,-0.9",-0.3,2.4,-131.5,-88.8,174.4,1.3,7.7,24.2
3,4.0,5.0,A,V,E,,-,ab83,28.0,A0,...,"77,-0.2","2,-0.4",-0.993,20.7,-141.5,-123.8,122.5,3.8,9.6,26.3
4,5.0,6.0,A,L,E,,-,ab84,29.0,A0,...,"-2,-0.5","2,-0.5",-0.735,14.2,-164.6,-85.6,131.1,7.4,8.4,26.1
