Secondary structure metrics with BioPython/DSSP 

In [10]:
import os

from Bio import BiopythonWarning
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore', BiopythonWarning)

from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
from Bio.PDB.DSSP import ss_to_index

import functools
from collections import Counter
import pandas as pd
import numpy as np

from pathlib import Path
#from multiprocessing import Pool

In [11]:
#cpus = 80

In [None]:
dataset_name = 'vhh_tsd'

structure_path = '/reduced_strucs/reduced_'+dataset_name+'/'
structures = os.listdir(structure_path)

In [23]:
def dssp_to_dict(structure, path = structure_path):

    try:
        #print(structure[:-4])
        p = PDBParser(QUIET=True)
        path_to_pdb = path + structure
        #print(path_to_pdb)
        model = p.get_structure("", path_to_pdb)[0] # GG modded here bc nested dirs
        #print(model)
        dssp = DSSP(model, path_to_pdb)
        #print(dssp)

        data = np.array(list(dict(dssp).values()))

        init_c = Counter({
            'H': 0,
            'B': 0,
            'E': 0,
            'G': 0,
            'I': 0,
            'T': 0,
            'S': 0,
            '-': 0
        })

        get_c = Counter(data[..., 2])

        ss_c = dict(functools.reduce(lambda a, b: a.update(b) or a, [init_c, get_c], Counter()))

        values = np.mean(data[..., 3:].astype(np.float32), axis=0)

        ss_d = {}
        keys = ['relative_ASA', 'phi', 'psi', 'NH_O_1_relidx', 
                'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy',
                'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 
                'O_NH_2_energy']

        ss_d.update(ss_c)

        for n, key in enumerate(keys):
            ss_d.update({key: values[n]})
        
        return ss_d

    except Exception:
        print('Error:', structure[:-4])
        ss_d = {
            'H': 0,
            'B': 0,
            'E': 0,
            'G': 0,
            'I': 0,
            'T': 0,
            'S': 0,
            '-': 0,
            'relative_ASA': 0, 
            'phi': 0, 
            'psi': 0, 
            'NH_O_1_relidx': 0, 
            'NH_O_1_energy': 0,
            'O_NH_1_relidx': 0, 
            'O_NH_1_energy': 0,
            'NH_O_2_relidx': 0,
            'NH_O_2_energy': 0, 
            'O_NH_2_relidx': 0, 
            'O_NH_2_energy': 0
        }
        return ss_d

In [24]:
all_results = dict()
for structure in structures:
    results = dssp_to_dict(structure, path = structure_path)
    all_results[structure[:-4]] = results

In [26]:
df = pd.DataFrame(all_results).reset_index()
#print(df.shape)

#df.insert(0, 'ID', [s[:-4] for s in structures])
# df.to_csv('./ABB_paired_dssp_2.csv', index=None)
#print(df.shape)

In [None]:
df.to_csv('results_dssp.csv', index=False)