In [1]:
from Bio.PDB import PDBParser, DSSP
import numpy as np

def DSSP_structure_extractor(pdb_file):
    
    '''
    Use DSSP for predict secondary structure from a PDB file
    
    returns a numpy array of residue secondary structure labels
    '''
    
    p = PDBParser()
    structure = p.get_structure("PDB_file", pdb_file)
    model = structure[0]

    # **************** LOOK HERE! THE MKDSSP LOC NEEDS TO BE CHANGED FOR PLATFORM ********************
    dssp = DSSP(model, pdb_file)

    simplify_dict = {'H' : 'H', 'P' : 'H', 'B': 'S', 'E': 'S', 'G': 'H', 'I': 'H', 'T': '-', 'S': '-', '-': '-', ' ': '-'}
    secondary_struct = []

    for key in list(dssp.keys()):
        secondary_struct.append( simplify_dict[ dssp[key][2] ] )
        
    return np.asarray(secondary_struct) 

In [2]:
# try:
from pdbfixer import PDBFixer
# except ImportError:
#     !conda install -c conda-forge pdbfixer

# try:
from openmm.app import PDBFile
# except ImportError:
#     !conda install -c omnia openmm


fixer = PDBFixer(filename='validation_data/BPTI/1qlq.pdb')

fixer.findMissingResidues()
fixer.findMissingAtoms()
fixer.addMissingAtoms()
PDBFile.writeFile(fixer.topology, fixer.positions, open('validation_data/BPTI/1qlq-fixed.pdb', 'w'))

In [3]:
bpti_ss = DSSP_structure_extractor('validation_data/BPTI/1qlq.pdb')

In [4]:
def remove_isolated_segments(secondary_structure):
    """
    Remove isolated 'S' or 'H' segments that are less than 3 in length.
    Merge them with the surrounding 'S' or 'H' segments if possible.
    
    Parameters:
        secondary_structure (np.array): Array of secondary structure labels.
    
    Returns:
        np.array: Modified array with isolated segments removed.
    """
    def extend_segments(seq, char):
        i = 0
        while i < len(seq):
            if seq[i] == char:
                start = i
                while i < len(seq) and seq[i] == char:
                    i += 1
                end = i
                if end - start < 3:
                    if start > 0 and seq[start - 1] == char:
                        seq[start:end] = char
                    elif end < len(seq) and seq[end] == char:
                        seq[start:end] = char
                    else:
                        seq[start:end] = '-'
            else:
                i += 1
    
    # Copy the input array to avoid modifying the original
    modified_structure = np.copy(secondary_structure)
    
    # Extend 'H' segments
    extend_segments(modified_structure, 'H')
    
    # Extend 'S' segments
    extend_segments(modified_structure, 'S')
    
    return modified_structure


In [5]:
def remove_isolated_elements(secondary, struct_sym):
    
    """
    This function scans the secondary structure array and replaces isolated instances
    of helix (H) or sheet (S) with linker (-).
        
        Example (struct_sym='H')
        
        ---H---SSSSS--- =:> -------SSSSS---
        
        <Vibe check: how could a single helix element exist when definition comes from 3+ CAs>
        
        Example (struct_sym='S')
        
        HHHHH---S-----S =:> HHHHH----------
        
    Parameters:
    secondary (numpy.ndarray): Array of secondary structure symbols (HS- format).
    struct_sym (str): The structural symbol to be targeted for removal.

    Returns:
    secondary (numpy.ndarray): The modified secondary structure array with isolated H or S replaced with -.
    """
    
    x = np.where(secondary==struct_sym)[0]

    consec_x = np.split(x, np.where(np.diff(x) != 1)[0]+1)

    sheet2linker_idx = []

    for arr in consec_x:
        if arr.shape[0]==1:
            sheet2linker_idx.append(arr.item())
            
    secondary[sheet2linker_idx] = '-'
    
    return secondary

In [6]:
bpti_ss

array(['-', '-', 'H', 'H', 'H', 'H', '-', 'H', 'H', '-', '-', '-', '-',
       '-', '-', '-', '-', 'S', 'S', 'S', 'S', 'S', 'S', 'S', '-', '-',
       '-', '-', 'S', 'S', 'S', 'S', 'S', 'S', 'S', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', 'S', '-', '-', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', '-', '-', '-'], dtype='<U1')

In [7]:
ss = remove_isolated_elements(bpti_ss, 'S')
ss = remove_isolated_elements(ss, 'H')
ss

array(['-', '-', 'H', 'H', 'H', 'H', '-', 'H', 'H', '-', '-', '-', '-',
       '-', '-', '-', '-', 'S', 'S', 'S', 'S', 'S', 'S', 'S', '-', '-',
       '-', '-', 'S', 'S', 'S', 'S', 'S', 'S', 'S', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', '-', '-', '-'], dtype='<U1')

In [8]:
import CarbonaraDataTools as cdt

In [9]:
pdb_file = 'validation_data/BPTI/1qlq-fixed.pdb'

coords_chains, sequence_chains, secondary_structure_chains, missing_residues_chains = cdt.pull_structure_from_pdb(pdb_file) 

In [10]:
M = cdt.pdb_2_biobox(pdb_file)
chains = M.data['chain'].unique()
ca_cond = M.data['name']=='CA'

coords = M.coordinates[0][ca_cond]

In [11]:
coords_file = cdt.write_coordinates_file(coords, working_path='validation_data/BPTI/', carb_index=1)

In [12]:
mixture_file = cdt.write_mixture_file(working_path='validation_data/BPTI/')

In [13]:
fingerprint_file = cdt.write_fingerprint_file(number_chains = 1, sequence = sequence_chains,
                           secondary_structure = [ss], working_path = 'validation_data/BPTI/')

In [14]:
ss.shape

(58,)

In [15]:
allowed_linkers_indices = cdt.auto_select_varying_linker(coords_file, fingerprint_file)

100%|██████████| 6/6 [00:01<00:00,  3.93it/s]


In [23]:
allowed_linkers_indices = [4,8,10]
# varying_file = cdt.write_varysections_file(allowed_linkers_indices, 'validation_data/BPTI/')

In [24]:
def visualise_varying_sections(coords_chains, secondary_structure_chains, varying_linker_chains, chain_index=1):

    x_lst, y_lst, z_lst, color_lst =  cdt.smooth_me_varying(coords_chains[chain_index], secondary_structure_chains[chain_index], varying_linker_chains[chain_index], oversample=5)
    structure_fig = cdt.line_plotly(x_lst, y_lst, z_lst, color_lst, outline=True)
    structure_fig.update_layout(height=600)
    structure_fig.show()

visualise_varying_sections(coords_chains, [ss], [allowed_linkers_indices], 0)

In [18]:
ss

array(['-', '-', 'H', 'H', 'H', 'H', '-', 'H', 'H', '-', '-', '-', '-',
       '-', '-', '-', '-', 'S', 'S', 'S', 'S', 'S', 'S', 'S', '-', '-',
       '-', '-', 'S', 'S', 'S', 'S', 'S', 'S', 'S', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', '-', '-', '-'], dtype='<U1')