# Extract secondary structure from PDB files easily

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [4]:
# Imports
import pandas as pd
import numpy as np
import glob 
import os
import urllib
from collections import Counter

In [None]:
def download_alpha_fold_pdbs(uniprot_id_list, workdir = '.'):
    counter = 0
    for uniprot_id in uniprot_id_list:
        print(f"At entry {counter}/{len(uniprot_id_list)}")
        print(f"ID: {uniprot_id}")
        download = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v2.pdb"
        
        try:
            file_name = os.path.join(workdir,f'AF-{uniprot_id}-F1-model_v2.pdb')
            #file_name = f"../data/alphafold/pdb_files/AF-{uniprot_id}-F1-model_v2.pdb"
            urllib.request.urlretrieve(download, file_name)
        except urllib.error.HTTPError:
            print("No such file.")
        counter = counter+1

In [None]:
def get_sec_struct(fname, dssp_path):
    
    """ returns secondary structure of protein file
    Parameters:
    -----------
    fname : String
        name of alpha fold pdb file
        
    dssp_path : String
        path to the installation of dssp
        
    Returns:
    --------
    secstruct : np.array
        array containing secondary structure information
    
    Cheat sheet of secondary structure information:
    -----------------------------------------------
    H = α-helix
    B = residue in isolated β-bridge
    E = extended strand, participates in β ladder
    G = 3-helix (310 helix)
    I = 5 helix (π-helix)
    T = hydrogen bonded turn
    S = bend
    """
    
    # call DSSP
    try:
        import subprocess
        subprocess.check_call("%s %s -o result.dssp"%(dssp_path, fname), shell=True)
        fin=open("result.dssp","r")
    except Exception as e:
        raise Exception("Could not calculate secondary structure! %s"%e)

    # parse output
    readit=False
    secstruct=[]
    for line in fin:

        if readit:
            try:
                if line[13:15] == '!*' or line[13] == '!':
                    continue
                else:
                    ss = line[16]
                    if line[16] == " ":
                        ss = "-"

                    secstruct.append(ss)
            except:
                continue

        if "#" in line:
            readit=True

    fin.close()

    # clean temporary files
    os.remove("result.dssp")

    return np.array(secstruct)

In [None]:
def select_high_confidence_structures(pdb_list, threshold=0.9):
    
    confidence_info=[]
    for p in pdb_list:
        f = open(p)
        data = f.readlines()
        f.close()
        confidence_score = []
        conf_dic = {}
        for l in data:
            if l.startswith('ATOM'):
                curr_line =l.split()
                confidence_score.append(float(curr_line[-2]))
        conf_dic['mean'] = np.mean(confidence_score)
        conf_dic['std'] = np.std(confidence_score)
        conf_dic['name'] = p
        confidence_info.append(conf_dic)
    confidence_df = pd.DataFrame.from_dict(confidence_info)
    filtering = confidence_df["mean"]>90.0
    high_confidence_pdbs = confidence_df.where(filfilteringter).dropna()['name']
    return high_confidence_pdbs

In [None]:
# Defining Uniprot IDs, also from file
uniprot_ids = ['B7HIJ2', 'F5WVJ2', 'C4LF00', 'O68891']

In [None]:
# Download alpha fold structures
download_alpha_fold_pdbs(uniprot_ids, workdir='data')

In [None]:
pdbs_alpha = glob.glob('data/*.pdb')

In [5]:
# Now write everything out
# Feel free to change file name
f_name = 'secondary_struc.csv'
secstruc_data = []
f = open(f_name, 'w')
for p in pdbs_alpha:
    sec_struc_output = get_sec_struct(p, '/srv/conda/envs/notebook/bin/mkdssp')
    uniprot_id = p.split('-')[1]
    f.write(uniprot_id+',')
    for r in sec_struc_output:
        f.write(r)
    f.write('\n')
f.close()