# Extract secondary structure from PDB files easily

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [50]:
# Imports
import pandas as pd
import numpy as np
import glob 
import os
import urllib
from collections import Counter

In [51]:
def download_alpha_fold_pdbs(uniprot_id_list, workdir = '.'):
    counter = 0
    for uniprot_id in uniprot_id_list:
        print(f"At entry {counter}/{len(uniprot_id_list)}")
        print(f"ID: {uniprot_id}")
        download = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v2.pdb"
        
        try:
            file_name = os.path.join(workdir,f'AF-{uniprot_id}-F1-model_v2.pdb')
            #file_name = f"../data/alphafold/pdb_files/AF-{uniprot_id}-F1-model_v2.pdb"
            urllib.request.urlretrieve(download, file_name)
        except urllib.error.HTTPError:
            print("No such file.")
        counter = counter+1

In [52]:
def get_sec_struct(fname, dssp_path):
    
    """ returns secondary structure of protein file
    Parameters:
    -----------
    fname : String
        name of alpha fold pdb file
        
    dssp_path : String
        path to the installation of dssp
        
    Returns:
    --------
    secstruct : np.array
        array containing secondary structure information
    
    Cheat sheet of secondary structure information:
    -----------------------------------------------
    H = α-helix
    B = residue in isolated β-bridge
    E = extended strand, participates in β ladder
    G = 3-helix (310 helix)
    I = 5 helix (π-helix)
    T = hydrogen bonded turn
    S = bend
    """
    
    # call DSSP
    try:
        import subprocess
        subprocess.check_call("%s %s -o result.dssp"%(dssp_path, fname), shell=True)
        fin=open("result.dssp","r")
    except Exception as e:
        raise Exception("Could not calculate secondary structure! %s"%e)

    # parse output
    readit=False
    secstruct=[]
    for line in fin:

        if readit:
            try:
                if line[13:15] == '!*' or line[13] == '!':
                    continue
                else:
                    ss = line[16]
                    if line[16] == " ":
                        ss = "-"

                    secstruct.append(ss)
            except:
                continue

        if "#" in line:
            readit=True

    fin.close()

    # clean temporary files
    os.remove("result.dssp")

    return np.array(secstruct)

In [53]:
def select_high_confidence_structures(pdb_list, threshold=0.9):
    
    confidence_info=[]
    for p in pdb_list:
        f = open(p)
        data = f.readlines()
        f.close()
        confidence_score = []
        conf_dic = {}
        for l in data:
            if l.startswith('ATOM'):
                curr_line =l.split()
                confidence_score.append(float(curr_line[-2]))
        conf_dic['mean'] = np.mean(confidence_score)
        conf_dic['std'] = np.std(confidence_score)
        conf_dic['name'] = p
        confidence_info.append(conf_dic)
    confidence_df = pd.DataFrame.from_dict(confidence_info)
    filtering = confidence_df["mean"]>90.0
    high_confidence_pdbs = confidence_df.where(filfilteringter).dropna()['name']
    return high_confidence_pdbs

In [120]:
def get_alpha_helix_length_and_location(secondary_struc):
    helix_regions = []
    counter = 0
    curr_helix = []
    indexes = [i for i, x in enumerate(list(secondary_struc)) if x == 'H']
    for i in range(len(indexes)-1):
        difference = indexes[i+1]-indexes[i]
        if difference == 1:
            curr_helix.append(indexes[i])
            if i == len(indexes)-2:
                if len(curr_helix)>=6:
                    curr_helix.append(indexes[i+1])
                    helix_regions.append(curr_helix)
        else:
            curr_helix.append(indexes[i])
            if len(curr_helix)>=6:
                helix_regions.append(curr_helix)
            curr_helix = []
    return helix_regions

In [None]:
def check_if_helix_form_part_of_known_folds(helix_regions):

In [55]:
# Defining Uniprot IDs, also from file
#uniprot_ids = ['B7HIJ2', 'F5WVJ2', 'C4LF00', 'O68891']
uniprot_ids = ['P01106']

In [56]:
# Download alpha fold structures
download_alpha_fold_pdbs(uniprot_ids, workdir='data')

At entry 0/1
ID: P01106


In [57]:
pdbs_alpha = glob.glob('data/*.pdb')

In [60]:
# Now write everything out
# Feel free to change file name
f_name = 'secondary_struc.csv'
secstruc_data = []
sec_struc_output = 0
f = open(f_name, 'w')
for p in pdbs_alpha:
    sec_struc_output = get_sec_struct(p, '/Users/toni_brain/miniconda3/envs/dssp//bin/mkdssp')
    print(sec_struc_output)
    #check_alpha_helix_of_length_seven_or_more(sec_struc_output)
    uniprot_id = p.split('-')[1]
    f.write(uniprot_id+',')
    for r in sec_struc_output:
        f.write(r)
    f.write('\n')
f.close()

['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'S' 'S' 'S' 'S' 'S' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'T' 'T' 'S' 'T' 'T' 'T' 'G' 'G'
 'G' 'S' '-' 'S' '-' '-' '-' '-' '-' '-' 'G' 'G' 'G' 'T' 'T' 'T' 'T' 'T'
 'T' 'S' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-'
 'S' '-' '-' '-' 'S' '-' '-' '-' '-' '-' 'T' 'T' '-' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'S' 'T' 'T' 'T' 'S' '-' '-' '-' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' 'S' '-' 'S' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'T' 'T' 'S' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'T' 'T' '-' '-' '-' '-' 'H' 'H' 'H' 'H' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-

In [122]:
helix_regions = get_alpha_helix_length_and_location(sec_struc_output)


In [106]:
len(sec_struc_output)

439

## Playing around with loading json

In [43]:
import json

In [44]:
f = open('test.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

In [27]:
data.keys()

dict_keys(['entryType', 'primaryAccession', 'secondaryAccessions', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'])

In [45]:
data['primaryAccession']

'P01106'

In [127]:
print(data['features'][1]['location']['start']['value'])
print(data['features'][1]['location']['end']['value'])

354
406


In [128]:
for d in data['features']:
    print(d['type'])
    if d['type'] == 'Domain':
        print(d)
    elif d['type'] == 'Region':
        print(d)
    elif d['type'] == 'Motif':
        print(d)
    elif d['type'] == 'Helix':
        print(d)

Chain
Domain
{'type': 'Domain', 'location': {'start': {'value': 354, 'modifier': 'EXACT'}, 'end': {'value': 406, 'modifier': 'EXACT'}}, 'description': 'bHLH', 'evidences': [{'evidenceCode': 'ECO:0000255', 'source': 'PROSITE-ProRule', 'id': 'PRU00981'}]}
Region
{'type': 'Region', 'location': {'start': {'value': 204, 'modifier': 'EXACT'}, 'end': {'value': 295, 'modifier': 'EXACT'}}, 'description': 'Disordered', 'evidences': [{'evidenceCode': 'ECO:0000256', 'source': 'SAM', 'id': 'MobiDB-lite'}]}
Region
{'type': 'Region', 'location': {'start': {'value': 413, 'modifier': 'EXACT'}, 'end': {'value': 434, 'modifier': 'EXACT'}}, 'description': 'Leucine-zipper'}
Motif
{'type': 'Motif', 'location': {'start': {'value': 100, 'modifier': 'EXACT'}, 'end': {'value': 108, 'modifier': 'EXACT'}}, 'description': '9aaTAD', 'evidences': [{'evidenceCode': 'ECO:0000269', 'source': 'PubMed', 'id': '34342803'}]}
Compositional bias
Modified residue
Modified residue
Modified residue
Modified residue
Modified res