# Extract secondary structure from PDB files easily

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [50]:
# Imports
import pandas as pd
import numpy as np
import glob 
import os
import urllib
from collections import Counter

In [52]:
def get_sec_struct(fname, dssp_path):
    
    """ returns secondary structure of protein file
    Parameters:
    -----------
    fname : String
        name of alpha fold pdb file
        
    dssp_path : String
        path to the installation of dssp
        
    Returns:
    --------
    secstruct : np.array
        array containing secondary structure information
    
    Cheat sheet of secondary structure information:
    -----------------------------------------------
    H = α-helix
    B = residue in isolated β-bridge
    E = extended strand, participates in β ladder
    G = 3-helix (310 helix)
    I = 5 helix (π-helix)
    T = hydrogen bonded turn
    S = bend
    """
    
    # call DSSP
    try:
        import subprocess
        subprocess.check_call("%s %s -o result.dssp"%(dssp_path, fname), shell=True)
        fin=open("result.dssp","r")
    except Exception as e:
        raise Exception("Could not calculate secondary structure! %s"%e)

    # parse output
    readit=False
    secstruct=[]
    for line in fin:

        if readit:
            try:
                if line[13:15] == '!*' or line[13] == '!':
                    continue
                else:
                    ss = line[16]
                    if line[16] == " ":
                        ss = "-"

                    secstruct.append(ss)
            except:
                continue

        if "#" in line:
            readit=True

    fin.close()

    # clean temporary files
    os.remove("result.dssp")

    return np.array(secstruct)

In [53]:
def select_high_confidence_structures(pdb_list, threshold=0.9):
    
    confidence_info=[]
    for p in pdb_list:
        f = open(p)
        data = f.readlines()
        f.close()
        confidence_score = []
        conf_dic = {}
        for l in data:
            if l.startswith('ATOM'):
                curr_line =l.split()
                confidence_score.append(float(curr_line[-2]))
        conf_dic['mean'] = np.mean(confidence_score)
        conf_dic['std'] = np.std(confidence_score)
        conf_dic['name'] = p
        confidence_info.append(conf_dic)
    confidence_df = pd.DataFrame.from_dict(confidence_info)
    filtering = confidence_df["mean"]>90.0
    high_confidence_pdbs = confidence_df.where(filfilteringter).dropna()['name']
    return high_confidence_pdbs

In [252]:
def get_alpha_helix_length_and_location(secondary_struc, min_length=7):
    helix_regions = []
    counter = 0
    curr_helix = []
    indexes = [i for i, x in enumerate(list(secondary_struc)) if x == 'H']
    for i in range(len(indexes)-1):
        difference = indexes[i+1]-indexes[i]
        if difference == 1:
            curr_helix.append(indexes[i])
            if i == len(indexes)-2:
                if len(curr_helix)>=min_length-1:
                    curr_helix.append(indexes[i+1])
                    helix_regions.append(curr_helix)
        else:
            curr_helix.append(indexes[i])
            if len(curr_helix)>=min_length-1:
                helix_regions.append(curr_helix)
            curr_helix = []
    return helix_regions

In [None]:
def get_number_of_serines(sequence):
    

In [None]:
def check_if_helix_form_part_of_known_folds(helix_regions):

In [55]:
# Defining Uniprot IDs, also from file
#uniprot_ids = ['B7HIJ2', 'F5WVJ2', 'C4LF00', 'O68891']
uniprot_ids = ['P01106']

In [56]:
# Download alpha fold structures
download_alpha_fold_pdbs(uniprot_ids, workdir='data')

At entry 0/1
ID: P01106


In [57]:
pdbs_alpha = glob.glob('data/*.pdb')

In [60]:
# Now write everything out
# Feel free to change file name
f_name = 'secondary_struc.csv'
secstruc_data = []
sec_struc_output = 0
f = open(f_name, 'w')
for p in pdbs_alpha:
    sec_struc_output = get_sec_struct(p, '/Users/toni_brain/miniconda3/envs/dssp//bin/mkdssp')
    print(sec_struc_output)
    #check_alpha_helix_of_length_seven_or_more(sec_struc_output)
    uniprot_id = p.split('-')[1]
    f.write(uniprot_id+',')
    for r in sec_struc_output:
        f.write(r)
    f.write('\n')
f.close()

['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'S' 'S' 'S' 'S' 'S' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'T' 'T' 'S' 'T' 'T' 'T' 'G' 'G'
 'G' 'S' '-' 'S' '-' '-' '-' '-' '-' '-' 'G' 'G' 'G' 'T' 'T' 'T' 'T' 'T'
 'T' 'S' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-'
 'S' '-' '-' '-' 'S' '-' '-' '-' '-' '-' 'T' 'T' '-' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'S' 'T' 'T' 'T' 'S' '-' '-' '-' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' 'S' '-' 'S' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'T' 'T' 'S' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'T' 'T' '-' '-' '-' '-' 'H' 'H' 'H' 'H' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-

In [122]:
helix_regions = get_alpha_helix_length_and_location(sec_struc_output)


In [106]:
len(sec_struc_output)

439

## Playing around with loading json

In [43]:
import json

In [44]:
f = open('test.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

In [27]:
data.keys()

dict_keys(['entryType', 'primaryAccession', 'secondaryAccessions', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'])

In [45]:
data['primaryAccession']

'P01106'

In [127]:
print(data['features'][1]['location']['start']['value'])
print(data['features'][1]['location']['end']['value'])

354
406


In [160]:
# This is collecting known information

known_things = {}
known_things['Domain'] = []
known_things['Region'] = []
known_things['Motif'] = []
#known_things['Helix'] = []

for d in data['features']:
    #print(d['type'])
    if d['type'] == 'Domain':
        known_things['Domain'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
    elif d['type'] == 'Region':
        known_things['Region'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
    elif d['type'] == 'Motif':
        known_things['Motif'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])
    #elif d['type'] == 'Helix':
    #    known_things['Helix'].append([d['location']['start']['value'], d['location']['end']['value'], d['description']])

In [161]:
known_things

{'Domain': [[354, 406, 'bHLH']],
 'Region': [[204, 295, 'Disordered'], [413, 434, 'Leucine-zipper']],
 'Motif': [[100, 108, '9aaTAD']]}

In [162]:
# now we want to match up the known information with the predicted helices:

In [163]:
def closest_value(input_list, input_value):
    arr = np.asarray(input_list)
    i = (np.abs(arr - input_value)).argmin()
    return arr[i]

In [168]:
for key in known_things.keys():
    print(key, known_things[key])
    for items in known_things[key]:

#helix = helix_regions[0]
#print(helix)

        for helix in helix_regions:
            print('helix to check is:',helix)
            difference_beginning = np.abs(closest_value(helix,items[0])-items[0])
            difference_end = np.abs(closest_value(helix,items[1])-items[1])
            print(difference_beginning)
            print(difference_end)
            if items[0] in helix:
                ## check if begin and end are in list
                print('begginning is part of helix')
            elif items[1] in helix:
                print('end is part of helix')
            elif difference_beginning and difference_end <=5:
                print('helix close to known starts')
                print(items[0], items[1])
    print(10*'=')

            

Domain [[354, 406, 'bHLH']]
helix to check is: [94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106]
248
300
helix to check is: [121, 122, 123, 124, 125, 126, 127, 128, 129, 130]
224
276
helix to check is: [139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153]
201
253
helix to check is: [175, 176, 177, 178, 179, 180, 181, 182, 183]
171
223
helix to check is: [347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377]
0
29
begginning is part of helix
helix to check is: [391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436]
37
0
end is part of helix
Region [[204, 295, 'Disordered'], [413, 434, 'Leucine-zipper']]
helix to check is: [94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106]
98
1

In [169]:
data

{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
 'primaryAccession': 'P01106',
 'secondaryAccessions': ['A8WFE7', 'P01107', 'Q14026'],
 'uniProtkbId': 'MYC_HUMAN',
 'entryAudit': {'firstPublicDate': '1986-07-21',
  'lastAnnotationUpdateDate': '2022-12-14',
  'lastSequenceUpdateDate': '1987-08-13',
  'entryVersion': 265,
  'sequenceVersion': 1},
 'annotationScore': 5.0,
 'organism': {'scientificName': 'Homo sapiens',
  'commonName': 'Human',
  'taxonId': 9606,
  'lineage': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Primates',
   'Haplorrhini',
   'Catarrhini',
   'Hominidae',
   'Homo']},
 'proteinExistence': '1: Evidence at protein level',
 'proteinDescription': {'recommendedName': {'fullName': {'value': 'Myc proto-oncogene protein'}},
  'alternativeNames': [{'fullName': {'value': 'Class E basic helix-loop-helix protein 39'},
    'shortNames': [{'value': 'bHLHe39'}]},
   {'fullN

## Trying to download uniprot json files and alphafold files

In [170]:
import urllib

In [233]:
def download_uniprot_json_file(uni_prot_id, workdir = '.'):
    #check if there is uniprot information available for the protein
    try:
        url_2 = 'https://www.uniprot.org/uniprot/' + uni_prot_id + '.json'
        html_2 = urllib.request.urlopen(url_2)
        #print(html_2)
        # now try and write to file
        f = open(os.path.join(workdir,uni_prot_id)+'.json', 'w')
        f.write(lines.decode('utf-8'))
        f.close()
        
    except Exception as e:
        raise Exception('Failed to obtain UNIPROT data. %s'%e)
    


In [51]:
def download_alpha_fold_pdbs(uniprot_id_list, workdir = '.'):
    counter = 0
    for uniprot_id in uniprot_id_list:
        print(f"At entry {counter}/{len(uniprot_id_list)}")
        print(f"ID: {uniprot_id}")
        download = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v2.pdb"
        
        try:
            file_name = os.path.join(workdir,f'AF-{uniprot_id}-F1-model_v2.pdb')
            #file_name = f"../data/alphafold/pdb_files/AF-{uniprot_id}-F1-model_v2.pdb"
            urllib.request.urlretrieve(download, file_name)
        except urllib.error.HTTPError:
            print("No such file.")
        counter = counter+1

## Let's try the workflow on one uniprot id

In [236]:
# Downloading uniprot file and alphafold file
download_uniprot_json_file('P01106', 'temp')
download_alpha_fold_pdbs(['P01106'], workdir ='temp')

At entry 0/1
ID: P01106


In [237]:
# get secondary structure from alphafold file
p = 'temp/AF-P01106-F1-model_v2.pdb'
sec_struc_output = get_sec_struct(p, '/Users/toni_brain/miniconda3/envs/dssp//bin/mkdssp')
print(sec_struc_output)


['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'S' 'S' 'S' 'S' 'S' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'T' 'T' 'S' 'T' 'T' 'T' 'G' 'G'
 'G' 'S' '-' 'S' '-' '-' '-' '-' '-' '-' 'G' 'G' 'G' 'T' 'T' 'T' 'T' 'T'
 'T' 'S' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-'
 'S' '-' '-' '-' 'S' '-' '-' '-' '-' '-' 'T' 'T' '-' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'S' 'T' 'T' 'T' 'S' '-' '-' '-' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' 'S' '-' 'S' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' 'T' 'T' 'S' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'T' 'T' '-' '-' '-' '-' 'H' 'H' 'H' 'H' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-

In [256]:
# Process uniprot file
f = open('temp/P01106.json')
data = json.load(f)
f.close()
# Some basics:
uni_id = data['primaryAccession'] 
print(uni_id)
gene_name = data['genes'][0]['geneName']['value']
print(gene_name)
ref_seq_id = 0
for db in new_data['uniProtKBCrossReferences']:
    if db['database'] == "RefSeq":
        ref_seq_id = (db['id'])
print(ref_seq_id)
sequence = data['sequence']['value']
print(sequence)

P01106
MYC
NP_002458.2
MPLNVSFTNRNYDLDYDSVQPYFYCDEEENFYQQQQQSELQPPAPSEDIWKKFELLPTPPLSPSRRSGLCSPSYVAVTPFSLRGDNDGGGGSFSTADQLEMVTELLGGDMVNQSFICDPDDETFIKNIIIQDCMWSGFSAAAKLVSEKLASYQAARKDSGSPNPARGHSVCSTSSLYLQDLSAAASECIDPSVVFPYPLNDSSSPKSCASQDSSAFSPSSDSLLSSTESSPQGSPEPLVLHEETPPTTSSDSEEEQEDEEEIDVVSVEKRQAPGKRSESGSPSAGGHSKPPHSPLVLKRCHVSTHQHNYAAPPSTRKDYPAAKRVKLDSVRVLRQISNNRKCTSPRSSDTEENVKRRTHNVLERQRRNELKRSFFALRDQIPELENNEKAPKVVILKKATAYILSVQAEEQKLISEEDLLRKRREQLKHKLEQLRNSCA


## Check that sequence is the same length as secondary_structure_output!!

In [254]:
# Let's get the Helix regions:
alpha_helix_index_list = get_alpha_helix_length_and_location(sec_struc_output, min_length=7)

In [259]:
firstAA_position_in_HELIDR = alpha_helix_index_list[0][0]+1
lastAA_position_in_HELIDR = alpha_helix_index_list[0][-1]+1

In [270]:
HELIDR_seq = sequence[alpha_helix_index_list[0][0]:alpha_helix_index_list[0][-1]+1]

In [303]:
# Note the +2 here does not include the last helix AA. 
# We have to make sure here that we check the arrays are not out of bounds!
down_stream_seq = sequence[alpha_helix_index_list[0][-1]+2:alpha_helix_index_list[0][-1]+12]
up_stream_seq = sequence[alpha_helix_index_list[0][0]-10:alpha_helix_index_list[0][0]]

In [312]:
print('down_stream_seq',down_stream_seq)
print('up_stream_seq', up_stream_seq)

down_stream_seq DMVNQSFICD
up_stream_seq DNDGGGGSFS


In [315]:
def count_serines(sequence):
    n_serine = sequence.count('S')
    if n_serine >=2:
        return 1
    else:
        return 0

In [322]:
Two_S5P_down = count_serines(down_stream_seq[:5])
Two_S5P_up = count_serines(up_stream_seq[5:])

In [323]:
print(Two_S5P_down)
print(Two_S5P_up)

0
1


In [311]:
up_stream_seq

'DNDGGGGSFS'

In [297]:
sec_struc_output[alpha_helix_index_list[0][0]:alpha_helix_index_list[0][-1]+1]

array(['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'],
      dtype='<U1')