# Extract secondary structure from PDB files easily and generate spreadsheet

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [5]:
# Imports
import pandas as pd
import numpy as np
import glob 
import os
import urllib
from collections import Counter
import urllib

In [2]:
def get_sec_struct(fname, dssp_path):
    
    """ returns secondary structure of protein file
    Parameters:
    -----------
    fname : String
        name of alpha fold pdb file
        
    dssp_path : String
        path to the installation of dssp
        
    Returns:
    --------
    secstruct : np.array
        array containing secondary structure information
    
    Cheat sheet of secondary structure information:
    -----------------------------------------------
    H = α-helix
    B = residue in isolated β-bridge
    E = extended strand, participates in β ladder
    G = 3-helix (310 helix)
    I = 5 helix (π-helix)
    T = hydrogen bonded turn
    S = bend
    """
    
    # call DSSP
    try:
        import subprocess
        subprocess.check_call("%s %s -o result.dssp"%(dssp_path, fname), shell=True)
        fin=open("result.dssp","r")
    except Exception as e:
        raise Exception("Could not calculate secondary structure! %s"%e)

    # parse output
    readit=False
    secstruct=[]
    for line in fin:

        if readit:
            try:
                if line[13:15] == '!*' or line[13] == '!':
                    continue
                else:
                    ss = line[16]
                    if line[16] == " ":
                        ss = "-"

                    secstruct.append(ss)
            except:
                continue

        if "#" in line:
            readit=True

    fin.close()

    # clean temporary files
    os.remove("result.dssp")

    return np.array(secstruct)

In [3]:
def select_high_confidence_structures(pdb_list, threshold=0.9):
    
    confidence_info=[]
    for p in pdb_list:
        f = open(p)
        data = f.readlines()
        f.close()
        confidence_score = []
        conf_dic = {}
        for l in data:
            if l.startswith('ATOM'):
                curr_line =l.split()
                confidence_score.append(float(curr_line[-2]))
        conf_dic['mean'] = np.mean(confidence_score)
        conf_dic['std'] = np.std(confidence_score)
        conf_dic['name'] = p
        confidence_info.append(conf_dic)
    confidence_df = pd.DataFrame.from_dict(confidence_info)
    filtering = confidence_df["mean"]>90.0
    high_confidence_pdbs = confidence_df.where(filfilteringter).dropna()['name']
    return high_confidence_pdbs

In [40]:
def get_alpha_helix_length_and_location(secondary_struc, min_length=8):
    helix_regions = []
    counter = 0
    curr_helix = []
    indexes = [i for i, x in enumerate(list(secondary_struc)) if x == 'H']
    for i in range(len(indexes)-1):
        difference = indexes[i+1]-indexes[i]
        if difference == 1:
            curr_helix.append(indexes[i])
            if i == len(indexes)-2:
                if len(curr_helix)>=min_length-1:
                    curr_helix.append(indexes[i+1])
                    helix_regions.append(curr_helix)
        else:
            curr_helix.append(indexes[i])
            if len(curr_helix)>=min_length-1:
                helix_regions.append(curr_helix)
            curr_helix = []
    return helix_regions

In [41]:
def download_uniprot_json_file(uni_prot_id, workdir = '.'):
    #check if there is uniprot information available for the protein
    try:
        url_2 = 'https://www.uniprot.org/uniprot/' + uni_prot_id + '.json'
        html_2 = urllib.request.urlopen(url_2)
        lines = html_2.readlines()[0]
        # now try and write to file
        f = open(os.path.join(workdir,uni_prot_id)+'.json', 'w')
        f.write(lines.decode('utf-8'))
        f.close()
        
    except Exception as e:
        raise Exception('Failed to obtain UNIPROT data. %s'%e)
    

    return html_2

In [42]:
def download_alpha_fold_pdbs(uniprot_id_list, workdir = '.'):
    counter = 0
    for uniprot_id in uniprot_id_list:
        print(f"At entry {counter}/{len(uniprot_id_list)}")
        print(f"ID: {uniprot_id}")
        download = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v2.pdb"
        
        try:
            file_name = os.path.join(workdir,f'AF-{uniprot_id}-F1-model_v2.pdb')
            #file_name = f"../data/alphafold/pdb_files/AF-{uniprot_id}-F1-model_v2.pdb"
            urllib.request.urlretrieve(download, file_name)
        except urllib.error.HTTPError:
            print("No such file.")
        counter = counter+1

In [43]:
def count_serines(sequence):
    n_serine = sequence.count('S')
    if n_serine >=2:
        return 1
    else:
        return 0

In [44]:
import pandas as pd
import json

In [45]:
def generate_table_rows_for_uniprot_id(uni_prot_id, df, work_dir = 'temp'):
    print('Working on uniprot ID', uni_prot_id)
    # Downloading uniprot file and alphafold file
    download_uniprot_json_file(uni_prot_id, work_dir)
    download_alpha_fold_pdbs([uni_prot_id], workdir =work_dir)
    p = 'temp/AF-'+uni_prot_id+'-F1-model_v2.pdb'
    sec_struc_output = get_sec_struct(p, '/Users/toni_brain/miniconda3/envs/dssp//bin/mkdssp')
    print('got secondary structure')
    print(sec_struc_output)
    
    # Process uniprot file
    f = open('temp/'+uni_prot_id+'.json')
    data = json.load(f)
    f.close()
    
    print('read json file successfully')
    
    # Some basics:
    uni_id = data['primaryAccession'] 
    print('uni_id',uni_id)
    print(data['genes'])
    gene_name = data['genes'][0]['geneName']['value']
    print('gene_name',gene_name)
    ref_seq_id = 0
    
    # getting the nucleotide sequenceID
    for db in data['uniProtKBCrossReferences']:
        if db['database'] == "RefSeq":
            if '.' in db['properties'][0]['value']:
                ref_seq_id = db['properties'][0]['value'].split('.')[0]
            else:
                ref_seq_id = db['properties'][0]['value']
    print('ref_seq_id',ref_seq_id)
    sequence = data['sequence']['value']
    print('sequence',sequence)
    
    if len(sequence) != len(sec_struc_output):
        print("there is an incompatibility between the sequence and alpha fold structure")
    
    # Now lets get the helices:
    alpha_helix_index_list = get_alpha_helix_length_and_location(sec_struc_output, min_length=7)
    
    # Missing: Filter the alpha_helix_index_list
    # TODO
    
    for i in range(len(alpha_helix_index_list)):
        firstAA_position_in_HELIDR = alpha_helix_index_list[i][0]+1
        lastAA_position_in_HELIDR = alpha_helix_index_list[i][-1]+1
        HELIDR_seq = sequence[alpha_helix_index_list[i][0]:alpha_helix_index_list[i][-1]+1]
        down_stream_seq = ''
        up_stream_seq = ''
        if alpha_helix_index_list[i][0]-10 >= 0 and alpha_helix_index_list[i][-1]+12 < len(sequence):
            # Note the +2 here does not include the last helix AA. 
            # We have to make sure here that we check the arrays are not out of bounds!
            down_stream_seq = sequence[alpha_helix_index_list[i][-1]+2:alpha_helix_index_list[i][-1]+12]
            up_stream_seq = sequence[alpha_helix_index_list[i][0]-10:alpha_helix_index_list[i][0]]
        elif alpha_helix_index_list[i][0]-10 <= 0:
            # Do we want a shorter version? 
            down_stream_seq = sequence[alpha_helix_index_list[i][-1]+2:alpha_helix_index_list[i][-1]+12]
            up_stream_seq = sequence[0:alpha_helix_index_list[i][0]]
        elif alpha_helix_index_list[i][-1]+12 > len(sequence):
            down_stream_seq = sequence[alpha_helix_index_list[i][-1]+2:len(sequence)+1]
            up_stream_seq = sequence[alpha_helix_index_list[i][0]-10:alpha_helix_index_list[i][0]]
        
        # This may need fixing if we have shorter upstream and downstream strings
        if len(down_stream_seq)==10 and len(up_stream_seq)==10: 
            Two_S5P_down = count_serines(down_stream_seq[:5])
            Two_S5P_up = count_serines(up_stream_seq[5:])
            # NOW we assemble the row:
            new_row = [uni_id,gene_name,ref_seq_id,firstAA_position_in_HELIDR,lastAA_position_in_HELIDR,up_stream_seq,HELIDR_seq,
                      down_stream_seq,Two_S5P_up,Two_S5P_down,'','','','','','','','','']
            df.loc[len(df)] = new_row
        else:
            # We may want to revisit this continue here
            continue
    return df

In [46]:
        
df = pd.DataFrame(columns=['uniprot_id','gene_name','refseq_id','firstAA_position_in_HELIDR','lastAA_position_in_HELIDR','HELIDR_upstream_seq'
                           ,'HELIDR_seq', 'HELIDR_downstream_seq', '2S5P_up', '2S5P_down', '2S5P1_up','2S5P1_down','2S5P1_helix','HEK293T_expressed','NonTMD[3]_TMD[2]_SEC[1]',
                           'Non_TMD_classification','4 compartments','TG_CY','TG_SR_nonS'])


In [47]:

#uniprot_ids = ['B7HIJ2', 'F5WVJ2', 'C4LF00', 'O68891']
uniprot_ids= ['A0A5B9','A0AV02','A0AV96','A0AVF1']
for ids in uniprot_ids:
    df = generate_table_rows_for_uniprot_id(ids,df)

Working on uniprot ID A0A5B9
At entry 0/1
ID: A0A5B9
got secondary structure
['-' 'G' 'G' 'G' '-' 'B' '-' '-' 'E' 'E' 'E' 'E' 'E' '-' '-' '-' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'S' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'B'
 'S' 'S' '-' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'T' 'T' 'E' 'E' 'E' '-' 'T' 'T'
 'E' 'E' 'E' '-' 'S' 'S' '-' 'E' 'E' 'S' 'S' 'T' 'T' '-' 'T' 'T' '-' '-'
 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'H' 'H' 'H' 'H' 'T' '-' 'T' 'T'
 '-' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' '-' '-' 'B' '-' 'T' 'T' 'S' '-' '-'
 '-' 'S' 'S' 'S' '-' '-' '-' 'B' 'S' 'E' 'E' 'E' 'E' 'E' 'E' 'E' 'E' '-'
 '-' '-' 'S' 'S' 'S' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' '-']
read json file successfully
uni_id A0A5B9
[{'geneName': {'evidences': [{'evidenceCode': 'ECO:0000303', 'source': 'Reference', 'id': 'Ref.5'}], 'value': 'TRBC2'}, 'synonyms': [{'evidences': [{'evidence

At entry 0/1
ID: A0AVF1
got secondary structure
['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' 'T' 'T' '-' '-' '-' '-' '-' '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'T'
 'T' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' '-'
 '-' '-' 'T' 'T' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' '-'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' 'S' 'T' 'T' '-' '-'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' '-' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'S' '-' 'S' '-' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'T' 'T' '-' '-' 'S' 'S' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'T' 'T' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 '-' 'T' 'T' '-' 'T' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T'
 'T' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' 'T' '-' 'T'
 'T' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'T' '-'
 'H

In [48]:
df

Unnamed: 0,uniprot_id,gene_name,refseq_id,firstAA_position_in_HELIDR,lastAA_position_in_HELIDR,HELIDR_upstream_seq,HELIDR_seq,HELIDR_downstream_seq,2S5P_up,2S5P_down,2S5P1_up,2S5P1_down,2S5P1_helix,HEK293T_expressed,NonTMD[3]_TMD[2]_SEC[1],Non_TMD_classification,4 compartments,TG_CY,TG_SR_nonS
0,A0A5B9,TRBC2,0,17,23,PPKVAVFEPS,EAEISHT,KATLVCLATG,0,0,,,,,,,,,
1,A0AV02,SLC12A8,NM_024628,26,31,QQDALAQPQP,WWKTQL,MWEPVLFGTW,0,0,,,,,,,,,
2,A0AV02,SLC12A8,NM_024628,47,53,VLFGTWDGVF,TSCMINI,GVVLFLRTGW,0,0,,,,,,,,,
3,A0AV02,SLC12A8,NM_024628,56,96,FTSCMINIFG,VVLFLRTGWLVGNTGVLLGMFLVSFVILVALVTVLSGIGVG,RSSIGSGGVY,0,1,,,,,,,,,
4,A0AV02,SLC12A8,NM_024628,106,114,GERSSIGSGG,VYSMISSVL,GQTGGTIGLL,0,0,,,,,,,,,
5,A0AV02,SLC12A8,NM_024628,116,148,VYSMISSVLG,GQTGGTIGLLYVFGQCVAGAMYITGFAESISDL,GLGNIWAVRG,1,0,,,,,,,,,
6,A0AV02,SLC12A8,NM_024628,154,174,SISDLLGLGN,IWAVRGISVAVLLALLGINLA,VKWIIRLQLL,0,0,,,,,,,,,
7,A0AV02,SLC12A8,NM_024628,176,203,LALLGINLAG,VKWIIRLQLLLLFLLAVSTLDFVVGSFT,LDPEHGFIGY,0,0,,,,,,,,,
8,A0AV02,SLC12A8,NM_024628,216,223,DPEHGFIGYS,PELLQNNT,PDYSPGESFF,0,0,,,,,,,,,
9,A0AV02,SLC12A8,NM_024628,233,240,TLPDYSPGES,FFTVFGVF,PAATGVMAGF,1,0,,,,,,,,,
