# Extract secondary structure from AF2 PDB files and save secondary structure info

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [1]:
# Imports
import pandas as pd
import numpy as np
import glob 
import os
import urllib
from collections import Counter
import urllib
import json

In [2]:
def get_sec_struct(fname, dssp_path):
    
    """ returns secondary structure of protein file
    Parameters:
    -----------
    fname : String
        name of alpha fold pdb file
        
    dssp_path : String
        path to the installation of dssp
        
    Returns:
    --------
    secstruct : np.array
        array containing secondary structure information
    
    Cheat sheet of secondary structure information:
    -----------------------------------------------
    H = α-helix
    B = residue in isolated β-bridge
    E = extended strand, participates in β ladder
    G = 3-helix (310 helix)
    I = 5 helix (π-helix)
    T = hydrogen bonded turn
    S = bend
    """
    
    # call DSSP
    try:
        import subprocess
        subprocess.check_call("%s %s -o result.dssp"%(dssp_path, fname), shell=True)
        fin=open("result.dssp","r")
    except Exception as e:
        raise Exception("Could not calculate secondary structure! %s"%e)

    # parse output
    readit=False
    secstruct=[]
    for line in fin:

        if readit:
            try:
                if line[13:15] == '!*' or line[13] == '!':
                    continue
                else:
                    ss = line[16]
                    if line[16] == " ":
                        ss = "-"

                    secstruct.append(ss)
            except:
                continue

        if "#" in line:
            readit=True

    fin.close()

    # clean temporary files
    os.remove("result.dssp")

    return np.array(secstruct)

In [3]:
def download_alpha_fold_pdbs(uniprot_id_list, workdir = '.'):
    counter = 0
    for uniprot_id in uniprot_id_list:
        # print(f"At entry {counter}/{len(uniprot_id_list)}")
        # print(f"ID: {uniprot_id}")
        download = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v2.pdb"
        
        try:
            file_name = os.path.join(workdir,f'AF-{uniprot_id}-F1-model_v2.pdb')
            #file_name = f"../data/alphafold/pdb_files/AF-{uniprot_id}-F1-model_v2.pdb"
            urllib.request.urlretrieve(download, file_name)
        except urllib.error.HTTPError:
            print("No such file.")
        counter = counter+1

In [4]:
f = open('unique_ids_from_spreadsheet.txt', 'r')
f_content = f.readlines()
f.close()

# All Uniprot IDS
ids = []
for f in f_content:
    ids.append(f.strip())

In [5]:
# Now running everything to get the DSSP information of each protein structure
counter = 0
for i in ids:
    if counter%500==0:
        print(f"At entry {counter}/{len(ids)}")
        print(f"ID: {i}")
    download_alpha_fold_pdbs([i], workdir ='temp1')
    p = 'temp1/AF-'+i+'-F1-model_v2.pdb'
    sec_struc_output = get_sec_struct(p, '/Users/toni_brain/miniconda3/envs/dssp//bin/mkdssp')
    np.save(f'dssp_info/{i}.npy', sec_struc_output)
    
    # remove pdb file
    os.remove(p)
    counter = counter +1

At entry 0/16236
ID: O00305
At entry 500/16236
ID: Q8N6Y2
At entry 1000/16236
ID: P24928
At entry 1500/16236
ID: Q92903
At entry 2000/16236
ID: Q8WTP8
At entry 2500/16236
ID: Q96SL8
At entry 3000/16236
ID: P06401
At entry 3500/16236
ID: Q6ZNG9
At entry 4000/16236
ID: P07954
At entry 4500/16236
ID: Q3ZCQ3
At entry 5000/16236
ID: Q8IX06
At entry 5500/16236
ID: Q8N4L1
At entry 6000/16236
ID: P35249
At entry 6500/16236
ID: O00748
At entry 7000/16236
ID: Q96B01
At entry 7500/16236
ID: Q86XK3
At entry 8000/16236
ID: Q96B26
At entry 8500/16236
ID: P29692
At entry 9000/16236
ID: Q9UKV3
At entry 9500/16236
ID: Q15544
At entry 10000/16236
ID: Q8N5Y2
At entry 10500/16236
ID: O75593
At entry 11000/16236
ID: Q8WUK0
At entry 11500/16236
ID: Q14232
At entry 12000/16236
ID: Q9P2M7
At entry 12500/16236
ID: Q9BYQ8
At entry 13000/16236
ID: P25789
At entry 13500/16236
ID: Q92896
At entry 14000/16236
ID: Q16611
At entry 14500/16236
ID: Q9NP77
At entry 15000/16236
ID: P52294
At entry 15500/16236
ID: Q6ZV50


----

## Backup functions

In [None]:
def select_high_confidence_structures(pdb_list, threshold=0.9):
    
    confidence_info=[]
    for p in pdb_list:
        f = open(p)
        data = f.readlines()
        f.close()
        confidence_score = []
        conf_dic = {}
        for l in data:
            if l.startswith('ATOM'):
                curr_line =l.split()
                confidence_score.append(float(curr_line[-2]))
        conf_dic['mean'] = np.mean(confidence_score)
        conf_dic['std'] = np.std(confidence_score)
        conf_dic['name'] = p
        confidence_info.append(conf_dic)
    confidence_df = pd.DataFrame.from_dict(confidence_info)
    filtering = confidence_df["mean"]>90.0
    high_confidence_pdbs = confidence_df.where(filfilteringter).dropna()['name']
    return high_confidence_pdbs

In [9]:
ids.index('Q13523')

8707

In [8]:
ids[3473:3475]

['Q6NZ36', 'P35573']