# Extract secondary structure from PDB files easily and generate spreadsheet

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [1]:
# Imports
import pandas as pd
import numpy as np
import glob 
import os
import urllib
from collections import Counter
import urllib
import json

In [2]:
def get_sec_struct(fname, dssp_path):
    
    """ returns secondary structure of protein file
    Parameters:
    -----------
    fname : String
        name of alpha fold pdb file
        
    dssp_path : String
        path to the installation of dssp
        
    Returns:
    --------
    secstruct : np.array
        array containing secondary structure information
    
    Cheat sheet of secondary structure information:
    -----------------------------------------------
    H = α-helix
    B = residue in isolated β-bridge
    E = extended strand, participates in β ladder
    G = 3-helix (310 helix)
    I = 5 helix (π-helix)
    T = hydrogen bonded turn
    S = bend
    """
    
    # call DSSP
    try:
        import subprocess
        subprocess.check_call("%s %s -o result.dssp"%(dssp_path, fname), shell=True)
        fin=open("result.dssp","r")
    except Exception as e:
        raise Exception("Could not calculate secondary structure! %s"%e)

    # parse output
    readit=False
    secstruct=[]
    for line in fin:

        if readit:
            try:
                if line[13:15] == '!*' or line[13] == '!':
                    continue
                else:
                    ss = line[16]
                    if line[16] == " ":
                        ss = "-"

                    secstruct.append(ss)
            except:
                continue

        if "#" in line:
            readit=True

    fin.close()

    # clean temporary files
    os.remove("result.dssp")

    return np.array(secstruct)

In [3]:
def download_alpha_fold_pdbs(uniprot_id_list, workdir = '.'):
    counter = 0
    for uniprot_id in uniprot_id_list:
        # print(f"At entry {counter}/{len(uniprot_id_list)}")
        # print(f"ID: {uniprot_id}")
        download = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v2.pdb"
        
        try:
            file_name = os.path.join(workdir,f'AF-{uniprot_id}-F1-model_v2.pdb')
            #file_name = f"../data/alphafold/pdb_files/AF-{uniprot_id}-F1-model_v2.pdb"
            urllib.request.urlretrieve(download, file_name)
        except urllib.error.HTTPError:
            print("No such file.")
        counter = counter+1

In [4]:
f = open('unique_ids_from_spreadsheet.txt', 'r')
f_content = f.readlines()
f.close()

# All Uniprot IDS
ids = []
for f in f_content:
    ids.append(f.strip())

In [30]:
# Now running everything to get the DSSP information of each protein str
counter = 0
for i in ids:
    if counter%500==0:
        print(f"At entry {counter}/{len(ids)}")
        print(f"ID: {i}")
    download_alpha_fold_pdbs([i], workdir ='temp1')
    p = 'temp1/AF-'+i+'-F1-model_v2.pdb'
    sec_struc_output = get_sec_struct(p, '/Users/toni_brain/miniconda3/envs/dssp//bin/mkdssp')
    np.save(f'dssp_info/{i}.npy', sec_struc_output)
    
    # remove pdb file
    os.remove(p)
    counter = counter +1

At entry 0/16236
ID: O00305
At entry 5/16236
ID: Q9HBG6
At entry 10/16236
ID: Q9NR28
At entry 15/16236
ID: Q6P444


In [19]:
np.save("data.npy", sec_struc_output)

In [20]:
np.load('data.npy')

array(['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'S',
       'S', 'S', 'S', 'S', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', 'T', 'T', 'S', 'T', 'T', 'T', 'G', 'G', 'G', 'S', '-',
       'S', '-', '-', '-', '-', '-', '-', 'G', 'G', 'G', 'T', 'T', 'T',
       'T', 'T', 'T', 'S', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', '-', 'S', '-', '-', '-', 'S', '-', '-', '-', '-',
       '-', 'T', 'T', '-', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'S', 'T', 'T', 'T', 'S', '-', '-', '-', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'T', 'T',
       'S', '-', 'S', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', 'T', 'T', 'S', 'H', 'H', 'H', 'H', 'H', 'H