# Algorithms in Comp Bio & BioInformatics
## 3rd_Assignment--GOR III Secondary Structure Prediction
# Moamin Abdulkareem

In [2]:
import numpy as np
import math 


In [3]:
def load_data(filename):
    
    f = open(filename)
    
    L = list()
    
    for line in f:
        
        line = line.split()
        
        if not line:
            continue
        
        L.append(line)
    
    L = np.asarray(L)
    return L

In [4]:
dssp_file = 'dssp_info.txt'
stride_file = 'stride_info.txt'
cath_file = 'cath_info.txt'
stride_dataset = load_data(stride_file)
dssp_dataset = load_data(dssp_file)
cath_dataset = load_data(cath_file)

print(len(stride_dataset))
print(len(dssp_dataset))
print(dssp_dataset)

71077
71391
[['1w0n' 'A' '12' 'ILE' 'Coil']
 ['1w0n' 'A' '13' 'THR' 'Beta']
 ['1w0n' 'A' '14' 'LYS' 'Beta']
 ...
 ['1ow4' 'A' '116' 'ASN' 'Helix']
 ['1ow4' 'A' '117' 'SER' 'Coil']
 ['1ow4' 'A' '118' 'TYR' 'Coil']]


In [5]:
# pre-process the datasets
amino_acids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY','HIS','ILE', 'LEU', 'LYS', 'MET','PHE', 'PRO','SER', 'THR', 'TRP', 'TYR', 'VAL']

def change_other_to_coil(dataset, amino_set):
    for i in range(len(dataset)):
        
        if dataset[i][4] == 'Other':
            dataset[i][4] = 'Coil'
    
    return dataset
stride = change_other_to_coil(stride_dataset, amino_acids)
dssp = change_other_to_coil(dssp_dataset, amino_acids)



amino_acids_letters = 'A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V'
amino_acids_letters = amino_acids_letters.split(',')

indices = list()
def remove_non_amino(dataset,amino_list):
    
    for i in range(len(dataset)):
        if dataset[i][3] not in amino_acids:
            indices.append(i)
    dataset = np.delete(dataset, indices, axis=0)
    return dataset
stride = remove_non_amino(stride, amino_acids)
dssp = remove_non_amino(dssp, amino_acids)


## Implementation of GOR III algorithm
### We will compute the self information first according to the formula below
<img src="http://latex.codecogs.com/gif.latex?\dpi{120}&space;I(\Delta&space;S_j;&space;R_1,...,&space;R_n)&space;=&space;log(f_{S_j,&space;R_j&plus;m,&space;R_j}&space;/&space;f_{n-S_j,&space;R_j&plus;m,&space;R_j})&space;&plus;&space;log(f_{n-S_j,R_j}&space;/&space;f_{S_j,R_j})" title="I(\Delta S_j; R_1,..., R_n) = log(f_{S_j, R_j+m, R_j} / f_{n-S_j, R_j+m, R_j}) + log(f_{n-S_j,R_j} / f_{S_j,R_j})" />


In [43]:
# since the dataset is huge so i made this function to extract a specific number of proteins
# then we can apply the algorithm on this portion of the dataset
def get_n_proteins(dataset, n):
    
    protein_count = 0
    start = dataset[0][0]
    
    alist = list()
    proteins_names = list()
    j = 0
    i = 0
    while i < len(dataset):
        
        protein_count += 1
        proteins_names.append(start)
        while j < len(dataset)-1 and dataset[j][0] == start and protein_count < n:
            alist.append(dataset[j])
            
            j += 1
        start = dataset[j][0]
        i += j
        if protein_count == n:
            
            break
    
    return alist, proteins_names
dssp_30, proteins_names_dssp = get_n_proteins(dssp, 30)
print(len(dssp_30))

2877


In [7]:
# Here I start with the implementation of the algorithm based on the equation above
# We have to compute the self information and then the directional information using these formulas above

dssp_20, first_20_proteins = get_n_proteins(dssp, 20)
# Now i will use this function to get the frequencies per protein so 
def get_frequencies_of_conformations(dataset):
    
    Helix_freq = 0
    Beta_freq = 0
    Coil_freq = 0
    
    for i in range(len(dataset)):
            
        if dataset[i][4] == 'Helix':
            
            Helix_freq +=1
            
        elif dataset[i][4] == 'Beta':
            
            Beta_freq += 1
            
        else:
            Coil_freq += 1
        
        # The output of this function will be the frequencies of every conformation
        # And also the frequencies fn-s which is observing non specific conformation
        
    total_freq = Helix_freq + Beta_freq + Coil_freq
    non_helix_freq = total_freq - Helix_freq
    non_Beta_freq = total_freq - Beta_freq
    non_coil_freq = total_freq - Coil_freq
        
    return Helix_freq, non_helix_freq, Beta_freq, non_Beta_freq, Coil_freq, non_coil_freq




In [8]:
def get_f_s_r(dataset):
    
    freq_dict = {}
        
    for amino_acid in amino_acids:
        count_helix = 0
        count_sheet = 0
        count_coil = 0
        count_total = 0
        for line in dataset:
            if line[3] == amino_acid and line[4] == 'Helix':
                count_helix +=1
            elif line[3] == amino_acid and line[4] == 'Beta':
                count_sheet +=1
            elif line[3] == amino_acid and line[4] == 'Coil':
                count_coil +=1
            count_total = count_helix + count_sheet + count_coil
        
        # we counted the total so we can get fn-s_r also in the output
        # the output will be a tuple which consists of two values
        # the first thing is the count of a specific conformation for a specific amino_acid
        # the second element in the tuple is the number of times we don't encounter a specific
        # conformation for a specific aminoacid
        freq_dict[(amino_acid, 'Helix')] = (count_helix, count_total - count_helix)
        freq_dict[(amino_acid, 'Beta')] = (count_sheet, count_total - count_sheet)
        freq_dict[(amino_acid, 'Coil')] = (count_coil, count_total - count_coil)
    
    # to count number of times we see a specific conformation and also the number of times that we don't observe it
        
    return freq_dict 
f_s_r = get_f_s_r(dssp)

print(f_s_r[('ALA', 'Helix')])


(2896, 3065)


In [36]:
# Now we will compute the self information and then the directional information

dssp_20, first_20_proteins = get_n_proteins(dssp,20)
stride_20, first_20_stride = get_n_proteins(stride, 20)



def get_self_information(dataset, n):
    
    # we have to get the frequencies of the conformations in the dataset first
    
    self_information = list()
    
    i = 0
    j = 0
    
    f_helix, f_non_helix, f_beta, f_non_beta, f_coil, f_non_coil = get_frequencies_of_conformations(dataset)
    
    f_s_r = get_f_s_r(dataset)


    dataset, protein_list = get_n_proteins(dataset,n)
    
    
    
    while i < len(protein_list):
        
        
        while j < len(dataset):
            
            
            
            
            
            if dataset[j][0] == protein_list[i]:
                
                amino_acid = dataset[j][3]
               
                I_helix = math.log(f_s_r[(amino_acid, 'Helix')][0] / f_s_r[(amino_acid, 'Helix')][1] ) + math.log(f_non_helix/f_helix)
                I_beta = math.log(f_s_r[(amino_acid, 'Beta')][0] / f_s_r[(amino_acid, 'Beta')][1]) + math.log(f_non_beta/ f_beta)
                I_coil = math.log(f_s_r[(amino_acid, 'Coil')][0] / f_s_r[(amino_acid, 'Coil')][1]) + math.log(f_non_coil / f_coil)
                
                self_information.append([I_helix,I_beta, I_coil])
                
                j += 1
                
                
            else:
                 
            
                break
                
        i += 1
        
    return self_information
        
    
self_information_dssp = get_self_information(dssp, 20)

print(len(self_information_dssp))


1835


### We'll use the following formula to get the pair information

<img src="http://latex.codecogs.com/gif.latex?\dpi{120}&space;I(\Delta&space;S_j;&space;R_1&space;,&space;...&space;,&space;R_n)&space;\approx&space;I(\Delta&space;S_j&space;,&space;R_j)&space;&plus;&space;\sum_{m=-8,&space;m!=&space;0}^{m=8}&space;I(\Delta&space;S_j&space;;R_{j&plus;m}|R_j)&space;{\'}~where&space;I(\Delta&space;S_j&space;;R_{j&plus;m}|R_j)&space;=&space;log(f_{S_j,R_{j&plus;m},R_j&space;}&space;/&space;f_{n-S_j,R_{j&plus;m},R_j})&space;&plus;&space;log(f_{n-S_j,R_j}&space;/&space;f_{S_j,R_j})" title="I(\Delta S_j; R_1 , ... , R_n) \approx I(\Delta S_j , R_j) + \sum_{m=-8, m!= 0}^{m=8} I(\Delta S_j ;R_{j+m}|R_j) {\'}~where I(\Delta S_j ;R_{j+m}|R_j) = log(f_{S_j,R_{j+m},R_j } / f_{n-S_j,R_{j+m},R_j}) + log(f_{n-S_j,R_j} / f_{S_j,R_j})" />

In [41]:
def get_pair_information(dataset,n):
    
    
    self_information = get_self_information(dataset, n)
    
    dataset, protein_list = get_n_proteins(dataset,n)

    
    pair_information = list()
    final_prediction = list()

    for j in range(len(dataset)):
        sumh=self_information[j][0]
        sumb=self_information[j][1]
        sumc=self_information[j][2]
        
        if j > 8 and j < len(dataset) - 8:
          
            for k in range(1,9):
            
                sumh+=self_information[j+k][0]
                sumh+=self_information[j-k][0]
                sumb+=self_information[j+k][1]
                sumb+=self_information[j-k][1]
                sumc+=self_information[j+k][2]
                sumc+=self_information[j-k][2]
    
        pair_information.append([sumh, sumb, sumc])
        if max(sumh, sumb, sumc) == sumh:
            final_prediction.append('H')
        elif max(sumh, sumb, sumc) == sumb:
            final_prediction.append('B')
        else:
            final_prediction.append('C')
        
    return final_prediction


pair_information_dssp = get_pair_information(dssp,30)
#dssp_20, protiens = get_n_proteins(dssp, 20)
#self_info = get_self_information(dssp, 20)


print(pair_information_dssp)

['B', 'B', 'H', 'B', 'H', 'H', 'H', 'C', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'B', 'B', 'B', 'H', 'H', 'H', 'B', 'C', 'C', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'B', 'C', 'C', 'C', 'C', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'B', 'C', 'C', 'C', 'C', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'B', 'B', 'H', 'H', 'H', 'H', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'H', 'B', 'B', 'B', 'B', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'H', 'B', 'B', 'B', 'B', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',