# Algorithms in Comp Bio & BioInformatics
## 3rd_Assignment--GOR III Secondary Structure Prediction
# Moamin Abdulkareem

In [1]:
import numpy as np
import math 


In [2]:
def load_data(filename):
    
    f = open(filename)
    
    L = list()
    
    for line in f:
        
        line = line.split()
        
        if not line:
            continue
        
        L.append(line)
    
    L = np.asarray(L)
    return L

In [3]:
dssp_file = 'dssp_info.txt'
stride_file = 'stride_info.txt'
cath_file = 'cath_info.txt'
stride_dataset = load_data(stride_file)
dssp_dataset = load_data(dssp_file)
cath_dataset = load_data(cath_file)

print(len(stride_dataset))
print(len(dssp_dataset))
print(dssp_dataset)

print(cath_dataset)

71077
71391
[['1w0n' 'A' '12' 'ILE' 'Coil']
 ['1w0n' 'A' '13' 'THR' 'Beta']
 ['1w0n' 'A' '14' 'LYS' 'Beta']
 ...
 ['1ow4' 'A' '116' 'ASN' 'Helix']
 ['1ow4' 'A' '117' 'SER' 'Coil']
 ['1ow4' 'A' '118' 'TYR' 'Coil']]
[['1w0n' 'A' 'Beta']
 ['2gpi' 'A' 'Alpha/beta']
 ['1vbw' 'A' 'Alpha/beta']
 ...
 ['1n7s' 'C' 'Alpha']
 ['3epw' 'A' 'Alpha/beta']
 ['1ow4' 'A' 'Alpha']]


## Pre-Processing part
- First we change 'Other' secondary structre to 'Coil'
- Second we remove the rows if the amino acid doesn't belong to 20 amino acids


In [4]:
# pre-process the datasets
amino_acids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY','HIS','ILE', 'LEU', 'LYS', 'MET','PHE', 'PRO','SER', 'THR', 'TRP', 'TYR', 'VAL']

def change_other_to_coil(dataset, amino_set):
    for i in range(len(dataset)):
        
        if dataset[i][4] == 'Other':
            dataset[i][4] = 'Coil'
    
    return dataset
stride = change_other_to_coil(stride_dataset, amino_acids)
dssp = change_other_to_coil(dssp_dataset, amino_acids)



amino_acids_letters = 'A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V'
amino_acids_letters = amino_acids_letters.split(',')

indices = list()
def remove_non_amino(dataset,amino_list):
    
    for i in range(len(dataset)):
        if dataset[i][3] not in amino_acids:
            indices.append(i)
    dataset = np.delete(dataset, indices, axis=0)
    return dataset
stride = remove_non_amino(stride, amino_acids)
dssp = remove_non_amino(dssp, amino_acids)


## Implementation of GOR III algorithm
### We will compute the self information first according to the formula below
<img src="http://latex.codecogs.com/gif.latex?\dpi{120}&space;I(\Delta&space;S_j;&space;R_1,...,&space;R_n)&space;=&space;log(f_{S_j,&space;R_j&plus;m,&space;R_j}&space;/&space;f_{n-S_j,&space;R_j&plus;m,&space;R_j})&space;&plus;&space;log(f_{n-S_j,R_j}&space;/&space;f_{S_j,R_j})" title="I(\Delta S_j; R_1,..., R_n) = log(f_{S_j, R_j+m, R_j} / f_{n-S_j, R_j+m, R_j}) + log(f_{n-S_j,R_j} / f_{S_j,R_j})" />


In [5]:
# since the dataset is huge so i made this function to extract a specific number of proteins
# then we can apply the algorithm on this portion of the dataset
def get_n_proteins(dataset, n):
    
    protein_count = 0
    start = dataset[0][0]
    
    alist = list()
    proteins_names = list()
    j = 0
    i = 0
    while i < len(dataset):
        
        protein_count += 1
        proteins_names.append(start)
        while j < len(dataset)-1 and dataset[j][0] == start and protein_count < n:
            alist.append(dataset[j])
            
            j += 1
        start = dataset[j][0]
        i += j
        if protein_count == n:
            
            break
    
    return alist, proteins_names
dssp_30, proteins_names_dssp = get_n_proteins(dssp, 30)
print(len(dssp_30))

2877


In [6]:
# Here I start with the implementation of the algorithm based on the equation above
# We have to compute the self information and then the directional information using these formulas above

dssp_20, first_20_proteins = get_n_proteins(dssp, 20)
# Now i will use this function to get the frequencies per protein so 
def get_frequencies_of_conformations(dataset):
    
    Helix_freq = 0
    Beta_freq = 0
    Coil_freq = 0
    
    for i in range(len(dataset)):
            
        if dataset[i][4] == 'Helix':
            
            Helix_freq +=1
            
        elif dataset[i][4] == 'Beta':
            
            Beta_freq += 1
            
        else:
            Coil_freq += 1
        
        # The output of this function will be the frequencies of every conformation in the whole dataset
        # And also the frequencies fn-s which is observing non specific conformation
        
    total_freq = Helix_freq + Beta_freq + Coil_freq
    non_helix_freq = total_freq - Helix_freq
    non_Beta_freq = total_freq - Beta_freq
    non_coil_freq = total_freq - Coil_freq
        
    return Helix_freq, non_helix_freq, Beta_freq, non_Beta_freq, Coil_freq, non_coil_freq




In [7]:

def get_f_s_r(dataset):
    
    # This function returns a dictionary where we will use it later in the calculation of the self information
    
    freq_dict = {}
    
    
    for amino_acid in amino_acids:
        count_helix = 0
        count_sheet = 0
        count_coil = 0
        count_total = 0
        for line in dataset:
            if line[3] == amino_acid and line[4] == 'Helix':
                count_helix +=1
            elif line[3] == amino_acid and line[4] == 'Beta':
                count_sheet +=1
            elif line[3] == amino_acid and line[4] == 'Coil':
                count_coil +=1
            count_total = count_helix + count_sheet + count_coil
        
        # we counted the total so we can get fn-s_r also in the output
        # the output will be a tuple which consists of two values
        # the first thing is the count of a specific conformation for a specific amino_acid
        # the second element in the tuple is the number of times we don't encounter a specific
        # conformation for a specific aminoacid
        freq_dict[(amino_acid, 'Helix')] = (count_helix, count_total - count_helix)
        freq_dict[(amino_acid, 'Beta')] = (count_sheet, count_total - count_sheet)
        freq_dict[(amino_acid, 'Coil')] = (count_coil, count_total - count_coil)
    
    # to count number of times we see a specific conformation and also the number of times that we don't observe it
        
    return freq_dict

f_s_r = get_f_s_r(dssp)

print(f_s_r[('ALA', 'Helix')])


(2896, 3065)


In [8]:
# Now we will compute the self information and then the directional information
# we will apply the functions above in the formula of calculating the self information


def get_self_information(dataset, n):
    
    # we have to get the frequencies of the conformations in the dataset first
    
    self_information = list()
    
    i = 0
    j = 0
    
    f_helix, f_non_helix, f_beta, f_non_beta, f_coil, f_non_coil = get_frequencies_of_conformations(dataset)
    
    f_s_r = get_f_s_r(dataset)


    dataset, protein_list = get_n_proteins(dataset,n)
    
    
    
    while i < len(protein_list):
        
        
        while j < len(dataset):
            
            
            
            
            
            if dataset[j][0] == protein_list[i]:
                
                amino_acid = dataset[j][3]
               
                I_helix = math.log(f_s_r[(amino_acid, 'Helix')][0] / f_s_r[(amino_acid, 'Helix')][1] ) + math.log(f_non_helix/f_helix)
                I_beta = math.log(f_s_r[(amino_acid, 'Beta')][0] / f_s_r[(amino_acid, 'Beta')][1]) + math.log(f_non_beta/ f_beta)
                I_coil = math.log(f_s_r[(amino_acid, 'Coil')][0] / f_s_r[(amino_acid, 'Coil')][1]) + math.log(f_non_coil / f_coil)
                
                self_information.append([I_helix,I_beta, I_coil])
                
                j += 1
                
                
            else:
                 
            
                break
                
        i += 1
        
    return self_information
        
    
self_information_dssp = get_self_information(dssp, 20)

print(len(self_information_dssp))


1835


### We'll use the following formula to get the Information which is the self and the pair information

<img src="http://latex.codecogs.com/gif.latex?\dpi{120}&space;I(\Delta&space;S_j;&space;R_1&space;,&space;...&space;,&space;R_n)&space;\approx&space;I(\Delta&space;S_j&space;,&space;R_j)&space;&plus;&space;\sum_{m=-8,&space;m!=&space;0}^{m=8}&space;I(\Delta&space;S_j&space;;R_{j&plus;m}|R_j)&space;{\'}~where&space;I(\Delta&space;S_j&space;;R_{j&plus;m}|R_j)&space;=&space;log(f_{S_j,R_{j&plus;m},R_j&space;}&space;/&space;f_{n-S_j,R_{j&plus;m},R_j})&space;&plus;&space;log(f_{n-S_j,R_j}&space;/&space;f_{S_j,R_j})" title="I(\Delta S_j; R_1 , ... , R_n) \approx I(\Delta S_j , R_j) + \sum_{m=-8, m!= 0}^{m=8} I(\Delta S_j ;R_{j+m}|R_j) {\'}~where I(\Delta S_j ;R_{j+m}|R_j) = log(f_{S_j,R_{j+m},R_j } / f_{n-S_j,R_{j+m},R_j}) + log(f_{n-S_j,R_j} / f_{S_j,R_j})" />

In [9]:
def get_total_information(dataset,n):
    
    
    self_information = get_self_information(dataset, n)
    
    dataset, protein_list = get_n_proteins(dataset,n)

    
    information = list()
    final_prediction = list()

    for j in range(len(dataset)):
        
        # If the indeices are less than 8, these values will be appended directly the information 
        sumh=self_information[j][0]
        sumb=self_information[j][1]
        sumc=self_information[j][2]
        
        if j > 8 and j < len(dataset) - 8:
          
            for m in range(1,9):
            
                sumh+=self_information[j+m][0]
                sumh+=self_information[j-m][0]
                sumb+=self_information[j+m][1]
                sumb+=self_information[j-m][1]
                sumc+=self_information[j+m][2]
                sumc+=self_information[j-m][2]
    
        information.append([sumh, sumb, sumc])
        if max(sumh, sumb, sumc) == sumh:
            final_prediction.append('H')
        elif max(sumh, sumb, sumc) == sumb:
            final_prediction.append('E')
        else:
            final_prediction.append('C')
        
    return final_prediction


information_dssp_30 = get_total_information(dssp,30)
information_stride_30 = get_total_information(stride, 30)

print(information_dssp_30)
print(information_stride_30)


['E', 'E', 'H', 'E', 'H', 'H', 'H', 'C', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E', 'E', 'H', 'H', 'H', 'E', 'C', 'C', 'E', 'E', 'E', 'E', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E', 'H', 'H', 'H', 'H', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E', 'E', 'E', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E', 'E', 'E', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',

### To get the Q3 measure we will use the following equation:
 

<img src="http://latex.codecogs.com/gif.latex?\dpi{120}&space;\LARGE&space;Q_3&space;=&space;\frac{N_{residues&space;~correctly&space;~classified}}{N_{residues~total}}" title="\LARGE Q_3 = \frac{N_{residues ~correctly ~classified}}{N_{residues~total}}" />
### We will also check the protein family from the 'Cath' file and print the output

In [26]:
def get_q3_score(dataset, n):
    
    # To get the q3_score we need to lists that have the same number of entries 
    # the predicted and the actual

    # that means we need to loop over each protein and then we make the loop on the dataset
    predicted = get_total_information(dataset, n)
    x = get_total_information(dataset, n)
    predicted = np.asarray(predicted)
    data, proteins_list = get_n_proteins(dataset, n)
    q3_scores = list()
    family = list()
    i = 0
    j = 0
    while i < len(proteins_list):
        
        # The list of actual contains the protein from the original dataset
        # And it is generated every time with a new protein
    
        actual = list()
        indices = list()
        protein_id = proteins_list[i]
        Helix_count = 0
        Beta_count = 0
        Coil_count = 0
        
        while j < len(data):
            
            if data[j][0] == protein_id:
            
                if data[j][4] == 'Helix':
            
                    actual.append('H')
            
                elif data[j][4] == 'Beta':
            
                    actual.append('E')
            
                else:
            
            
                    actual.append('C')
                # I am using x here because predicted is minimized after each iteration of the proteins
                # This will be used to check the protein family and we will check that from cath file that is provided
                if x[j] == 'H':
                    
                    Helix_count += 1
                
                elif x[j] == 'E':
                    
                    Beta_count += 1
                    
                else:
                    
                    Coil_count += 1
                    
                j += 1
            
            else:
                
                max_s = max(Helix_count,Beta_count,Coil_count)
                    
                if max_s == Helix_count:
                    
                    family.append((proteins_list[i],'Predicted: Alpha', 'Actual from Cath: ',cath_dataset[i][2]))
                elif max_s == Beta_count:
                    family.append((proteins_list[i], 'Predicted: Beta', 'Actual from Cath: ', cath_dataset[i][2]))
                else:
                    family.append((proteins_list[i], 'Predicted: Coil', 'Actual from Cath: ', cath_dataset[i][2]))
                break
                
            # we will use this later when we remove these indices from predicted list
            indices.append(j)
            
        
        length_actual = len(actual)
        
        count=0
            
        # Now we count the correctly predicted values and then we get q3 score
        
        for n in range(len(actual)):
            
            if actual[n] == predicted[n]:
                count += 1
        try:
            
            q3_score = count / len(actual)
            
        except ZeroDivisionError:
            
            q3_score = 0
        
        # appending q3scores with the protein name
        
        q3_scores.append((protein_id,predicted[:length_actual], q3_score))
        
        # Now we remove the indices from the predicted so we can make the comparison again with the next protein
        
        predicted = np.delete(predicted,indices,axis=0)
        
            
        
        i += 1
            
        
    return q3_scores, family

print('The proteins with the corresponding q3 scores and family structre for dssp dataset',get_q3_score(dssp,30))

The proteins with the corresponding q3 scores and family structre for dssp dataset ([('1w0n', array(['E', 'E', 'H', 'E', 'H', 'H', 'H', 'C', 'H', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'E', 'E', 'E', 'H', 'H', 'H', 'E', 'C', 'C',
       'E', 'E', 'E', 'E', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'H', 'H', 'H', 'H', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'H',
       'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'E', 'E', 'E'], dtype='<U1'), 0.36666666666666664), ('2gpi', array(['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E',
       'E', 'E', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E',



In [27]:
print('The proteins with the corresponding q3 score and family structre for stride dataset', get_q3_score(stride,494))

The proteins with the corresponding q3 score and family structre for stride dataset ([('1w0n', array(['E', 'E', 'H', 'E', 'H', 'H', 'H', 'C', 'H', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'E', 'E', 'E', 'H', 'H', 'H', 'E', 'C', 'C',
       'E', 'E', 'E', 'E', 'C', 'E', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'H', 'H', 'H', 'H', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'H',
       'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'E', 'E', 'E'], dtype='<U1'), 0.375), ('2gpi', array(['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E',
       'E', 'E', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E', 'E', 'E', 'H



### The formula for calculating Mathews correlation coefficient(MCC)
<img src="http://latex.codecogs.com/gif.latex?\dpi{120}&space;MCC&space;=&space;\frac{TP&space;\times&space;TN&space;-&space;FP&space;\times&space;FN&space;}{\sqrt{(TP&space;&plus;&space;FP)&space;(TP&space;&plus;&space;FN)(TN&plus;FP)(TN&plus;FN)}&space;}" title="MCC = \frac{TP \times TN - FP \times FN }{\sqrt{(TP + FP) (TP + FN)(TN+FP)(TN+FN)} }" />

In [39]:
# To compute MCC score we will have to compute the true positives and false positives and true and false negatives
# We can use the confusion matrix to get the values for the equation
from sklearn.metrics import confusion_matrix

def get_mcc(dataset, n):
    
    # We will do the same thing as in the q3 method 
    # we will loop through the proteins list and then through the dataset to extract the actual values.
    # And then we compare them with the predicted ones
    
    predicted = get_total_information(dataset, n)
    predicted = np.asarray(predicted)
    data, proteins_list = get_n_proteins(dataset,n)
    mcc_list = list()
    i = 0
    j = 0
    
    while i < len(proteins_list):
        
        actual = list()
        indices = list()
        protein_id = proteins_list[i]
        
        while j < len(data):
        
            if data[j][0] == protein_id:
                
        
                if data[j][4] == 'Helix':
                
                    actual.append('H')
                    
                elif data[j][4] == ('Beta'):
                    
                    actual.append('E')
                    
                else:
                    actual.append('C')
                    
                j+=1
                
                indices.append(j)
            else:
                # We have three classes and the resulting matrix will be 3 by 3 
                # The true positives will be the diagonal and the false negatives for each class is the sum of the values in the row
                # except the true positive of that class. The false positive of a class is the sum of the colum excluding the true positives.
                # The total True negatives of a class will be the sum of the rows and columns except the row and the column of the class.
                # By getting these values we can sum everything up and use it in the formula above to get the score for each protein
                
                cm = confusion_matrix(actual, predicted[:len(actual)])
                TP_H, TP_B, TP_C = cm[0][0], cm[1][1], cm[2][2]
                FN_H, FN_B, FN_C = cm[0][1] + cm[0][2],  cm[1][0] + cm[1][2], cm[2][0] + cm[2][1]
                FP_H, FP_B, FP_C = cm[1][0] + cm[2][0], cm[0][1] + cm[2][1], cm[0][2] + cm[1][2]
                TN_H = cm[1][1] + cm[1][2] + cm[2][1] + cm[2][2]
                TN_B = cm[0][0] + cm[0][2] + cm[2][0] + cm[2][2]
                TN_C = cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]
                TP = TP_H + TP_B + TP_C
                FN = FN_H + FN_B + FN_C
                FP = FP_H + FP_B + FP_C
                TN = TN_H + TN_B + TN_C
                mcc = ((TP * TN) - (FP * FN)) / (math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)))
                
                # Appending the protein along with the mcc score
                
                mcc_list.append((protein_id,predicted[:len(actual)], mcc))
                
                # Removing the protein from the predicted data to start with a new one
                predicted = np.delete(predicted,indices,axis=0)
                
                break
        i += 1
                
                
    return mcc_list

print('The mcc score for the first 30 proteins of the dssp dataset: ',get_mcc(dssp, 30))


The mcc score for the first 30 proteins of the dssp dataset:  [('1w0n', array(['E', 'E', 'H', 'E', 'H', 'H', 'H', 'C', 'H', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'E', 'E', 'E', 'H', 'H', 'H', 'E', 'C', 'C',
       'E', 'E', 'E', 'E', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'H', 'H', 'H', 'H', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'H',
       'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'E', 'E', 'E'], dtype='<U1'), 0.05), ('2gpi', array(['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E',
       'E', 'E', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E', 'E', 'E', 'H', 'H', 'H',
       'H',



In [40]:
print('The mcc score for the first 30 proteins of the stride dataset: ', get_mcc(stride,30))

the mcc score for the first 30 proteins of the stride dataset:  [('1w0n', array(['E', 'E', 'H', 'E', 'H', 'H', 'H', 'C', 'H', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'E', 'E', 'E', 'H', 'H', 'H', 'E', 'C', 'C',
       'E', 'E', 'E', 'E', 'C', 'E', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'C', 'E', 'C', 'C', 'C', 'C', 'E', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'H', 'H', 'H', 'H', 'H', 'H', 'C', 'C', 'C', 'C', 'C', 'C', 'H',
       'H', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'E', 'E',
       'E', 'E', 'E'], dtype='<U1'), 0.0625), ('2gpi', array(['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E',
       'E', 'E', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'H', 'E', 'E', 'E', 'E', 'H', 'H', 'H',
       

