Test is a test notebook where I develop the GOR3 algorithm.

Things to do:

- The first thing we should do is import the data and clean the data to make it ready for use. 
- Understand the GOR 3 Algorithm
- Understand how we should deal with edges 
- Understand what we need to do with invalid entries and how we deal with it in algorithm
- Understand how we should make it optimised leave-one-out. (don't recount everything with every new protein sequence).
- How to summarise the results of the GOR3 (per data set)
- How to summarise the results per protein family.


In [1]:
import numpy as np

In [202]:
# Algorithm to evaluate a protein. 
# Create a random protein sequence. Where the amino acids are code between 0,20
Proteins = np.random.randint(3,size=100)
SeqLen = len(Proteins)

In [203]:
# Add some invalid entries and pad the sides with -1's which represent the edges.
Proteins[[15,74,87]] = -1 
#Proteins = np.concatenate((-np.ones(8),Proteins,-np.ones(8))).astype(int)

In [188]:
# Create for now a dummy frequency tensor:
freq = np.ones(shape=(3,17,3,3),dtype=float)
freq[0,8,0,0] = 7
freq[0,8,1,1] = 10
freq[0,8,2,2] = 13
freq[1,8,0,0] = 70
freq[1,8,1,1] = 100
freq[1,8,2,2] = 130


CountSSperR = np.ones(shape=(3,20))
CountSS = np.sum(CountSSperR,-1)
otherSS = np.array([[1,2],[0,2],[0,1]]) # Have the index of the other Secundary Structure such that we can sum of the other SS
logDiff = np.log(CountSS/np.sum(CountSS[otherSS],axis=-1))
logDiffRj = np.log(CountSSperR/np.sum(CountSSperR[otherSS],axis=-2)) # -2 since -1 are the amino acids

In [208]:
Proteins

array([ 1,  2,  0,  2,  2,  1,  0,  2,  2,  0,  1,  1,  0,  2,  1, -1,  2,
        0,  1,  0,  2,  2,  0,  1,  0,  2,  1,  2,  1,  1,  2,  0,  0,  1,
        1,  1,  2,  2,  1,  1,  2,  2,  1,  0,  1,  2,  1,  1,  1,  0,  1,
        2,  0,  2,  0,  2,  2,  2,  0,  1,  1,  0,  2,  1,  0,  2,  0,  1,
        1,  0,  2,  1,  2,  2, -1,  1,  2,  1,  1,  2,  2,  1,  1,  1,  2,
        1,  2, -1,  1,  1,  2,  1,  2,  0,  0,  0,  2,  1,  0,  0])

In [190]:
CountSS

array([20., 20., 20.])

In [169]:
valid = np.array([0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1]).astype(bool)
seq = np.arange(17)
vseq = seq[valid]
mCorrect = np.where(valid)[0]
freq[0,mCorrect,vseq,0]

IndexError: index 8 is out of bounds for axis 2 with size 3

In [196]:
testarray = np.array([4,8,7])
if 8 in testarray:
    print("Yes")

Yes


In [220]:
import time
startTime = time.time()
for i in range(500):
    out = predictSS(Proteins,freq,logDiff,logDiffRj)
Diff = time.time() - startTime
print(f"{np.round(Diff,2)} seconds")

45.34 seconds


In [219]:
def predictSS(protSequence,freq,logDiff,logDiffRj):
    """
    Input:
        - protSequence : 1D-numpy array, Represents the proteins sequence, where the aminoacids are labeled as integers from 0 to 19
        - freq : 4D-numpy array shape = (3,17,20,20) representing the frequency count of the data set.
            - axis0 = Secundary structure
            - axis1 = relative position m from the window center positioned at j in the protein sequence
            - axis2 = amino Acid at the position m. = R_{j+m}
            - axis3 = amino Acid at the centre. = R_j
        - logDiff: 1D-numpy array of length 3. This is the logarithm of the relative SS structure count: log(f_s/f_{not_s})
        - logDiffRj: 2D-numpy array of shape (3,20). This is the logarithm of the relative SS structure count per amino accid: log(f_{s,R}/f_{not_s,R})
    
    Output:
        - SS_result: 1D-numpy array with the predicted SS of the protein sequence
        - ISS: 2D-numpy array with the information per SS per amino accid in the protein sequence.
        
    """
    # Add 8 "-1"'s at the beginning and end of the protein sequence. 
    # This is done such that we can have running window of lenght 8 at each side at the edges of the protein sequences. 
    # Use -1 since we offcourse vieuw these as invalid entries.
    paddedSequence = np.concatenate((-np.ones(8),protSequence,-np.ones(8))).astype(int)
    SeqLen = len(protSequence)
    
    # Construct the output value ISS
    ISS = -np.ones(shape=(3,SeqLen)) # use -1 to later on detect invalid entries in the protein sequence.
    
    for j, r_j in enumerate(paddedSequence):
        if j-8 > SeqLen: # We have reached the end.
            break
        if r_j<0: # We skip over invalid entries. (These are the padded edges and invalid entries)
            continue
        # r_j is valid so set the ISS level equalt to zero. Unvalid r_j are kept -1
        ISS[:,j-8] = np.zeros(3)             
        for SS in range(3):
            for m, r_jm in enumerate(Proteins[j-8:j+9]): # m in [0,16],
                if r_jm<0:
                    continue
                # Compute the logSum
                ISS[SS,j-8] += np.log(freq[SS,m,r_jm,r_j]/np.sum(freq[otherSS[SS],m,r_jm,r_j]))
                if m==8: 
                    ISS[SS,j-8] += logDiff[SS]
                else:
                    ISS[SS,j-8] += logDiffRj[SS,r_j]

    SS_result = np.argmax(ISS,axis=0)

    ###################################################
    ##### Deal with invalid amino acid entries ######
    ##################################################

    InvalidIndex = np.where(ISS[0]==-1)[0]
    for index in InvalidIndex:
        if index == 0: # The invalid is the first element
            indexToCopy = 1
            indexIsNotValid = np.any(Invalidindex==indexToCopy)
            while indexIsNotValid: # Keep searching until we are sure that the next index is also not an invalid one. 
                indexToCopy+=1
                nextIsValid = np.any(InvalidIndex==indexToCopy)
            SS_result[index] = SS_result[indexToCopy] # Set it equal to the first valid element. 
        else: # In all other cases we pick the one below 
            SS_result[index] = SS_result[index-1]
        # The reason why we pick the one is clarified by the logic that we would have else used:

#       elif (index-1)==SeqLen # The invalid index is the last element
#           SS_result[index] = SS_result[index-1] # Set equal to the element before.
#       elif SS_result[index-1]==SS_result[index+1] # SS around in valid point is the same
#           SS_result[index] = SS_result[index-1]
#       else: # By default just pick the one below, this way we are sure the entry exist. 
#           SS_result[index]=SS_result[index-1] 

        # And so in all other cases it would have been pick the one below so. We can just simplify it with an easy else. 

    return SS_result, ISS # Return the found SS and the information values.

------
Left to do now is reading in the data file and computing the frequency tables.

In [247]:
def freqTableOfProtein(protSequence,SS):
    """
    Input:
        - protSequence : 1D-numpy array, Represents the proteins sequence, where the aminoacids are labeled as integers from 0 to 19
        - SS : 1D-numpy array holding the know secundary structure of the 
    Output:
    - freq : 4D-numpy array shape = (3,17,20,20) representing the frequency count of the protein sequence.
            - axis0 = Secundary structure
            - axis1 = relative position m from the window center positioned at j in the protein sequence
            - axis2 = amino Acid at the position m. = R_{j+m}
            - axis3 = amino Acid at the centre. = R_j
    - freq :
    """
    freqTable = np.zeros(shape=(3,17,20,20))
    
    # Pad the sequence with 8 -1's before and after such that we can easily perform move a window around. 
    paddedSequence = np.concatenate((-np.ones(8),protSequence,-np.ones(8))).astype(int)
    paddedSS = np.concatenate((-np.ones(8),SS,-np.ones(8))).astype(int)
    
    seqLen = len(protSequence)
    
    #Fill freqTable
    for j, (r_j, S_j) in enumerate(zip(paddedSequence,paddedSS)):
        if j-8 > SeqLen: # We have reached the end.
            break
        if r_j<0: # We skip over invalid entries. (These are the padded edges and invalid entries)
            continue
        for m in np.arange(-8,9):#  m=[-8,...,8]
            r_jm = paddedSequence[j+m]
            if r_jm<0:
                continue
            freqTable[S_j,m+8,r_jm,r_j]+=1
            
    countSSperR = np.zeros()
    
    ##### THIS IS NOT FINISHED!!! ####
    
    #return freqTable, countSS, countSSperR

In [252]:
testSeq = np.arange(20)
testSS = np.ones(len(testSeq))
np.unique(testSeq,return_counts=True)[1]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
def leaveOneOutAnalysis(dataSet = "dssp", protFamily = None):
    """
        Estimate the Q3 and CVV of the GOR method for a specific dataset using a crossvalidation of leave-one-out
    """
    totalFreqTable = np.zeros(shape=(3,17,20,20))
    totalCountSS = np.zeros(3)
    totalCountSSperR = np.zeros(shape=(3,20))
    otherSS = np.array([[1,2],[0,2],[0,1]]) # Have the index of the other Secundary Structure such that we can sum of the other SS
    
    # Select the correct proteinTuple based on dssp protFamily
    
    #First compute the full freqTable
    for proteinSequence, trueSS in proteinTuple:
        freqTableFromProt, countSSFromProt, countSSperRFromProt = freqTableOfProtein(proteinSequence, trueSS)
        totalFreqTable += freqTableFromProt
        totalCountSS += countSSFromProt
        totalCountSSperR += countSSperRFromProt
        
    
    numberProteins = len(AllProteins)
    
    Q3 = np.zeros(shape=(3,number))
    CVV = np.zeros(shape=(3,number))
    
    # Now do the leave-one-out:
    for i, (proteinSequence, trueSS) in enumerate(proteinTuple):
        # Remove the frequency counts of this protein.
        freqTableFromProt, countSSFromProt, countSSperRFromProt = freqTableOfProtein(proteinSequence, trueSS)
        tempFreqTable = totalFreqTable - freqTableFromProt
        tempCountSS = totalCountSS - countSSFromProt
        tempCountSSperR = totalCountSSperR - countSSperRFromProt
        
        templogDiff = np.log(tempCountSS/np.sum(tempCountSS[otherSS],axis=-1))
        templogDiffRj = np.log(tempCountSSperR/np.sum(tempCountSSperR[otherSS],axis=-2)) # -2 since -1 are the amino acids
        
        # Set Frequency that are equal to zero = 0.00001 inorder to avoid infinities.
        tempFreqTable[tempFreqTable = 0] = 0.00001
        
        # Perform the GOR3 prediction.
        SS_result, _ = predictSS(proteinSequence, tempFreqTable, templogDiff, templogDiffRj)
        
        Q3[i] = proteinQ3(SS_result, trueSS)
        CVV[i] = proteinCVV(SS_result, trueSS)
    
    return Q3, CVV

In [228]:
def proteinQ3(predictedSS, trueSS):
    _, (numberHelix, numberSheet, numberCoil) = np.unique(trueSS, return_counts=True)
    TPHelix = np.sum(np.logical_and(predictedSS==0, trueSS==0)) #Only look where trueSS = Helix, see if they are then equal.
    TPSheet = np.sum(np.logical_and(predictedSS==1, trueSS==1))
    TPCoil =  np.sum(np.logical_and(predictedSS==2, trueSS==2))
    return np.array([TPHelix/numberHelix, TPSheet/numberSheet, TPCoil/numberCoil])

def computeCVV(TP,TN,FP,FN):
    numerator = TP*TN-FP*FN
    denominator = np.sqrt((TP+FP) * (TP+FN) * (TN+FP) * (TN+FN))
    return numerator/denominator

def proteinCVV(predictedSS, trueSS):
    #Helix
    TPHelix = np.sum(np.logical_and(predictedSS==0, trueSS==0))
    TNHelix = np.sum(np.logical_and(predictedSS!=0, trueSS!=0))
    FPHelix = np.sum(np.logical_and(predictedSS==0, trueSS!=0))
    FNHelix = np.sum(np.logical_and(predictedSS!=0, trueSS==0))
    CVVHelix = computeCVV(TPHelix,TNHelix,FPHelix,FNHelix)
    #Sheet
    TPSheet = np.sum(np.logical_and(predictedSS==1, trueSS==1))
    TNSheet = np.sum(np.logical_and(predictedSS!=1, trueSS!=1))
    FPSheet = np.sum(np.logical_and(predictedSS==1, trueSS!=1))
    FNSheet = np.sum(np.logical_and(predictedSS!=1, trueSS==1))
    CVVSheet = computeCVV(TPSheet,TNSheet,FPSheet,FNSheet)
    #Coil
    TPCoil = np.sum(np.logical_and(predictedSS==2, trueSS==2))
    TNCoil = np.sum(np.logical_and(predictedSS!=2, trueSS!=2))
    FPCoil = np.sum(np.logical_and(predictedSS==2, trueSS!=2))
    FNCoil = np.sum(np.logical_and(predictedSS!=2, trueSS==2))
    CVVCoil = computeCVV(TPCoil,TNCoil,FPCoil,FNCoil)
    return np.array([CVVHelix, CVVSheet, CVVCoil])