In [1]:
#!/usr/bin/env python3

########################################################################
# File: problem22.ipynb

# Author: Nicholas Chan
# History: 11/25/2021 Created
########################################################################

# Assignment 6: Problem 22
<br>
For this assignment, we were given input for an emission sequence, hidden states, <br>
emitted symbols, a transition table, and an emission table. With this information <br>
we were to compute the Probability of the sequence of emission symbols Pr(x). <br>

# Forward Class
<br>
The Forward class creates an object that stores the hidden states, emitted symbols, <br>
emission sequence, transition table, and emission table read from input. With this, <br>
a Forward Class object can call on the method computeForward to compute the Forward <br>
of the sink node. The Forward of the sink node is statistically equal to the probability of <br>
emitting some sequence as the calculation involves the sum of all probabilities where <br>
Pr(x,pi) over all instances of pi. Therefore this sum produces Pr(x). <br>

In [2]:
import numpy as np
# use long double floating point type for all computation
# particularly, initialize the emission and transition tables to this
class Forward:
    '''
    The Forward Class is used to store information on an HMM and to compute
    the Forward probability from the Sink node. This is thus equal to the
    probability of Pr(x) which is the total probability of emitting some
    sequence of emission symbols x.
    '''
    def __init__(self, hStates, eSymbols, emissionSeq, trTable, emTable):
        '''
        Forward objects are initialized with a list of hidden states, a
        list of emitted symbols, a string sequence of emitted symbols, 
        a transition table dictionary, and an emission table dictionary.
        '''
        self.hS = hStates # List of hidden states
        self.eS = eSymbols # List of emitted symbols
        self.emS = emissionSeq # stores sequence of emitted symbols
        self.trT = trTable # stores transition table parsed from input
        self.emT = emTable # stores emission table parsed from input

    def computeForward(self):
        '''
        The computeForward method calculates the probability Pr(x) of an observed sequence
        of emissions from the source to sink nodes (the entire emission sequence given as input).
        Pr(x) of the entire emitted sequence is calculated by finding the Forward of each state in
        a column. A Forward of a current state k is found by taking the sum of products from all 
        preceding states l's Forwards along with their associated weights from some previous state l
        to our current state k. The sum of these "subForward calculations" yields the Forward for a
        current state k. The Forward from source to the first column uses 1/|states| as the transition
        value from some previous state l to current state k and a default forward of 1, leaving the 
        emission of k as the only real calculation.
        '''
        firstScores = [[(1/len(self.hS))*self.emT[state + self.emS[0]], '[ZERO_STATE]', '[ZERO_IDX]', state] for state in self.hS] # Scores of 1st nodes after source node (reused from p21)
        allForwardList = [firstScores] # Contains sublists which hold the Forwards of hidden states. Each sublist represents a column.
        for i in range(1,len(self.emS)): # Iterates over the indices of the emission seq and hidden path
            forwardCol = [] # Holds the Forwards in a column
            prevForward = allForwardList[i-1] # Refers to the previous column's Forwards
            for idx, state in enumerate(self.hS): # Iterates over all hidden states in curr column
                currColSubForwards = [] # Stores the subForwards of a sinlge current state k from all previous states l, to be summed at the end of the next for loop
                for prevStateIdx, prevState in enumerate(self.hS): # Iterates over states in previous column
                    currEmission = self.emT[state+self.emS[i]] # Emission for a curr state
                    currTransition = self.trT[prevState + state] # Transition for a curr state
                    currSubForward = prevForward[prevStateIdx][0] * currEmission * currTransition # Computes a forward for a current state l from some state l
                    currColSubForwards.append(currSubForward)      
                forwardCol.append([sum(currColSubForwards),state]) # Computes the forward of a current state k from all its subForwards 
            allForwardList.append(forwardCol) 
        forwardSink = sum([i[0] for i in allForwardList[-1]]) # Computes the Forward of the sink node
        return forwardSink


# Main Function
<br>
Parses an input text file as a list of ints which the Forward class requires as input. <br>
Input is assumed to contain an emitted sequence of symbols, a list of possible hidden states, <br>
a list of possible emission symbols, a transition table, and an emission table. 

In [3]:
def main(infile, outfile='', inCL=None):
    '''
    The main function parses input text to find an emission sequence string, a list
    of emitted symbols, a list of hidden states, a transition table, and an emission 
    table. With this input we can generate the probability of emitting the recorded sequence
    of symbols by creating a Forward object and calling the computeForward method.
    '''
    with open(infile,'r') as myfile:
        emSeq = myfile.readline().rstrip()  # READS EMISSION SEQUENCE STRING
        myfile.readline() # Clears over spacer line
        emSymbols = myfile.readline().rstrip().split('\t') # Reads list of emitted symbols
        myfile.readline() # Clears over spacer line
        hStates = myfile.readline().rstrip().split('\t') # Reads list of hidden states
        myfile.readline() # Clears over spacer line
        myfile.readline() # Clears over spacer line of emission symbols

        # TRANSITION TABLE PARSING
        trTable = dict() # Initialize transition table
        trRows = []
        trInRow = []
        while '-' not in trInRow:
            trInRow = myfile.readline().rstrip()
            if '-' in trInRow:
                break
            else:
                trRows.append(np.longdouble(trInRow.split('\t')[1:]))
        for stateIdx1, state1 in enumerate(hStates): # Iterate over hidden states and their idxes
            for stateIdx2, state2 in enumerate(hStates): # Iterate over symbols and their idxes
                trTable[state1+state2] = np.longdouble(trRows[stateIdx1][stateIdx2]) # Populate transition table probabilities where each key is a string: "{state1}+{state2}"
                
        myfile.readline() # Clears over spacer line
        
        # EMISSION TABLE PARSING
        emTable = dict() # Initialize emission table
        emRows = []
        emInRow = []
        for line in myfile:
            emInRow = line.rstrip()
            emRows.append(np.longdouble(emInRow.split('\t')[1:]))
        for stateIdx, state in enumerate(hStates): # Iterate over hidden states and their idxes
            for symbolIdx, symbol in enumerate(emSymbols): # Iterate over symbols and their idxes
                emTable[state+symbol] = np.longdouble(emRows[stateIdx][symbolIdx]) # Populate transition table probabilities where each key is a string: "{state}+{symbol}"
        myFor = Forward(hStates, emSymbols, emSeq, trTable, emTable)
        prob = myFor.computeForward()
        print(np.format_float_scientific(prob, 11))
        
if __name__ == "__main__":
#     main("data/p22-simple-input.txt")
    main("data/rosalind_ba10d.txt")
#     main("data/p22-ex-input.txt")


4.77544546059e-49


In [4]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Remove redundant code, finding the path isn't necessary as Pr(X) is independent of path
# - Write more markdown comments
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Fixed redundant code
# - Wrote markdown comments
# - Cleaned code
# - Added more docstrings and inline comments