In [1]:
#!/usr/bin/env python3

########################################################################
# File: problem26.ipynb

# Author: Nicholas Chan
# History: 12/06/2021 Created
########################################################################

# Assignment 7: Problem 26
<br>
For this assignment, we were given input for the number of iterations of learning to execute, <br>
an emission sequence, emitted symbols, hidden states, an initial transition table, and an initial <br>
emission table. With this information we were to compute a new set of parameters for the HMM <br>
through iterations of the Expectation and Maximization steps of Baum-Welch Learning. <br>

# BaumWelch Class
<br>
The BaumWelch class stores the number of iterations of learning to execute, the <br> 
emission sequence, emission symbols, hidden states, transition table, and emission  <br>
table given as input. With this information, we can generate the likelihood maximizing path
through the Node and Edge Responsibility Matrices which probablistically representat <br>
a hidden path. We can then estimate new parameters which can be used to re-compute <br>
the Node and Edge Responsibility Matrices.

In [2]:
import numpy as np
# use long double floating point type for all computation
# particularly, initialize the emission and transition tables to this

class BaumWelch:
    '''
    Objects of the BaumWelch class compute a matrix for Forward Probabilities and 
    a matrix for Backwards Probabilities to then compute a Node Responsibility and
    Edge Responsibility matrix, which probabilistically represent a path of hidden 
    states used to generate an emission sequence. We can use this probabilistic 
    representation of a hidden path to compute a new transition and emission table.
    These new parameters can be used to train another probabilistic representation of
    the hidden path that generated an emission sequence
    '''
    def __init__(self, hStates, eSymbols, emissionSeq, trTable, emTable):
        '''
        BaumWelch objects are initialized with a list of hidden states, a
        list of emitted symbols, a string sequence of emitted symbols, 
        a transition table dictionary, and an emission table dictionary. These objects
        are also initialized with a Forward Probability and Backwards Probability matrix
        as well as a Node Responsibility and Edge Responsibility matrix. A new set
        of parameters in the form of a new transition and emission table are also
        initialized.
        '''
        self.hS = hStates # List of hidden states
        self.eS = eSymbols # List of emitted symbols
        self.emS = emissionSeq # stores sequence of emitted symbols
        self.revEmS = emissionSeq[::-1] # Reverse of emission sequence, used for Backward
        self.trT = trTable # stores transition table parsed from input
        self.emT = emTable # stores emission table parsed from input
        self.fMat = self.computeForwardMatrix() # Forward Matrix
        self.bMat = self.computeBackwardMatrix() # Backward Matrixd
        self.prob = self.computeSeqProb() # Emission sequence probability Pr(x)
        self.nrMat = self.nodalResponsibilityMatrix() # Nodal Responsibility Matrix (Pi*)
        self.nrConditionals = [sum([prob[stateIdx][0] for prob in self.nrMat]) for stateIdx in range(len(self.hS))] # Sums of emission probs given certain state, idx of prob corresponds with idx of hidden states in self.hS
        self.erMat = self.edgeResponsibilityMatrix() # Edge Responsibility Matrix (Pi**)
        # First iterates over the number of possible transitions, transitionIdx are stored in order corresponding to erMat's inner most lists of transitions, i.e. transitionIdx = 0 -> AA, transitionIdx = 1 -> AB
        self.erConditionals = [[sum([prob[transitionIdx][0] for prob in self.erMat]), self.erMat[0][transitionIdx][1]] for transitionIdx in range(len(self.hS)**2)] # Sums of transition probs, e.g. sums of the rows of erMat
        self.newEmT = self.genNewEmissionTable()
        self.newTrT = self.genNewTransitionTable()

    def computeForwardMatrix(self):
        '''
        The computeForward method calculates the probability Pr(x) of an observed sequence
        of emissions from the source to sink nodes (the entire emission sequence given as input).
        Pr(x) of the entire emitted sequence is calculated by finding the Forward of each state in
        a column. A Forward of a current state k is found by taking the sum of products from all 
        preceding states l's Forwards along with their associated weights from some previous state l
        to our current state k. The sum of these "subForward calculations" yields the Forward for a
        current state k. The Forward from source to the first column uses 1/|states| as the transition
        value from some previous state l to current state k and a default forward of 1, leaving the 
        emission of k as the only real calculation.
        '''
        firstScores = [[(1/len(self.hS))*self.emT[state + self.emS[0]], state] for state in self.hS] # Scores of 1st nodes after source node (reused from p21)
        allForwardList = [firstScores] # Contains sublists which hold the Forwards of hidden states. Each sublist represents a column.
        for i in range(1,len(self.emS)): # Iterates over the indices of the emission seq and hidden path
            forwardCol = [] # Holds the Forwards in a column
            prevForward = allForwardList[i-1] # Refers to the previous column's Forwards
            for idx, state in enumerate(self.hS): # Iterates over all hidden states in curr column
                currColSubForwards = [] # Stores the subForwards of a sinlge current state k from all previous states l, to be summed at the end of the next for loop
                for prevStateIdx, prevState in enumerate(self.hS): # Iterates over states in previous column
                    currEmission = self.emT[state+self.emS[i]] # Emission for a curr state
                    currTransition = self.trT[prevState + state] # Transition for a curr state
                    currSubForward = prevForward[prevStateIdx][0] * currEmission * currTransition # Computes a forward for a current state l from some state l
                    currColSubForwards.append(currSubForward)      
                forwardCol.append([sum(currColSubForwards),state]) # Computes the forward of a current state k from all its subForwards 
            allForwardList.append(forwardCol) 
#         forwardSink = sum([i[0] for i in allForwardList[-1]]) # Computes the Forward of the sink node
        return allForwardList

    def computeBackwardMatrix(self):
        '''
        Computes matrix of Backward calculations for each state at each index of the
        emission sequence.
        '''
        firstScores = [[1, state] for state in self.hS] # Scores of 1st nodes after source node
        allBackwardList = [firstScores]
        for i in range(1,len(self.revEmS)):
            backwardCol = [] # Holds the Backwards in a column
            nextBackward = allBackwardList[i-1] # Refers to the previous column's Backwards
            for idx, state in enumerate(self.hS): # Iterates over all hidden states in curr column
                currColSubBackwards = [] # Stores the subBackwards of a sinlge current state k from all next states l, to be summed at the end of the next for loop
                for nextStateIdx, nextState in enumerate(self.hS): # Iterates over states in next column
                    nextEmission = self.emT[nextState+self.revEmS[i-1]] # Emission for a next state
                    nextTransition = self.trT[state + nextState] # Transition for a curr state
                    currSubBackward = nextBackward[nextStateIdx][0] * nextEmission * nextTransition # Computes a backward for a current state l from some state l
                    currColSubBackwards.append(currSubBackward)      
                backwardCol.append([sum(currColSubBackwards),state]) # Computes the backward of a current state k from all its subBackwards from some next state l
            allBackwardList.append(backwardCol)
        return allBackwardList[::-1]
        
    def computeSeqProb(self):
        '''
        Computes Forward(sink) = Pr(x) = probability of emitting sequence x = sum of Forwards
        from the last column of the Forward matrix.
        '''
        return sum([i[0] for i in self.fMat[-1]]) # Computes the Forward of the sink node
    
    def nodalResponsibilityMatrix(self):
        '''
        Returns matrix of conditional probabilities Pr(pi_i=k | x) for each idx
        where k are all possible states and i is the emission index of the 
        emission sequence x. For Soft Decoding Problem, finds the prob that
        an HMM is in a particular state at a particular moment given its output.
        '''
        nrMatrix = []
        for emIdx in range(len(self.emS)):
            currCol = []
            for sIdx, state in enumerate(self.hS):
                cond = (self.fMat[emIdx][sIdx][0] * self.bMat[emIdx][sIdx][0]) / self.prob
                currCol.append([cond,state])
            nrMatrix.append(currCol)
        return nrMatrix
    
    def printNRMat(self):
        '''
        Returns string representation of nodal responsibility matrix.
        '''
        outString = "\t".join(self.hS) + "\n"
        for emIdx, em in enumerate(self.emS):
            colString = ""
            for sIdx, state in enumerate(self.hS):
                colString += str(np.round(self.nrMat[emIdx][sIdx][0], decimals=4)) + "\t"
            colString += "\n"
            outString+=colString
        return outString
    
    def edgeResponsibilityMatrix(self):
        '''
        Returns matrix of conditional probabilities Pr(pi_i=l, pi_i+1=k | x) for each idx i
        where pi_i=l (l and k are all possible current and next states for each edge (l, k)).
        Matrix of the probabilities that an HMM passes through a particular edge at a particular
        moment given its output.
        '''
        erMatrix = []
        for emIdx in range(len(self.emS)-1): # Loop over each index of l for each edge (l,k) in emission seq
            currCol = []
            for currStateIdx, currState in enumerate(self.hS): # Step Runs state**2 num times
                for nextStateIdx, nextState in enumerate(self.hS):
                    if self.prob == 0: # Handles div by 0 when normalizing
                        cond = 0
                    else:
                        cond = (self.fMat[emIdx][currStateIdx][0] * self.bMat[emIdx+1][nextStateIdx][0] * self.emT[nextState+self.emS[emIdx+1]] * self.trT[currState + nextState]) / self.prob
                    currCol.append([cond, currState+nextState])
            erMatrix.append(currCol)
        return erMatrix
    
    def genNewEmissionTable(self):
        '''
        Generate new emission table from current inputs and Pi*.
        '''
        newEmT = dict() 
        for st in self.hS: # This step initializes new emission table
            for em in self.eS:
                newEmT[st+em] = 0
        for emIdx, emission in enumerate(self.emS): # Iterate over emission symbols in emission seq
            for stateIdx, state in enumerate(self.hS): # Iterate over the states in emission column
                newEmT[state+emission] += self.nrMat[emIdx][stateIdx][0] # Increment new Emission prob using Pi* (at emission idx and then at state idx)
        for hsIdx, hs in enumerate(self.hS): # Normalizes accumulated emission sums by the conditional probabilities of their corresponding states
            for emsIdx, ems in enumerate(self.eS):
                if self.nrConditionals[hsIdx] == 0: # Handles div by 0 when normalizing
                    newEmT[hs + ems] = 0
                else:
                    newEmT[hs + ems] /= self.nrConditionals[hsIdx]
        return newEmT
    
    def genNewTransitionTable(self):
        '''
        Generates new transition table from current inputs and Pi**.
        '''
        newTrT = dict() 
        for currSt in self.hS: # This step initializes new transition table
            for nextSt in self.hS:
                newTrT[currSt+nextSt] = 0
        for conditional in self.erConditionals: # Loop over conditionals which are kept in pairs of [[prob, transition(AA)], ...]
            newTrT[conditional[1]] = conditional[0] # Use transition associated with each conditional value as a key
        norms = [] # Initialize list for holding noramlizing values
        for currState in self.hS:
            norm = 0
            for nextState in self.hS:
                norm += newTrT[currState + nextState]
            norms.append(norm)
        for currState2Idx, currState2 in enumerate(self.hS):
            for nextState2 in self.hS:  
                if norms[currState2Idx] == 0: # Handles div by 0 when normalizing
                    newTrT[currState2 + nextState2] = 0
                else:
                    newTrT[currState2 + nextState2] /= norms[currState2Idx] # Normalize transition table probs by corresponding starting state 
        return newTrT
    
    def maximizeIter(self, num):
        '''
        Method maximizes the total product weight through Baum-Welch Learning.
        '''
        count = num
        prevProb = 0
        currProb = self.prob
        while count>0:
            self.trT = self.newTrT # Assigns object new transition and emission tables
            self.emT = self.newEmT
            prevProb = currProb # Method stores last computed product weight (Forward(sink))
            # Rest of this code resets current object to the new transition and emission tables
            self.fMat = self.computeForwardMatrix() # Forward Matrix
            self.bMat = self.computeBackwardMatrix() # Backward Matrixd
            self.prob = self.computeSeqProb() # Emission sequence probability Pr(x)
            self.nrMat = self.nodalResponsibilityMatrix() # Nodal Responsibility Matrix (Pi*)
            self.nrConditionals = [sum([prob[stateIdx][0] for prob in self.nrMat]) for stateIdx in range(len(self.hS))] # Sums of emission probs given certain state, idx of prob corresponds with idx of hidden states in self.hS
            self.erMat = self.edgeResponsibilityMatrix() # Edge Responsibility Matrix (Pi**)
            # First iterates over the number of possible transitions, transitionIdx are stored in order corresponding to erMat's inner most lists of transitions, i.e. transitionIdx = 0 -> AA, transitionIdx = 1 -> AB
            self.erConditionals = [[sum([prob[transitionIdx][0] for prob in self.erMat]), self.erMat[0][transitionIdx][1]] for transitionIdx in range(len(self.hS)**2)] # Sums of transition probs, e.g. sums of the rows of erMat
            self.newEmT = self.genNewEmissionTable()
            self.newTrT = self.genNewTransitionTable()
            # After object has been refreshed with new parameters, compute new product weight
            currProb = self.computeSeqProb() # Method assigns new computed product weight (Forward(sink))
            count -= 1
        return
    
    def maximizeZero(self):
        '''
        Method maximizes the total product weight through Baum-Welch Learning.
        '''
        prevProb = 0
        currProb = self.prob
        while currProb-prevProb > 0:
            self.trT = self.newTrT # Assigns object new transition and emission tables
            self.emT = self.newEmT
            prevProb = currProb # Method stores last computed product weight (Forward(sink))
            # Rest of this code resets current object to the new transition and emission tables
            self.fMat = self.computeForwardMatrix() # Forward Matrix
            self.bMat = self.computeBackwardMatrix() # Backward Matrixd
            self.prob = self.computeSeqProb() # Emission sequence probability Pr(x)
            self.nrMat = self.nodalResponsibilityMatrix() # Nodal Responsibility Matrix (Pi*)
            self.nrConditionals = [sum([prob[stateIdx][0] for prob in self.nrMat]) for stateIdx in range(len(self.hS))] # Sums of emission probs given certain state, idx of prob corresponds with idx of hidden states in self.hS
            self.erMat = self.edgeResponsibilityMatrix() # Edge Responsibility Matrix (Pi**)
            # First iterates over the number of possible transitions, transitionIdx are stored in order corresponding to erMat's inner most lists of transitions, i.e. transitionIdx = 0 -> AA, transitionIdx = 1 -> AB
            self.erConditionals = [[sum([prob[transitionIdx][0] for prob in self.erMat]), self.erMat[0][transitionIdx][1]] for transitionIdx in range(len(self.hS)**2)] # Sums of transition probs, e.g. sums of the rows of erMat
            self.newEmT = self.genNewEmissionTable()
            self.newTrT = self.genNewTransitionTable()
            # After object has been refreshed with new parameters, compute new product weight
            currProb = self.computeSeqProb() # Method assigns new computed product weight (Forward(sink))
        return
    
    def printParams(self):
        '''
        Use this method to return a string representing params.
        Call this after a call to the maximize method for Rosalind 
        solution.
        '''
        outString = ""
        for hs in self.hS:
            outString += hs + "\t"
        outString += "\n"
        for hs1 in self.hS:
            outString += hs1 + "\t"
            for hs2 in self.hS:
                outString += str(np.around(self.trT[hs1+hs2], decimals=3)) + "\t"
            outString += "\n"
        outString += "-------- \n"
        outString += "\t".join(self.eS) + "\n"
        for hs1 in self.hS:
            outString += hs1 + "\t"
            for em2 in self.eS:
                outString += str(np.around(self.emT[hs1+em2], decimals=3)) + "\t"
            outString += "\n"
        return outString
            

# Main Function
<br>
Parses an input text file as an int for the number of learning iterations to perform, <br>
a string for the emission sequence, a list of emission symbols, a string for the hidden path, <br>
a list for the hidden states, a dictionary for a transition table, and a dictionary for an <br>
emission table which the BaumWelch class requires as input. The main function also initializes <br>
a ViterbiLearning object with the given input to compute our parameters to be printed to output.

In [3]:
def main(infile, outfile='', inCL=None):
    '''
    The main function parses input text to find the number of learning iterations to perform,
    emission sequence string, a list of emitted symbols, a list of hidden states, a transition 
    table, and an emission table. With this input we can generate the a probablistic representation
    for a path of hidden states used for producing some emission sequence. Then we can estimate a 
    new transition and emission table which can be used to recompute another probablistic representation
    of a hidden path (Expectation).
    '''
    with open(infile,'r') as myfile:
        iters = int(myfile.readline()) # READS NUMBER OF ITERATIONS TO EXECUTE
        myfile.readline() # Clears over spacer line
        emSeq = myfile.readline().rstrip()  # READS EMISSION SEQUENCE STRING
        myfile.readline() # Clears over spacer line
        emSymbols = myfile.readline().rstrip().split('\t') # Reads list of emitted symbols
        myfile.readline() # Clears over spacer line
        hStates = myfile.readline().rstrip().split('\t') # Reads list of hidden states
        myfile.readline() # Clears over spacer line
        myfile.readline() # Clears over spacer line of emission symbols

        # TRANSITION TABLE PARSING
        trTable = dict() # Initialize transition table
        trRows = []
        trInRow = []
        while '-' not in trInRow:
            trInRow = myfile.readline().rstrip()
            if '-' in trInRow:
                break
            else:
                trRows.append(np.longdouble(trInRow.split('\t')[1:]))
        for stateIdx1, state1 in enumerate(hStates): # Iterate over hidden states and their idxes
            for stateIdx2, state2 in enumerate(hStates): # Iterate over symbols and their idxes
                trTable[state1+state2] = np.longdouble(trRows[stateIdx1][stateIdx2]) # Populate transition table probabilities where each key is a string: "{state1}+{state2}"
                
        myfile.readline() # Clears over spacer line
        
        # EMISSION TABLE PARSING
        emTable = dict() # Initialize emission table
        emRows = []
        emInRow = []
        for line in myfile:
            emInRow = line.rstrip()
            emRows.append(np.longdouble(emInRow.split('\t')[1:]))
        for stateIdx, state in enumerate(hStates): # Iterate over hidden states and their idxes
            for symbolIdx, symbol in enumerate(emSymbols): # Iterate over symbols and their idxes
                emTable[state+symbol] = np.longdouble(emRows[stateIdx][symbolIdx]) # Populate transition table probabilities where each key is a string: "{state}+{symbol}"
        # INITIALIZE OBJECT
        myBM = BaumWelch(hStates, emSymbols, emSeq, trTable, emTable)
#         print(myFor.trT)
#         print(myFor.emT)
        myBM.maximizeIter(iters) # Maximize on number of iteration from input

#         print(myFor.prob)
        print(myBM.printParams())

if __name__ == "__main__":
#     main("data/p26-simple-input.txt")
    main("data/rosalind_ba10k.txt")
#     main("data/p26-ex-input.txt")


A	B	C	D	
A	0.832	0.168	0.0	0.0	
B	0.0	0.0	1.0	0.0	
C	0.0	0.0	0.0	1.0	
D	0.0	0.492	0.508	0.0	
-------- 
x	y	z
A	0.0	1.0	0.0	
B	0.664	0.336	0.0	
C	0.385	0.0	0.615	
D	0.132	0.577	0.291	



In [4]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Fix Edge Responsibility Matrix method
# - Write more markdown comments
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Found that I was accidentally leaving out emission probabilities in my method for computing edge responsibility
# - Wrote markdown comments
# - Cleaned code
# - Added more docstrings and inline comments