In [1]:
#!/usr/bin/env python3

########################################################################
# File: problem24.ipynb

# Author: Nicholas Chan
# History: 12/06/2021 Created
########################################################################

# Assignment 7: Problem 24
<br>
For this assignment, we were given input for an emission sequence, emitted symbols, <br>
hidden states, an initial transition table, and an initial emission table. With this <br>
information we were to compute the node responsibility matrix, which contains the  <br>
conditional probabilities of paths passing through any particular state at a particular <br>
time. The computed Node Responsibility Matrix is  a probablistic way of estimating a <br>
hidden path for an HMM. <br>


# SoftDecode Class
<br>
The SoftDecode class stores the emission sequence, emission symbols, hidden states, <br>
transition table, and emission table given as input. With this information, we compute <br>
a matrix for Forward probabilities and a matrix for Backward probabilities. Using the  <br>
Forward and Backwards matrices, we can then compute the Node Responsibility Matrix. <br>

In [2]:
import numpy as np
# use long double floating point type for all computation
# particularly, initialize the emission and transition tables to this
class SoftDecode:
    '''
    The SoftDecode Class is used to store information on an HMM and to compute
    the Node Responsibility Matrix. To compute the Node Responsibility Matrix,
    matrices for the Forward and Backward probabilities of each state over each
    emission are first computed. Then using the Forward-Backward algorithm, the Node
    Responsibility Matrix can be computed.
    '''
    def __init__(self, hStates, eSymbols, emissionSeq, trTable, emTable):
        '''
        SoftDecode objects are initialized with a list of hidden states, a
        list of emitted symbols, a string sequence of emitted symbols, 
        a transition table dictionary, and an emission table dictionary.
        Matrices for the Forward and Backward probabilities of each state over each
        emission as well as the Node Responsibility Matrix are also computed on
        initialization.
        '''
        self.hS = hStates # List of hidden states
        self.eS = eSymbols # List of emitted symbols
        self.emS = emissionSeq # stores sequence of emitted symbols
        self.revEmS = emissionSeq[::-1] # Reverse of emission sequence, used for Backward
        self.trT = trTable # stores transition table parsed from input
        self.emT = emTable # stores emission table parsed from input
        self.fMat = self.computeForwardMatrix() # Forward Matrix
        self.bMat = self.computeBackwardMatrix() # Backward Matrixd
        self.prob = self.computeSeqProb() # Emission sequence probability Pr(x)
        self.nrMat = self.nodalResponsibilityMatrix()

    def computeForwardMatrix(self):
        '''
        The computeForward method calculates the probability Pr(x) of an observed sequence
        of emissions from the source to sink nodes (the entire emission sequence given as input).
        Pr(x) of the entire emitted sequence is calculated by finding the Forward of each state in
        a column. A Forward of a current state k is found by taking the sum of products from all 
        preceding states l's Forwards along with their associated weights from some previous state l
        to our current state k. The sum of these "subForward calculations" yields the Forward for a
        current state k. The Forward from source to the first column uses 1/|states| as the transition
        value from some previous state l to current state k and a default forward of 1, leaving the 
        emission of k as the only real calculation.
        '''
        firstScores = [[(1/len(self.hS))*self.emT[state + self.emS[0]], state] for state in self.hS] # Scores of 1st nodes after source node (reused from p21)
        allForwardList = [firstScores] # Contains sublists which hold the Forwards of hidden states. Each sublist represents a column.
        for i in range(1,len(self.emS)): # Iterates over the indices of the emission seq and hidden path
            forwardCol = [] # Holds the Forwards in a column
            prevForward = allForwardList[i-1] # Refers to the previous column's Forwards
            for idx, state in enumerate(self.hS): # Iterates over all hidden states in curr column
                currColSubForwards = [] # Stores the subForwards of a sinlge current state k from all previous states l, to be summed at the end of the next for loop
                for prevStateIdx, prevState in enumerate(self.hS): # Iterates over states in previous column
                    currEmission = self.emT[state+self.emS[i]] # Emission for a curr state
                    currTransition = self.trT[prevState + state] # Transition for a curr state
                    currSubForward = prevForward[prevStateIdx][0] * currEmission * currTransition # Computes a forward for a current state l from some state l
                    currColSubForwards.append(currSubForward)      
                forwardCol.append([sum(currColSubForwards),state]) # Computes the forward of a current state k from all its subForwards 
            allForwardList.append(forwardCol) 
#         forwardSink = sum([i[0] for i in allForwardList[-1]]) # Computes the Forward of the sink node
        return allForwardList

    def computeBackwardMatrix(self):
        '''
        Computes matrix of Backward calculations for each state at each index of the
        emission sequence.
        '''
        firstScores = [[1, state] for state in self.hS] # Scores of 1st nodes after source node
        allBackwardList = [firstScores]
        for i in range(1,len(self.revEmS)):
            backwardCol = [] # Holds the Backwards in a column
            nextBackward = allBackwardList[i-1] # Refers to the previous column's Backwards
            for idx, state in enumerate(self.hS): # Iterates over all hidden states in curr column
                currColSubBackwards = [] # Stores the subBackwards of a sinlge current state k from all next states l, to be summed at the end of the next for loop
                for nextStateIdx, nextState in enumerate(self.hS): # Iterates over states in next column
                    nextEmission = self.emT[nextState+self.revEmS[i-1]] # Emission for a next state
                    nextTransition = self.trT[state + nextState] # Transition for a curr state
                    currSubBackward = nextBackward[nextStateIdx][0] * nextEmission * nextTransition # Computes a backward for a current state l from some state l
                    currColSubBackwards.append(currSubBackward)      
                backwardCol.append([sum(currColSubBackwards),state]) # Computes the backward of a current state k from all its subBackwards from some next state l
            allBackwardList.append(backwardCol)
        return allBackwardList[::-1]
        
    def computeSeqProb(self):
        '''
        Computes Forward(sink) = Pr(x) = probability of emitting sequence x = sum of Forwards
        from the last column of the Forward matrix.
        '''
        return sum([i[0] for i in self.fMat[-1]]) # Computes the Forward of the sink node
    
    def nodalResponsibilityMatrix(self):
        '''
        Returns matrix of conditional probabilities Pr(pi_i=k | x) for each 
        where k are all possible states and i is the emission index of the 
        emission sequence x. For Soft Decoding Problem, finds the prob that
        an HMM is in a particular state at a particular moment given its output.
        '''
        nrMatrix = []
        for emIdx in range(len(self.emS)):
            currCol = []
            for sIdx, state in enumerate(self.hS):
                cond = (self.fMat[emIdx][sIdx][0] * self.bMat[emIdx][sIdx][0]) / self.prob
                currCol.append([cond,state])
            nrMatrix.append(currCol)
        return nrMatrix
    
    def printNRMat(self):
        '''
        Returns string representation of nodal responsibility matrix.
        '''
        outString = "\t".join(self.hS) + "\n"
        for emIdx, em in enumerate(self.emS):
            colString = ""
            for sIdx, state in enumerate(self.hS):
                colString += str(np.round(self.nrMat[emIdx][sIdx][0], decimals=4)) + "\t"
            colString += "\n"
            outString+=colString
        return outString
        
                
        


# Main Function
<br>
Parses an input text file as a string for the emission sequence, a list of emission symbols, <br>
a string for the hidden path, and a list for the hidden states which the Params class <br>
requires as input. Input is assumed to contain an emitted sequence of symbols, a list of <br>
possible hidden states, a list of possible emission symbols, a transition table, and an emission table. 

In [3]:
def main(infile, outfile='', inCL=None):
    '''
    The main function parses input text to find an emission sequence string, a list
    of emitted symbols, a list of hidden states, a transition table, and an emission 
    table. With this input we can generate the Node Responsibility Matrix which contains
    the conditional probabilities of paths passing through any particular state at a 
    particular time.
    '''
    with open(infile,'r') as myfile:
        emSeq = myfile.readline().rstrip()  # READS EMISSION SEQUENCE STRING
        myfile.readline() # Clears over spacer line
        emSymbols = myfile.readline().rstrip().split('\t') # Reads list of emitted symbols
        myfile.readline() # Clears over spacer line
        hStates = myfile.readline().rstrip().split('\t') # Reads list of hidden states
        myfile.readline() # Clears over spacer line
        myfile.readline() # Clears over spacer line of emission symbols

        # TRANSITION TABLE PARSING
        trTable = dict() # Initialize transition table
        trRows = []
        trInRow = []
        while '-' not in trInRow:
            trInRow = myfile.readline().rstrip()
            if '-' in trInRow:
                break
            else:
                trRows.append(np.longdouble(trInRow.split('\t')[1:]))
        for stateIdx1, state1 in enumerate(hStates): # Iterate over hidden states and their idxes
            for stateIdx2, state2 in enumerate(hStates): # Iterate over symbols and their idxes
                trTable[state1+state2] = np.longdouble(trRows[stateIdx1][stateIdx2]) # Populate transition table probabilities where each key is a string: "{state1}+{state2}"
                
        myfile.readline() # Clears over spacer line
        
        # EMISSION TABLE PARSING
        emTable = dict() # Initialize emission table
        emRows = []
        emInRow = []
        for line in myfile:
            emInRow = line.rstrip()
            emRows.append(np.longdouble(emInRow.split('\t')[1:]))
        for stateIdx, state in enumerate(hStates): # Iterate over hidden states and their idxes
            for symbolIdx, symbol in enumerate(emSymbols): # Iterate over symbols and their idxes
                emTable[state+symbol] = np.longdouble(emRows[stateIdx][symbolIdx]) # Populate transition table probabilities where each key is a string: "{state}+{symbol}"
        mySD = SoftDecode(hStates, emSymbols, emSeq, trTable, emTable)
        prob = mySD.computeSeqProb()
        fMat = mySD.fMat
        bMat = mySD.bMat
        nrMat = mySD.nrMat
        print(mySD.printNRMat())
        
if __name__ == "__main__":
#     main("data/p24-simple-input.txt")
    main("data/rosalind_ba10j.txt")


A	B	C	D
0.3484	0.2459	0.1313	0.2744	
0.3199	0.1124	0.3579	0.2098	
0.2285	0.1175	0.3004	0.3536	
0.2869	0.3267	0.0706	0.3158	
0.3323	0.4511	0.0533	0.1634	
0.3123	0.5141	0.0913	0.0823	
0.3098	0.5088	0.0864	0.095	
0.313	0.5063	0.0873	0.0935	
0.3287	0.4879	0.0905	0.0929	
0.3415	0.4399	0.1243	0.0942	



In [4]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Write more markdown comments
# - Fix Forward-Backward Algorithm 
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Wrote markdown comments
# - Fixed Forward-Backward Algorithm implementation
# - Added more docstrings and inline comments