In [None]:
#!/usr/bin/env python3

########################################################################
# File: problem21.ipynb

# Author: Nicholas Chan
# History: 11/25/2021 Created
########################################################################

# Assignment 6: Problem 21
<br>
For this assignment, we were given input for an emission sequence, hidden states, <br>
emitted symbols, a transition table, and an emission table. With this information <br>
we were to compute the most likely path which generated the sequence of emission  <br>
symbols observed from input.

# Viterbi Class
<br>
The Viterbi class creates an object that stores the hidden states, emitted symbols, <br>
emission sequence, transition table, and emission table read from input. With this, <br>
a Viterbi Class object can call on the method computePath to compute the most <br>
likely path that yielded our observed sequence of emissions. <br>

In [1]:
import numpy as np
# use long double floating point type for all computation
# particularly, initialize the emission and transition tables to this
class Viterbi:
    '''
    The Viterbi class initializes objects with a list of hidden states, emission symbols,
    an emission sequence, transition table, and emission table. Objects of the Viterbi class
    are able to call on a method called computePath to compute the most likely path of hidden
    states used to generate an emission sequence. This path maximizes Pr(x,pi) which is the 
    probability of the emission sequence and path occurring together.
    '''
    def __init__(self, hStates, eSymbols, emissionSeq, trTable, emTable):
        '''
        '''
        self.hS = hStates # List of hidden states
        self.eS = eSymbols # List of emitted symbols
        self.emS = emissionSeq # stores sequence of emitted symbols
        self.trT = trTable # stores transition table parsed from input
        self.emT = emTable # stores emission table parsed from input
        
    def maxState(self, scores):
        '''
        Small method used to return a list of [{maximal hidden state}, {corresponding maximal score}]
        '''
        # For sublists in pathList: [ [A, 0.0585], ... ] where subList[0] is max state and subList[1] is max score
        return [self.hS[np.argmax(scores)], max(scores)]

    def computePath(self):
        '''
        Scores - computed with Score = prevScore(l) * transition(l to k) * emission(p_i to k)
        
        computePath finds the most likely hidden path that can occur alongside the emission sequence
        from our input. computePath first finds the Scores of each hidden state for every column in 
        the HMM. Each column contains all the hidden states that could output the symbol corresponding to its 
        index. The score of the first column is calculated as shown above, with the exception that the previous score 
        is set to 1 by default and the transition probability is substituted with 1/|states| (regardless of current 
        state k). The score for a typical state k is computed by taking the max of the possible scores computed for 
        a single current state k from all of the scores from the states l in the previous column (remember only max
        of all those scores is kept for k). Each score a state is kept as a list such that: [{score}, {prevState}, {prevStateIDX}, {currState}].
        Each column is a list of these score lists such that: [[scoreList for A], [scoreList for B], ..., [scoreList for Z]].
        Therefore a prevStateIDX of 0 would refer to the scoreList for state A in the previous column. Once scoreLists for 
        all states in all columns have been computed, we look at the last column (before sink) of the HMM and refer to the
        state with the highest score. Backtracking can proceed by tracking the states prior to the last maximal state thanks
        to the pointers we left behind in idx=1 and 2 of our scoreLists (idx 1 and 2 of a state's scoreList refers to its
        prevState and the prevStat's idx). The reverse of the backtracked path is printed (this is then just the regular path).
        '''
        firstScores = [[(1/len(self.hS))*self.emT[state + self.emS[0]], '[ZERO_STATE]', '[ZERO_IDX]', state] for state in self.hS] # Scores of 1st nodes after source node
        scoreList = [firstScores] # List stores lists containing the scores of each column: [score, prevState, currState]
        for i in range(1,len(self.emS)): # Iterates over the indices of the emission seq and hidden path
            currCol = [] # Stores the most likely scores for the curr column
            prevScore = scoreList[i-1] # prev Score List
            for idx, state in enumerate(self.hS): # Iterates over all hidden states in curr column
                currNodeScores = [] # Stores the possible scores of a state in the curr column
                for prevStateIdx, prevState in enumerate(self.hS): # Iterates over states in previous column
                    currEmission = self.emT[state+self.emS[i]] # Emission for a curr state
                    currTransition = self.trT[prevState + state] # Transition for a curr state
                    currScore =  prevScore[prevStateIdx][0] * currEmission * currTransition # Computes a score
                    currScoreAndPrevState = [currScore, prevState, prevStateIdx, state]
                    currNodeScores.append(currScoreAndPrevState) # Appends scores to the list representing a State's curr score
                currCol.append(max(currNodeScores, key = lambda x:x[0])) # Appends list with the maximal score given a previous state to the list representing the current column
            scoreList.append(currCol)
        scoreToPathList = scoreList[::-1] # We will look at all columns of the HMM from a reversed view
        lastStates = max(scoreToPathList[0], key = lambda x:x[0]) # When starting backtracking, start with the last state which has the maximal score
        revPath = "" # This will hold our backtracked states
        for i in scoreToPathList[1:]: # Last state's preceding states are found by following prev state pointers on index = 2
            revPath += lastStates[3]
            lastStates = i[lastStates[2]]
        revPath += lastStates[3]
        path = revPath[::-1] # Reverse our backtracked path to get our regular path
        return path


# Main Function
<br>
Parses an input text file as a list of ints which the Forward class requires as input. <br>
Input is assumed to contain an emitted sequence of symbols, a list of possible hidden states, <br>
a list of possible emission symbols, a transition table, and an emission table. 

In [3]:
def main(infile, outfile='', inCL=None):
    '''
    The main function parses input text to find an emission sequence string, a list
    of emitted symbols, a list of hidden states, a transition table, and an emission 
    table. With this input we can generate a path of hidden states using the computePath
    method from a Viterbi class object
    '''
    with open(infile,'r') as myfile:
        emSeq = myfile.readline().rstrip()  # READS EMISSION SEQUENCE STRING
        myfile.readline() # Clears over spacer line
        emSymbols = myfile.readline().rstrip().split('\t') # Reads list of emitted symbols
        myfile.readline() # Clears over spacer line
        hStates = myfile.readline().rstrip().split('\t') # Reads list of hidden states
        myfile.readline() # Clears over spacer line
        myfile.readline() # Clears over spacer line of emission symbols
        
        # TRANSITION TABLE PARSING
        trTable = dict() # Initialize transition table
        trRows = []
        trInRow = []
        while '-' not in trInRow:
            trInRow = myfile.readline().rstrip()
            if '-' in trInRow:
                break
            else:
                trRows.append(np.longdouble(trInRow.split('\t')[1:]))
        for stateIdx1, state1 in enumerate(hStates): # Iterate over hidden states and their idxes
            for stateIdx2, state2 in enumerate(hStates): # Iterate over symbols and their idxes
                trTable[state1+state2] = np.longdouble(trRows[stateIdx1][stateIdx2]) # Populate transition table probabilities where each key is a string: "{state1}+{state2}"
                
        myfile.readline() # Clears over spacer line
        
        # EMISSION TABLE PARSING
        emTable = dict() # Initialize emission table
        emRows = []
        emInRow = []
        for line in myfile:
            emInRow = line.rstrip()
            emRows.append(np.longdouble(emInRow.split('\t')[1:]))
        for stateIdx, state in enumerate(hStates): # Iterate over hidden states and their idxes
            for symbolIdx, symbol in enumerate(emSymbols): # Iterate over symbols and their idxes
                emTable[state+symbol] = np.longdouble(emRows[stateIdx][symbolIdx]) # Populate transition table probabilities where each key is a string: "{state}+{symbol}"

        myVit = Viterbi(hStates, emSymbols, emSeq, trTable, emTable)
        path = myVit.computePath()
        print(path)
        
if __name__ == "__main__":
    main("data/rosalind_ba10c.txt")


BBBBBBBBBBBBAAABBBBBAAABBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB


In [None]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Fix Backtracking algorithm, slightly off for some reason
# - Write more markdown comments
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Fixed backtracking
# - Wrote markdown comments
# - Cleaned code
# - Added more docstrings and inline comments