In [1]:
#!/usr/bin/env python3

########################################################################
# File: problem20.ipynb

# Author: Nicholas Chan
# History: 11/25/2021 Created
########################################################################

# Assignment 6: Problem 20
<br>
For this assignment problem I found the conditional probability of <br>
an emission sequence occurring given some path of hidden states. <br>
This was done by taking the product of all emission probabilities <br>
throughout the hidden path and emission sequence (both are the same length). <br>
Each emission is already a conditional probability of some symbol being <br>
emitted given some hidden state, so finding the same conditional <br>
probability over the entire emission sequence required the product of <br>
all such individual conditional probabilities. <br>

# EmissionSeqProb Class
<br>
The EmissionSeqProb Class creates objects that can store a path of hidden states,
an emission sequence, and emission table some HMM. With this, EmissionSeqProb objects 
can call the computeProb method to compute the conditional probability of an emission
sequence occurring given some hidden path.

In [3]:
import numpy as np
# use long double floating point type for all computation
# particularly, initialize the emission and transition tables to this
class EmissionSeqProb:
    '''
    The EmissionSeqProb class takes in a hidden path, emission sequence, and 
    an emission table from parsed input. EmissionSeqProb objects contain a 
    method called computeProb which calculates the conditional probability of 
    a sequence of symbols being emitted given a path.
    '''
    def __init__(self, hiddenPath, emissionSeq, emTable):
        '''
        EmissionSeqProb objects are initialized with a hidden path sequence,
        a sequence of emitted symbols, and an emission table which can be 
        used with the given hidden path sequence and sequence of emitted symbols.
        '''
        self.hP = hiddenPath # stores path of states
        self.emS = emissionSeq # stores sequence of emitted symbols
        self.emT = emTable # stores emission table parsed from input, keys are of the form: {state}+{symbol}
        
    def computeProb(self):
        '''
        Computes the probability of a sequence occurring given a state path.
        computeProb iterates over the indices of the emission seq and hidden 
        path to records the probability of the emission occuring at that index.
        The emission table is used to calculate emission probabilties.
        The probability is stored to a list which will have its product returned 
        once all emissions have been appended. 
        '''
        prodList = [] # List has emission probabilities appended to it
        for i in range(len(self.hP)): # Iterates over the indices of the emission seq and hidden path
            prodList.append(self.emT[self.hP[i]+self.emS[i]]) # Creates a 2 character string: "{state}+{symbol}" which is used as a key for the emission table
        return np.prod(prodList) # returns the product of emission probabilities for a sequence of symbols given a hidden path

# Main Function
<br>
Parses an input text file as a list of ints which the EmissionSeqProb class requires as input. <br>
Input is assumed to contain an emitted sequence of symbols, a list of possible emission symbols, <br>
a path of hidden states, a list of possible hidden states, and an emission table. 

In [13]:
import numpy as np
# use long double floating point type for all computation
# particularly, initialize the emission and transition tables to this
def main(infile, outfile='', inCL=None):
    '''
    main function parses in input to store an emission sequence, a hidden path of states,
    possible states, possible symbols, and the rows of an emission table. An emission table 
    dictionary is composed from the possible states, possible symbols, and rows read in as input.
    The main function then creates an EmissionSeqProb object given am emission sequence, hidden
    path of states, and an emission table dictionary. The probability of an outcome given a hidden
    path is returned after a call to the computeProb method on the EmissionSeqProb object. Output
    is printed to stout.
    '''
    with open(infile,'r') as myfile:
        emSeq = myfile.readline().rstrip()  # READS EMISSION SEQUENCE STRING
        myfile.readline() # Clears over spacer line
        emSymbols = myfile.readline().rstrip().split('\t') # READS LIST OF EMISSION SYMBOLS
        myfile.readline() # Clears over spacer line
        hPath = myfile.readline().rstrip() # READS HIDDEN PATH STRING
        myfile.readline() # Clears over spacer line
        hStates = myfile.readline().rstrip().split('\t') # READS LIST OF HIDDEN STATES
        myfile.readline() # Clears over spacer line
        myfile.readline() # Clears over spacer line of emission symbols
        emTable = dict() # Initialize emission table
        
#         EMISSION TABLE PARSING
        emRows = []
        emInRow = []
        for line in myfile:
            emInRow = line.rstrip()
            emRows.append(np.longdouble(emInRow.split('\t')[1:]))
        for stateIdx, state in enumerate(hStates): # Iterate over hidden states and their idxes
            for symbolIdx, symbol in enumerate(emSymbols): # Iterate over symbols and their idxes
                emTable[state+symbol] = np.longdouble(emRows[stateIdx][symbolIdx]) # Populate transition table probabilities where each key is a string: "{state}+{symbol}"
                
        myESP = EmissionSeqProb(hPath, emSeq, emTable)
        prob = myESP.computeProb()
        print(np.format_float_scientific(prob, 11))
        
if __name__ == "__main__":
#     main("data/rosalind_ba10b.txt")
    main("testCases/test cases/20.txt")


6.53119591630e-36


In [4]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Write more markdown comments
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Wrote markdown comments
# - Cleaned code
# - Added more docstrings and inline comments