In [1]:
#!/usr/bin/env python3

########################################################################
# File: problem23.ipynb

# Author: Nicholas Chan
# History: 12/06/2021 Created
########################################################################

# Assignment 7: Problem 23
<br>
For this assignment, we were given input for an emission sequence, emitted symbols, <br>
hidden path, and hidden states. With this information we were to estimate the parameters of the HMM. <br>
These parameters were the transition and emission probabilities of the HMM.


# Params Class
<br>
The Params class stores the emission sequence, emission symbols, hidden path, and hidden <br>
states given as input. With this information, the parameters of emission and transition <br>
probabilities are generated with the computeEmission() and computeTransition() methods.

In [2]:
import numpy as np
class Params:
    '''
    The Params class can be used to estimate the emission and transition probabilities
    of an HMM through the forms of an emission table and transition table. These tables
    can then be printed out from the Params's class's methods printEmTable() and
    printTrTable().
    '''
    def __init__(self, eSymbols, hStates, eSequence, hPath):
        '''
        Initializes object with listof emission symbols, list of hidden states, string 
        of emission sequence, string of hidden path, a dictionary holding estimated 
        emission probabilities, and a dictionary holding estimated transition probabilities.
        '''
        self.emS = eSymbols
        self.hS = hStates
        self.emSeq = eSequence
        self.hP = hPath
        self.emissionTable = self.computeEmissions()
        self.transitionTable = self.computeTransitions()
        
    def computeEmissions(self):
        '''
        Composes count tables for emissions which are then divided by 
        the conditional sums corresponding to the state which each symbol
        was emitted from, yielding our estimates for emission probabilities.
        '''
        # Computation of emission table
        emCountTable = dict() # Emission count table
        for state1 in self.hS: # INITIALIZE TABLE
            for symbol2 in self.emS:
                emCountTable[state1+symbol2] = np.longdouble(0)
        for idx, emission in enumerate(self.emSeq): # COUNT EMISSIONS
            emCountTable[self.hP[idx] + emission] += 1
        for state1 in self.hS: # FORM AND DIVIDE TABLE'S STATES BY CONDITIONAL SUMS
            conditionalSum = 0
            for symbol2 in self.emS: # FORM CONDITIONAL SUM FOR A STATE GIVEN ITS EMITTED SYMBOLS
                conditionalSum += emCountTable[state1+symbol2] # adds the count of an emission to the conditional sum
            for symbol2 in self.emS: # DIVIDE STATE'S EMISSIONS BY CONDITIONAL SUM
                if conditionalSum == 0: # Handles nan's
                    emCountTable[state1+symbol2] = np.round(1/len(self.emS), decimals=3)
                else:
                    emCountTable[state1+symbol2] = np.round(emCountTable[state1+symbol2]/conditionalSum, decimals=3) # changes table val to a conditional prob
        return emCountTable
    
    def printEmTable(self):
        '''
        Method for printing out emission table.
        '''
        outstring = "\t"+"\t".join(self.emS) + "\n" # Initialize the first line of the printed emission table
        for state1 in self.hS:
            tmpStr = f"{state1}"
            for symbol2 in self.emS:
                tmpStr += f"\t{self.emissionTable[state1+symbol2]}"
            outstring += tmpStr + "\n"
        return outstring
        
    def computeTransitions(self):
        '''
        Composes count tables for hidden states which are then divided by 
        the total occurence counts of all transitions sharing the same source 
        state in each transition. This yields our estimates on transition 
        probabilities.
        '''
        # Computation of transition table        
        hsCountTable = dict() # Hidden State count table
        for state1 in self.hS: # INITIALIZE TABLE
            for state2 in self.hS:
                hsCountTable[state1+state2] = np.longdouble(0) 
        for idx in range(1,len(self.hP)): # COUNT STATE TRANSITIONS
            hsCountTable[self.hP[idx-1:idx+1]] += 1
        for state1 in self.hS: # FORM AND DIVIDE TABLE'S STATES BY CONDITIONAL SUMS
            conditionalSum = 0
            for state2 in self.hS: # FORM CONDITIONAL SUM FOR A STATE GIVEN ITS EMITTED SYMBOLS
                conditionalSum += hsCountTable[state1+state2]
            for state2 in self.hS:
                if conditionalSum == 0:
                    hsCountTable[state1+state2] = np.around(1/len(self.hS), decimals=3)
#                     hsCountTable[state1+state2] = np.round(0)
                else:
                    hsCountTable[state1+state2] = np.round(hsCountTable[state1+state2]/conditionalSum, decimals=3)
        return hsCountTable
                
    def printTrTable(self):
        '''
        Method for printing out transition table.
        '''
        outstring = "\t"+"\t".join(self.hS) + "\n" # Initialize the first line of the printed emission table
        for state1 in self.hS:
            tmpStr = f"{state1}"
            for state2 in self.hS:
                tmpStr += f"\t{self.transitionTable[state1+state2]}"
            outstring += tmpStr + "\n"
        return outstring
        

# Main Function
<br>
Parses an input text file as a string for the emission sequence, a list of emission symbols, <br>
a string for the hidden path, and a list for the hidden states which the Params class <br>
requires as input. Input is assumed to contain an emitted sequence of symbols, a list of <br>
possible hidden states, a list of possible emission symbols, a transition table, and an emission table. 

In [3]:
def main(infile):
    '''
    main function parses input to initialize a Params object. With a Params
    object, we can estimate the parameters of an HMM. Using the 
    printTrTable() and printEmTable methods contained by the Params object,
    the estimated transition and emission tables of the HMM are then printed.
    '''
    with open(infile, 'r') as myfile:
        eSequence = myfile.readline().rstrip() # READS EMISSION SEQUENCE LINE
        myfile.readline() # Clears over spacer line
        eSymbols = myfile.readline().rstrip().split('\t') # SPLITS EMISSION SYMBOLS LINE INTO A LIST
        myfile.readline() # Clears over spacer line
        hPath = myfile.readline().rstrip() # READS HIDDEN STATE PATH LINE
        myfile.readline() # Clears over spacer line
        hStates = myfile.readline().rstrip().split('\t') # SPLITS HIDDEN STATES LINE INTO A LIST
        myParams = Params(eSymbols, hStates, eSequence, hPath)
        print(myParams.printTrTable(), end = "")
        print("--------")
        print(myParams.printEmTable())
        
if __name__ == "__main__":
#     main("data/p23-ex-input.txt")
    main("data/rosalind_ba10h.txt")

	A	B
A	0.51	0.49
B	0.542	0.458
--------
	x	y	z
A	0.385	0.288	0.327
B	0.375	0.292	0.333



In [4]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Cleaned code
# - Added more docstrings and inline comments