# Rosalind problem23

Nicholas Rose

BME 205

Due Date: December 6, 2021

### HMM Parameter Estimation Problem:

Given: A sequence of emitted symbols x = x1 . . . xn in an alphabet ∑ and a path π = π1 . . . πn generated by a k-state HMM with unknown transition and emission probabilities.

Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that maximize Pr(x,π) over all possible matrices of transition and emission probabilities.

In [1]:
import numpy as np
import itertools

In [2]:
class InfileParse():
    '''
    This class object parses an given infile.
    Includes attributes:
    self.x (string),
    self.char (list),
    self.pi (string),
    self.states (list)
    '''
    
    def __init__(self, infile):
        
        self.x = ''
        self.char = []
        self.pi = ''
        self.states = []
        
        with open(infile) as f:
            self.x = f.readline().strip()
            next(f)
            self.char = f.readline().strip().replace("\t", "").replace(" ", "")
            next(f)
            self.pi = f.readline().strip()
            next(f)
            self.states = f.readline().strip().replace("\t", "").replace(" ", "")

In [3]:
class TransitionTable():
    '''
    Object 'TransitionTable'. 
    Initialized by a hidden path (pi),
    and a list of states in pi (states).
    Includes attributes:
    self.states (list),
    self.stateCount (dictionary), and
    self.transitions (dictionary),
    which are populated when creating an object.
    Includes method(s):
    matrix(self).
    '''
    
    def __init__(self, pi, states):
        
        self.states = states
        self.stateCount = {}
        self.transitions = {}
        
        trans = itertools.product(''.join(states), repeat=2)
        for i in trans:
            self.transitions[(i[0] + i[1])] = 0
        for i in states:
            self.stateCount[i] = 0
            
        for i, state in enumerate(pi[:-1]):
            self.stateCount[state] += 1
            self.transitions[pi[i:i+2]] += 1
            
            
    def matrix(self):
        '''
        Method initialized by two dictionaries containing
        the counts of states and transitions.
        Returns a numpy array containing transition table data.
        '''
        
        transProb = {}
        for i in self.transitions:
            if self.stateCount[i[0]] != 0:
                transProb[i] = self.transitions[i] / self.stateCount[i[0]]
            else:
                transProb[i] = 1 / len(self.states)
        
        data = list(transProb.values())
        matrix = np.reshape(data, (len(self.states), len(self.states)))
        
        return matrix

In [4]:
class EmissionTable():
    '''
    Object 'EmissionTable'. 
    Initialized by a sequence (x),
    a list of characters in x (char),
    a hidden path (pi), and a list of states in pi (states).
    Includes attributes:
    self.states (list),
    self.char (list)
    self.stateCount (dictionary), and
    self.transitions (dictionary),
    which are populated when creating an object.
    Includes method(s):
    matrix(self).
    '''
    
    def __init__(self, x, char, pi, states):
        
        self.states = states
        self.char = char
        self.stateCount = {}
        self.emissions = {}
        
        for i in self.states:
            for j in char:
                self.emissions[i + j] = 0
        for i in states:
            self.stateCount[i] = 0
            
        for i, state in enumerate(pi):
            self.stateCount[state] += 1
            self.emissions[pi[i] + x[i]] += 1
            
            
    def matrix(self):
        '''
        Method initialized by two dictionaries containing
        the counts of states and emissions.
        Returns a numpy array containing emission table data.
        '''
        
        transProb = {}
        for i in self.emissions:
            if self.stateCount[i[0]] != 0:
                transProb[i] = self.emissions[i] / self.stateCount[i[0]]
            else:
                transProb[i] = 1 / len(self.char)
        
        data = list(transProb.values())
        matrix = np.reshape(data, (len(self.states), len(self.char)))
        
        return matrix

## Main

In [7]:
def main(infile):
    '''
    The main method. This method takes file containing
    a sequence of characters, a character list, a hidden path, and a state list.
    This method parses the file and runs the above methods.
    Output is printed below and to an output file 
    'rosalind_23.txt.out'
    '''
    
    data = InfileParse(infile)
    transTable = TransitionTable(data.pi, data.states).matrix()
    emTable = EmissionTable(data.x, data.char, data.pi, data.states).matrix()
    

    with open('rosalind_23.txt.out', 'w') as out:
        print(*data.states, sep='\t', file=out)
        print(*data.states, sep='\t')
        for i, state in enumerate(transTable):
            print(data.states[i], *state, sep='\t', file=out)
            print(data.states[i], *state, sep='\t')
        print('--------', file=out)
        print('--------')
        print('\t', *data.char, sep='\t', file=out)
        print('\t', *data.char, sep='\t')
        for i, state in enumerate(emTable):
            print(data.states[i], *state, sep='\t', file=out)
            print(data.states[i], *state, sep='\t')
    
    
if __name__ == "__main__":
    main('/home/nick_rose/Downloads/rosalind_ba10h (2).txt')

A	B
A	0.40476190476190477	0.5952380952380952
B	0.43859649122807015	0.5614035087719298
--------
		x	y	z
A	0.30952380952380953	0.2857142857142857	0.40476190476190477
B	0.3620689655172414	0.2413793103448276	0.39655172413793105
