# Rosalind problem25

Nicholas Rose

BME 205

Due Date: December 6, 2021

### Viterbi learning:

Given: A sequence of emitted symbols x = x1 ... xn in an alphabet A, generated by a k-state HMM with unknown transition and emission probabilities, initial Transition and Emission matrices and a number of iterations i.

Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that maximizes Pr(x, π) over all possible transition and emission matrices and over all hidden paths π.

In [2]:
import math
import numpy as np
import itertools

In [4]:
class InfileParse():
    '''
    This class object parses an given infile.
    Includes attributes:
    self.i (int),
    self.x (string),
    self.char (list),
    self.states (list),
    self.transTable (list),
    self.emTable (list)
    '''
    
    def __init__(self, infile):
        
        self.i = 0
        self.x = ''
        self.char = []
        self.states = []
        self.transTable = []
        self.emTable = []
        
        with open(infile) as f:
            self.i = f.readline().strip()
            next(f)
            self.x = f.readline().strip()
            next(f)
            self.char = f.readline().strip().replace("\t", "").replace(" ", "")
            next(f)
            self.states = f.readline().strip().replace("\t", "").replace(" ", "")
            next(f)
            for i, line in enumerate(f.readlines()):
                if i - 1 < len(self.states):
                    self.transTable.append(line.strip().split())
                if i - 1 > len(self.states):
                    self.emTable.append(line.strip().split())
    
    
    def createTable(self, table):
        '''
        Method initialized by a list (table)
        of file lines (strings), representing the header and rows
        of a matrix. Returns a numpy array containing table data.
        '''
        
        matrix = np.zeros((0,len(table[-1])-1))
        states = []
        for i, line in enumerate(table):
            if i != 0:
                matrix = np.vstack([matrix, np.array(list(map(float, line[1:])))])
                    
        return matrix

In [242]:
class Viterbi():
    '''
    Object 'Viterbi'.
    Includes attributes:
    self.nodes (list), self.edges (list)
    and methods:
    self.algorithm(self, x, states, transitionTable, emissionTable) and
    self.hiddenPath(self, states)
    '''
    
    def __init__(self, x, char, states):
    
        self.x = x
        self.char = char
        self.states = states
        self.nodes = np.zeros((len(x), len(states)))
        self.edges = []
        
       
    def algorithm(self, transitionTable, emissionTable):
        '''
        Method which accepts a a string x, possible hidden states, an 
        transition table, and an emission table. Returns the probabity of
        string x being emmited by a given HMM considering all possible
        hidden states. This is done with the forward algorithy.
        '''
        
        for i, nucleo in enumerate(self.x):
            if i == 0:
                start = 1 / len(self.states)
                for j in range(len(self.states)):
                    self.nodes[i][j] = start * emissionTable[j][self.char.index(nucleo)]
            else:
                rowEdges = {}
                for j, state in enumerate(self.states):
                    listInput = []
                    for k, prevState in enumerate(self.states):
                        listInput.append((self.nodes[i - 1][k]) *
                                         (transitionTable[k][j]) *
                                         (emissionTable[j][self.char.index(nucleo)]))
                    rowEdges[state] = listInput
                    self.nodes[i][j] = max(listInput)
                self.edges.append(rowEdges)
                
    
    def hiddenPath(self, states):
        '''
        Method that accepts a list of states,
        and returns a path that maximizes the (unconditional) 
        probability Pr(x, π) over all possible paths π, using backtracking
        and dynamic programming
        '''
        
        hiddenPath = ''
        last = self.states[np.argmax(self.nodes[-1])]
        hiddenPath += last
        
        for column in self.edges[::-1]:
            listMax = max(column[last])
            index = column[last].index(listMax)
            state = states[index]
            hiddenPath += state
            last = state
        
        return hiddenPath[::-1]

In [243]:
class TransitionTable():
    '''
    Object 'TransitionTable'. 
    Initialized by a hidden path (pi),
    and a list of states in pi (states).
    Includes attributes:
    self.states (list),
    self.stateCount (dictionary), and
    self.transitions (dictionary),
    which are populated when creating an object.
    Includes method(s):
    matrix(self).
    '''
    
    def __init__(self, pi, states):
        
        self.states = states
        self.stateCount = {}
        self.transitions = {}
        
        trans = itertools.product(''.join(states), repeat=2)
        for i in trans:
            self.transitions[(i[0] + i[1])] = 0
        for i in states:
            self.stateCount[i] = 0
            
        for i, state in enumerate(pi[:-1]):
            self.stateCount[state] += 1
            self.transitions[pi[i:i+2]] += 1
            
            
    def matrix(self):
        '''
        Method initialized by two dictionaries containing
        the counts of states and transitions.
        Returns a numpy array containing transition table data.
        '''
        
        transProb = {}
        for i in self.transitions:
            if self.stateCount[i[0]] != 0:
                transProb[i] = self.transitions[i] / self.stateCount[i[0]]
            else:
                transProb[i] = 1 / len(self.states)
        
        data = list(transProb.values())
        matrix = np.reshape(data, (len(self.states), len(self.states)))
        
        return matrix

In [244]:
class EmissionTable():
    '''
    Object 'EmissionTable'. 
    Initialized by a sequence (x),
    a list of characters in x (char),
    a hidden path (pi), and a list of states in pi (states).
    Includes attributes:
    self.states (list),
    self.char (list)
    self.stateCount (dictionary), and
    self.transitions (dictionary),
    which are populated when creating an object.
    Includes method(s):
    matrix(self).
    '''
    
    def __init__(self, x, char, pi, states):
        
        self.states = states
        self.char = char
        self.stateCount = {}
        self.emissions = {}
        
        for i in self.states:
            for j in char:
                self.emissions[i + j] = 0
        for i in states:
            self.stateCount[i] = 0
            
        for i, state in enumerate(pi):
            self.stateCount[state] += 1
            self.emissions[pi[i] + x[i]] += 1
            
            
    def matrix(self):
        '''
        Method initialized by two dictionaries containing
        the counts of states and emissions.
        Returns a numpy array containing emission table data.
        '''
        
        transProb = {}
        for i in self.emissions:
            if self.stateCount[i[0]] != 0:
                transProb[i] = self.emissions[i] / self.stateCount[i[0]]
            else:
                transProb[i] = 1 / len(self.char)
        
        data = list(transProb.values())
        matrix = np.reshape(data, (len(self.states), len(self.char)))
        
        return matrix

In [269]:
class Iterate():
    '''
    Object 'Iterate'.
    Includes attributes:
    self.data (InFileParse),
    self.transitionTableStart (np.array)
    self.emissionTableStart (np.array)
    self.transitionTableFinal (int / np.array)
    self.emissionTableFinal (int / np.array)
    and method:
    self.iterate(self)
    '''
    
    def __init__(self, data, transitionTable, emissionTable):
        
        self.data = data
        self.transitionTableStart = transitionTable
        self.emissionTableStart = emissionTable
        self.transitionTableFinal = 0
        self.emissionTableFinal = 0
        
        
    def iterate(self):
        '''
        Method that iterates i (self.data.i) times with
        a while loop. The Object and methods above are
        called and the perameters and hidden path are 
        updated for every iteration.
        '''
        
        n = 0
        ind = int(self.data.i)
        transitionTable = self.transitionTableStart
        emissionTable = self.emissionTableStart
        
        while n <= ind:
            viterbi = Viterbi(self.data.x, self.data.char, self.data.states)
            viterbi.algorithm(transitionTable, emissionTable)
            hiddenPath = viterbi.hiddenPath(self.data.states)
            transitionTable = TransitionTable(hiddenPath, self.data.states).matrix()
            emissionTable = EmissionTable(self.data.x, self.data.char, hiddenPath, self.data.states).matrix()
            n += 1
        
        self.transitionTableFinal = transitionTable
        self.emissionTableFinal = emissionTable

## Main

In [275]:
def main(infile):
    '''
    The main method. This method takes file containing
    a number of iterations, a sequence, a character list, 
    possible hidden states, a transition table, and an emission table.
    This method parses the file and runs the above methods.
    Output is printed below and to an output file 
    'rosalind_25.txt.out'
    '''
    
    data = InfileParse(infile)
    transitionTable = data.createTable(data.transTable)
    emissionTable = data.createTable(data.emTable)
    result = Iterate(data, transitionTable, emissionTable)
    result.iterate()
    
    with open('rosalind_25.txt.out', 'w') as out:
        print(*data.states, sep='\t', file=out)
        print(*data.states, sep='\t')
        for i, state in enumerate(result.transitionTableFinal):
            print(data.states[i], *state, sep='\t', file=out)
            print(data.states[i], *state, sep='\t')
        print('--------', file=out)
        print('--------')
        print('\t', *data.char, sep='\t', file=out)
        print('\t', *data.char, sep='\t')
        for i, state in enumerate(result.emissionTableFinal):
            print(data.states[i], *state, sep='\t', file=out)
            print(data.states[i], *state, sep='\t')
    
if __name__ == "__main__":
    main('/home/nick_rose/Downloads/rosalind_ba10i.txt')

A	B	C
A	0.0	0.3333333333333333	0.6666666666666666
B	0.0	0.125	0.875
C	0.3	0.0	0.7
--------
		x	y	z
A	1.0	0.0	0.0
B	1.0	0.0	0.0
C	0.0	0.4788732394366197	0.5211267605633803
