# Rosalind problem24

Nicholas Rose

BME 205

Due Date: December 6, 2021

### Soft Decoding Problem:
    
Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).

Return: The probability Pr(πi = k|x) that the HMM was in state k at step i (for each state k and each step i).

In [1]:
import numpy as np

In [2]:
class InfileParse():
    '''
    This class object parses an given infile.
    Includes attributes:
    self.x (string),
    self.char (list),
    self.states (list),
    self.transTable (list),
    self.emTable (list)
    '''
    
    def __init__(self, infile):
        
        self.x = ''
        self.char = []
        self.states = []
        self.transTable = []
        self.emTable = []
        
        with open(infile) as f:
            self.x = f.readline().strip()
            next(f)
            self.char = f.readline().strip().replace("\t", "").replace(" ", "")
            next(f)
            self.states = f.readline().strip().replace("\t", "").replace(" ", "")
            next(f)
            for i, line in enumerate(f.readlines()):
                if i - 1 < len(self.states):
                    self.transTable.append(line.strip().split())
                if i - 1 > len(self.states):
                    self.emTable.append(line.strip().split())
    
    
    def createTable(self, table):
        '''
        Method initialized by a list (table)
        of file lines (strings), representing the header and rows
        of a matrix. Returns a numpy array containing table data.
        '''
        
        matrix = np.zeros((0,len(table[-1])-1))
        states = []
        for i, line in enumerate(table):
            if i != 0:
                matrix = np.vstack([matrix, np.array(list(map(float, line[1:])))])
                    
        return matrix

In [3]:
class Viterbi():
    '''
    Object 'Viterbi'.
    Includes attributes:
    self.nodes (list), self.prob (int/float)
    and method:
    self.forward(self, x, states, transitionTable, emissionTable)
    '''
    
    def __init__(self, x, states):
    
        self.x = x
        self.states = states
        self.fNodes = np.zeros((len(x), len(states)))
        self.bNodes = np.zeros((len(x), len(states)))
        self.piStar = np.zeros((len(x), len(states)))
        self.prob = 0
        
       
    def forward(self, char, transitionTable, emissionTable):
        '''
        Method which accepts a a string x, possible hidden states, an 
        transition table, and an emission table. Returns the probabity of
        string x being emmited by a given HMM considering all possible
        hidden states. This is done with the forward algorithy.
        '''
        
        for i, nucleo in enumerate(self.x):
            if i == 0:
                start = 1 / len(self.states)
                for j in range(len(self.states)):
                    self.fNodes[i][j] = start * emissionTable[j][char.index(nucleo)]
            else:
                a = np.transpose(self.fNodes[i-1])
                b = np.matmul(a, transitionTable)
                c = b * emissionTable[:,char.index(nucleo)]
                d = np.transpose(c)
                self.fNodes[i] = d
                
        return self.fNodes
    
    
    def backward(self, char, transitionTable, emissionTable):
        '''
        Method which accepts a a string x, possible hidden states, an 
        transition table, and an emission table. Returns the probabity of
        string x being emmited by a given HMM considering all possible
        hidden states. This is done with the forward algorithy.
        '''
        
        for i, nucleo in enumerate(self.x[::-1]):
            prevNucleo = self.x[::-1][i-1]
            if i == 0:
                for j in range(len(self.states)):
                    self.bNodes[i][j] = 1
            else:
                a = self.bNodes[i-1] * emissionTable[:,char.index(prevNucleo)]
                b = np.matmul(transitionTable, a)
                c = np.transpose(b)
                self.bNodes[i] = c
         
        self.bNodes = self.bNodes[::-1]
        return self.bNodes
    
    
    def softDecode(self):
        '''
        Method which returns the likelihood of a sequence
        passing through each hidden state at position i.
        This is done by summing the forward and backward
        algorithms at i, and dividing by the total probability
        given by the forward algorithm.
        These values are returned as a numpy array.
        '''
        
        for i, nucleo in enumerate(self.x):
            totalProb = np.sum(self.fNodes[-1])
            self.piStar[i] = (self.fNodes[i] * self.bNodes[i]) / totalProb
                
        return self.piStar

## Main

In [5]:
def main(infile):
    '''
    The main method. This method takes file containing
    a sequence, a character list, possible hidden states,
    a transition table, and an emission table.
    This method parses the file and runs the above methods.
    Output is printed below and to an output file 
    'rosalind_24.txt.out'
    '''
    
    data = InfileParse(infile)
    transitionTable = data.createTable(data.transTable)
    emissionTable = data.createTable(data.emTable)
    viterbi = Viterbi(data.x, data.states)
    forward = viterbi.forward(data.char, transitionTable, emissionTable)
    backward = viterbi.backward(data.char, transitionTable, emissionTable)
    decode = viterbi.softDecode()
    
    with open('rosalind_24.txt.out', 'w') as out:
        print(*data.states, sep='\t', file=out)
        print(*data.states, sep='\t')
        for line in decode:
            print(*line, sep='\t', file=out)
            print(*line, sep='\t')
    
    
if __name__ == "__main__":
    main('/home/nick_rose/Downloads/rosalind_ba10j.txt')

A	B
0.5826159462937879	0.4173840537062126
0.15561784049585506	0.8443821595041454
0.5625023610245109	0.43749763897548943
0.20060239025777954	0.7993976097422208
0.41873200110256964	0.5812679988974306
0.40971958568641365	0.5902804143135866
0.45426247261880226	0.545737527381198
0.3694358685214039	0.6305641314785962
0.27628065397115115	0.723719346028849
0.5734915621015685	0.42650843789843146
