In [232]:
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import HMM as hmm
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment_HMM as alignment_hmm
from Comparative_Analysis import Alignment_Analysis as alignment_analysis
from Comparative_Analysis import Alignment as align
from numpy.random import default_rng
import numpy as np
from scipy.stats import norm
from scipy.stats import binom
from scipy import optimize as opt
import seaborn as sns
import math
from tqdm import tqdm

In [233]:
rng = default_rng()

In [234]:
def create_transition_matrix(params):
    a = params[0]
    b = (1-params[0])*(params[1])
    c = 1-a-b
    e = params[2]
    d = (1-params[2])*(params[3])
    f = 1-e-d
    i = params[4]
    g = (1-params[4])*(params[5])
    h = 1 - i - g
    transition_probabilities = np.array([[a,b,c],[d,e,f],[g,h,i]])
    mutation_probabilities = params[6:]
    return transition_probabilities, mutation_probabilities

In [235]:
def sim_multinomial(probs):
    a = np.where(rng.multinomial(1, probs) == 1)[0][0]
    return a

In [236]:
def sum_logs(p, q):
        if p>9999 and q>99999:
            ans = math.log(math.exp(p) + math.exp(q))
        else:
            if p > q:
                ans =  p + math.log(1 + math.exp(q - p))
            else:
                ans =  q + math.log(1 + math.exp(p - q))
        return ans

In [237]:
def normal_draw(state, means):
    return rng.standard_normal() + means[state]

In [238]:
def binomial_draw(state, probs, size):
    return rng.binomial(size, probs[state])

##### Simulate from HMM

In [239]:
sample_size = 10000

In [240]:
num_states = 3
num_comparison_sequences = 10
means = [2, 7, 10]
mutation_probs = [0.9, 0.5, 0.1]
initial_probs = [0.333, 0.333, 0.334]
transition_matrix = np.array([[0.9, 0.075, 0.025], [0.7, 0.2, 0.1], [0.5, 0.3, 0.2]])

In [267]:
def calculate_observation_probabilities(observations, mutation_probs):
    observation_probs = np.zeros((num_states, sample_size))
    for i in range(sample_size):
        for state in range(num_states):
            observation_probs[state, i] = binom.pmf(observations[i],  num_comparison_sequences, mutation_probs[state])
    return observation_probs 

In [268]:
states = np.zeros(sample_size)
observation_probabilities = np.zeros((num_states, sample_size))
for i in range(sample_size):
    if i == 0:
        current_state = sim_multinomial(initial_probs)
    else:
        current_state = sim_multinomial(transition_matrix[current_state,:])
    states[i] = current_state
    #observations[i] = normal_draw(current_state, means)
    observations[i] = binomial_draw(current_state, mutation_probs, num_comparison_sequences)
    
observation_probabilities = calculate_observation_probabilities(observations, mutation_probs)

In [278]:
def calculate_likelihood(params):
    trans_matrix, mutation_probabilities = create_transition_matrix(params)
    observation_probabilities = calculate_observation_probabilities(observations, mutation_probabilities)
    hmm_model = hmm.HMM(initial_probs, trans_matrix, observation_probabilities)
    hmm_model.calculate_probabilities()
    print(trans_matrix)
    print(mutation_probabilities)
    print (hmm_model.forward_ll * -1)

    return hmm_model.forward_ll * -1

In [279]:
params = [0.95, 0.5, 0.95, 0.5, 0.95, 0.5, 0.8, 0.7, 0.6]
bound_tuple = [(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999)]

In [None]:
res = opt.minimize(calculate_likelihood, params, method = 'Nelder-Mead', bounds = bound_tuple)

In [275]:
for iter in tqdm(range(1000)):
    if iter == 0:
        transition_probabilities, mutation_probabilities = create_transition_matrix(params)
    else:
        transition_probabilities = transition_counts
        mutation_probabilities = mutation_counts
    observation_probabilities = calculate_observation_probabilities(observations, mutation_probabilities)
    hm_model = hmm.HMM(initial_probs, transition_probabilities, observation_probabilities)
    hm_model.calculate_probabilities()
    if iter > 1 and abs(total_probability - (hm_model.forward_ll * -1)) < 0.001:
        break
    total_probability = hm_model.forward_ll * -1
    prob_observation = hm_model.forward_ll
    transition_counts = np.zeros((num_states, num_states))
    mutation_counts = np.zeros(num_states)
    for s in range(num_states):
        for t in range(num_states):
            temp = 0
            for i in range(sample_size - 1):
                if i == 0:
                    temp = hm_model.forward_probabilities[s, i] + math.log(transition_probabilities[s, t]) + math.log(observation_probabilities[t, i+1]) + hm_model.backward_probabilities[t, i+1]
                else:
                    temp = sum_logs(temp, hm_model.forward_probabilities[s, i] + math.log(transition_probabilities[s, t]) + math.log(observation_probabilities[t, i+1]) + hm_model.backward_probabilities[t, i+1])
            transition_counts[s, t] += math.exp(temp - prob_observation)

    for s in range(num_states):
        temp_1 = 0
        for t in range(num_states):
            temp_1 += transition_counts[s, t]
        for t in range(num_states):
            transition_counts[s, t] = transition_counts[s, t] / temp_1
  
    for s in range(num_states):
        temp_1 = 0; temp_2 = 0
        for i in range(sample_size - 1):
            temp_1 += hm_model.state_probabilities[s][i] * observations[i] / num_comparison_sequences
            temp_2 += hm_model.state_probabilities[s][i]
        mutation_counts[s] = temp_1 / temp_2
    if iter % 10 == 0:
        print(total_probability)  
        print(transition_counts, mutation_counts)

print("Final Fit....")
print(total_probability)  
print(transition_counts, mutation_counts)


  0%|          | 1/1000 [00:02<39:39,  2.38s/it]

22232.19254363934
[[0.97192581 0.00471853 0.02335566]
 [0.19262002 0.75072301 0.05665697]
 [0.25544959 0.0167872  0.72776321]] [0.87406331 0.69427762 0.42284982]


  1%|          | 11/1000 [00:25<39:18,  2.38s/it]

16961.843846126554
[[0.91009311 0.03224297 0.05766392]
 [0.53320957 0.36691274 0.09987769]
 [0.59230943 0.10614246 0.30154811]] [0.90060053 0.62378989 0.25242024]


  2%|▏         | 21/1000 [00:49<39:10,  2.40s/it]

16780.43588142488
[[0.89633606 0.0750235  0.02864044]
 [0.66348832 0.22637764 0.11013404]
 [0.51057288 0.27829126 0.21113586]] [0.90367557 0.54448347 0.12846531]


  3%|▎         | 31/1000 [01:13<37:42,  2.33s/it]

16769.75251913011
[[0.90079981 0.07437643 0.02482375]
 [0.66834419 0.22653725 0.10511856]
 [0.50781562 0.30169048 0.1904939 ]] [0.90250563 0.51534633 0.10576632]


  4%|▍         | 41/1000 [01:36<37:34,  2.35s/it]

16769.478040851387
[[0.90182612 0.0738225  0.02435138]
 [0.66878782 0.22680629 0.10440589]
 [0.5084575  0.30428473 0.18725777]] [0.9022224  0.51013739 0.10285184]


  4%|▍         | 43/1000 [01:43<38:26,  2.41s/it]

Final Fit....
16769.474684126897
[[0.90188417 0.07379109 0.02432474]
 [0.66881476 0.22681802 0.10436722]
 [0.50849212 0.30443953 0.18706835]] [0.90220615 0.50984078 0.10268869]





In [251]:
observation_probabilities

array([[0.31849633, 0.31849633, 0.00237329, ..., 0.31849633, 0.1593297 ,
        0.29263245],
       [0.31849633, 0.31849633, 0.00237329, ..., 0.31849633, 0.1593297 ,
        0.29263245],
       [0.31849598, 0.31849598, 0.00237331, ..., 0.31849598, 0.15933003,
        0.29263259]])

array([ 7.,  8., 10., 10.,  8.,  9.,  4.,  5.,  9.,  8., 10.,  8.,  9.,
        6.,  7.,  9., 10.,  9., 10.,  8., 10.,  9.,  8.,  9.,  9., 10.,
        8.,  8.,  1.,  3., 10.,  9.,  9.,  8.,  9.,  9., 10., 10.,  9.])