In [1]:
import numpy as np
import random
import math
import matplotlib.pyplot as plt
from scipy import optimize as opt
import Sequence_Analysis_Routines as sar
import ete3



In [2]:
project_dir = 'D:/Project_Data/Project_3'
output_dir = project_dir + '/Output/Close_Species'
non_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Non_CDS'
tb_species = 'GCF_000195955.2'

In [3]:
file_ids = sar.list_files(non_cds_output_dir+'/')
ids = [int(i.split('.')[0]) for i in file_ids]
#ids.remove(1559)  #Contains S in alignment!
outgroup_species = 'NC_008596.1'
master_tree = ete3.Tree(output_dir + '/Trees/Concatenated_JC_Tree.treefile')
outgroup = master_tree.search_nodes(name= outgroup_species)[0]
outgroup.delete()

In [61]:
group_ids = ids[1000:1300]
align_dict = {}
for group_id in group_ids:
        alignment = sar.Alignment(non_cds_output_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
        alignment.modify_sequence(1, True, True)
        align_dict[group_id] = alignment

In [62]:
def fit_hmm(params):
    num_symbols = 4    # Inserts are randomised
    num_states = len(params) - 1
    initial_state_probabilities = [1.0/num_states]*num_states
    total_probability = 0
    for group_id in group_ids:
        alignment = align_dict[group_id]
        align_list =  alignment.modified_sequence_list
        align_names = alignment.sequence_names
        len_align_list = len(align_list[0])
        non_cds = [x[50:len_align_list - 50] for x in align_list]
        if len(non_cds[0]) < 10:
            continue
        transition_probabilities = np.full((num_states,num_states),params[0])
        np.fill_diagonal(transition_probabilities, 1 - (num_states-1)*params[0])
        observation_probabilities = sar.mutation_probs(params[1:num_states+1], non_cds, align_names, master_tree, num_symbols)
        trial_hmm = sar.HMM(initial_state_probabilities, transition_probabilities, observation_probabilities)
        trial_hmm.viterbi()
        total_probability += trial_hmm.viterbi_log_probability * -1
    print(params, total_probability)
    return total_probability

In [65]:
fit_hmm([0.02087488, 0.02126647, 4.64647873, 1.10655882])
#fit_hmm([0.02087488, 4.64647873, 1.10655882])

[0.02087488, 0.02126647, 4.64647873, 1.10655882] 18373.483313731693


18373.483313731693

In [66]:
res = opt.minimize(fit_hmm, (0.01, 2, 1, 1), method = 'Nelder-Mead', bounds = ((0.001,0.999),(0.1,10),(0.1,10), (0.1, 10)))
print(res.x)
#res = opt.minimize(fit_hmm, (0.02, 4, 1), method = 'Nelder-Mead', bounds = ((0.001,0.499),(0.1,10),(0.1,10)))
#print(res.x)


[0.01 2.   1.   1.  ] 18718.479335616077
[0.0105 2.     1.     1.    ] 18721.39692718164
[0.01 2.1  1.   1.  ] 18692.432197706687
[0.01 2.   1.05 1.  ] 18716.286292144872
[0.01 2.   1.   1.05] 18716.286292144872
[0.0095 2.05   1.025  1.025 ] 18710.7708977154
[0.00975 2.075   1.0375  1.0375 ] 18710.357143013396
[0.009625 2.1125   1.05625  0.98125 ] 18677.571198312377
[0.0094375 2.16875   1.084375  0.946875 ] 18650.018416977095
[0.00934375 2.196875   1.0234375  1.0046875 ] 18668.101947490235
[0.00976563 2.2203125  1.04765625 0.96953125] 18650.84235486504
[0.00952344 2.26796875 1.04023437 0.92304688] 18623.00144776652
[0.00941016 2.36445313 1.04160156 0.86582031] 18584.850905222385
[0.00897852 2.37519531 1.09853516 0.89345703] 18590.34022178709
[0.00945215 2.36748047 1.11264648 0.8331543 ] 18571.514022105755
[0.00950635 2.4527832  1.15725098 0.7473877 ] 18530.678591604104
[0.00890063 2.46027832 1.1432251  0.75723877] 18530.72078310075
[0.00896033 2.65760498 1.1359314  0.6850769 ] 18487.12

KeyboardInterrupt: 

In [None]:
fitted_parameters = res.x
transition_probabilities = np.array([[1-fitted_parameters[0],fitted_parameters[0]],[fitted_parameters[1],1-fitted_parameters[1]]])
group_id =   1167 #1569 #1505    #  1167
alignment = sar.Alignment(non_cds_output_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
alignment.modify_sequence(consensus=1)
alignment_list =  alignment.modified_sequence_list
alignment_names = alignment.sequence_names
observation_probabilities = sar.mutation_probs(fitted_parameters[2], fitted_parameters[3], alignment_list, alignment_names, master_tree)
fitted_hmm = sar.HMM(initial_state_probabilities, transition_probabilities, observation_probabilities)
fitted_hmm.viterbi()
print(fitted_hmm.viterbi_log_probability)
plt.plot(fitted_hmm.viterbi_path);