In [1]:
import numpy as np
import random
import math
import matplotlib.pyplot as plt
from scipy import optimize as opt
import Sequence_Analysis_Routines as sar
import ete3



In [2]:
project_dir = 'D:/Project_Data/Project_3'
output_dir = project_dir + '/Output/Close_Species'
non_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Non_CDS'
tb_species = 'GCF_000195955.2'

In [3]:
num_states = 5

In [12]:
file_ids = sar.list_files(non_cds_output_dir+'/')
ids = [int(i.split('.')[0]) for i in file_ids]
#ids.remove(1559)  #Contains S in alignment!
outgroup_species = 'NC_008596.1'
master_tree = ete3.Tree(output_dir + '/Trees/Concatenated_JC_Tree.treefile')
outgroup = master_tree.search_nodes(name= outgroup_species)[0]
outgroup.delete()

In [13]:
def fit_hmm(params):
    num_states = 4    # Inserts are randomised
    initial_state_probabilities = [1.0/num_states]*num_states
    total_probability = 0
    #group_ids = ids[200:250]
    
    group_ids = ids
    for group_id in group_ids:
        alignment = sar.Alignment(non_cds_output_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
        alignment.modify_sequence(1, False, True)
        align_list =  alignment.modified_sequence_list
        align_names = alignment.sequence_names
        len_align_list = len(align_list[0])
        non_cds = [x[50:len_align_list - 50] for x in align_list]
        if len(non_cds[0]) < 10:
            continue
        transition_probabilities = np.array([[1-params[0],params[0]],[params[1],1-params[1]]])
        observation_probabilities = sar.mutation_probs(params[2], params[3], non_cds, align_names, master_tree, num_states)
        trial_hmm = sar.HMM(initial_state_probabilities, transition_probabilities, observation_probabilities)
        trial_hmm.viterbi()
        total_probability += trial_hmm.viterbi_log_probability * -1
    print(params, total_probability)
    return total_probability

In [14]:
fit_hmm([0.02087488, 0.02126647, 4.64647873, 1.10655882])

[0.02087488, 0.02126647, 4.64647873, 1.10655882] 1261069.6399328616


1261069.6399328616

In [11]:
res = opt.minimize(fit_hmm, (0.03, 0.03, 2, 1), method = 'Nelder-Mead', bounds = ((0.001,0.999),(0.001,0.999),(0.1,10),(0.1,10)))
print(res.x)
#res = opt.shgo(fit_hmm, bounds = ((0.001,0.999),(0.001,0.999),(0.1,10),(0.1,10)))
#print(res.x)

[0.03 0.03 2.   1.  ] 29133.63654637549
[0.0315 0.03   2.     1.    ] 29230.914584989703
[0.03   0.0315 2.     1.    ] 29267.740648424395
[0.03 0.03 2.1  1.  ] 29095.46159030211
[0.03 0.03 2.   1.05] 29215.728195910397
[0.03075 0.0285  2.05    1.025  ] 29222.721620764754
[0.028875 0.02925  2.075    1.0375  ] 29127.432464958474
[0.0286875 0.031125  2.0375    1.01875  ] 29074.97542078415
[0.02765625 0.0324375  2.03125    1.015625  ] 29144.515736525445
[0.02878125 0.0301875  2.10625    0.978125  ] 29073.594825893448
[0.02817187 0.03028125 2.159375   0.9421875 ] 28904.71411549865
[0.02786719 0.03032812 2.1859375  0.99921875] 28943.599489806686
[0.02848828 0.03161719 2.16640625 0.94257812] 28989.750331780142
[0.02660742 0.03167578 2.17460937 0.95136719] 28838.88019212654
[0.02491113 0.03251367 2.21191406 0.92705078] 28862.183078661827
[0.02687988 0.03082617 2.30566406 0.89892578] 28813.819559220115
[0.02597607 0.03067676 2.43974609 0.83901367] 28705.689819061507


KeyboardInterrupt: 

In [None]:
fitted_parameters = res.x
transition_probabilities = np.array([[1-fitted_parameters[0],fitted_parameters[0]],[fitted_parameters[1],1-fitted_parameters[1]]])
group_id =   1167 #1569 #1505    #  1167
alignment = sar.Alignment(non_cds_output_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
alignment.modify_sequence(consensus=1)
alignment_list =  alignment.modified_sequence_list
alignment_names = alignment.sequence_names
observation_probabilities = sar.mutation_probs(fitted_parameters[2], fitted_parameters[3], alignment_list, alignment_names, master_tree)
fitted_hmm = sar.HMM(initial_state_probabilities, transition_probabilities, observation_probabilities)
fitted_hmm.viterbi()
print(fitted_hmm.viterbi_log_probability)
plt.plot(fitted_hmm.viterbi_path);