##### Import modules and set up file locations

In [1]:
import Sequence_Analysis_Routines as sar
import random
from joblib import Parallel, delayed
from scipy import optimize as opt
from tqdm import tqdm
import matplotlib.pyplot as plt
import logomaker as lm
import math
import pandas as pd
import subprocess
import numpy as np
import ete3;
import pickle



In [2]:
project_dir = 'D:/Project_Data/Project_3'
sonic_paranoid_run_name = 'Run_Without_Outgroup'
outgroup_sonic_paranoid_run_name = 'Run_With_Outgroup'
genome_datasets_dir = project_dir + '/Datasets/NCBI_Datasets_Close_Species/'
output_dir = project_dir + '/Output/Close_Species'
protein_fasta_output_loc = output_dir + '/Protein_Sequences'
outgroup_protein_fasta_output_loc = output_dir + '/Protein_Sequences_With_Outgroup'
sonic_paranoid_output_loc = output_dir + '/Sonic_Paranoid_Output'
ortholog_file_ref = sonic_paranoid_output_loc + '/runs/' + sonic_paranoid_run_name + '/ortholog_groups/flat.ortholog_groups.tsv'
outgroup_ortholog_file_ref = sonic_paranoid_output_loc + '/runs/' + outgroup_sonic_paranoid_run_name + '/ortholog_groups/flat.ortholog_groups.tsv'
single_copy_ortholog_file_ref = sonic_paranoid_output_loc + '/runs/' + sonic_paranoid_run_name + '/ortholog_groups/single-copy_groups.tsv'
outgroup_single_copy_ortholog_file_ref = sonic_paranoid_output_loc + '/runs/' + outgroup_sonic_paranoid_run_name + '/ortholog_groups/single-copy_groups.tsv'
non_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Non_CDS'
cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS'
outgroup_cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS_With_Outgroup'
outgroup_concatenated_cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS_With_Outgroup_Concatenated'

In [3]:
num_cores = 16
offset = 50
tb_species = 'GCF_000195955.2'
outgroup_species = 'GCF_000696675.2'

##### Tree and HMM parameters from fitted phylo-HMM

In [4]:
#fitted_parameters = [0.98576229, 0.97154437, 8.54931842, 1.26858705]     #Viterbi
fitted_parameters = [0.97249065, 0.9457994, 8.99643063,  1.27164373 ]  #Forward
num_symbols = 4
num_states = len(fitted_parameters) - 2

##### Determine genomes in ortholog family, generate protein files and run Sonic Paranoid (both with and without outgroup - outgroup needed for tree building)

In [5]:
genome_ids_with_outgroup = sar.list_dirs(genome_datasets_dir)
genome_ids = sar.list_dirs(genome_datasets_dir)
genome_ids.remove(outgroup_species)
num_ids = len(genome_ids)
num_ids_with_outgroup = len(genome_ids_with_outgroup)

In [None]:
for folder in sar.tqdm(genome_ids):
    sar.generate_protein_file(genome_datasets_dir + '/' + folder + '/genomic.gbff', protein_fasta_output_loc + '/' + folder + '.faa')
for folder in sar.tqdm(genome_ids_with_outgroup):
    sar.generate_protein_file(genome_datasets_dir + '/' + folder + '/genomic.gbff', outgroup_protein_fasta_output_loc + '/' + folder + '.faa')

In [None]:
#sar.run_sonic_paranoid(protein_fasta_output_loc, sonic_paranoid_output_loc, sonic_paranoid_run_name)
#sar.run_sonic_paranoid(outgroup_protein_fasta_output_loc, sonic_paranoid_output_loc, outgroup_sonic_paranoid_run_name)

##### Generate ortholog object and object containing sequence information for each ortholog group / species

In [6]:
orthologs = sar.Ortholog_Grouping(ortholog_file_ref)
outgroup_orthologs = sar.Ortholog_Grouping(outgroup_ortholog_file_ref)

100%|██████████| 84312/84312 [00:03<00:00, 22637.00it/s]
100%|██████████| 84312/84312 [00:03<00:00, 22689.66it/s]


In [7]:
seq_data = sar.Ortholog_Sequence_Dataset(orthologs, genome_datasets_dir, genome_ids, offset, tb_species) 
outgroup_seq_data = sar.Ortholog_Sequence_Dataset(outgroup_orthologs, genome_datasets_dir, genome_ids_with_outgroup, offset, tb_species) 
print(outgroup_seq_data.species_info())

100%|██████████| 11/11 [00:24<00:00,  2.19s/it]
100%|██████████| 12/12 [00:26<00:00,  2.23s/it]

                                    name          species
0      Mycobacterium kansasii ATCC 12478  GCF_000157895.3
0       Mycobacterium tuberculosis H37Rv  GCF_000195955.2
0    Mycobacterium haemophilum DSM 44634  GCF_000340435.2
0          Rhodococcus erythropolis R138  GCF_000696675.2
0         Mycolicibacterium vaccae 95051  GCF_001655245.1
0           Mycobacterium intracellulare  GCF_002285675.1
0                   Mycobacterium cookii  GCF_010727945.1
0             Mycobacterium shinjukuense  GCF_010730055.1
0                    Mycobacterium lacus  GCF_010731535.1
0                  Mycobacterium marinum  GCF_016745295.1
0  Mycolicibacterium hassiacum DSM 44199  GCF_900603025.1
0    Mycolicibacterium smegmatis MC2 155      NC_008596.1





##### Perform CDS and non-CDS alignments for each full ortholog group and save to folders

In [8]:
min_species = num_ids
min_species_with_outgroup = num_ids_with_outgroup
core_numbers = list(range(1, num_cores+1))
groups = random.sample(orthologs.full_ortholog_groups, len(orthologs.full_ortholog_groups))  #Permutation ensures even distribution of processing speeds
outgroup_groups = random.sample(outgroup_orthologs.full_ortholog_groups, len(orthologs.full_ortholog_groups))  #Permutation ensures even distribution of processing speeds

#par = Parallel(n_jobs=-1)(delayed(sar.align_and_build)(outgroup_groups, num_cores, core_number, outgroup_seq_data.sequence_data, 'cds_length', 'cds_seq', outgroup_cds_output_dir+'/', min_species_with_outgroup) for core_number in tqdm(core_numbers))
#par = Parallel(n_jobs=-1)(delayed(sar.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'cds_length', 'cds_seq', cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))
#par = Parallel(n_jobs=-1)(delayed(sar.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'non_cds_offset_length', 'non_cds_offset_seq', non_cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))

##### Run IQTree on concatenated CDS alignments to generate tree

In [9]:
alignment_names = sar.list_files(outgroup_cds_output_dir)
sar.concatenate_fasta(outgroup_cds_output_dir, alignment_names, outgroup_concatenated_cds_output_dir + '/concatenated_cds.fasta')
#subprocess.run('cd \\users\\nicho\\IQTree & bin\\iqtree2 -s ' + outgroup_concatenated_cds_output_dir + '/concatenated_cds.fasta' + ' --prefix '+ output_dir + '/Trees/Concatenated_JC_Tree -m JC -B 1000 -T AUTO -o ' + outgroup_species, shell=True)

##### Calibrate Phylo HMM

In [10]:
num_symbols = 4
num_states = 2
minimum_fit_length = 10

In [11]:
tree = ete3.Tree(output_dir + '/Trees/Concatenated_JC_Tree.treefile')
outgroup = tree.search_nodes(name= outgroup_species)[0]
outgroup.delete()

In [12]:
alignment_ids = sar.list_files(non_cds_output_dir+'/')
alignment_ids = [int(i.split('.')[0]) for i in alignment_ids]
alignment_dict = {}
for group_id in tqdm(alignment_ids):
        alignment = sar.Alignment(non_cds_output_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
        alignment.modify_sequence(1, False, True)
        alignment_dict[group_id] = alignment

100%|██████████| 1327/1327 [00:24<00:00, 54.31it/s]


In [13]:
def parallel_fit_hmm (params):
    core_numbers = range(1, num_cores+1)
    a = Parallel(n_jobs=-1)(delayed(sar.fit_phylo_hmm)(tree, num_symbols, num_states, params, alignment_ids, alignment_dict, num_cores, core_number, offset, minimum_fit_length) for core_number in core_numbers)
    print(params, sum(a))
    return sum(a)  

In [15]:
sar.fit_phylo_hmm(tree, num_symbols, num_states, [0.97, 0.97, 8,0.2,0.5,0.5], alignment_ids, alignment_dict, 16, 1, offset, minimum_fit_length)

23234.200657866102

In [None]:
res = opt.minimize(parallel_fit_hmm, (0.97, 0.97, 8,0.2,1,1 ), method = 'Nelder-Mead', bounds = ((0.001,0.999),(0.001,0.999),(0.001,10),(0.001,10),(0.001,0.999), (0.001,0.999)))

In [None]:
#fitted_parameters = [0.98576229, 0.97154437, 8.54931842, 1.26858705]     #Viterbi
fitted_parameters = [0.96731688, 0.92605748, 8.4243919,  1.26477696]  #Forward
fitted_parameters = res.x

##### Remove portions of alignment gapped for TB and prepare data to plot entropies and logos and estimate conserved regions based on HMM calibrated to all non-CDS (see separate calibration notebook)

In [None]:
utr_upstream_dict = {}
utrs = pd.read_csv(project_dir + '/Datasets/Data_From_Publications/strict_3UTRs.csv', header=0)
for i, r in tqdm(utrs.iterrows(), total=utrs.shape[0]):
    utr_upstream_dict[r['upstream']] = [r['utr'], r['start']-1, r['stop']-1, r['strand'], r['downstream']]

In [None]:
alignment_info_dict = {}
file_ids = sar.list_files(non_cds_output_dir+'/')
ids = [int(i.split('.')[0]) for i in file_ids]

for group_id in tqdm(ids):
    alignment = sar.Alignment(non_cds_output_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
    alignment.modify_sequence(1,False,True)
    alignment.calculate_entropies(mvave_len = 10)
    alignment_2 = sar.Alignment(non_cds_output_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
    alignment_2.modify_sequence(1, False, False)
    alignment_list =  alignment.modified_sequence_list
    alignment_names = alignment.sequence_names

    cds_end_pos = offset - 1
    non_cds_end_pos = alignment.modified_sequence_length - offset
    non_cds_offset_start = seq_data.master_species_info(group_id, 'non_cds_offset_start')
    non_cds_offset_stop = seq_data.master_species_info(group_id, 'non_cds_offset_stop')
    locus_tag = seq_data.master_species_info(group_id, 'locus_tag')
    cds_strand = seq_data.master_species_info(group_id, 'strand')
    if cds_strand == 1:
        downstream_locus_tag = seq_data.master_species_info(group_id, 'next_locus_tag')
        downstream_locus_strand = seq_data.master_species_info(group_id, 'next_strand')
    else:
        downstream_locus_tag = seq_data.master_species_info(group_id, 'previous_locus_tag')
        downstream_locus_strand = seq_data.master_species_info(group_id, 'prev_strand')
    utr_start_pos = 0
    utr_end_pos = 0
    if locus_tag in utr_upstream_dict:
        utr_data = utr_upstream_dict[locus_tag]
        if cds_strand == 1:
            utr_start_pos = utr_data[1] - non_cds_offset_start
            utr_end_pos = utr_data[2] - non_cds_offset_start
        else:
            utr_start_pos = non_cds_offset_stop - utr_data[2]
            utr_end_pos = non_cds_offset_stop - utr_data[1]

    initial_state_probabilities = [1.0/num_states]*num_states
    transition_probabilities = np.array([[fitted_parameters[0], 1-fitted_parameters[0]],[1-fitted_parameters[1],fitted_parameters[1]]])
    observation_probabilities = sar.mutation_probs(fitted_parameters[2:len(fitted_parameters)], alignment_list, alignment_names, tree, num_symbols)
    hmm = sar.HMM(initial_state_probabilities, transition_probabilities, observation_probabilities)
    hmm.viterbi()
    hmm.forward()
    hmm.backward()      
    alignment_info_dict[group_id] = [alignment, alignment_2, cds_end_pos, non_cds_end_pos, non_cds_offset_start, non_cds_offset_stop, locus_tag, cds_strand, downstream_locus_tag, downstream_locus_strand,
                                     utr_start_pos, utr_end_pos, hmm]

In [None]:
with open(output_dir + '/conservation_info_dictionary.pkl', 'wb') as f:
    pickle.dump(alignment_info_dict, f)

In [None]:
with open(output_dir + '/conservation_info_dictionary.pkl', 'rb') as f:
    conservation_info_dict = pickle.load(f)

##### Plot graphics to show sequence and HMM regions

In [None]:
group_id = 772   #752   756   758    772   791   786  805 shows non insertions
data = conservation_info_dict[group_id]
alignment = data[0]; alignment_2 = data[1]; cds_end_pos = data[2]; non_cds_end_pos = data[3]; non_cds_offset_start = data[4]; non_cds_offset_stop = data[5]; locus_tag = data[6]
cds_strand = data[7]; downstream_locus_tag = data[8]; downstream_locus_strand = data[9]; utr_start_pos = data[10]; utr_end_pos = data[11]; hmm = data[12]

counts_df = lm.alignment_to_matrix(sequences = alignment.modified_sequence_list, to_type = 'counts', characters_to_ignore = '-', pseudocount=0)
background_probs = [0.25, 0.25, 0.25, 0.25]
for i, r in counts_df.iterrows():
    temp_relent = []
    num_gaps = alignment.num_sequences
    for k in range(4):
        num_gaps = num_gaps - r.iloc[k]
    for k in range(4):
        ct = r.iloc[k] + num_gaps*background_probs[k]
        if ct == 0:
            temp_relent.append(0)
        else:
            temp_relent.append((ct /alignment.num_sequences) * math.log((ct /alignment.num_sequences)/background_probs[k],2))
    for k in range(4):
        r.iloc[k] = temp_relent[k]

y = -1        
seqlogo = lm.Logo(counts_df, figsize = [25,1])
seqlogo.ax.plot([0, cds_end_pos], [y,y], color='skyblue', linewidth=10, solid_capstyle='butt')
for i in alignment_2.master_species_modified_sequence_insertions:
    seqlogo.ax.plot([i[0], i[0]+1], [y-2,y-2], color='red', linewidth=3*i[1], solid_capstyle='butt')
seqlogo.ax.plot([non_cds_end_pos, alignment.modified_sequence_length], [y,y], color='skyblue', linewidth=10, solid_capstyle='butt')
seqlogo.ax.plot([utr_start_pos, utr_end_pos],[y-0.5, y-0.5], color='mediumslateblue', linewidth=10, solid_capstyle='butt')
for i, state in enumerate(hmm.viterbi_path):
    if state == 1:
        seqlogo.highlight_position_range(pmin=i, pmax=i, color='rosybrown')
    if state == 2:
        seqlogo.highlight_position_range(pmin=i, pmax=i, color='skyblue')
seqlogo.ax.text(0,4.2*y,locus_tag + ' ('+str(cds_strand)+')',fontsize=12)
seqlogo.ax.text(alignment.modified_sequence_length - offset/2,4.2*y,downstream_locus_tag+ ' ('+str(downstream_locus_strand)+')',fontsize=12)
seqlogo.ax.text(0, 4.5*y,int(non_cds_offset_start), verticalalignment='top', horizontalalignment='left')
seqlogo.style_spines(visible=False)
seqlogo.style_spines(spines=['left'], visible=True, bounds=[0, 2])
seqlogo.ax.set_xticks([])
seqlogo.ax.set_yticks([0,2])
seqlogo.ax.set_ylim([-4, 2])
seqlogo.ax.axhline(y, color = 'k', linewidth = 1)
seqlogo;

In [None]:
#plt.plot(alignment.relative_entropy);
plt.plot(alignment.mvave_relative_entropy);
plt.axvline(x=cds_end_pos, ymin=0, ymax=2, color='r');
plt.axvline(x=non_cds_end_pos, ymin=0, ymax=2, color='r');
c = hmm.forward_probabilities[1] + hmm.backward_probabilities[1]
d = [math.exp(x - hmm.forward_ll) for x in c]
plt.plot(d);

In [None]:
temp = seq_data.sequence_data
print(temp[temp['locus_tag'] == 'Rv1249c'])

In [None]:
file_ids = sar.list_files(non_cds_output_dir+'/')
ids = [int(i.split('.')[0]) for i in file_ids]
for id in tqdm(ids):
    if id in conservation_info_dict:
        temp = conservation_info_dict[id]
        viterbi_path = temp[12].viterbi_path
        viterbi_path_length = len(viterbi_path) 
        conserved_length = sum(viterbi_path[50:viterbi_path_length - 50])
        num_insertions = len(temp[1].master_species_modified_sequence_insertions)
        sequence_length = temp[0].modified_sequence_length
        if sequence_length > 150 and conserved_length > 50:
            print (id, sequence_length, conserved_length, conserved_length/sequence_length, num_insertions)