##### Import modules and set up file locations

In [1]:
full_run = False

In [2]:
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import HMM as hmm
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment_HMM as alignment_hmm
from Comparative_Analysis import Alignment_Analysis as alignment_analysis
from Comparative_Analysis import Alignment as align
import random
from joblib import Parallel, delayed
from scipy import optimize as opt
from tqdm import tqdm
import matplotlib.pyplot as plt
import logomaker as lm
import math
import pandas as pd
import subprocess
import numpy as np
import ete3;
import pickle



In [3]:
project_dir = 'D:/Project_Data/Project_3'
sonic_paranoid_run_name = 'Run_Without_Outgroup'
outgroup_sonic_paranoid_run_name = 'Run_With_Outgroup'
genome_datasets_dir = project_dir + '/Datasets/NCBI_Datasets_Close_Species/'
output_dir = project_dir + '/Output/Close_Species'
protein_fasta_output_loc = output_dir + '/Protein_Sequences'
outgroup_protein_fasta_output_loc = output_dir + '/Protein_Sequences_With_Outgroup'
sonic_paranoid_output_loc = output_dir + '/Sonic_Paranoid_Output'
ortholog_file_ref = sonic_paranoid_output_loc + '/runs/' + sonic_paranoid_run_name + '/ortholog_groups/flat.ortholog_groups.tsv'
outgroup_ortholog_file_ref = sonic_paranoid_output_loc + '/runs/' + outgroup_sonic_paranoid_run_name + '/ortholog_groups/flat.ortholog_groups.tsv'
non_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Non_CDS'
upstream_non_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Upstream_Non_CDS'
cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS'
extended_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Extended_CDS'
outgroup_cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS_With_Outgroup'
outgroup_concatenated_cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS_With_Outgroup_Concatenated'

In [4]:
num_cores = 16
core_numbers = list(range(1, num_cores+1))
non_cds_offset = 50
extended_cds_offset = 100
tb_species = 'GCF_000195955.2'
outgroup_species = 'GCF_000696675.2'

##### Determine genomes in ortholog family, generate protein files and run Sonic Paranoid (both with and without outgroup - outgroup needed for tree building)

In [5]:
genome_ids_with_outgroup = util.list_dirs(genome_datasets_dir)
genome_ids = util.list_dirs(genome_datasets_dir)
genome_ids.remove(outgroup_species)
num_ids = len(genome_ids)
num_ids_with_outgroup = len(genome_ids_with_outgroup)

In [6]:
if full_run == True:
    for folder in sar.tqdm(genome_ids):
        sar.generate_protein_file(genome_datasets_dir + '/' + folder + '/genomic.gbff', protein_fasta_output_loc + '/' + folder + '.faa')
    for folder in sar.tqdm(genome_ids_with_outgroup):
        sar.generate_protein_file(genome_datasets_dir + '/' + folder + '/genomic.gbff', outgroup_protein_fasta_output_loc + '/' + folder + '.faa')

In [7]:
if full_run == True:
    sar.run_sonic_paranoid(protein_fasta_output_loc, sonic_paranoid_output_loc, sonic_paranoid_run_name)
    sar.run_sonic_paranoid(outgroup_protein_fasta_output_loc, sonic_paranoid_output_loc, outgroup_sonic_paranoid_run_name)

##### Generate objects containing orthologs and sequence information for each ortholog group / species

In [8]:
orthologs = sar.Ortholog_Grouping(ortholog_file_ref)
outgroup_orthologs = sar.Ortholog_Grouping(outgroup_ortholog_file_ref)

100%|██████████| 75075/75075 [00:03<00:00, 22220.59it/s]
100%|██████████| 84312/84312 [00:03<00:00, 22428.93it/s]


In [9]:
seq_data = sar.Ortholog_Sequence_Dataset(orthologs, genome_datasets_dir, genome_ids, non_cds_offset, tb_species) 
outgroup_seq_data = sar.Ortholog_Sequence_Dataset(outgroup_orthologs, genome_datasets_dir, genome_ids_with_outgroup, non_cds_offset, tb_species) 
#print(outgroup_seq_data.species_info())

100%|██████████| 16/16 [00:00<00:00, 315.01it/s]
100%|██████████| 16/16 [00:00<?, ?it/s]


##### Perform CDS and non-CDS alignments for each full ortholog group and save to folders

In [10]:
min_species = num_ids
min_species_with_outgroup = num_ids_with_outgroup
groups = random.sample(orthologs.full_single_copy_ortholog_groups, len(orthologs.full_single_copy_ortholog_groups))  #Permutation ensures even distribution of processing speeds
outgroup_groups = random.sample(outgroup_orthologs.full_single_copy_ortholog_groups, len(outgroup_orthologs.full_single_copy_ortholog_groups))  #Permutation ensures even distribution of processing speeds
if full_run == True:
    par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(outgroup_groups, num_cores, core_number, outgroup_seq_data.sequence_data, 'cds_length', 'cds_seq', outgroup_cds_output_dir+'/', min_species_with_outgroup) for core_number in tqdm(core_numbers))
    par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'cds_length', 'cds_seq', cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))
    par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'non_cds_offset_length', 'non_cds_offset_seq', non_cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))
    par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'upstream_non_cds_offset_length', 'upstream_non_cds_offset_seq', upstream_non_cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))
    par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'cds_extended_region_length', 'cds_extended_region_seq', extended_cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))

##### Run IQTree on concatenated CDS alignments to generate tree

In [11]:
if full_run == True:
    alignment_names = util.list_files(outgroup_cds_output_dir)
    util.concatenate_fasta(outgroup_cds_output_dir, alignment_names, outgroup_concatenated_cds_output_dir + '/concatenated_cds.fasta')
    subprocess.run('cd \\users\\nicho\\IQTree & bin\\iqtree2 -s ' + outgroup_concatenated_cds_output_dir + '/concatenated_cds.fasta' + ' --prefix '+ output_dir + 
                   '/Trees/Concatenated_JC_Tree -m JC -B 1000 -T AUTO -o ' + outgroup_species, shell=True)

##### Fit Alignment HMM

In [12]:
num_symbols = 4
num_states = 3
minimum_fit_length = 10

In [13]:
Alignment_HMM_Model = alignment_hmm.Alignment_HMM (num_symbols, num_states, non_cds_output_dir, tb_species)

100%|██████████| 1621/1621 [00:01<00:00, 1481.82it/s]


In [14]:
def parallel_alignment_hmm_log_likelihood (params):
    core_numbers = range(1, num_cores+1)
    a = Alignment_HMM_Model.alignment_hmm_log_likelihood(params, 16, 1, non_cds_offset, minimum_fit_length)
   # a = Parallel(n_jobs=-1)(delayed(Alignment_HMM_Model.alignment_hmm_log_likelihood)(params, num_cores, core_number, non_cds_offset, minimum_fit_length) for core_number in core_numbers)
   # a = Parallel(n_jobs=-1)(delayed(Alignment_HMM_Model.fit_alignment_hmm)(params, num_cores, core_number, non_cds_offset, minimum_fit_length, all_species = False, comparison_species = 'NC_008596.1'
   #                                                                       ) for core_number in core_numbers)
    #print(params, sum(a))
    #return sum(a)  
    print(params, a)
    return (a)

In [15]:
params = [0.95, 0.5, 0.95, 0.5, 0.95, 0.5, 0.56370018, 0.52131172, 0.33906948]
bound_tuple = [(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999),(0.001,0.999)]

In [16]:
Alignment_HMM_Model.alignment_hmm_log_likelihood(params, 1, 1, non_cds_offset, minimum_fit_length)

403770.8512953706

In [None]:
Alignment_HMM_Model.EM_update(1, 1, params, non_cds_offset, minimum_fit_length)
    

  1%|          | 1/100 [00:27<46:08, 27.97s/it]

[[0.95536874 0.00842696 0.0362043 ]
 [0.0428727  0.86855507 0.08857223]
 [0.0154444  0.00711191 0.97744369]] [0.7510488  0.54540054 0.13411747] 403770.8512953706


 11%|█         | 11/100 [05:13<43:45, 29.50s/it]

[[0.7971284  0.17621397 0.02665763]
 [0.09457692 0.81925978 0.0861633 ]
 [0.01137906 0.06881068 0.91981026]] [0.89012777 0.38006913 0.0340621 ] 218450.73177903588


 14%|█▍        | 14/100 [06:39<41:40, 29.08s/it]

In [None]:
#if full_run == True:
if 1==1:
    res = opt.minimize(parallel_alignment_hmm_log_likelihood, params, method = 'Nelder-Mead', bounds = bound_tuple)

In [None]:
fitted_parameters = [0.79185005, 0.960987, 0.83863594, 0.751462, 0.9568103, 0.1157162, 0.85319079, 0.30944991, 0.02530253]
if full_run == True:
    fitted_parameters = res.x

##### Analyse ortholog groups for conservation and other features and output to dictionary

In [None]:
def parallel_build_analysis_dictionary(num_subsets, subset_num, ids, analysis_type):
    ids = util.chunk_list(ids, num_subsets, subset_num)
    output_list = []
    for group_id in ids:
        alignment = align.Alignment(alignment_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
        analysis = alignment_analysis.Alignment_Analysis(analysis_type, alignment, num_states, non_cds_offset, group_id, fitted_parameters, project_dir, Alignment_HMM_Model, seq_data)
        output_list.append((group_id, analysis))
    return output_list

In [None]:
for analysis_type in ['Downstream', 'Upstream']:
    if analysis_type == 'Downstream':
        alignment_dir = non_cds_output_dir
        dict_name = 'downstream_conservation_info_dictionary'
    else:
        alignment_dir = upstream_non_cds_output_dir
        dict_name = 'upstream_conservation_info_dictionary'
    alignment_info_dict = {}
    file_ids = util.list_files(alignment_dir+'/')
    ids = [int(i.split('.')[0]) for i in file_ids]
    parallel_output = Parallel(n_jobs=-1)(delayed(parallel_build_analysis_dictionary)(num_cores, core_number, ids, analysis_type) for core_number in tqdm(core_numbers))
    dictionary_list = [item for sublist in parallel_output for item in sublist]
    alignment_info_dict = {}
    for (group, analysis) in dictionary_list:
        alignment_info_dict[group] = analysis
    with open(output_dir + '/' + dict_name + '.pkl', 'wb') as f:
        pickle.dump(alignment_info_dict, f)

In [None]:
with open(output_dir + '/upstream_conservation_info_dictionary.pkl', 'rb') as f:
    upstream_conservation_info_dictionary = pickle.load(f)
with open(output_dir + '/downstream_conservation_info_dictionary.pkl', 'rb') as f:
    downstream_conservation_info_dictionary = pickle.load(f)

##### Plot graphics to show sequence and HMM regions

In [None]:
group_id = 2131 #1337 1007
     #1120 can see the labels work!
    #1161 1115 1116 758 1337   1525?List of sRNA in    
    #1129
    #1169 shows upstream start in DeJesus
    #2131 not very well conserved in Arnvig
    #1214 massive!
    #1009 not much conservation
    # 993 possible??
upstream_conservation_info_dictionary[group_id].display_analysis()
downstream_conservation_info_dictionary[group_id].display_analysis()

In [None]:
data = upstream_conservation_info_dictionary[group_id]
#plt.plot(data.alignment.relative_entropy);
plt.plot(data.alignment.mvave_relative_entropy);
plt.axvline(x=data.buffer_end, ymin=0, ymax=2, color='r');
plt.axvline(x=data.target_end, ymin=0, ymax=2, color='r');
for state in [0]:
    plt.plot(data.hmm_model.state_probabilities[state]);

In [None]:
temp= seq_data.sequence_data
group_id = (temp[temp['locus_tag'] == 'Rv0243'].iloc[0]['group_id'])
temp[temp['group_id'] == group_id]