##### Import modules and set up file locations

In [1]:
full_run = True

In [2]:
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import HMM as hmm
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment_HMM as alignment_hmm
from Comparative_Analysis import Alignment_Analysis as alignment_analysis
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Master_Alignment_HMM as master_alignment_hmm
from Comparative_Analysis import Multi_Species_Master_Alignment_HMM as multi_species_master_alignment_hmm
from Comparative_Analysis import Arneson_Ernst_HMM as ae_hmm
import random
from joblib import Parallel, delayed
from scipy import optimize as opt
from tqdm import tqdm
import matplotlib.pyplot as plt
import logomaker as lm
import math
import pandas as pd
import subprocess
import numpy as np
import ete3;
import pickle
import copy
import seaborn as sns



In [3]:
project_dir = 'D:/Project_Data/Project_6'
genome_datasets_dir = project_dir + '/Datasets/NCBI_Datasets'
literature_datasets_dir = project_dir + '/Datasets/Data_From_Publications'
output_dir = project_dir + '/Output'
protein_fasta_output_loc = output_dir + '/Protein_Sequences'
outgroup_protein_fasta_output_loc = output_dir + '/Protein_Sequences_With_Outgroup'
sonic_paranoid_run_name = 'Run_Without_Outgroup'
outgroup_sonic_paranoid_run_name = 'Run_With_Outgroup'
sonic_paranoid_output_loc = output_dir + '/Sonic_Paranoid_Output'
ortholog_dir = sonic_paranoid_output_loc + '/runs/' + sonic_paranoid_run_name + '/ortholog_groups'
outgroup_ortholog_dir = sonic_paranoid_output_loc + '/runs/' + outgroup_sonic_paranoid_run_name + '/ortholog_groups'
non_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Downstream_Non_CDS'
upstream_non_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Upstream_Non_CDS'
cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS'
extended_cds_output_dir = output_dir + '/Multiple_Alignment_Data/Extended_CDS'
extended_non_cds_regions_output_dir = output_dir + '/Multiple_Alignment_Data/Extended_Non_CDS'
outgroup_cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS_With_Outgroup'
outgroup_concatenated_cds_output_dir = output_dir + '/Multiple_Alignment_Data/CDS_With_Outgroup_Concatenated'
hmm_parameters_output_dir = output_dir +'/HMM_Model_Parameters'
conservation_analysis_output_dir = output_dir + '/Conservation_Analysis'

In [4]:
num_cores = 16
core_numbers = list(range(1, num_cores+1))
tb_species = 'GCF_000195955.2'
outgroup_species = 'GCF_000696675.2'
non_cds_offset = 50

##### Determine genomes in ortholog family, generate protein files and run Sonic Paranoid (both with and without outgroup - outgroup needed for tree building)

In [5]:
genome_ids_with_outgroup = util.list_dirs(genome_datasets_dir)
genome_ids = util.list_dirs(genome_datasets_dir)
genome_ids.remove(outgroup_species)
non_target_genome_ids = util.list_dirs(genome_datasets_dir)
non_target_genome_ids.remove(outgroup_species)
non_target_genome_ids.remove(tb_species)
num_ids = len(genome_ids)
num_ids_with_outgroup = len(genome_ids_with_outgroup)

In [None]:
if full_run == True:
    for folder in sar.tqdm(genome_ids):
        sar.generate_protein_file(genome_datasets_dir + '/' + folder + '/genomic.gbff', protein_fasta_output_loc + '/' + folder + '.faa')
    for folder in sar.tqdm(genome_ids_with_outgroup):
        sar.generate_protein_file(genome_datasets_dir + '/' + folder + '/genomic.gbff', outgroup_protein_fasta_output_loc + '/' + folder + '.faa')

In [None]:
if full_run == True:
    sar.run_sonic_paranoid(protein_fasta_output_loc, sonic_paranoid_output_loc, sonic_paranoid_run_name)
    sar.run_sonic_paranoid(outgroup_protein_fasta_output_loc, sonic_paranoid_output_loc, outgroup_sonic_paranoid_run_name)

##### Generate objects containing orthologs and sequence information for each ortholog group / species

In [6]:
orthologs = sar.Ortholog_Grouping(ortholog_dir)
outgroup_orthologs = sar.Ortholog_Grouping(outgroup_ortholog_dir)

100%|██████████| 75075/75075 [00:03<00:00, 23123.21it/s]
100%|██████████| 84312/84312 [00:03<00:00, 23730.84it/s]


In [7]:
seq_data = sar.Ortholog_Sequence_Dataset(orthologs, genome_datasets_dir, genome_ids, non_cds_offset, tb_species) 
outgroup_seq_data = sar.Ortholog_Sequence_Dataset(outgroup_orthologs, genome_datasets_dir, genome_ids_with_outgroup, non_cds_offset, tb_species) 
all_copy_seq_data = sar.Ortholog_Sequence_Dataset(orthologs, genome_datasets_dir, genome_ids, non_cds_offset, tb_species, single_copy = False) 
#print(outgroup_seq_data.species_info())

100%|██████████| 16/16 [00:00<00:00, 94.35it/s]
100%|██████████| 16/16 [00:00<?, ?it/s]
100%|██████████| 16/16 [00:00<?, ?it/s]


In [None]:
#seq_data.generate_synteny_plot()
#seq_data.generate_ortholog_count_plot()
#all_copy_seq_data.generate_master_count_plot()
seq_data.generate_unassigned_gene_count_plot()

##### Perform CDS and extended CDS (including intergenic regions) alignments for each full ortholog group and save to folders

In [None]:
min_species = num_ids
min_species_with_outgroup = num_ids_with_outgroup
groups = random.sample(orthologs.full_single_copy_ortholog_groups, len(orthologs.full_single_copy_ortholog_groups))  #Permutation ensures even distribution of processing speeds
outgroup_groups = random.sample(outgroup_orthologs.full_single_copy_ortholog_groups, len(outgroup_orthologs.full_single_copy_ortholog_groups))  #Permutation ensures even distribution of processing speeds
if full_run == True:
    #par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(outgroup_groups, num_cores, core_number, outgroup_seq_data.sequence_data, 'cds_length', 'cds_seq', outgroup_cds_output_dir+'/', min_species_with_outgroup) for core_number in tqdm(core_numbers))
    #par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'cds_length', 'cds_seq', cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))
    #par = Parallel(n_jobs=-1)(delayed(align.align_and_build)(groups, num_cores, core_number, seq_data.sequence_data, 'cds_extended_region_length', 'cds_extended_region_seq', extended_cds_output_dir+'/', min_species) for core_number in tqdm(core_numbers))

In [None]:
extended_region_ids = util.list_files(extended_cds_output_dir+'/')
e_ids = [int(i.split('.')[0]) for i in extended_region_ids]
for group_id in tqdm(e_ids):      
    align.extract_non_cds_regions_from_alignment(seq_data.sequence_data, tb_species, 1250, extended_cds_output_dir, extended_non_cds_regions_output_dir, group_id, 10)

##### Run IQTree on concatenated CDS alignments to generate tree and rename with full names

In [None]:
if full_run == True:
    alignment_names = util.list_files(outgroup_cds_output_dir)
    util.concatenate_fasta(outgroup_cds_output_dir, alignment_names, outgroup_concatenated_cds_output_dir + '/concatenated_cds.fasta')
    subprocess.run('cd \\users\\nicho\\IQTree & bin\\iqtree2 -s ' + outgroup_concatenated_cds_output_dir + '/concatenated_cds.fasta' + ' --prefix '+ output_dir + 
                   '/Trees/Concatenated_JC_Tree -m JC -B 1000 -T AUTO -o ' + outgroup_species, shell=True)
    master_tree = ete3.Tree(output_dir + '/Trees/Concatenated_JC_Tree.treefile')
    for node in master_tree.traverse():
        if node.is_leaf():
            node.name = outgroup_seq_data.organism_dict[node.name] 
    master_tree.write(format=0, outfile= output_dir + '/Trees/Concatenated_JC_Tree_Full_Names.treefile')    

In [None]:
master_tree = ete3.Tree(output_dir + '/Trees/Concatenated_JC_Tree.treefile')
phylogenetic_distance_dict = {}
for gid in non_target_genome_ids:
    phylogenetic_distance_dict[gid] = master_tree.get_distance(gid, tb_species)

##### Fit overall Alignment and Pairwise HMMs using EM

In [8]:
num_symbols = 4
num_states = 3
minimum_fit_length = 10
initial_params = [0.95, 0.5, 0.95, 0.5, 0.95, 0.5, 0.56370018, 0.52131172, 0.33906948]

In [None]:
if full_run == True:
    Alignment_HMM_Model = alignment_hmm.Alignment_HMM (num_symbols, num_states, extended_non_cds_regions_output_dir, tb_species, species_order = genome_ids)
    parameter_fits = Alignment_HMM_Model.EM_update(num_cores, initial_params, minimum_fit_length)
    fitted_parameters = parameter_fits[3]
    with open(hmm_parameters_output_dir + '/' + 'full_parameter_fit.pkl', 'wb') as f:
        pickle.dump(fitted_parameters, f)

In [None]:
if full_run == True:
    parameter_fits = []
    for id in non_target_genome_ids:
        parameter_fits.append((id, Alignment_HMM_Model.EM_update(num_cores, initial_params, minimum_fit_length, all_species = False, comparison_species = id)[3]))
    with open(hmm_parameters_output_dir + '/' + 'pairwise_parameter_fits.pkl', 'wb') as f:
        pickle.dump(parameter_fits, f)

##### Fit overall HMM to pairwise HMMs

In [9]:
def parallel_generate_pairwise_state_probabilities(num_subsets, subset_num, ids, num_states, pairwise_fitted_parameters):
    ids = util.chunk_list(ids, num_subsets, subset_num)
    pairwise_observation_probabilities = []
    for group_id in ids:
        temp = []
        alignment = align.Alignment(extended_non_cds_regions_output_dir +'/'+str(group_id)+'.fasta', tb_species, 'NT', species_order = genome_ids)
        alignment.modify_sequence(1,False,False)
        for params in pairwise_fitted_parameters:
                transition_probabilities, mutation_probabilities = Alignment_HMM_Model.alignment_hmm_model_inputs(params[1])
                observation_probabilities = Alignment_HMM_Model.calculate_observation_probs(mutation_probabilities, alignment.modified_sequence_list, alignment, all_species=False, comparison_species = params[0])
                initial_state_probabilities = [1.0/num_states]*num_states
                hmm_model = hmm.HMM(initial_state_probabilities, transition_probabilities, observation_probabilities)
                hmm_model.calculate_probabilities()
                temp.append(hmm_model.state_probabilities)
        pairwise_observation_probabilities.append(temp) 
    return pairwise_observation_probabilities

In [10]:
if full_run == True:
    with open(hmm_parameters_output_dir + '/' + 'pairwise_parameter_fits.pkl', 'rb') as f:
        pairwise_fitted_parameters = pickle.load(f)
    file_ids = util.list_files(extended_non_cds_regions_output_dir + '/')
    ids = [(i.split('.')[0]) for i in file_ids] 
    parallel_output = Parallel(n_jobs=-1)(delayed(parallel_generate_pairwise_state_probabilities)(num_cores, core_number, ids, num_states, pairwise_fitted_parameters) for core_number in core_numbers)
    pairwise_observation_probabilities = [item for sublist in parallel_output for item in sublist]
    with open(hmm_parameters_output_dir + '/' + 'pairwise_observation_probabilities.pkl', 'wb') as f:
        pickle.dump(pairwise_observation_probabilities, f)
else:
    with open(hmm_parameters_output_dir + '/' + 'pairwise_observation_probabilities.pkl', 'rb') as f:
        pairwise_observation_probabilities = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'D:/Project_Data/Project_6/Output/Multiple_Alignment_Data/Extended_Non_CDS/10000.fasta'

In [None]:
if full_run == True:
    Master_Alignment_HMM_Model = master_alignment_hmm.Master_Alignment_HMM (pairwise_observation_probabilities)
    initial_params = [0.8, 0.5, 0.9, 0.3]
    parameter_fits = Master_Alignment_HMM_Model.EM_update(num_cores, initial_params, minimum_fit_length)
    fitted_parameters = parameter_fits[3]
    print(fitted_parameters)
    with open(hmm_parameters_output_dir + '/' + 'master_parameter_fit.pkl', 'wb') as f:
        pickle.dump(fitted_parameters, f)

#####  Fit overall species specific HMM to pairwise HMMs

In [None]:
if full_run == True:
    num_comparison_species = len(pairwise_observation_probabilities[0])
    Multi_Species_Master_Alignment_HMM_Model = multi_species_master_alignment_hmm.Multi_Species_Master_Alignment_HMM (pairwise_observation_probabilities)
    initial_params = [0.8, 0.5, [0.9]*num_comparison_species, [0.3]*num_comparison_species]
    parameter_fits = Multi_Species_Master_Alignment_HMM_Model.EM_update(num_cores, initial_params, minimum_fit_length)
    #print(parameter_fits)
    fitted_parameters = parameter_fits[3]
    print(fitted_parameters)
    with open(hmm_parameters_output_dir + '/' + 'multi_species_master_parameter_fit.pkl', 'wb') as f:
        pickle.dump(fitted_parameters, f)
    species_conservation_parameter_dict = {}    # Used to plot against phylogenetic distance below
    for i, gid in enumerate(non_target_genome_ids):
        species_conservation_parameter_dict[gid] = parameter_fits[1][0][i]
    with open(hmm_parameters_output_dir + '/' + 'multi_species_conservation_parameter_dict.pkl', 'wb') as f:
        pickle.dump(species_conservation_parameter_dict, f)

In [None]:
with open(hmm_parameters_output_dir + '/' + 'multi_species_conservation_parameter_dict.pkl', 'rb') as f:
    species_conservation_parameter_dict = pickle.load(f)
species_name_labels = []
log_conservation = []
phylo_distance = []
for gid in non_target_genome_ids:
    species_name_labels.append(gid)
    log_conservation.append(-1*math.log(species_conservation_parameter_dict[gid]))
    phylo_distance.append(phylogenetic_distance_dict[gid])
conservations_df = pd.DataFrame(
    {'Species': species_name_labels,
     '-Log conservation probability': log_conservation,
     'Phylogenetic distance': phylo_distance
    })
ax = sns.scatterplot(data = conservations_df, x = 'Phylogenetic distance', y = '-Log conservation probability', s= 20, hue = 'Species')
plt.setp(ax.get_legend().get_texts(), fontsize='6') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='8') # for legend title
plt.show()

#####  Fit Arneson Ernst HMM to alignments

In [None]:
num_states = 3
num_symbols = 4
minimum_fit_length = 10
initial_mutation_probs = np.random.rand(num_states,num_ids,3)
for i in range(num_states):
    for j in range(num_ids):
        tot = 0
        for k in range(3):
            tot += initial_mutation_probs[i, j, k]
        for k in range(3):
            initial_mutation_probs[i, j, k] = initial_mutation_probs[i, j, k] / tot
#print(initial_mutation_probs)
initial_params = [0.95, 0.5, 0.95, 0.5, 0.95, 0.5, initial_mutation_probs]

In [None]:
if full_run == True:
    AE_HMM_Model = ae_hmm.Alignment_HMM (num_symbols, num_states, extended_non_cds_regions_output_dir, tb_species, species_order = genome_ids)
    parameter_fits = AE_HMM_Model.EM_update(num_cores, initial_params, minimum_fit_length)
    print(parameter_fits)
    fitted_parameters = parameter_fits[3]
    with open(hmm_parameters_output_dir + '/' + 'AE_parameter_fit.pkl', 'wb') as f:
        pickle.dump(fitted_parameters, f)

##### Load HMM parameters

In [None]:
with open(hmm_parameters_output_dir + '/' + 'full_parameter_fit.pkl', 'rb') as f:
    full_fitted_parameters = pickle.load(f)
with open(hmm_parameters_output_dir + '/' + 'pairwise_parameter_fits.pkl', 'rb') as f:
    pairwise_fitted_parameters = pickle.load(f)
with open(hmm_parameters_output_dir + '/' + 'master_parameter_fit.pkl', 'rb') as f:
    master_fitted_parameters = pickle.load(f)
with open(hmm_parameters_output_dir + '/' + 'multi_species_master_parameter_fit.pkl', 'rb') as f:
    multi_species_master_fitted_parameters = pickle.load(f)
with open(hmm_parameters_output_dir + '/' + 'AE_parameter_fit.pkl', 'rb') as f:
    AE_fitted_parameters = pickle.load(f)

##### Analyse ortholog groups for conservation and other features and output to dictionary

In [None]:
Alignment_HMM_Model = alignment_hmm.Alignment_HMM (4, 3, extended_non_cds_regions_output_dir, tb_species, species_order = genome_ids)
Master_Alignment_HMM_Model = master_alignment_hmm.Master_Alignment_HMM (pairwise_observation_probabilities)
Multi_Species_Master_Alignment_HMM_Model = multi_species_master_alignment_hmm.Multi_Species_Master_Alignment_HMM (pairwise_observation_probabilities)
AE_HMM_Model = ae_hmm.Alignment_HMM (4, 3, non_cds_output_dir, tb_species, species_order = genome_ids)
literature_annotations_df_list = [pd.read_excel(literature_datasets_dir+'/Mycobrowser_Release_4.xlsx'), pd.read_excel(literature_datasets_dir+'/DeJesus2013.xlsx')]

In [None]:
def parallel_build_analysis_dictionary(num_subsets, subset_num, ids, analysis_type, model_type):
    ids = util.chunk_list(ids, num_subsets, subset_num)
    output_list = []
    for group_id in ids:
        alignment = align.Alignment(alignment_dir+'/'+str(group_id)+'.fasta', tb_species, 'NT')
        if model_type == 'Simple':
            analysis = alignment_analysis.Alignment_Analysis(analysis_type, alignment, seq_data, group_id, 3, pairwise_fitted_parameters, 'Simple', 2, 
                                           master_fitted_parameters, extended_non_cds_regions_output_dir, tb_species, genome_ids, pairwise_observation_probabilities, 
                                           Alignment_HMM_Model, Master_Alignment_HMM_Model, literature_annotations_df_list)
        elif model_type == 'Multi_Species':
            analysis = alignment_analysis.Alignment_Analysis(analysis_type, alignment, seq_data, group_id, 3, pairwise_fitted_parameters, 'Multi_Species', 2, 
                                           multi_species_master_fitted_parameters, extended_non_cds_regions_output_dir, tb_species, genome_ids, pairwise_observation_probabilities, 
                                           Alignment_HMM_Model, Multi_Species_Master_Alignment_HMM_Model, literature_annotations_df_list)
        elif model_type == 'AE':
            analysis = alignment_analysis.Alignment_Analysis(analysis_type, alignment, seq_data, group_id, 3, pairwise_fitted_parameters, 'AE', 3, 
                                           AE_fitted_parameters, extended_non_cds_regions_output_dir, tb_species, genome_ids, pairwise_observation_probabilities, 
                                           Alignment_HMM_Model, AE_HMM_Model, literature_annotations_df_list)
        else:
            pass
        output_list.append((group_id, analysis))
    return output_list

In [None]:
#if full_run == True:
if 1==1:
    for model_type in ['Simple', 'Multi_Species', 'AE']:
        analysis_type = 'Extended_CDS'
        alignment_dir = extended_cds_output_dir
        dict_name = model_type + '_' + 'extended_cds_conservation_info_dictionary'
        alignment_info_dict = {}
        file_ids = util.list_files(alignment_dir+'/')
        ids = [int(i.split('.')[0]) for i in file_ids]
        parallel_output = Parallel(n_jobs=-1)(delayed(parallel_build_analysis_dictionary)(num_cores, core_number, ids, analysis_type, model_type) for core_number in tqdm(core_numbers))
        dictionary_list = [item for sublist in parallel_output for item in sublist]
        alignment_info_dict = {}
        for (group, analysis) in dictionary_list:
            alignment_info_dict[seq_data.master_species_info(group, 'locus_tag')] = analysis
        with open(conservation_analysis_output_dir + '/' + dict_name + '.pkl', 'wb') as f:
            pickle.dump(alignment_info_dict, f)