#### Directories etc

In [35]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from Comparative_Analysis import Alignment as alignfn
from random import sample
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
project_dir = 'F:/Project_Data/E_Pump_Project'
seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
sequence_dirs = util.list_dirs(seq_dir)
tb_species = 'NC_000962.3' 
tb_annotation_dirname = 'GCF_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

#### Create file with all CDS for species and create BLAST databases for TB CDS and All CDS (to do reciprocal best hits)

In [7]:
def generate_protein_dataset(num_subsets, subset_num, dir_list): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    all_cds = []
    all_tb_cds = []
    names = []
    sequences = []
    locations = []
    for dirname in (sequence_dirs):
        for genome_record in SeqIO.parse(seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                    if dirname == tb_annotation_dirname:
                        all_tb_cds.append([accession_locus, translation])
    return (all_cds, all_tb_cds, names, locations, sequences)           

In [8]:
#if full_build == True:
if 1==1:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, sequence_dirs) for core_number in core_numbers)
    names_dict = {}
    locations_dict = {}
    sequence_dict = {}
    all_cds = []
    all_tb_cds = []
    for x in parallel_output:
        all_cds += x[0]
        all_tb_cds += x[1]
        for temp in x[2]:
            names_dict[temp[0]] = temp[1]
        for temp in x[3]:
            locations_dict[temp[0]] = temp[1]
        for temp in x[4]:
            sequence_dict[temp[0]] = temp[1]

In [None]:
with open(project_dir + '/names_dict.pkl', 'wb') as f:
        pickle.dump(names_dict, f) 

In [None]:
if full_build == True:
    util.produce_fasta_file(all_cds, project_dir + '/all_cds.fasta')
    util.produce_fasta_file(all_tb_cds, project_dir + '/all_tb_cds.fasta')
    blastfn.build_blast_db(project_dir, 'all_cds.fasta', 'Mycobacteriaceae_Refseq', 'F:/Datasets/BLAST/Mycobacteriaceae_Refseq')
    blastfn.build_blast_db(project_dir, 'all_tb_cds.fasta', 'all_tb_cds', 'F:/Datasets/BLAST/all_tb_cds')

In [None]:
if full_build == True:
    blastfn.run_blastp('F:/Datasets/BLAST/Mycobacteriaceae_Refseq', 'all_tb_cds.fasta', 'Mycobacteriaceae_Refseq', 'all_tb_cds_hits.csv', e_value = 1e-10)
    blastfn.run_blastp('F:/Datasets/BLAST/all_tb_cds', 'all_cds.fasta', 'all_tb_cds', 'reverse_hits.csv', e_value = 1e-10)

In [None]:
if full_build == True:
#if 1==1:
    a = blastfn.process_blast_output('F:/Datasets/BLAST/Mycobacteriaceae_Refseq/all_tb_cds_hits.csv', names_dict, top_hit_only = False)
    a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
    b = blastfn.process_blast_output('F:/Datasets/BLAST/all_tb_cds/reverse_hits.csv', names_dict, top_hit_only = False)
    b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
    rbh =  blastfn.keep_reciprocal_best_hits(a, b)
    rbh.to_csv(project_dir + '/reciprocal_best_hits.csv')
else:
    rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')

#### Subset based on loci of interest, and filter orthologues based on percent identity, synteny

In [None]:
loci = ['Rv3855', 'Rv3854c']    # [TetR,  divergent_gene]
target_loci = [tb_species+ '@' + x for x in loci]

In [None]:
rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')
rbh = rbh[rbh['query_ref'].isin(target_loci)]
rbh['target_loc'] = rbh['target_ref'].map(locations_dict)
rbh['cds_count'] = rbh.groupby('target_species_name')['query_ref'].transform('size')
rbh['min_pct_id'] = rbh.groupby('target_species_name')['percent_identical_matches'].transform('min')

In [None]:
rbh = rbh[rbh['cds_count'] == 2]
for i, r in rbh.iterrows():
    (start, stop, strand) = r['target_loc']
    rbh.at[i,'start'] = start
    rbh.at[i,'stop'] =  stop
    rbh.at[i,'strand'] = strand
rbh['min_start'] = rbh.groupby('target_species_name')['start'].transform('min')
rbh['max_stop'] = rbh.groupby('target_species_name')['stop'].transform('max')
rbh['region_size'] = rbh['max_stop'] - rbh['min_start']
rbh = rbh[rbh['region_size'] < 5000]
rbh = rbh[rbh['min_pct_id'] >= 70]

#### Generate FASTA file containing intergenic regions in orthologous species and run Muscle / R-scape

In [None]:
intergenic_regions = []
tetR_regions = []
for target_species in list(rbh['target_species_name'].unique()):
    temp = rbh[rbh['target_species_name'] == target_species]
    for i, r in temp.iterrows():
        target_species_accession = r['target_species']
        if r['query_ref'].split('@')[1] == loci[1]:
            divgene_coords = r['target_loc']
        if r['query_ref'].split('@')[1] == loci[0]:
            tetR_coords = r['target_loc']
    if divgene_coords[2] == 1:
        intergenic_region = (tetR_coords[1], divgene_coords[0]+3, 1)
    else:
        intergenic_region = (divgene_coords[1] - 3, tetR_coords[0], -1)
    if intergenic_region[2] == 1:
        intergenic_sequence = sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]]
    else:
        intergenic_sequence = util.reverse_complement(sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]])
    intergenic_regions.append([target_species.replace(" ", "_"), intergenic_sequence])
    
    if tetR_coords[2]==1:    
        tetR_sequence = sequence_dict[target_species_accession][tetR_coords[0]: tetR_coords[1]]
    else:
        tetR_sequence = util.reverse_complement(sequence_dict[target_species_accession][tetR_coords[0]: tetR_coords[1]]) 
    tetR_regions.append([target_species.replace(" ", "_"), tetR_sequence])
    
util.produce_fasta_file(intergenic_regions, project_dir +'/intergenic_regions.fasta')
cline = MuscleCommandline(muscle_exe, input= project_dir +'/intergenic_regions.fasta', out=project_dir + '/intergenic_region_'+loci[0]+'_'+loci[1] + '_alignment.fasta')
result = cline();
#blastfn.convert_fasta_to_stockholm(project_dir, '/intergenic_region_'+loci[0]+'_'+loci[1] + '_alignment.fasta', '/intergenic_region_'+loci[0]+'_'+loci[1] + '_alignment.sto)

#util.produce_fasta_file(tetR_regions, project_dir +'/tetR_regions.fasta')
#cline = MuscleCommandline(muscle_exe, input= project_dir +'/'+'tetR_regions.fasta', out=project_dir + '/tetR_region_alignment3.fasta')
#result = cline();
#blastfn.convert_fasta_to_stockholm(project_dir, 'tetR_region_alignment3.fasta', 'tetR_region_alignment3.sto')

In [None]:
blastfn.run_rscape(project_dir, 'intergenic_region_alignment3.sto', 'rscape_output')

#### Produce full ortholog file for phylogenetic tree

In [9]:
cds_dict = {}
for (x, y) in tqdm(all_cds):
    cds_dict[x] = y

100%|██████████| 1116061/1116061 [00:00<00:00, 2371354.64it/s]


In [10]:
rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')

In [11]:
query_refs = list(rbh['query_ref'].unique())

In [12]:
full_ortholog_refs = []
for ref in tqdm(query_refs):
    temp = rbh[rbh['query_ref'] == ref]
    if len(temp) == len(set(names_dict.values())):
        full_ortholog_refs.append(ref)

100%|██████████| 3896/3896 [01:07<00:00, 57.33it/s]


In [13]:
len(full_ortholog_refs)

325

In [43]:
concatenated_alignment_dict = {}
for ref in tqdm(full_ortholog_refs):
    temp_seq = []
    temp = rbh[rbh['query_ref'] == ref]
    for i, r in temp.iterrows():
        temp_seq.append([r['target_species_name'].replace(' ','_'),cds_dict[r['target_ref']]])
    util.produce_fasta_file(temp_seq, project_dir +'/temp_seq.fasta')    
    cline = MuscleCommandline(muscle_exe, input= project_dir +'/temp_seq.fasta', out=project_dir +'/temp_seq_alignment.fasta')
    result = cline();    
    alignment = util.read_fasta_to_array(project_dir +'/temp_seq_alignment.fasta')    
    for (name, sequence) in zip(alignment[0], alignment[1]):
        if name in concatenated_alignment_dict:
            temp2 = concatenated_alignment_dict[name]
            concatenated_alignment_dict[name] = temp2 + sequence
        else:
            concatenated_alignment_dict[name] = sequence
temp  = []
for k, v in concatenated_alignment_dict.items():
    temp.append([k, v])
util.produce_fasta_file(temp, project_dir + '/concatenated_alignment.fasta')

  0%|          | 0/325 [00:00<?, ?it/s]
100%|██████████| 215/215 [00:00<00:00, 21503.61it/s]
  0%|          | 1/325 [00:06<37:29,  6.94s/it]
100%|██████████| 215/215 [00:00<00:00, 35910.14it/s]
  1%|          | 2/325 [00:12<32:19,  6.00s/it]
100%|██████████| 215/215 [00:00<00:00, 43023.63it/s]
  1%|          | 3/325 [00:17<29:54,  5.57s/it]
100%|██████████| 215/215 [00:00<00:00, 23970.64it/s]
  1%|          | 4/325 [00:28<41:05,  7.68s/it]
100%|██████████| 215/215 [00:00<00:00, 14330.50it/s]
  2%|▏         | 5/325 [00:49<1:07:59, 12.75s/it]
100%|██████████| 215/215 [00:00<00:00, 105298.38it/s]
  2%|▏         | 6/325 [00:51<47:27,  8.92s/it]  
100%|██████████| 215/215 [00:00<00:00, 71393.82it/s]
  2%|▏         | 7/325 [00:54<37:45,  7.12s/it]
100%|██████████| 215/215 [00:00<00:00, 23871.65it/s]
  2%|▏         | 8/325 [01:03<40:04,  7.59s/it]
100%|██████████| 215/215 [00:00<00:00, 30475.68it/s]
  3%|▎         | 9/325 [01:09<37:03,  7.04s/it]
100%|██████████| 215/215 [00:00<00:00, 35818.8

In [45]:
subprocess.run('cd c:\\users\\nicho\\IQTree & bin\\iqtree2 -s ' +  'concatenated_alignment.fasta' + ' --prefix Concatenated_Mycobacteria_Tree -m JTT -B 1000 -T AUTO ', shell=True)
   

CompletedProcess(args='cd c:\\users\\nicho\\IQTree & bin\\iqtree2 -s concatenated_alignment.fasta --prefix Concatenated_Mycobacteria_Tree -m JTT -B 1000 -T AUTO ', returncode=0)

In [58]:
i=0
dist_names = []
distances = []
with open(project_dir + '/Concatenated_Mycobacteria_Tree.mldist','r') as ofile: 
    sequence_names = []
    sequence_list = []
    first_seq = 0
    for l in ofile:
        m = l.strip('\n')
        if i > 0:
            dist_names.append(m.split(' ')[0])
            if 'tuberculosis' in m.split(' ')[0]:
                temp= (m.split(' ')[1:])
                for x in temp:
                    if len(x) > 0:
                        distances.append(float(x))
        i+=1
dist_dict = {}
for (name, dist) in zip(dist_names, distances):
    dist_dict[name] = dist
