#### Import packages, set directories and parameters

In [103]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [114]:
project_dir = 'F:/Project_Data/mabR_Project'
mycobacteria_seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_Complete_Annot_20230511/data'
tb_species = 'AL123456.3' 
tb_annotation_dirname = 'GCA_000195955.2'
min_upstream_region_length = 7 
full_build = False
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

##### Produce full sequence file of M. tb for FIMO searches and also list of all feature co-ordinates and inter-feature co-ordinates for locating hits

In [69]:
genome_record = next(SeqIO.parse(mycobacteria_seq_dir + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"))
full_sequence = str(genome_record.seq)
util.produce_fasta_file([['H37Rv', full_sequence]], project_dir + '/MEME_Output/mtb_seq')

100%|██████████| 1/1 [00:00<00:00, 11.98it/s]


In [44]:
annotated_regions = []
intergenic_regions = []
record = next(SeqIO.parse(mycobacteria_seq_dir + '/'+tb_annotation_dirname+'/genomic.gbff', "genbank"))
for feature in record.features:
    a = feature.qualifiers
    if feature.type not in ['source','gene'] and (int(feature.location.start) < int(feature.location.end)) and (int(feature.location.end) - int(feature.location.start)) < 1000000:
        if not(a.get("product") == None):
               product = a.get("product")[0]
        if not(a.get("locus_tag")==None):
            locus_tag = a.get("locus_tag")[0]
        else:
            locus_tag = feature.type
        annotated_regions.append((locus_tag, product, feature.type, int(feature.location.start), int(feature.location.end), str(feature.location.strand)))
annotated_regions.sort(key = lambda x: x[4])
prev_strand = 0
prev_locus = ''
prev_product = ''
max_stop = 0
for n, (locus, product, feature_type, start, stop, strand) in enumerate(annotated_regions):
    if start > max_stop:
        intergenic_regions.append([prev_locus+':'+locus, prev_product + ':' + product, 'Inter-feature',max_stop, start, str(prev_strand)+':'+str(strand)])
    if stop > max_stop:
        prev_locus = locus
        prev_product = product
        prev_strand = strand
    max_stop = max(max_stop, stop)    
for x in intergenic_regions:
    annotated_regions.append(x)
annotated_regions.sort(key = lambda x : x[4])

In [11]:
tb_upstream_dict = {}
all_features = []
genome_record =  next(SeqIO.parse(mycobacteria_seq_dir + '/'+tb_annotation_dirname+'/genomic.gbff', "genbank"))
full_sequence = str(genome_record.seq)
len_full_sequence = len(full_sequence)
for feature in genome_record.features:
    if feature.type in ['gene', 'source']:
        continue
    a = feature.qualifiers
    feature_type = feature.type
    if a.get("locus_tag") != None:
        locus_tag = a.get("locus_tag")[0]
    else:
        locus_tag  = ''
    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
    all_features.append([locus_tag, feature_type, start, stop, strand])

# Positive strand upstream 
all_features.sort(key = lambda x: x[2])
max_stop = 0
for (locus, feature_type, start, stop, strand) in all_features:
    if max_stop < start and feature_type == 'CDS' and strand == 1 and start - max_stop < 100000:    #Avoid joins where biopython interprets inconsistently 
        tb_upstream_dict[locus] = full_sequence[max_stop: start+3]
    max_stop = max(max_stop, stop)
# Negative strand upstream
all_features.sort(key = lambda x: x[3], reverse = True)
min_start = len(full_sequence)-1
for (locus, feature_type, start, stop, strand) in all_features:
    if stop < min_start and feature_type == 'CDS' and strand == -1 and min_start - stop < 100000:
        tb_upstream_dict[locus] = util.reverse_complement(full_sequence[stop-3: min_start])
    min_start = min(min_start, start)

##### Import list of significantly up/downregulated genes and produce fasta file of upstream sequences

In [50]:
regulation_type_dict = {}
p_val_dict = {}
downreg_genes = pd.read_excel(project_dir + '/2022-10-02_data_NU_NAs_renamed.xlsx', sheet_name = 'significant genes downregulated')
upreg_genes = pd.read_excel(project_dir + '/2022-10-02_data.xlsx', sheet_name = 'significant genes upregulated')
all_genes = pd.read_excel(project_dir + '/2022-10-02_data.xlsx', sheet_name = 'all genes')
gene_list = []
for i, r in downreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Downregulated'
for i, r in upreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Upregulated'
for i, r in all_genes.iterrows():
    if r['padj'] == 'NA':
        p_val_dict[r['Locus']] = 999
    else:    
        p_val_dict[r['Locus']] = r['padj']

In [19]:
significant_upstream_tb_regions = []
for gene in gene_list:
    locus = str(gene)
    if locus in tb_upstream_dict:
        upstream_region = tb_upstream_dict[locus][:-3]  # Remove start codon of downstream gene
        if len(upstream_region) >= min_upstream_region_length:
            significant_upstream_tb_regions.append([locus, upstream_region])
util.produce_fasta_file(significant_upstream_tb_regions, project_dir + '/significant_upstream_tb_regions.faa')

100%|██████████| 177/177 [00:00<00:00, 177097.28it/s]


##### Functions to run MEME and FIMO

In [22]:
def run_meme(search_regions_loc, output_dir, min_width, min_sites):
    subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; meme '+ util.wslname(search_regions_loc) + ' -oc '+ util.wslname(output_dir) +' -dna -evt 0.01 -revcomp -mod anr -brief 4000 -minw ' + str(min_width) +' -maxw 200 -minsites ' + str(min_sites)
               , shell=True)

In [24]:
def run_fimo(motif_file, sequence_to_search_file, output_dir):
    subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; fimo -oc ' + util.wslname(output_dir) + ' ' + util.wslname(motif_file) + ' ' + util.wslname(sequence_to_search_file)
               , shell=True)

##### Run MEME, search against tb using FIMO

In [23]:
run_meme(project_dir + '/significant_upstream_tb_regions.faa',project_dir + '/MEME_Output',3,5)

In [25]:
run_fimo(project_dir + '/MEME_Output/meme.txt', project_dir + '/MEME_Output/mtb_seq' , project_dir + '/FIMO_Output')

In [26]:
fimo_hits = pd.read_csv(project_dir + '/FIMO_Output/fimo.tsv', sep='\t')

In [30]:
matched_sequences = []
for i, r in fimo_hits.iterrows():
    if '#' in r['motif_id']:
        continue
    if float(r['q-value']) < 1e-5 and r['motif_alt_id'] == 'MEME-1':
        matched_sequences.append([str(r['start'])+'_'+str(r['stop']), r['matched_sequence']])
util.produce_fasta_file(matched_sequences, project_dir + '/fimo_hit_sequences.faa')

100%|██████████| 113/113 [00:00<00:00, 113089.08it/s]


In [31]:
run_meme(project_dir + '/fimo_hit_sequences.faa', project_dir + '/Second_MEME_Output',3,5)

In [32]:
run_fimo(project_dir + '/Second_MEME_Output/meme.txt', project_dir + '/MEME_Output/mtb_seq' , project_dir + '/Second_FIMO_Output')

In [34]:
second_fimo_hits = pd.read_csv(project_dir + '/Second_FIMO_Output/fimo.tsv', sep='\t')

In [35]:
def count_total_inter_feature(temp):
    tot = 0
    for x in temp:
        if x[2] == 'Inter-feature':
            tot += float(x[3].rstrip('%'))
    return tot

In [48]:
upstream_hit_summary = []
seq_hits = []
filtered_hits = []
fimo_hit_positions = []
matched_sequences = []
for i, r in fimo_hits.iterrows():
    if '#' in r['motif_id']:
        continue
    fimo_hit_positions.append((int(r['start']), int(r['stop']), float(r['q-value']), r['motif_alt_id'],r['matched_sequence']))
    fimo_hit_positions.sort(key = lambda x : x[2])
for (start1, end1, q, alt_id, matched_sequence) in fimo_hit_positions:    
    feature_matches = []
    for (locus, product, feature, start, stop, strand) in annotated_regions:
            if start< end1 and stop > start1:
                overlap = str(int(100*(min(end1, stop) - max(start1, start))/ (end1-start1)))+'%'
                feature_matches.append([locus, product, feature, overlap, strand])
    if q < 1e-1:
        match_list = []
        upstream_genes = []
        for x in feature_matches:
            genes_in_feature = x[0].split(':')
            strands_in_feature = x[4].split(':')
            if len(strands_in_feature) == 2:
                if strands_in_feature[0] == '-1':
                    upstream_genes.append(genes_in_feature[0])
                if strands_in_feature[1] == '1':
                    upstream_genes.append(genes_in_feature[1])
            for y in x:
                for gene in gene_list:
                    if str(gene) in str(y):
                         match_list.append(gene)
        if len(match_list) >= 0 and alt_id == 'MEME-1':
            output_info = (alt_id, q, match_list, start1, end1, feature_matches, matched_sequence, upstream_genes)
            #print(output_info)
            matched_sequences.append([str(start1)+'_'+str(end1), matched_sequence, upstream_genes])
            filtered_hits.append(output_info)
            if len(upstream_genes) > 0:
                upstream_hit_summary.append((upstream_genes, q))
#pd.DataFrame(filtered_hits, columns = ['alt_id','q-value', 'up/down_regulated_matches', 'start', 'end', 'feature_positions', 'matched_sequence', 'upstream_genes']).to_csv(project_dir + '/filtered_fimo_hits.csv')

In [None]:
for x in upstream_hit_summary:
    p_vals = []
    for gene in x[0]:
        if gene in p_val_dict:
            p_vals.append(p_val_dict[gene])
        else:
            p_vals.append(-1)
        print(x, p_vals)

##### Number of motif hits in different species

In [98]:
hit_info = []
for dir in util.list_dirs(mycobacteria_seq_dir):
#for dir in [tb_annotation_dirname]: 
    if os.path.exists(mycobacteria_seq_dir + '/' + dir + '/genomic.gbff'):
        temp_fimo_hit_positions = []
        for genome_record in (SeqIO.parse(mycobacteria_seq_dir + '/' + dir + '/genomic.gbff', "genbank")):
            organism_name = genome_record.annotations['organism']
            full_sequence = str(genome_record.seq)
            util.produce_fasta_file([['temp', full_sequence]], project_dir + '/MEME_Output/temp_seq')
            run_fimo(project_dir + '/MEME_Output/meme.txt', project_dir + '/MEME_Output/temp_seq' , project_dir + '/Temp_FIMO_Output')
            temp_fimo_hits = pd.read_csv(project_dir + '/Temp_FIMO_Output/fimo.tsv', sep='\t')
            for i, r in temp_fimo_hits.iterrows():
                if '#' in r['motif_id']:
                    continue
                if r['motif_alt_id'] == 'MEME-1' and float(r['q-value']) < 1e-7:
                    temp_fimo_hit_positions.append((int(r['start']), int(r['stop']), float(r['q-value']), r['motif_alt_id'],r['matched_sequence']))
        temp_fimo_hit_positions.sort(key = lambda x : x[2])
        hit_info.append([organism_name.replace(' ','_'), 'Num_hits', len(temp_fimo_hit_positions)]
        print(organism_name.replace(' ','_'), len(temp_fimo_hit_positions))

100%|██████████| 1/1 [00:00<00:00, 12.94it/s]


Mycobacterium tuberculosis H37Rv 91


100%|██████████| 1/1 [00:00<00:00, 15.46it/s]


Mycolicibacterium brumae 0


100%|██████████| 1/1 [00:00<00:00, 11.22it/s]


Mycolicibacterium thermoresistibile 1


100%|██████████| 1/1 [00:00<00:00, 12.99it/s]


Mycolicibacter terrae 4


100%|██████████| 1/1 [00:00<00:00, 11.17it/s]


Mycobacterium basiliense 0


100%|██████████| 1/1 [00:00<00:00, 10.71it/s]


Mycolicibacterium hassiacum DSM 44199 24


100%|██████████| 1/1 [00:00<00:00, 10.65it/s]


Mycolicibacterium aurum 0


100%|██████████| 1/1 [00:00<00:00, 11.28it/s]


Mycolicibacterium chitae 0


100%|██████████| 1/1 [00:00<00:00, 14.24it/s]
100%|██████████| 1/1 [00:00<00:00, 995.80it/s]


Mycolicibacillus parakoreensis 2


100%|██████████| 1/1 [00:00<00:00, 10.73it/s]
100%|██████████| 1/1 [00:00<00:00, 333.28it/s]
100%|██████████| 1/1 [00:00<00:00, 992.97it/s]


Mycobacterium rufum 0


100%|██████████| 1/1 [00:00<00:00, 10.99it/s]


Mycolicibacter virginiensis 35


100%|██████████| 1/1 [00:00<00:00, 11.36it/s]


Mycobacterium paraterrae 1


100%|██████████| 1/1 [00:00<00:00, 12.91it/s]
100%|██████████| 1/1 [00:00<00:00, 328.58it/s]


Candidatus Mycobacterium methanotrophicum 0


100%|██████████| 1/1 [00:00<00:00,  9.52it/s]
100%|██████████| 1/1 [00:00<00:00, 165.65it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


Mycolicibacterium crocinum 3


100%|██████████| 1/1 [00:00<00:00,  7.38it/s]


Mycobacterium goodii 77


100%|██████████| 1/1 [00:00<00:00, 10.48it/s]
100%|██████████| 1/1 [00:00<00:00, 502.37it/s]


Mycobacterium ulcerans 0


100%|██████████| 1/1 [00:00<00:00, 12.20it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


Mycobacterium intracellulare 2


100%|██████████| 1/1 [00:00<00:00, 11.57it/s]


Mycobacterium ostraviense 8


100%|██████████| 1/1 [00:00<00:00, 11.05it/s]


Mycolicibacter heraklionensis 20


100%|██████████| 1/1 [00:00<00:00, 10.68it/s]


Mycolicibacterium holsaticum DSM 44478 = JCM 12374 0


100%|██████████| 1/1 [00:00<00:00, 11.83it/s]


Mycobacterium malmoense 13


100%|██████████| 1/1 [00:00<00:00, 10.45it/s]


Mycolicibacterium senegalense 19


100%|██████████| 1/1 [00:00<00:00,  9.79it/s]
100%|██████████| 1/1 [00:00<00:00, 501.17it/s]
100%|██████████| 1/1 [00:00<00:00, 501.17it/s]
100%|██████████| 1/1 [00:00<00:00, 951.09it/s]


Mycolicibacterium farcinogenes 24


100%|██████████| 1/1 [00:00<00:00,  9.60it/s]
100%|██████████| 1/1 [00:00<00:00, 1003.18it/s]


Mycobacterium paraintracellulare 2


100%|██████████| 1/1 [00:00<00:00, 10.58it/s]


Mycobacterium heckeshornense 19


100%|██████████| 1/1 [00:00<00:00, 10.35it/s]


Mycobacterium spongiae 0


100%|██████████| 1/1 [00:00<00:00, 11.30it/s]


Mycolicibacterium neoaurum 2


100%|██████████| 1/1 [00:00<00:00, 10.35it/s]


Mycolicibacterium diernhoferi 0


100%|██████████| 1/1 [00:00<00:00, 10.92it/s]
100%|██████████| 1/1 [00:00<00:00, 1000.55it/s]
100%|██████████| 1/1 [00:00<00:00, 994.15it/s]


Mycolicibacterium pallens 5


100%|██████████| 1/1 [00:00<00:00, 11.69it/s]
100%|██████████| 1/1 [00:00<00:00, 198.65it/s]


Mycolicibacterium arabiense 0


100%|██████████| 1/1 [00:00<00:00, 12.56it/s]


Mycolicibacillus koreensis 0


100%|██████████| 1/1 [00:00<00:00, 11.49it/s]


Mycolicibacterium anyangense 3


100%|██████████| 1/1 [00:00<00:00, 11.97it/s]


Mycolicibacterium sarraceniae 0


100%|██████████| 1/1 [00:00<00:00, 10.52it/s]


Mycolicibacterium helvum 0


100%|██████████| 1/1 [00:00<00:00, 10.69it/s]


Mycobacterium vicinigordonae 29


100%|██████████| 1/1 [00:00<00:00, 11.29it/s]


Mycobacterium stomatepiae 0


100%|██████████| 1/1 [00:00<00:00, 10.05it/s]


Mycolicibacterium sediminis 0


100%|██████████| 1/1 [00:00<00:00, 11.83it/s]


Mycolicibacter minnesotensis 0


100%|██████████| 1/1 [00:00<00:00, 10.91it/s]


Mycobacterium mantenii 0


100%|██████████| 1/1 [00:00<00:00, 11.30it/s]


Mycolicibacterium celeriflavum 4


100%|██████████| 1/1 [00:00<00:00, 10.68it/s]


Mycolicibacterium monacense 25


100%|██████████| 1/1 [00:00<00:00, 11.03it/s]


Mycobacterium seoulense 0


100%|██████████| 1/1 [00:00<00:00, 11.40it/s]


Mycolicibacterium insubricum 0


100%|██████████| 1/1 [00:00<00:00, 12.25it/s]


Mycobacterium noviomagense 117


100%|██████████| 1/1 [00:00<00:00,  9.84it/s]


Mycobacterium paraseoulense 0


100%|██████████| 1/1 [00:00<00:00, 10.81it/s]


Mycolicibacterium litorale 3


100%|██████████| 1/1 [00:00<00:00, 10.58it/s]


Mycobacterium florentinum 8


100%|██████████| 1/1 [00:00<00:00, 10.36it/s]


Mycobacterium parmense 0


100%|██████████| 1/1 [00:00<00:00, 11.83it/s]


Mycobacterium heidelbergense 2


100%|██████████| 1/1 [00:00<00:00, 10.15it/s]


Mycolicibacterium phocaicum 2


100%|██████████| 1/1 [00:00<00:00, 10.15it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


Mycolicibacterium boenickei 29


100%|██████████| 1/1 [00:00<00:00, 12.90it/s]


Mycobacterium lacus 153


100%|██████████| 1/1 [00:00<00:00, 12.17it/s]


Mycolicibacter hiberniae 1


100%|██████████| 1/1 [00:00<00:00, 11.54it/s]
100%|██████████| 1/1 [00:00<00:00, 500.10it/s]


Mycolicibacterium madagascariense 2


100%|██████████| 1/1 [00:00<00:00, 10.36it/s]


Mycolicibacterium confluentis 0


100%|██████████| 1/1 [00:00<00:00, 13.89it/s]


Mycobacterium shinjukuense 105


100%|██████████| 1/1 [00:00<00:00, 10.70it/s]


Mycobacterium conspicuum 12


100%|██████████| 1/1 [00:00<00:00, 10.93it/s]


Mycobacterium shottsii 1


100%|██████████| 1/1 [00:00<00:00, 11.30it/s]
100%|██████████| 1/1 [00:00<00:00, 124.99it/s]


Mycobacterium branderi 0


100%|██████████| 1/1 [00:00<00:00,  8.89it/s]


Mycolicibacterium gadium 22


100%|██████████| 1/1 [00:00<00:00, 10.69it/s]


Mycobacterium saskatchewanense 24


100%|██████████| 1/1 [00:00<00:00, 11.96it/s]


Mycolicibacterium psychrotolerans 0


100%|██████████| 1/1 [00:00<00:00,  9.30it/s]
100%|██████████| 1/1 [00:00<00:00, 249.90it/s]


Mycolicibacterium alvei 8


100%|██████████| 1/1 [00:00<00:00,  7.09it/s]


Mycolicibacterium mageritense 19


100%|██████████| 1/1 [00:00<00:00, 11.17it/s]


Mycobacterium simiae 0


100%|██████████| 1/1 [00:00<00:00, 11.47it/s]


Mycobacterium cookii 0


100%|██████████| 1/1 [00:00<00:00, 13.24it/s]


Mycobacterium doricum 3


100%|██████████| 1/1 [00:00<00:00,  9.84it/s]
100%|██████████| 1/1 [00:00<00:00, 499.92it/s]


Mycolicibacterium poriferae 0


100%|██████████| 1/1 [00:00<00:00, 11.29it/s]


Mycolicibacterium tokaiense 0


100%|██████████| 1/1 [00:00<00:00, 11.10it/s]


Mycolicibacterium moriokaense 67


100%|██████████| 1/1 [00:00<00:00, 11.05it/s]


Mycolicibacterium aichiense 2


100%|██████████| 1/1 [00:00<00:00, 13.07it/s]


Mycobacterium novum 47


100%|██████████| 1/1 [00:00<00:00, 12.42it/s]


Mycolicibacterium duvalii 1


100%|██████████| 1/1 [00:00<00:00, 13.60it/s]


Mycolicibacterium fallax 0


100%|██████████| 1/1 [00:00<00:00, 13.69it/s]


Mycobacteroides salmoniphilum 0


100%|██████████| 1/1 [00:00<00:00, 11.17it/s]


Mycolicibacterium mucogenicum DSM 44124 1


100%|██████████| 1/1 [00:00<00:00, 10.80it/s]
100%|██████████| 1/1 [00:00<00:00, 987.59it/s]


Mycobacterium grossiae 0


100%|██████████| 1/1 [00:00<00:00, 11.42it/s]


Mycolicibacterium parafortuitum 0


100%|██████████| 1/1 [00:00<00:00, 11.30it/s]


Mycolicibacterium pulveris 53


100%|██████████| 1/1 [00:00<00:00, 12.12it/s]


Mycobacterium shigaense 0


100%|██████████| 1/1 [00:00<00:00, 12.66it/s]


[Mycobacterium] stephanolepidis 0


100%|██████████| 1/1 [00:00<00:00, 16.22it/s]


Mycobacterium leprae 0


100%|██████████| 1/1 [00:00<00:00, 11.05it/s]


Mycobacterium pseudoshottsii JCM 15466 0


100%|██████████| 1/1 [00:00<00:00,  9.75it/s]
100%|██████████| 1/1 [00:00<00:00, 332.17it/s]
100%|██████████| 1/1 [00:00<00:00, 500.51it/s]
100%|██████████| 1/1 [00:00<00:00, 987.59it/s]
100%|██████████| 1/1 [00:00<00:00, 1002.22it/s]


Mycobacterium paragordonae 24


100%|██████████| 1/1 [00:00<00:00, 10.78it/s]
100%|██████████| 1/1 [00:00<00:00, 998.17it/s]


Mycobacteroides abscessus 0


100%|██████████| 1/1 [00:00<00:00,  9.39it/s]


Mycobacteroides immunogenum 3


100%|██████████| 1/1 [00:00<00:00, 10.81it/s]


Mycobacteroides chelonae CCUG 47445 0


100%|██████████| 1/1 [00:00<00:00,  8.02it/s]


Mycolicibacterium vaccae 95051 2


100%|██████████| 1/1 [00:00<00:00,  9.22it/s]
100%|██████████| 1/1 [00:00<00:00, 500.16it/s]
100%|██████████| 1/1 [00:00<00:00, 499.86it/s]
100%|██████████| 1/1 [00:00<00:00, 1003.42it/s]
100%|██████████| 1/1 [00:00<00:00, 999.83it/s]


Mycobacterium dioxanotrophicus 2


100%|██████████| 1/1 [00:00<00:00,  9.85it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


Mycobacterium marseillense 6


100%|██████████| 1/1 [00:00<00:00, 11.84it/s]


Mycolicibacter sinensis 57


100%|██████████| 1/1 [00:00<00:00, 11.11it/s]


Mycobacterium canettii CIPT 140010059 86


100%|██████████| 1/1 [00:00<00:00, 13.22it/s]


Mycobacterium haemophilum DSM 44634 0


100%|██████████| 1/1 [00:00<00:00,  9.57it/s]
100%|██████████| 1/1 [00:00<00:00, 499.62it/s]


Mycobacterium marinum E11 1


100%|██████████| 1/1 [00:00<00:00, 15.74it/s]


Mycobacterium lepromatosis 0


100%|██████████| 1/1 [00:00<00:00,  7.63it/s]


Mycolicibacterium fortuitum 9


100%|██████████| 1/1 [00:00<00:00, 11.75it/s]


Mycobacteroides saopaulense 8


100%|██████████| 1/1 [00:00<00:00, 10.43it/s]


Mycolicibacterium vanbaalenii PYR-1 1


100%|██████████| 1/1 [00:00<00:00, 10.16it/s]
100%|██████████| 1/1 [00:00<00:00, 497.96it/s]


Mycobacterium kansasii ATCC 12478 9


100%|██████████| 1/1 [00:00<00:00, 10.03it/s]
100%|██████████| 1/1 [00:00<00:00, 503.16it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


Mycolicibacterium gilvum Spyr1 0


In [108]:
mycobacteria_dirs = []
for dir in util.list_dirs(mycobacteria_seq_dir):
    if os.path.exists(mycobacteria_seq_dir + '/' + dir + '/genomic.gbff'):
        mycobacteria_dirs.append(dir)

In [81]:
tb_mycobacteria_rbh = pd.read_csv(project_dir + '/tb_mycobacteria_reciprocal_best_hits.csv')

In [104]:
max_len = len(tb_mycobacteria_rbh['target_species_name'].unique())

In [106]:
full_ortholog_refs = []
temp = tb_mycobacteria_rbh.groupby('query_ref').agg({'target_ref': "count", 'percent_identical_matches': "min"}).reset_index()
temp = temp.query('target_ref == @max_len and percent_identical_matches > 85')
for i, r in temp.iterrows():
    full_ortholog_refs.append(r['query_ref'])

In [110]:
def generate_protein_dataset(num_subsets, subset_num, dir_list, seqdir): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    all_cds = []
    all_tb_cds = []
    names = []
    sequences = []
    locations = []
    for dirname in (sequence_dirs):
        for genome_record in SeqIO.parse(seqdir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None and a.get("locus_tag") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                   
    return (all_cds, names, locations, sequences)           

In [115]:
full_build = True
if full_build == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, mycobacteria_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
    names_dict = {}
    locations_dict = {}
    sequence_dict = {}
    protein_dict = {}
    for x in parallel_output:
        for temp in x[0]:
            protein_dict[temp[0]] = temp[1]
        for temp in x[1]:
            names_dict[temp[0]] = temp[1]
        for temp in x[2]:
            locations_dict[temp[0]] = temp[1]
        for temp in x[3]:
            sequence_dict[temp[0]] = temp[1]


In [None]:
concatenated_alignment_dict = {}
for ref in tqdm(full_ortholog_refs):
    temp_seq = []
    temp = tb_mycobacteria_rbh[tb_mycobacteria_rbh['query_ref'] == ref]
    for i, r in temp.iterrows():
        temp_seq.append([r['target_species_name'].replace(' ','_'),protein_dict[r['target_ref']]])
    util.produce_fasta_file(temp_seq, project_dir +'/temp_seq.fasta')    
    cline = MuscleCommandline(muscle_exe, input= project_dir +'/temp_seq.fasta', out=project_dir +'/temp_seq_alignment.fasta')
    result = cline();    
    alignment = util.read_fasta_to_array(project_dir +'/temp_seq_alignment.fasta')    
    for (name, sequence) in zip(alignment[0], alignment[1]):
        if name in concatenated_alignment_dict:
            temp2 = concatenated_alignment_dict[name]
            concatenated_alignment_dict[name] = temp2 + sequence
        else:
            concatenated_alignment_dict[name] = sequence
temp  = []
for k, v in concatenated_alignment_dict.items():
    temp.append([k, v])
util.produce_fasta_file(temp, project_dir + '/concatenated_alignment.fasta')

54