#### Import packages, set directories and parameters

In [120]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2, binom
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
project_dir = 'F:/Project_Data/mabR_Project'
mycobacteria_seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_Complete_Annot_20230511/data'
tb_species = 'AL123456.3' 
tb_annotation_dirname = 'GCA_000195955.2'
min_region_length = 7 
full_build = False
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_run = False

In [3]:
mycobacteria_dirs = []
for dir in util.list_dirs(mycobacteria_seq_dir):
    if os.path.exists(mycobacteria_seq_dir + '/' + dir + '/genomic.gbff'):
        mycobacteria_dirs.append(dir)

In [4]:
tb_mycobacteria_rbh = pd.read_csv(project_dir + '/tb_mycobacteria_reciprocal_best_hits.csv')

#### Produce reference FASTA files (a) TB and (b) all mycobacteria for searching against motifs

In [39]:
genome_record = next(SeqIO.parse(mycobacteria_seq_dir + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"))
full_sequence = str(genome_record.seq)
accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
util.produce_fasta_file([[accession_ver, full_sequence]], project_dir + '/mtb_seq')


100%|██████████| 1/1 [00:00<00:00, 11.87it/s]
100%|██████████| 1/1 [00:00<00:00, 12.98it/s]


In [7]:
if full_run == True:
    temp = []
    for dir in util.list_dirs(mycobacteria_seq_dir): 
        if os.path.exists(mycobacteria_seq_dir + '/' + dir + '/genomic.gbff'):
            for genome_record in (SeqIO.parse(mycobacteria_seq_dir + '/' + dir + '/genomic.gbff', "genbank")):
                accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
                full_sequence = str(genome_record.seq)
                temp.append([accession_ver, full_sequence])
    util.produce_fasta_file(temp, project_dir + '/all_seq')

#### Functions to locate arbitrary region in organism

In [8]:
def annotated_regions_dataset(num_subsets, subset_num, dir_list, seqdir): 
    output = []
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    for dirname in sequence_dirs:
        annotated_regions = []
        intergenic_regions = []
        for record in (SeqIO.parse(seqdir + '/'+dirname+'/genomic.gbff', "genbank")):
            accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
            for feature in record.features:
                a = feature.qualifiers
                if feature.type not in ['source','gene'] and (int(feature.location.start) < int(feature.location.end)) and (int(feature.location.end) - int(feature.location.start)) < 1000000:
                    if not(a.get("product") == None):
                           product = a.get("product")[0]
                    if not(a.get("locus_tag")==None):
                        locus_tag = a.get("locus_tag")[0]
                    else:
                        locus_tag = feature.type
                    annotated_regions.append((locus_tag, product, feature.type, int(feature.location.start), int(feature.location.end), str(feature.location.strand)))
            annotated_regions.sort(key = lambda x: x[4])
            prev_strand = 0
            prev_locus = ''
            prev_product = ''
            max_stop = 0
            for n, (locus, product, feature_type, start, stop, strand) in enumerate(annotated_regions):
                if start > max_stop:
                    intergenic_regions.append([prev_locus+':'+locus, prev_product + ':' + product, 'Inter-feature',max_stop, start, str(prev_strand)+':'+str(strand)])
                if stop > max_stop:
                    prev_locus = locus
                    prev_product = product
                    prev_strand = strand
                max_stop = max(max_stop, stop)    
            for x in intergenic_regions:
                annotated_regions.append(x)
            annotated_regions.sort(key = lambda x : x[4])
            output.append([accession_ver, annotated_regions])
    return output

In [9]:
parallel_output = Parallel(n_jobs=-1)(delayed(annotated_regions_dataset)(num_cores, core_number, mycobacteria_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
annotated_regions_dict = {}
for x in parallel_output:
    for y in x:
        annotated_regions_dict[y[0]] = y[1]

In [101]:
def location(accession_ver, start1, end1):
    feature_matches = []
    for (locus, product, feature, start, stop, strand) in annotated_regions_dict[accession_ver]:
            if start< end1 and stop > start1:
                overlap = str(int(100*(min(end1, stop) - max(start1, start))/ (end1-start1)))+'%'
                feature_matches.append([locus, product, feature, overlap, strand])
    return feature_matches

#### Functions to run MEME and FIMO

In [11]:
def run_meme(search_regions_loc, output_dir, min_width, min_sites):
    subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; meme '+ util.wslname(search_regions_loc) + ' -oc '+ util.wslname(output_dir) +' -dna -evt 0.01 -revcomp -mod anr -brief 4000 -minw ' + str(min_width) +' -maxw 200 -minsites ' + str(min_sites)
               , shell=True)

In [12]:
def run_fimo(motif_file, sequence_to_search_file, output_dir):
    subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; fimo -oc ' + util.wslname(output_dir) + ' ' + util.wslname(motif_file) + ' ' + util.wslname(sequence_to_search_file)
               , shell=True)

#### Produce dictionary with upstream and downstream sequences relative to CDS locus id

In [13]:
tb_upstream_dict = {}
tb_downstream_dict = {}
all_features = []
genome_record =  next(SeqIO.parse(mycobacteria_seq_dir + '/'+tb_annotation_dirname+'/genomic.gbff', "genbank"))
full_sequence = str(genome_record.seq)
len_full_sequence = len(full_sequence)
for feature in genome_record.features:
    if feature.type in ['gene', 'source']:
        continue
    a = feature.qualifiers
    feature_type = feature.type
    if a.get("locus_tag") != None:
        locus_tag = a.get("locus_tag")[0]
    else:
        locus_tag  = ''
    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
    all_features.append([locus_tag, feature_type, start, stop, strand])

# Positive strand upstream 
all_features.sort(key = lambda x: x[2])
max_stop = 0
for (locus, feature_type, start, stop, strand) in all_features:
    if max_stop < start and feature_type == 'CDS' and strand == 1 and start - max_stop < 100000:    #Avoid joins where biopython interprets inconsistently 
        tb_upstream_dict[locus] = full_sequence[max_stop: start+3]
    max_stop = max(max_stop, stop)
# Negative strand upstream
all_features.sort(key = lambda x: x[3], reverse = True)
min_start = len(full_sequence)-1
for (locus, feature_type, start, stop, strand) in all_features:
    if stop < min_start and feature_type == 'CDS' and strand == -1 and min_start - stop < 100000:
        tb_upstream_dict[locus] = util.reverse_complement(full_sequence[stop-3: min_start])
    min_start = min(min_start, start)
    
# Positive strand downstream
all_features.sort(key = lambda x: x[3], reverse = True)
min_start = len(full_sequence)-1
for (locus, feature_type, start, stop, strand) in all_features:
    if stop < min_start and feature_type == 'CDS' and strand == 1 and min_start - stop < 100000:
        tb_downstream_dict[locus] = full_sequence[stop-3: min_start]
    min_start = min(min_start, start)
# Negative strand downstream 
all_features.sort(key = lambda x: x[2])
max_stop = 0
for (locus, feature_type, start, stop, strand) in all_features:
    if max_stop < start and feature_type == 'CDS' and strand == -1 and start - max_stop < 100000:    #Avoid joins where biopython interprets inconsistently 
        tb_downstream_dict[locus] = util.reverse_complement(full_sequence[max_stop: start+3])
    max_stop = max(max_stop, stop)    


#### Import list of significantly up/downregulated genes and produce fasta file of upstream sequences

In [14]:
regulation_type_dict = {}
p_val_dict = {}
downreg_genes = pd.read_excel(project_dir + '/2022-10-02_data_NU_NAs_renamed.xlsx', sheet_name = 'significant genes downregulated')
upreg_genes = pd.read_excel(project_dir + '/2022-10-02_data.xlsx', sheet_name = 'significant genes upregulated')
all_genes = pd.read_excel(project_dir + '/2022-10-02_data.xlsx', sheet_name = 'all genes')
gene_list = []
for i, r in downreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Downregulated'
for i, r in upreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Upregulated'
for i, r in all_genes.iterrows():
    if r['padj'] == 'NA':
        p_val_dict[r['Locus']] = 999
    else:    
        p_val_dict[r['Locus']] = r['padj']

In [15]:
significant_upstream_tb_regions = []
significant_downstream_tb_regions = []
for gene in gene_list:
    locus = str(gene)
    if locus in tb_upstream_dict:
        upstream_region = tb_upstream_dict[locus][:-3]  # Remove start codon of downstream gene
        if len(upstream_region) >= min_region_length:
            significant_upstream_tb_regions.append([locus, upstream_region])
    if locus in tb_downstream_dict:
        downstream_region = tb_downstream_dict[locus][:-3]  # Remove start codon of downstream gene
        if len(downstream_region) >= min_region_length:
            significant_downstream_tb_regions.append([locus, downstream_region])
util.produce_fasta_file(significant_upstream_tb_regions, project_dir + '/significant_upstream_tb_regions.faa')
util.produce_fasta_file(significant_downstream_tb_regions, project_dir + '/significant_downstream_tb_regions.faa')

100%|██████████| 177/177 [00:00<00:00, 177055.05it/s]
100%|██████████| 145/145 [00:00<00:00, 144975.94it/s]


#### Run MEME, search against tb using FIMO

In [16]:
run_meme(project_dir + '/significant_upstream_tb_regions.faa',project_dir + '/MEME_Upstream_Output',3,5)
run_meme(project_dir + '/significant_downstream_tb_regions.faa',project_dir + '/MEME_Downstream_Output',3,5)

In [20]:
run_fimo(project_dir + '/MEME_Upstream_Output/meme.txt', project_dir + '/mtb_seq' , project_dir + '/FIMO_Upstream_Output')
run_fimo(project_dir + '/MEME_Downstream_Output/meme.txt', project_dir + '/mtb_seq' , project_dir + '/FIMO_Downstream_Output')

In [18]:
fimo_upstream_hits = pd.read_csv(project_dir + '/FIMO_Upstream_Output/fimo.tsv', sep='\t')
fimo_downstream_hits = pd.read_csv(project_dir + '/FIMO_Downstream_Output/fimo.tsv', sep='\t')

In [130]:
run_meme('F:/Project_Data/Intergenic_Region_Comparative_Analysis/Rv2248/intergenic_regions_1.fasta',project_dir + '/Temp_MEME_Output',3,5)

In [146]:
run_fimo(project_dir + '/Temp_MEME_Output/meme.txt', project_dir + '/all_seq' , project_dir + '/Temp_FIMO_Output')

In [23]:
def produce_matched_sequence_file(hit_df, output_file):
    matched_sequences = []
    for i, r in hit_df.iterrows():
        if '#' in r['motif_id']:
            continue
        if float(r['q-value']) < 1e-3:
            matched_sequences.append([str(r['start'])+'_'+str(r['stop']), r['matched_sequence']])
    util.produce_fasta_file(matched_sequences, output_file)

In [24]:
produce_matched_sequence_file(fimo_upstream_hits, project_dir + '/fimo_upstream_hit_sequences.faa')
produce_matched_sequence_file(fimo_downstream_hits, project_dir + '/fimo_downstream_hit_sequences.faa')

100%|██████████| 162/162 [00:00<?, ?it/s]
100%|██████████| 150/150 [00:00<?, ?it/s]


In [25]:
run_meme(project_dir + '/fimo_upstream_hit_sequences.faa', project_dir + '/Second_MEME_Upstream_Output',3,5)
run_meme(project_dir + '/fimo_downstream_hit_sequences.faa', project_dir + '/Second_MEME_Downstream_Output',3,5)

In [49]:
run_fimo(project_dir + '/Second_MEME_Upstream_Output/meme.txt', project_dir + '/all_seq' , project_dir + '/Second_FIMO_Upstream_Output')
run_fimo(project_dir + '/Second_MEME_Downstream_Output/meme.txt', project_dir + '/all_seq' , project_dir + '/Second_FIMO_Downstream_Output')

#### Final motif and searches

In [147]:
temp_fimo_hits = pd.read_csv(project_dir + '/Temp_FIMO_Output/fimo.tsv', sep='\t')
hit_dict = {}
for i, r in temp_fimo_hits.iterrows():
    if '#' in r['motif_id']:
        continue
    organism = r['sequence_name']
    motif_id = r['motif_alt_id']
    if float(r['q-value']) < 1e-4 and motif_id == 'MEME-1':
        temp = location(r['sequence_name'],int(r['start']), int(r['stop']))
        if len(temp) > 0:
            temp.sort(key = lambda x: float(x[3].strip('%')), reverse = True)
            main_location = temp[0]
        else:
            main_location = ''
 

        info = [int(r['start']), int(r['stop']), r['strand'], float(r['q-value']), main_location, r['matched_sequence']]
        if organism in hit_dict:
            hit_dict[organism].append(info)
        else:
            hit_dict[organism] = [info]

for k, v in hit_dict.items():
    v.sort(key = lambda x: x[0])
  

In [137]:
genes = []
v = hit_dict[tb_species]
for n, (start, stop, strand, q, loc, matched_seq) in enumerate(v):
    for x in loc[0].split(':'):
        genes.append(x)
b = set(genes)
inters = b.intersection(set(gene_list))

In [142]:
print(len(b), len(inters), len(gene_list))

134 17 236


In [149]:
names_dict

NameError: name 'names_dict' is not defined

In [139]:
b.intersection(set(gene_list))

{'Rv0154c',
 'Rv0211',
 'Rv0467',
 'Rv0468',
 'Rv0711',
 'Rv0860',
 'Rv0896',
 'Rv1058',
 'Rv1180',
 'Rv2845c',
 'Rv2846c',
 'Rv3140',
 'Rv3319',
 'Rv3383c',
 'Rv3416',
 'Rv3792',
 'Rv3795'}

In [145]:
1- binom.cdf(len(inters), len(b),(len(gene_list))/3900)

0.0012562765454423586

In [None]:
pd.DataFrame(hit_info, columns = ['name', 'variable', 'value']).to_csv(project_dir + '/Hit_Counts.csv')

In [153]:
for k, v in hit_dict.items():
    print(k, v)

AL123456.3 [[183270, 183291, '+', 4.64e-05, ['Rv0154c:Rv0155', 'Probable acyl-CoA dehydrogenase FadE2:Probable NAD(P) transhydrogenase (subunit alpha) PntAa [first part; catalytic part] (pyridine nucleotide transhydrogenase subunit alpha) (nicotinamide nucleotide transhydrogenase subunit alpha)', 'Inter-feature', '100%', '-1:1'], 'AGGGGGTTTTGCGTCTGCTCGC'], [234447, 234468, '-', 9.96e-07, ['Rv0197', 'Possible oxidoreductase', 'CDS', '100%', '1'], 'GGGCGATTCTGCGTCTGCTCGG'], [253604, 253625, '-', 2.3e-05, ['Rv0211:Rv0212c', 'Probable iron-regulated phosphoenolpyruvate carboxykinase [GTP] PckA (phosphoenolpyruvate carboxylase) (PEPCK)(pep carboxykinase):Possible transcriptional regulatory protein NadR (probably AsnC-family)', 'Inter-feature', '100%', '1:-1'], 'GGGGGCTTATGCGTCTGCTCGC'], [279535, 279556, '-', 2.89e-07, ['Rv0233:Rv0234c', 'Ribonucleoside-diphosphate reductase (beta chain) NrdB (ribonucleotide reductase small chain):Succinate-semialdehyde dehydrogenase [NADP+] dependent (SSDH)

#### Buiild phylo tree

In [None]:
max_len = len(tb_mycobacteria_rbh['target_species_name'].unique())

In [None]:
full_ortholog_refs = []
temp = tb_mycobacteria_rbh.groupby('query_ref').agg({'target_ref': "count", 'percent_identical_matches': "min"}).reset_index()
temp = temp.query('target_ref == @max_len and percent_identical_matches > 85')
for i, r in temp.iterrows():
    full_ortholog_refs.append(r['query_ref'])

In [None]:
def generate_protein_dataset(num_subsets, subset_num, dir_list, seqdir): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    all_cds = []
    all_tb_cds = []
    names = []
    sequences = []
    locations = []
    all_cds_nt = []
    all_cds_200_up_nt = []
    for dirname in (sequence_dirs):
        for genome_record in SeqIO.parse(seqdir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None and a.get("locus_tag") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    if strand == 1:
                        nt_sequence = full_sequence[start: stop]
                        nt_sequence_200_up = full_sequence[start-200: start]
                    else:
                        nt_sequence = util.reverse_complement(full_sequence[start:stop])
                        nt_sequence_200_up = util.reverse_complement(full_sequence[stop:stop+200])
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                    all_cds_nt.append([accession_locus, nt_sequence])
                    all_cds_200_up_nt.append([accession_locus, nt_sequence_200_up])
    return (all_cds, names, locations, sequences, all_cds_nt, all_cds_200_up_nt)           

In [None]:
full_build = True
if full_build == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, mycobacteria_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
    protein_dict = {}
    for x in parallel_output:
        for temp in x[0]:
            protein_dict[temp[0]] = temp[1]

In [None]:
concatenated_alignment_dict = {}
for ref in tqdm(full_ortholog_refs):
    temp_seq = []
    temp = tb_mycobacteria_rbh[tb_mycobacteria_rbh['query_ref'] == ref]
    for i, r in temp.iterrows():
        temp_seq.append([r['target_species_name'].replace(' ','_'),protein_dict[r['target_ref']]])
    util.produce_fasta_file(temp_seq, project_dir +'/temp_seq.fasta')    
    cline = MuscleCommandline(muscle_exe, input= project_dir +'/temp_seq.fasta', out=project_dir +'/temp_seq_alignment.fasta')
    result = cline();    
    alignment = util.read_fasta_to_array(project_dir +'/temp_seq_alignment.fasta')    
    for (name, sequence) in zip(alignment[0], alignment[1]):
        if name in concatenated_alignment_dict:
            temp2 = concatenated_alignment_dict[name]
            concatenated_alignment_dict[name] = temp2 + sequence
        else:
            concatenated_alignment_dict[name] = sequence
temp  = []
for k, v in concatenated_alignment_dict.items():
    temp.append([k, v])
util.produce_fasta_file(temp, project_dir + '/concatenated_alignment.fasta')

#### Number of motif hits in different species and output so can be merged with R to display info against phylo tree

In [151]:
names_dict = {}
for dirname in (mycobacteria_dirs):
        for genome_record in SeqIO.parse(mycobacteria_seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names_dict[accession_ver] = genome_record.annotations['organism']

In [152]:
hit_info = []
for k, v in hit_dict.items():
    print(names_dict[k], len(v))
    
    
    #hit_info.append([organism_name.replace(' ','_'), 'Num_hits', len(temp_fimo_hit_positions)])
    #    print(organism_name.replace(' ','_'), len(temp_fimo_hit_positions))

Mycobacterium tuberculosis H37Rv 116
Mycobacterium canettii CIPT 140010059 118
Mycobacterium ulcerans 48
Mycolicibacterium anyangense 27
Mycobacterium paragordonae 82
Mycobacterium basiliense 45
Mycobacterium heidelbergense 111
Mycolicibacterium vaccae 95051 26
Mycolicibacter hiberniae 40
Mycobacterium heckeshornense 58
Mycolicibacter terrae 55
Mycobacterium pseudoshottsii JCM 15466 46
Mycobacterium marinum E11 56
Mycolicibacterium vanbaalenii PYR-1 29
Mycobacterium shottsii 50
Mycobacterium ostraviense 71
Mycobacterium lacus 164
Candidatus Mycobacterium methanotrophicum 61
Mycobacterium malmoense 117
Mycobacterium marseillense 22
Mycobacterium branderi 13
Mycobacterium kansasii ATCC 12478 66
Mycobacterium intracellulare 5
Mycobacterium dioxanotrophicus 7
Mycobacterium saskatchewanense 28
Mycobacterium florentinum 14
Mycobacterium shinjukuense 104
Mycobacterium rufum 38
Mycolicibacterium confluentis 33
Mycolicibacterium litorale 16
Mycolicibacterium aurum 6
Mycolicibacterium gilvum Spy