#### Import packages, set directories and parameters

In [84]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import shutil
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2, binom
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [85]:
project_dir = 'F:/Project_Data/mabR_Project/MTBC'
seq_dir = 'F:/Datasets/Other_Genomes/MTBC'
motif_loc = 'F:/Project_Data/mabR_Project/Second_MEME_Upstream_Output/meme.txt'
full_run = True
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [91]:
tbc_seq_dict = {}
tbc_name_dict = {}
tbc_species_dirs = util.list_dirs(seq_dir)
for sdir in tbc_species_dirs:
    for genome_record in SeqIO.parse(seq_dir + '/' + sdir + '/sequence.gb', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            tbc_name_dict[accession_ver] = genome_record.annotations['organism']
            full_sequence = str(genome_record.seq)
            tbc_seq_dict[accession_ver] = full_sequence
    

In [95]:
for k, v in tbc_seq_dict.items():
    print(tbc_name_dict[k], len(v))

Mycobacterium orygis 4352140
Mycobacterium tuberculosis variant caprae 4324961
Mycobacterium tuberculosis variant bovis AF2122/97 4349904
Mycobacterium tuberculosis H37Rv 4411532
Mycobacterium tuberculosis variant microti 4384561
Mycobacterium tuberculosis variant africanum GM041182 4389314
Mycobacterium canettii CIPT 140010059 4482059


In [96]:
util.produce_fasta_file([[k,v] for k, v in tbc_seq_dict.items()], project_dir + '/tbc_sequences.faa')

100%|██████████| 7/7 [00:00<00:00, 10.88it/s]


In [97]:
blastfn.run_fimo(motif_loc, project_dir + '/tbc_sequences.faa' , project_dir + '/tbc_FIMO_Hits')

In [98]:
tbc_fimo_hits_df = pd.read_csv(project_dir + '/tbc_FIMO_Hits/fimo.tsv', sep='\t')

In [147]:
temp = []
for i, r in tbc_fimo_hits_df.iterrows():
    if '#' in r['motif_id'] or not(r['motif_alt_id'] == 'MEME-1') or r['q-value'] > 1e-3:
        continue
    start = int(r['start'])-100
    stop = int(r['stop'])+100
    if r['strand'] == 1:
        strand = '+'
    else:
        strand = '-'
    temp.append([r['sequence_name']+'@'+str(start)+'_'+str(stop)+'_'+strand, tbc_seq_dict[r['sequence_name']][start:stop]])

util.produce_fasta_file(temp, project_dir + '/mtbc_fimo_hits.faa')
print(len(temp))

100%|██████████| 1049/1049 [00:00<00:00, 59253.71it/s]

1049





In [148]:
if 1==1:
    blastfn.build_blast_db(project_dir, 'tbc_sequences.faa', 'MTBC', project_dir + '/BLAST/MTBC', 'nucl')
    shutil.copy(project_dir + '/mtbc_fimo_hits.faa', project_dir + '/BLAST/MTBC/mtbc_fimo_hits.faa')
    blastfn.run_blastn(project_dir + '/BLAST/MTBC', 'mtbc_fimo_hits.faa', 'MTBC', 'motif_mtbc_blast_hits.csv', e_value = 1e-1)

In [188]:
blast_results_df = pd.read_csv(project_dir + '/BLAST/MTBC/motif_mtbc_blast_hits.csv', header = None)
blast_results_df.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 
                         'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']
blast_results_df = blast_results_df.loc[blast_results_df.groupby(['query_ref','target_ref'])['bit_score'].idxmax()]
for i, r in blast_results_df.iterrows():
    target_ref = r['target_ref'].split('@')[0]
    query_ref = r['query_ref'].split('@')[0]
    blast_results_df.at[i,'query_organism'] = tbc_name_dict[query_ref]
    blast_results_df.at[i,'target_organism'] = tbc_name_dict[target_ref]
    temp = r['query_ref'].split('@')[1].split('_')
    query_start = int(temp[0])
    query_end = int(temp[1])
    target_start_alignment = int(r['target_start_alignment'])
    target_end_alignment = int(r['target_end_alignment'])
    query_start_alignment = int(r['query_start_alignment'])
    query_end_alignment = int(r['query_end_alignment'])
    full_target_alignment_start = target_start_alignment - query_start_alignment 
    full_target_alignment_end = target_end_alignment + 221- query_end_alignment
    blast_results_df.at[i, 'full_target_alignment_start'] =  full_target_alignment_start
    blast_results_df.at[i, 'full_target_alignment_end'] =  full_target_alignment_end
    blast_results_df.at[i, 'full_query_sequence'] =  tbc_seq_dict[query_ref][query_start:query_end]
    blast_results_df.at[i, 'full_target_sequence'] =  tbc_seq_dict[target_ref][full_target_alignment_start: full_target_alignment_end]
    blast_results_df.at[i, 'full_target_sequence_ref'] = target_ref + '_'+ str(full_target_alignment_start) + '_' + str(full_target_alignment_end)

In [176]:
blast_results_df.to_csv(project_dir + '/processed_blast_results.csv')

In [196]:
temp_df = blast_results_df.query('query_end_alignment < 217')
distinct_clusters = []
regions_considered = []
for i, r in temp_df.iterrows():
    temp_df_2 = temp_df[temp_df['query_ref'] == r['query_ref']]
    matched = 0
    for j, s in temp_df_2.iterrows():
        for (species, start, stop) in regions_considered:
            if s['target_ref'] == species and s['full_target_alignment_start'] < stop and s['full_target_alignment_end'] > start:
                matched = 1
    if matched == 0:
        distinct_clusters.append(r['query_ref'])
        for j, s in temp_df_2.iterrows():
            regions_considered.append((s['target_ref'], s['full_target_alignment_start'], s['full_target_alignment_end'])) 

In [182]:
for ref in distint_clusters:
    ref = "CP016401.1@3394898_3395119_-"
    region_of_interest = blast_results_df.query('query_ref == @ref')
    temp = []
    for i, r in region_of_interest.iterrows():
        temp.append([r['full_target_sequence_ref'],r['full_target_sequence']])
    util.produce_fasta_file(temp, project_dir + '/Regions_of_Interest/' + ref + '.faa')
    cline = MuscleCommandline(muscle_exe, input= project_dir + '/Regions_of_Interest/' + ref + '.faa', out = project_dir + '/Regions_of_Interest/' + ref + '_aligned.fasta')
    stdout, stderr = cline()

100%|██████████| 7/7 [00:00<?, ?it/s]


In [183]:
cline = MuscleCommandline(muscle_exe, input= project_dir + '/Regions_of_Interest/' + ref + '.faa', out = project_dir + '/Regions_of_Interest/' + ref + '_aligned.fasta')
stdout, stderr = cline()

In [160]:
util.produce_fasta_file([['H37RV_2522064_2522460',tb_sequence[2522064:2522460]],['AF221297_2505723_2505831',bovis_sequence[2505623:2505931]]], project_dir + '/bovis_tb_difference.faa')

100%|██████████| 2/2 [00:00<?, ?it/s]


In [83]:
cline = MuscleCommandline(muscle_exe, input= project_dir + '/bovis_tb_difference.faa', out = project_dir + '/bovis_tb_alignment.fasta')
stdout, stderr = cline()

In [76]:
bovis_sequence[2505623:2505931]

'CGACGAGAAGATCGACCCGGCGCATACTCGCAGCAAGCTCACCGAGGCGCTGGCGCAGGCTCCGGCACGGCGCGGCCGCCACAAGAACATCCCGCTGTAGTTCTGACCGCGAGCCGCTCCTCGCATGCTCGAACGGTGCCTACCGACGCGCTAACAATTCTCGAGAAGGCCGGCGGGTTCGCCACCACCGCGCAATTGCTCACGGTCATGACCCGCCAACAGCTCGACGTCCAAGTGAAAAACGGCGGCCTCGTTCGCGTTTGGTACGGGGTCTACGCGGCACAAGAGCCGGACCTGTTGGGCCGCTT'

In [165]:
counts_df = blast_results_df.groupby(['query_ref']).size().reset_index()
counts_df=counts_df.rename(columns={0 : 'Count'}) 