#### Import packages, set directories and parameters

In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import shutil
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
import collections
from scipy.stats import chi2, binom
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
project_dir = 'F:/Project_Data/mabR_Project/MTBC_Strains'
seq_dir = 'F:/Datasets/Other_Genomes/MTBC_Strains'
motif_loc = 'F:/Project_Data/mabR_Project/Second_MEME_Upstream_Output/meme.txt'
full_run = True
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [4]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A'}    # Note S
    temp = []
    for char in reversed(seq_string):
        if char in complement_dict:
            temp.append(complement_dict[char])
        else:
            temp.append('A')
    return ''.join(temp)

In [None]:
tbc_seq_dict = {}
tbc_name_dict = {}
tbc_species_dirs = util.list_dirs(seq_dir)
species_name_dict = {}
accession_dict = {}
for n, sdir in tqdm(enumerate(tbc_species_dirs)):
    fname = util.list_files(seq_dir + '/' + sdir)[0]
    fasta_file = util.read_fasta_to_array(seq_dir + '/' + sdir + '/' + fname)
    if len(fasta_file[1][0]) <4000000:
        print(fname)
        continue
    species_name_dict['MTB'+str(n)] = fasta_file[0][0]
    tbc_seq_dict['MTB'+str(n)] = fasta_file[1][0]
    accession_dict['MTB'+str(n)] = fname[:-11]

In [None]:
species_sname_dict = {}
for k, v in species_name_dict.items():
    temp = v.replace(',','').split(' ')
    temp = [x for x in temp if not(x in ['Mycobacterium', 'tuberculosis', 'complete', 'sequence','chromosome', 'genome', 'chromosome'])]
    species_sname_dict[k] = ' '.join(temp).replace(' ','_')

In [None]:
util.produce_fasta_file([[k,v] for k, v in tbc_seq_dict.items()], project_dir + '/tbc_sequences.faa')

In [None]:
blastfn.run_fimo(motif_loc, project_dir + '/tbc_sequences.faa' , project_dir + '/tbc_FIMO_Hits')

In [None]:
tbc_fimo_hits_df = pd.read_csv(project_dir + '/tbc_FIMO_Hits/fimo.tsv', sep='\t')

In [None]:
temp = []
for i, r in tbc_fimo_hits_df.iterrows():
    if '#' in r['motif_id'] or not(r['motif_alt_id'] == 'MEME-1') or r['q-value'] > 1e-3:
        continue
    start = int(r['start'])-100
    stop = int(r['stop'])+100
    if r['strand'] == 1:
        strand = '+'
    else:
        strand = '-'
    temp.append([r['sequence_name']+'@'+str(start)+'_'+str(stop)+'_'+strand, tbc_seq_dict[r['sequence_name']][start:stop]])
    tb_hits.append([r['sequence_name']+'@'+str(start)+'_'+str(stop)+'_'+strand, tbc_seq_dict[r['sequence_name']][start:stop]])
util.produce_fasta_file(temp, project_dir + '/mtbc_fimo_hits.faa')
print(len(temp))

In [None]:
if 1==0:
    blastfn.build_blast_db(project_dir, 'tbc_sequences.faa', 'MTBC', project_dir + '/BLAST/MTBC', 'nucl')
    shutil.copy(project_dir + '/mtbc_fimo_hits.faa', project_dir + '/BLAST/MTBC/mtbc_fimo_hits.faa')
    blastfn.run_blastn(project_dir + '/BLAST/MTBC', 'mtbc_fimo_hits.faa', 'MTBC', 'motif_mtbc_blast_hits.csv', e_value = 1e-1)

In [None]:
blast_results_df = pd.read_csv(project_dir + '/BLAST/MTBC/motif_mtbc_blast_hits.csv', header = None)

blast_results_df.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 
                         'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']
for i, r in blast_results_df.iterrows():
    blast_results_df.at[i, 'query_organism'] = r['query_ref'].split('@')[0]
    blast_results_df.at[i, 'target_organism'] = r['target_ref']

blast_results_df = blast_results_df.loc[blast_results_df.groupby(['query_ref','target_organism'])['bit_score'].idxmax()]
for i, r in blast_results_df.iterrows():
    target_ref = r['target_ref'].split('@')[0]
    query_ref = r['query_ref'].split('@')[0]
    temp = r['query_ref'].split('@')[1].split('_')
    query_start = int(temp[0])
    query_end = int(temp[1])
    target_start_alignment = int(r['target_start_alignment'])
    target_end_alignment = int(r['target_end_alignment'])
    query_start_alignment = int(r['query_start_alignment'])
    query_end_alignment = int(r['query_end_alignment'])
    
    if target_start_alignment < target_end_alignment:
        target_sense = 1
        full_target_alignment_start = target_start_alignment - query_start_alignment 
        full_target_alignment_end = full_target_alignment_start + 221
    else:
        target_sense = -1
        full_target_alignment_end = target_start_alignment + query_start_alignment 
        full_target_alignment_start = full_target_alignment_end - 221
    
    
    blast_results_df.at[i, 'full_target_alignment_start'] =  full_target_alignment_start
    blast_results_df.at[i, 'full_target_alignment_end'] =  full_target_alignment_end
    blast_results_df.at[i, 'full_query_sequence'] =  tbc_seq_dict[query_ref][query_start:query_end]
    if target_sense == 1:
        blast_results_df.at[i, 'full_target_sequence'] =  tbc_seq_dict[target_ref][full_target_alignment_start: full_target_alignment_end]
        blast_results_df.at[i, 'full_target_sequence_ref'] = target_ref + '_'+ str(full_target_alignment_start) + '_' + str(full_target_alignment_end)
    else:
        blast_results_df.at[i, 'full_target_sequence'] =  reverse_complement(tbc_seq_dict[target_ref][full_target_alignment_start: full_target_alignment_end])
        blast_results_df.at[i, 'full_target_sequence_ref'] = target_ref + '_'+ str(full_target_alignment_end) + '_' + str(full_target_alignment_start)


In [None]:
blast_results_df.to_csv(project_dir + '/processed_blast_results.csv')

In [None]:
query_refs = list(blast_results_df['query_ref'].unique())
temp_df = blast_results_df
distinct_clusters = []
regions_considered = []
for query_ref in tqdm(query_refs):
#for i, r in temp_df.iterrows():
    temp_df_2 = temp_df[temp_df['query_ref'] == query_ref]
    temp_df_3 = temp_df_2.head(1)
    matched = 0
    for j, s in temp_df_3.iterrows():
        if matched == 1:
            break
        for (species, start, stop) in regions_considered:
            if matched ==1:
                break
            if s['target_ref'] == species and s['full_target_alignment_start'] < stop and s['full_target_alignment_end'] > start:
                matched = 1
                break
    if matched == 0:
        distinct_clusters.append(query_ref)
        for j, s in temp_df_2.iterrows():
            regions_considered.append((s['target_ref'], s['full_target_alignment_start'], s['full_target_alignment_end'])) 

In [None]:
with open(project_dir + '/distinct_clusters.pkl', 'wb') as f:
    pickle.dump(distinct_clusters, f)    

In [None]:
for ref in distinct_clusters:
    region_of_interest = blast_results_df.query('query_ref == @ref')
    temp = []
    for i, r in region_of_interest.iterrows():
        temp.append([species_sname_dict[r['target_ref']],r['full_target_sequence']])
        #temp.append([r['full_target_sequence_ref'],r['full_target_sequence']])
    util.produce_fasta_file(temp, project_dir + '/Regions_of_Interest_SName/' + ref + '.faa')
    cline = MuscleCommandline(muscle_exe, input= project_dir + '/Regions_of_Interest_SName/' + ref + '.faa', out = project_dir + '/Regions_of_Interest_SName/' + ref + '_aligned.fasta')
    stdout, stderr = cline()

In [None]:
temp = [] 
for ref in distinct_clusters:
    region_of_interest = blast_results_df.query('query_ref == @ref')
    ct = 0
    ct_incomplete = 0
    st = 0
    stop = 0
    for i, r in region_of_interest.iterrows():
        if r['alignment_length'] < 200:
             ct_incomplete +=1
        if r['target_ref'] == 'MTB13':
            st = r['full_target_alignment_start']
            stop = r['full_target_alignment_end']
        ct +=1
    pct_incomplete = ct_incomplete/ct * 100
    print(ref, pct_incomplete, len(region_of_interest), st, stop)
    if not(pct_incomplete == 0):
        temp.append([ref, pct_incomplete, len(region_of_interest), st, stop])

pd.DataFrame(temp, columns = ['reference', 'pct_incomplete', 'alignment_length', 'H37Rv_start', 'H37Rv_stop']).to_csv(project_dir + '/insertion_alignments.csv')

In [5]:
genome_record = next(SeqIO.parse('F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data/GCF_000195955.2/genomic.gbff', "genbank"))
H37Rv_sequence = str(genome_record.seq)

In [6]:
reverse_complement(H37Rv_sequence[1468100:1468200])

'CGACACAACCACCCACAGATCAGTCAGTAGAGCCCGAAATGGGGGCTTTTGCGTCTGCTGACCAGTGCGTCACATACTACGGCGGCGCGCGCGCGGCGAC'

In [None]:
reverse_complement(H37Rv_sequence[1469594:1469694])