#### Import packages, set directories and parameters

In [9]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import shutil
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
import collections
from scipy.stats import chi2, binom
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [24]:
project_dir = 'F:/Project_Data/mabR_Project/Orthologues_Mycobacteria'
seq_dir = 'F:/Datasets/Other_Genomes/4_Close_Mycobacteria'
motif_loc = 'F:/Project_Data/mabR_Project/MTBC_Strains/MEME_Strains_Long_Motif2/meme.txt'
full_run = True
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [11]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A'}    # Note S
    temp = []
    for char in reversed(seq_string):
        if char in complement_dict:
            temp.append(complement_dict[char])
        else:
            temp.append('A')
    return ''.join(temp)

In [12]:
tbc_seq_dict = {}
tbc_species_dirs = util.list_dirs(seq_dir)
species_name_dict = {}
all_location_dict = {}
all_translation_dict = {}
for n, sdir in tqdm(enumerate(tbc_species_dirs)):
    if len(util.list_files(seq_dir + '/' + sdir)) < 1:
        continue
    fname = util.list_files(seq_dir + '/' + sdir)[0]
    genome_record = next(SeqIO.parse(seq_dir + '/' + sdir + '/' + fname, "genbank"))
    full_sequence = str(genome_record.seq)
    accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
    species =  genome_record.annotations['organism']
    species_name_dict[accession_ver] = species
    tbc_seq_dict[accession_ver] = full_sequence
    translation_dict = {}
    location_dict = {}
    for feature in genome_record.features:
        a = feature.qualifiers
        if feature.type == 'CDS' and a.get("translation") != None and a.get("locus_tag") != None:
            locus_tag = a.get("locus_tag")[0]
            accession_locus = accession_ver + '@' + locus_tag
            translation = a.get("translation")[0]
            (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
            location_dict[accession_locus] = (start, stop, strand)
            translation_dict[accession_locus] = translation
    all_location_dict[accession_ver] = location_dict
    all_translation_dict[accession_ver] = translation_dict

4it [00:02,  1.92it/s]


In [13]:
temp = []
for accver, prot_dict in all_translation_dict.items():
    for k, v in prot_dict.items():
        temp.append([k, v])
util.produce_fasta_file(temp, project_dir + '/protein_sequences.faa')

100%|██████████| 16792/16792 [00:01<00:00, 15327.78it/s]


In [14]:
if 1==1:
    blastfn.build_blast_db(project_dir, 'protein_sequences.faa', 'Myco', project_dir + '/BLAST/Myco', 'prot')
    shutil.copy(project_dir + '/protein_sequences.faa', project_dir + '/BLAST/Myco/protein_sequences.faa')
    blastfn.run_blastp(project_dir + '/BLAST/Myco', 'protein_sequences.faa', 'Myco', 'myco_blast_hits.csv', e_value = 1e-6)

In [15]:
blast_output_df = blastfn.process_blast_output(project_dir + '/BLAST/Myco/myco_blast_hits.csv', species_name_dict, top_hit_only = True, outfile_loc = 'None')
rbh_df = blastfn.keep_reciprocal_best_hits(blast_output_df, blast_output_df, outfile_loc ='None')

In [17]:
full_orthologue_refs = []
temp = rbh_df.groupby('query_ref').agg({'target_ref': "count", 'percent_identical_matches': "min"}).reset_index()
temp_2 = temp.query('target_ref == 4 and percent_identical_matches > 70')
for i, r in temp_2.iterrows():
    if r['query_ref'].split('@')[0] == 'AL123456.3':
        full_orthologue_refs.append(r['query_ref'])

In [18]:
full_orthologue_dict = {}
for ref in full_orthologue_refs:
    for j, s in rbh_df.query('query_ref == @ref').iterrows():
        if ref in full_orthologue_dict:
            full_orthologue_dict[ref].append(s['target_ref'])
        else:
            full_orthologue_dict[ref] = [(s['target_ref'])]
len(full_orthologue_dict)

2480

In [25]:
util.produce_fasta_file([['AL123456.3', tbc_seq_dict['AL123456.3']]], project_dir + '/tb_sequence.faa')
blastfn.run_fimo(motif_loc, project_dir + '/tb_sequence.faa' , project_dir + '/tb_FIMO_Hits')

100%|██████████| 1/1 [00:00<00:00, 11.73it/s]


In [26]:
tb_fimo_hits_df = pd.read_csv(project_dir + '/tb_FIMO_Hits/fimo.tsv', sep='\t')

In [27]:
tb_fimo_hits = []
for i, r in tb_fimo_hits_df.iterrows():
    if '#' in r['motif_id'] or not(r['motif_alt_id'] == 'MEME-1') or r['q-value'] > 1e-3:
        continue
    start = int(r['start'])
    stop = int(r['stop'])
    if r['strand'] == '+':
        strand = 1
    else:
        strand = -1
    tb_fimo_hits.append((start, stop, strand))

In [28]:
def find_location(x):
    k = x.split('@')[0]
    return all_location_dict[k][x]            

In [29]:
for hit in tb_fimo_hits:
    (hit_start, hit_stop, hit_strand) = hit
    max_stop = 0 
    min_start = 999999999
    max_stop_loc = ''
    min_start_loc = ''
    overlap = False
    for k, v in all_location_dict['AL123456.3'].items():
        (start, stop, strand) = v
        if stop < hit_stop and stop > max_stop:
            max_stop = stop
            max_stop_loc = k
        if start > hit_start and start < min_start:
            min_start = start
            min_start_loc = k
        if start < hit_stop and stop > hit_start:
            overlap = True
        if max_stop_loc in full_orthologue_dict and min_start_loc in full_orthologue_dict:
            full_orths = True
        else:
            full_orths = False
    if full_orths == True and overlap == False:
        temp = []
        for (x,y) in zip(full_orthologue_dict[max_stop_loc], full_orthologue_dict[min_start_loc]):
            if find_location(x)[1] < find_location(y)[0]:
                sequence_info = (find_location(x)[1] -50, find_location(y)[0] + 50, 1)
            else:
                sequence_info = (find_location(y)[1] -50, find_location(x)[0] + 50, -1)
            
            #print((x.split('@')[0],x, find_location(x), y, find_location(y)), sequence_info)
            if sequence_info[2] == hit_strand:
                temp.append([species_name_dict[x.split('@')[0]].replace(' ','_'), tbc_seq_dict[x.split('@')[0]][sequence_info[0]:sequence_info[1]]])
            else:
                temp.append([species_name_dict[x.split('@')[0]].replace(' ','_'), reverse_complement(tbc_seq_dict[x.split('@')[0]][sequence_info[0]:sequence_info[1]])])             
        util.produce_fasta_file(temp, project_dir + '/Regions_of_Interest/' + str(hit_start) + '_'+ str(hit_stop)+'.faa')
        cline = MuscleCommandline(muscle_exe, input= project_dir + '/Regions_of_Interest/' + str(hit_start) + '_'+ str(hit_stop)+'.faa', out = project_dir + '/Regions_of_Interest/' + str(hit_start) + '_'+ str(hit_stop)+'_aligned.fasta')
        stdout, stderr = cline()
        #print()
        #print()

100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 3952.23it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 4169.29it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 1996.10it/s]


In [None]:
species_name_dict

In [None]:
if 1==1:
    blastfn.build_blast_db(project_dir, 'tbc_sequences.faa', 'MTBC', project_dir + '/BLAST/MTBC', 'nucl')
    shutil.copy(project_dir + '/mtbc_fimo_hits.faa', project_dir + '/BLAST/MTBC/mtbc_fimo_hits.faa')
    blastfn.run_blastn(project_dir + '/BLAST/MTBC', 'mtbc_fimo_hits.faa', 'MTBC', 'motif_mtbc_blast_hits.csv', e_value = 1e-1)

In [None]:
blast_results_df = pd.read_csv(project_dir + '/BLAST/MTBC/motif_mtbc_blast_hits.csv', header = None)

blast_results_df.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 
                         'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']
for i, r in blast_results_df.iterrows():
    blast_results_df.at[i, 'query_organism'] = r['query_ref'].split('@')[0]
    blast_results_df.at[i, 'target_organism'] = r['target_ref']

blast_results_df = blast_results_df.loc[blast_results_df.groupby(['query_ref','target_organism'])['bit_score'].idxmax()]
for i, r in blast_results_df.iterrows():
    target_ref = r['target_ref'].split('@')[0]
    query_ref = r['query_ref'].split('@')[0]
    temp = r['query_ref'].split('@')[1].split('_')
    query_start = int(temp[0])
    query_end = int(temp[1])
    target_start_alignment = int(r['target_start_alignment'])
    target_end_alignment = int(r['target_end_alignment'])
    query_start_alignment = int(r['query_start_alignment'])
    query_end_alignment = int(r['query_end_alignment'])
    
    if target_start_alignment < target_end_alignment:
        target_sense = 1
        full_target_alignment_start = target_start_alignment - query_start_alignment 
        full_target_alignment_end = full_target_alignment_start + 221
    else:
        target_sense = -1
        full_target_alignment_end = target_start_alignment + query_start_alignment 
        full_target_alignment_start = full_target_alignment_end - 221
    
    
    blast_results_df.at[i, 'full_target_alignment_start'] =  full_target_alignment_start
    blast_results_df.at[i, 'full_target_alignment_end'] =  full_target_alignment_end
    blast_results_df.at[i, 'full_query_sequence'] =  tbc_seq_dict[query_ref][query_start:query_end]
    if target_sense == 1:
        blast_results_df.at[i, 'full_target_sequence'] =  tbc_seq_dict[target_ref][full_target_alignment_start: full_target_alignment_end]
        blast_results_df.at[i, 'full_target_sequence_ref'] = target_ref + '_'+ str(full_target_alignment_start) + '_' + str(full_target_alignment_end)
    else:
        blast_results_df.at[i, 'full_target_sequence'] =  reverse_complement(tbc_seq_dict[target_ref][full_target_alignment_start: full_target_alignment_end])
        blast_results_df.at[i, 'full_target_sequence_ref'] = target_ref + '_'+ str(full_target_alignment_end) + '_' + str(full_target_alignment_start)


In [None]:
blast_results_df.to_csv(project_dir + '/processed_blast_results.csv')

In [None]:
query_refs = list(blast_results_df['query_ref'].unique())
temp_df = blast_results_df
distinct_clusters = []
regions_considered = []
for query_ref in tqdm(query_refs):
#for i, r in temp_df.iterrows():
    temp_df_2 = temp_df[temp_df['query_ref'] == query_ref]
    temp_df_3 = temp_df_2.head(1)
    matched = 0
    for j, s in temp_df_3.iterrows():
        if matched == 1:
            break
        for (species, start, stop) in regions_considered:
            if matched ==1:
                break
            if s['target_ref'] == species and s['full_target_alignment_start'] < stop and s['full_target_alignment_end'] > start:
                matched = 1
                break
    if matched == 0:
        distinct_clusters.append(query_ref)
        for j, s in temp_df_2.iterrows():
            regions_considered.append((s['target_ref'], s['full_target_alignment_start'], s['full_target_alignment_end'])) 

In [None]:
with open(project_dir + '/distinct_clusters.pkl', 'wb') as f:
    pickle.dump(distinct_clusters, f)    

In [None]:
for ref in distinct_clusters:
    region_of_interest = blast_results_df.query('query_ref == @ref')
    temp = []
    for i, r in region_of_interest.iterrows():
        temp.append([species_name_dict[r['target_ref']].replace(' ','_'),str(r['full_target_sequence'])])
    util.produce_fasta_file(temp, project_dir + '/Regions_of_Interest_SName/' + ref + '.faa')
    cline = MuscleCommandline(muscle_exe, input= project_dir + '/Regions_of_Interest_SName/' + ref + '.faa', out = project_dir + '/Regions_of_Interest_SName/' + ref + '_aligned.fasta')
    stdout, stderr = cline()

In [None]:
temp = [] 
for ref in distinct_clusters:
    region_of_interest = blast_results_df.query('query_ref == @ref')
    ct = 0
    ct_incomplete = 0
    st = 0
    stop = 0
    for i, r in region_of_interest.iterrows():
        if r['alignment_length'] < 200:
             ct_incomplete +=1
        if r['target_ref'] == 'MTB13':
            st = r['full_target_alignment_start']
            stop = r['full_target_alignment_end']
        ct +=1
    pct_incomplete = ct_incomplete/ct * 100
    print(ref, pct_incomplete, len(region_of_interest), st, stop)
    if not(pct_incomplete == 0):
        temp.append([ref, pct_incomplete, len(region_of_interest), st, stop])

pd.DataFrame(temp, columns = ['reference', 'pct_incomplete', 'alignment_length', 'H37Rv_start', 'H37Rv_stop']).to_csv(project_dir + '/insertion_alignments.csv')

In [None]:
temp = []
for i, r in blast_results_df.query('query_ref == "MTB0@3246402_3246623_-"').iterrows():
    if r['alignment_length'] > 130:
        insertion = 'Y'
    else:
        insertion = 'N'
    temp.append([species_name_dict[r['target_organism']], insertion])
pd.DataFrame(temp, columns = ['Strain','insertion']).to_csv(project_dir + '/insertions_MTB0@3246402_3246623_-.csv')

In [None]:
genome_record = next(SeqIO.parse('F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data/GCF_000195955.2/genomic.gbff', "genbank"))
H37Rv_sequence = str(genome_record.seq)

In [None]:
reverse_complement(H37Rv_sequence[1468100:1468200])

In [None]:
reverse_complement(H37Rv_sequence[1469594:1469694])

In [None]:
temp = []
for i, r in blast_results_df.query('query_organism =="MTB13" and target_organism == "MTB13"').iterrows():
    temp.append((r['query_ref'].split('_')[0:2], r['query_ref']))
temp.sort(key = lambda x:int(x[0][1]))
temp

In [None]:
blast_results_df.query('query_ref =="MTB13@1468039_1468260_-" and target_organism == "MTB0"')