#### Import packages, set project directories and parameters

In [6]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
import re 
from itertools import product
import statistics
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
project_dir = 'F:/Project_Data/MITE_Project'
seq_dir = 'F:/Datasets/Other_Genomes/data'
target_species = 'NZ_CP054795.1' 
target_annotation_dirname = 'M_smegmatis'
min_region_length = 7 
full_build = False
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_run = False

In [3]:
genome_record = next(SeqIO.parse(seq_dir + '/' + target_annotation_dirname + '/genomic.gbff', "genbank"))
full_sequence = str(genome_record.seq)

In [4]:
def generate_putative_MITES(num_subsets, subset_num, target_sequence_list): 
    target_sequences = util.chunk_list(target_sequence_list, num_subsets, subset_num)
    putative_MITEs = []
    for target_sequence in tqdm(target_sequences):
        rc_sequence = util.reverse_complement(target_sequence)
        target_hits = [(m.start(), 'T') for m in re.finditer(target_sequence, full_sequence)]
        reverse_hits =  [(m.start(), 'R') for m in re.finditer(rc_sequence, full_sequence)]
        all_hits = target_hits + reverse_hits
        all_hits.sort(key = lambda x:x[0])
        hit_counter = []
        for n, (start, seq_type) in enumerate(all_hits):
            if n > 0:
                prev_start, prev_seq_type = all_hits[n-1][0], all_hits[n-1][1]
                if start - prev_start < 150:
                    hit_counter.append((prev_start, prev_seq_type, start, seq_type, start - prev_start))
        if len(hit_counter) > 1:
            data = [x[4] for x in hit_counter]
            putative_MITEs.append([target_sequence, len(hit_counter), statistics.mean(data), statistics.stdev(data)])
    return putative_MITEs

In [5]:
target_sequences = [''.join(i) for i in product('ACGT', repeat = 9)]
len(target_sequences)

262144

In [6]:
test = target_sequences#[0:20000]
parallel_output1 = Parallel(n_jobs=-1)(delayed(generate_putative_MITES)(num_cores, core_number, test) for core_number in core_numbers)

In [8]:
master_list = []
for seq_list in parallel_output1:
    for put_mit in seq_list:
        if put_mit[3] < 15 and put_mit[1] > 30 and put_mit[2] > 0:
            master_list.append(put_mit)
master_list.sort(key = lambda x:x[1], reverse = True)
master_list

[['GGTGCGCAC', 163, 2.871165644171779, 12.986767833311845],
 ['GTGCGCACC', 163, 2.871165644171779, 12.986767833311845],
 ['GGTGATCAC', 138, 2.028985507246377, 10.952413345349036],
 ['GTGATCACC', 138, 2.028985507246377, 10.952413345349036],
 ['CGACGCGTC', 136, 1.4338235294117647, 5.05920826170401],
 ['GACGCGTCG', 136, 1.4338235294117647, 5.05920826170401],
 ['ATCGCGATC', 124, 2.5, 10.513251413551986],
 ['GATCGCGAT', 124, 2.5, 10.513251413551986],
 ['CCCGCGGGC', 121, 2.5702479338842976, 12.533838495768824],
 ['GCCCGCGGG', 121, 2.5702479338842976, 12.533838495768824],
 ['GGTGGCCAC', 112, 2.6875, 10.291281405780962],
 ['GTGGCCACC', 112, 2.6875, 10.291281405780962],
 ['CGCGCGCGC', 110, 3.8727272727272726, 14.992419546887406],
 ['GCGCGCGCG', 110, 3.8727272727272726, 14.992419546887406],
 ['CTGCGCAGC', 106, 2.2641509433962264, 13.015230555587339],
 ['GCTGCGCAG', 106, 2.2641509433962264, 13.015230555587339],
 ['ATCGCGATG', 103, 2.6601941747572817, 10.72796729650439],
 ['CATCGCGAT', 103, 2.6601

In [None]:
for x in master_list:
    if x[0] == 'CGAGCAGAC':
        print(x)

In [None]:
temp = []
for x in master_list:
    target_sequence = x[0]
    if not(x[0] == 'GCGACGATG'):
        continue
    print(x)
    print("")
    print("")
    rc_sequence = util.reverse_complement(target_sequence)
    target_hits = [(m.start(), 'T') for m in re.finditer(target_sequence, full_sequence)]
    reverse_hits =  [(m.start(), 'R') for m in re.finditer(rc_sequence, full_sequence)]
    all_hits = target_hits + reverse_hits
    all_hits.sort(key = lambda x:x[0])
    for n, (start, seq_type) in enumerate(all_hits):
            if n > 0:
                prev_start, prev_seq_type = all_hits[n-1][0], all_hits[n-1][1]
                if start - prev_start < 150:
                    temp.append([str(prev_start-10)+'_'+str(start+10), full_sequence[prev_start-10: start + 19]])
                    print((prev_start, prev_seq_type, start, seq_type, start - prev_start), full_sequence[prev_start-10: start + 19])

In [None]:
util.produce_fasta_file(temp, project_dir + '/temp.faa')

In [None]:
genome_record = next(SeqIO.parse(mycobacteria_seq_dir + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"))
target_sequence = str(genome_record.seq)
accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
util.produce_fasta_file([[accession_ver, target_sequence]], project_dir + '/target_seq')

In [None]:
def run_meme(search_regions_loc, output_dir, min_width, min_sites):
    subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; meme '+ util.wslname(search_regions_loc) + ' -oc '+ util.wslname(output_dir) +' -dna -evt 0.001 -p 8 -revcomp -mod anr -brief 4000 -minw ' + str(min_width) +' -maxw 200 -minsites ' + str(min_sites)
               , shell=True)

In [None]:
def run_fimo(motif_file, sequence_to_search_file, output_dir):
    subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; fimo -oc ' + util.wslname(output_dir) + ' ' + util.wslname(motif_file) + ' ' + util.wslname(sequence_to_search_file)
               , shell=True)

In [None]:
def produce_matched_sequence_file(hit_df, output_file, q_value):
    matched_sequences = []
    for i, r in hit_df.iterrows():
        if '#' in r['motif_id']:
            continue
        if float(r['q-value']) < q_value:
            matched_sequences.append([str(r['start'])+'_'+str(r['stop']), r['matched_sequence']])
    util.produce_fasta_file(matched_sequences, output_file)

In [None]:
run_fimo(project_dir + '/MEME_Output/meme.txt', project_dir + '/target_seq' , project_dir + '/FIMO_Output')
fimo_upstream_hits = pd.read_csv(project_dir + '/FIMO_Output/fimo.tsv', sep='\t')
produce_matched_sequence_file(fimo_upstream_hits, project_dir + '/fimo_hit_sequences.faa', q_value = 1e-3)
run_meme(project_dir + '/fimo_hit_sequences.faa', project_dir + '/Second_MEME_Output',3,5)

In [None]:
def location(accession_ver, start1, end1, annotation_lookup_dictionary):
    feature_matches = []
    for (locus, product, feature, start, stop, strand) in annotation_lookup_dictionary[accession_ver]:
            if start< end1 and stop > start1:
                overlap = str(int(100*(min(end1, stop) - max(start1, start))/ (end1-start1)))+'%'
                feature_matches.append([locus, product, feature, overlap, strand])
    return feature_matches

In [None]:
def annotated_regions_dataset(num_subsets, subset_num, dir_list, seqdir, cds_only = False): 
    output = []
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    for dirname in sequence_dirs:
        annotated_regions = []
        intergenic_regions = []
        for record in (SeqIO.parse(seqdir + '/'+dirname+'/genomic.gbff', "genbank")):
            accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
            for feature in record.features:
                a = feature.qualifiers
                if feature.type not in ['source','gene'] and (int(feature.location.start) < int(feature.location.end)) and (int(feature.location.end) - int(feature.location.start)) < 1000000:
                    if cds_only == True and not(feature.type == 'CDS'):
                        continue
                    else:
                        if not(a.get("product") == None):
                               product = a.get("product")[0]
                        else:
                               product = ''
                        if not(a.get("locus_tag")==None):
                            locus_tag = a.get("locus_tag")[0]
                        else:
                            locus_tag = feature.type
                        annotated_regions.append((locus_tag, product, feature.type, int(feature.location.start), int(feature.location.end), str(feature.location.strand)))
            annotated_regions.sort(key = lambda x: x[4])
            prev_strand = 0
            prev_locus = ''
            prev_product = ''
            max_stop = 0
            for n, (locus, product, feature_type, start, stop, strand) in enumerate(annotated_regions):
                if start > max_stop:
                    intergenic_regions.append([prev_locus+':'+locus, prev_product + ':' + product, 'Inter-feature',max_stop, start, str(prev_strand)+':'+str(strand)])
                if stop > max_stop:
                    prev_locus = locus
                    prev_product = product
                    prev_strand = strand
                max_stop = max(max_stop, stop)    
            for x in intergenic_regions:
                annotated_regions.append(x)
            annotated_regions.sort(key = lambda x : x[4])
            output.append([accession_ver, annotated_regions])
    return output

In [None]:
seq_dirs = []
for dir in util.list_dirs(seq_dir):
    if os.path.exists(seq_dir + '/' + dir + '/genomic.gbff'):
        seq_dirs.append(dir)

In [None]:
parallel_output = Parallel(n_jobs=-1)(delayed(annotated_regions_dataset)(num_cores, core_number, seq_dirs, seq_dir) for core_number in core_numbers)
annotated_regions_dict = {}
for x in parallel_output:
    for y in x:
        annotated_regions_dict[y[0]] = y[1]

In [None]:
temp_list = []
temp_fimo_hits = pd.read_csv(project_dir + '/FIMO_Output/fimo.tsv', sep='\t')
for i, r in temp_fimo_hits.iterrows():
    if '#' in r['motif_id']:
        continue
    organism = r['sequence_name']
    motif_id = r['motif_alt_id']
    if float(r['q-value']) < 1e-3:
        temp = location(r['sequence_name'],int(r['start']), int(r['stop']), annotated_regions_dict)
        temp_list.append((int(r['start']),r['motif_alt_id'], float(r['q-value']), temp))
temp_list.sort(key=lambda x: x[0])
for x in temp_list:
    print(x)