#### Import packages, set directories and parameters

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
project_dir = 'F:/Project_Data/mabR_Project'
mycobacteria_seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_Complete_Annot_20230511/data'
actinomycetes_seq_dir = 'F:/Datasets/NCBI_Refseq_Actinomycetes_Complete_Annot_20230511/data'
tb_species = 'AL123456.3' 
tb_annotation_dirname = 'GCA_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

In [3]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A','N':'N','S':'A','R':'A','Y':'A'}    # Note S
    temp = []
    for char in reversed(seq_string):
        temp.append(complement_dict[char])
    return ''.join(temp)

#### Check files where no genomic file downloaded from NCBI

In [4]:
def non_empty_directories(num_subsets, subset_num, dir_list, seqdir):
    temp = util.chunk_list(dir_list, num_subsets, subset_num)
    non_empty_dirs = []
    for dirname in temp:
            if not(os.path.exists(seqdir + '/' + dirname + '/genomic.gbff')):
                continue
            else:
                non_empty_dirs.append(dirname)
    return non_empty_dirs 

In [5]:
mycobacteria_dirs = []
actinomycetes_dirs = []
sequence_dirs = util.list_dirs(mycobacteria_seq_dir)
parallel_output = Parallel(n_jobs=-1)(delayed(non_empty_directories)(num_cores, core_number, sequence_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
for temp in parallel_output:
    for x in temp:
        mycobacteria_dirs.append(x) 
sequence_dirs = util.list_dirs(actinomycetes_seq_dir)
parallel_output = Parallel(n_jobs=-1)(delayed(non_empty_directories)(num_cores, core_number, sequence_dirs, actinomycetes_seq_dir) for core_number in core_numbers)
for temp in parallel_output:
    for x in temp:
        actinomycetes_dirs.append(x)

In [None]:
with open(project_dir + '/mycobacteria_dirs.pkl', 'wb') as f:
    pickle.dump(mycobacteria_dirs, f) 
with open(project_dir + '/actinomycetes_dirs.pkl', 'wb') as f:
    pickle.dump(actinomycetes_dirs, f) 

In [None]:
print(len(mycobacteria_dirs),len(actinomycetes_dirs))

#### Create files with all CDS for both mycobacteria and actinobacteria reference sets and create BLAST databases for TB CDS and both reference sets (to do reciprocal best hits)

In [None]:
def generate_protein_dataset(num_subsets, subset_num, dir_list, seqdir): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    all_cds = []
    all_tb_cds = []
    names = []
    sequences = []
    locations = []
    for dirname in (sequence_dirs):
        for genome_record in SeqIO.parse(seqdir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None and a.get("locus_tag") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                    if dirname == tb_annotation_dirname:
                        all_tb_cds.append([accession_locus, translation])
    return (all_cds, all_tb_cds, names, locations, sequences)           

In [None]:
def generate_upstream_sequence_dataset(num_subsets, subset_num, dir_list, seqdir): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    upstream_cds_regions = []
    for dirname in (sequence_dirs):
        all_features = []
        for genome_record in SeqIO.parse(seqdir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            full_sequence = str(genome_record.seq)
            len_full_sequence = len(full_sequence)
            for feature in genome_record.features:
                if feature.type in ['gene', 'source']:
                    continue
                a = feature.qualifiers
                feature_type = feature.type
                if a.get("locus_tag") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                else:
                    accession_locus  = ''
                (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                all_features.append([accession_locus, feature_type, start, stop, strand])
            # Positive strand upstream 
            all_features.sort(key = lambda x: x[2])
            max_stop = 0
            for (accession_locus, feature_type, start, stop, strand) in all_features:
                if max_stop < start and feature_type == 'CDS' and strand == 1 and start - max_stop < 100000:    #Avoid joins where biopython interprets inconsistently 
                    upstream_cds_regions.append([accession_locus, max_stop, start, strand, full_sequence[max_stop: start+3]])
                max_stop = max(max_stop, stop)
            # Negative strand upstream
            all_features.sort(key = lambda x: x[3], reverse = True)
            min_start = len(full_sequence)-1
            for (accession_locus, feature_type, start, stop, strand) in all_features:
                if stop < min_start and feature_type == 'CDS' and strand == -1 and min_start - stop < 100000:
                    upstream_cds_regions.append([accession_locus, stop, min_start, strand, reverse_complement(full_sequence[stop-3: min_start])])
                min_start = min(min_start, start)
    return (upstream_cds_regions)           

In [None]:
if full_build == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, mycobacteria_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
    names_dict_1 = {}
    locations_dict_1 = {}
    sequence_dict_1 = {}
    all_cds_1 = []
    all_tb_cds_1 = []
    for x in parallel_output:
        all_cds_1 += x[0]
        all_tb_cds_1 += x[1]
        for temp in x[2]:
            names_dict_1[temp[0]] = temp[1]
        for temp in x[3]:
            locations_dict_1[temp[0]] = temp[1]
        for temp in x[4]:
            sequence_dict_1[temp[0]] = temp[1]
    with open(project_dir + '/names_dict_1.pkl', 'wb') as f:
            pickle.dump(names_dict_1, f) 

In [None]:
if full_build == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, actinomycetes_dirs, actinomycetes_seq_dir) for core_number in core_numbers)
    names_dict_2 = {}
    locations_dict_2 = {}
    sequence_dict_2 = {}
    all_cds_2 = []
    all_tb_cds_2 = []
    for x in parallel_output:
        all_cds_2 += x[0]
        all_tb_cds_2 += x[1]
        for temp in x[2]:
            names_dict_2[temp[0]] = temp[1]
        for temp in x[3]:
            locations_dict_2[temp[0]] = temp[1]
        for temp in x[4]:
            sequence_dict_2[temp[0]] = temp[1]
    with open(project_dir + '/names_dict_2.pkl', 'wb') as f:
            pickle.dump(names_dict_2, f) 

In [None]:
if full_build == True:
    mycobacteria_upstream_dict = {}
    actinomycetes_upstream_dict = {}
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_upstream_sequence_dataset)(num_cores, core_number, mycobacteria_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
    for x in parallel_output:
        for n in x:
            mycobacteria_upstream_dict[n[0]] = [n[1], n[2], n[3], n[4]]
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_upstream_sequence_dataset)(num_cores, core_number, actinomycetes_dirs, actinomycetes_seq_dir) for core_number in core_numbers)
    for x in parallel_output:
        for n in x:
            actinomycetes_upstream_dict[n[0]] = [n[1], n[2], n[3], n[4]]
    with open(project_dir + '/mycobacteria_upstream_dict.pkl', 'wb') as f:
        pickle.dump(mycobacteria_upstream_dict, f) 
    with open(project_dir + '/actinomycetes_upstream_dict.pkl', 'wb') as f:
        pickle.dump(actinomycetes_upstream_dict, f) 

In [None]:
if full_build == True:
    util.produce_fasta_file(all_cds_1, project_dir + '/mycobacteria_cds.fasta')
    util.produce_fasta_file(all_tb_cds_1, project_dir + '/tb_cds.fasta')
    util.produce_fasta_file(all_cds_2, project_dir + '/actinomycetes_cds.fasta')
    blastfn.build_blast_db(project_dir, 'mycobacteria_cds.fasta', 'Mycobacteria', project_dir + '/BLAST/Mycobacteria')
    blastfn.build_blast_db(project_dir, 'tb_cds.fasta', 'all_tb_cds', project_dir + '/BLAST/Tb')
    blastfn.build_blast_db(project_dir, 'actinomycetes_cds.fasta', 'Actinomycetes', project_dir + '/BLAST/Actinomycetes')

In [None]:
if full_build == True:
    blastfn.run_blastp(project_dir + '/BLAST/Mycobacteria', 'tb_cds.fasta', 'Mycobacteria', 'tb_mycobacteria_hits.csv', e_value = 1e-10)
    blastfn.run_blastp(project_dir + '/BLAST/Tb', 'mycobacteria_cds.fasta', 'all_tb_cds', 'mycobacteria_tb_hits.csv', e_value = 1e-10)
    a = blastfn.process_blast_output(project_dir + '/BLAST/Mycobacteria/tb_mycobacteria_hits.csv', names_dict_1, top_hit_only = False)
    a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
    b = blastfn.process_blast_output(project_dir + '/BLAST/Tb/mycobacteria_tb_hits.csv', names_dict_1, top_hit_only = False)
    b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
    rbh =  blastfn.keep_reciprocal_best_hits(a, b)
    rbh.to_csv(project_dir + '/tb_mycobacteria_reciprocal_best_hits.csv')

In [None]:
if full_build == True:
    blastfn.run_blastp(project_dir + '/BLAST/Actinomycetes', 'tb_cds.fasta', 'Actinomycetes', 'tb_actinomycetes_hits.csv', e_value = 1e-10)
    blastfn.run_blastp(project_dir + '/BLAST/Tb', 'actinomycetes_cds.fasta', 'all_tb_cds', 'actinomycetes_tb_hits.csv', e_value = 1e-10)
    a = blastfn.process_blast_output(project_dir + '/BLAST/Actinomycetes/tb_actinomycetes_hits.csv', names_dict_2, top_hit_only = False)
    a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
    b = blastfn.process_blast_output(project_dir + '/BLAST/Tb/actinomycetes_tb_hits.csv', names_dict_2, top_hit_only = False)
    b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
    rbh_2 =  blastfn.keep_reciprocal_best_hits(a, b)
    rbh_2.to_csv(project_dir + '/tb_actinomycetes_reciprocal_best_hits.csv')

#### Function to generate FASTA file containing intergenic regions in orthologous species and run Muscle / R-scape

In [61]:
if not(full_build == True):
    with open(project_dir + '/names_dict_1.pkl', 'rb') as f:
        names_dict_1 = pickle.load(f)
    with open(project_dir + '/names_dict_2.pkl', 'rb') as f:
        names_dict_2 = pickle.load(f)
    with open(project_dir + '/mycobacteria_dirs.pkl', 'rb') as f:
        mycobacteria_dirs = pickle.load(f) 
    with open(project_dir + '/actinomycetes_dirs.pkl', 'rb') as f:
         actinomycetes_dirs = pickle.load(f)  
    with open(project_dir + '/mycobacteria_upstream_dict.pkl', 'rb') as f:
        mycobacteria_upstream_dict = pickle.load(f) 
    with open(project_dir + '/actinomycetes_upstream_dict.pkl', 'rb') as f:
        actinomycetes_upstream_dict = pickle.load(f) 
    tb_mycobacteria_hits = blastfn.process_blast_output(project_dir + '/BLAST/Mycobacteria/tb_mycobacteria_hits.csv', names_dict_1, top_hit_only = False)
    tb_mycobacteria_rbh = pd.read_csv(project_dir + '/tb_mycobacteria_reciprocal_best_hits.csv')
    tb_actinomycetes_rbh = pd.read_csv(project_dir + '/tb_actinomycetes_reciprocal_best_hits.csv')

In [101]:
def align_upstream_regions(locus, comparison_type): # 1 = mycobacteria, #2 = actinomycetes
    if comparison_type == 1:
        comparison_df = tb_mycobacteria_rbh
        comparison_dict = mycobacteria_upstream_dict
        name_dict = names_dict_1
        prefix = 'mycobacteria'
    else:
        comparison_df = tb_actinomycetes_rbh
        comparison_dict = actinomycetes_upstream_dict
        prefix = 'actinomycetes'
        name_dict = names_dict_2
    intergenic_regions = []
    target_locus = tb_species + '@' + locus
    hits = comparison_df[comparison_df['query_ref'] == target_locus]
    hits = hits[hits['percent_identical_matches'] > 40]
    tb_hit = 0
    for i, r in hits.iterrows():
        target_ref = r['target_ref']
        if target_ref in comparison_dict:
            upstream_region = comparison_dict[target_ref][3]
            if len(upstream_region) > 8:
                if tb_species in target_ref:
                    tb_hit = 1
                intergenic_regions.append([name_dict[target_ref.split('@')[0]].replace(' ', '_'), upstream_region])
    if len(intergenic_regions) > 10 and tb_hit == 1:    
        results_dir = project_dir + '/Intergenic_Regions/' + locus
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
        util.produce_fasta_file(intergenic_regions, results_dir +'/'+prefix+'_intergenic_regions.fasta')
        cline = MuscleCommandline(muscle_exe, input= results_dir +'/'+prefix + '_intergenic_regions.fasta', out = results_dir + '/' + prefix + '_upstream_region_'+locus + '_alignment.fasta')
        exception = 0
        try:
            stdout, stderr = cline()
        except Exception as e:
            exception == 1
        #if exception == 0 and comparison_type == 2:
        #    blastfn.convert_fasta_to_stockholm(results_dir, prefix + '_upstream_region_'+locus + '_alignment.fasta', prefix + '_upstream_region_'+locus + '_alignment.sto')
        #    blastfn.run_rscape(results_dir, prefix + '_upstream_region_'+locus + '_alignment.sto', 'rscape_')


In [102]:
regulation_type_dict = {}
downreg_genes = pd.read_excel(project_dir + '/2022-10-02_data.xlsx', sheet_name = 'significnat genes downregulated')
upreg_genes = pd.read_excel(project_dir + '/2022-10-02_data.xlsx', sheet_name = 'significant genes upregulated')
gene_list = []
for i, r in downreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Downregulated'
for i, r in upreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Upregulated'

In [None]:
for locus in gene_list:
    align_upstream_regions(str(locus), 1)
    #align_upstream_regions(str(locus), 2)

In [104]:
mave_len = 10
id_re = []
seq_ids = util.list_dirs(project_dir + '/Intergenic_Regions')
out_list = []
consensus_dict = {}
for id in tqdm(seq_ids):
    if os.path.exists(project_dir + '/Intergenic_Regions/'+str(id)+ '/mycobacteria_upstream_region_'+str(id) + '_alignment.fasta'):
        alignment = util.read_fasta_to_array(project_dir + '/Intergenic_Regions/' + str(id)+ '/mycobacteria_upstream_region_'+str(id) + '_alignment.fasta')
        sequences_in_alignment = len(alignment[0])
        alignment_length = len(alignment[1][0])
        for j, name in enumerate(alignment[0]):
            if 'H37Rv' in name:
                tb_index = j
                break
        if sequences_in_alignment < 20:
            continue
        pct_identity = []
        consensus = []
        for i in range(alignment_length):
            temp = []
            for j in range(sequences_in_alignment):
                temp.append(alignment[1][j][i])
            match_found = 0
            mac_pct_id = 0
            for letter in ['A','C','G','T']:
                if temp.count(letter) / sequences_in_alignment >= 0.8:
                    match_found = 1
                    consensus.append(letter)
                    break
            if match_found == 0:
                consensus.append('*')
        consensus_dict[id] = consensus    
        #print(consensus[410:])
        re = (alignfn.relative_entropy(alignment[1]))
        mave_re = []
        for i in range(len(re[0])-mave_len):
            temp = re[0][i:i+mave_len]
            mave_re.append(sum(temp)/len(temp))
        max_re = -999
        max_re_pos = 0
        if len(mave_re) == 0:
            continue
        for i in range(len(re[0])-mave_len):
            if mave_re[i] > max_re:
                max_re = mave_re[i]
                max_re_pos = i
        id_re.append([id, max_re, ''.join(consensus_dict[id][max_re_pos: max_re_pos + mave_len]), ''.join(consensus_dict[id]), len(consensus_dict[id]), len([x for x in re[0] if x > 1.9]), 
                      sequences_in_alignment, alignment[1][tb_index]])

100%|██████████| 148/148 [00:02<00:00, 63.29it/s]


In [105]:
a = pd.DataFrame(id_re, columns = ['Downstream CDS', 'Maximum_Moving_Ave_RE', 'Max_Region_Consensus_Sequence','Full_Consensus','Intergenic_Region_TB_Length','Num_positions_gt_90_ID', 'Num_Seq_in_alignement','TB_Upstream_Sequence'])


In [108]:
hit_info = []
for gene_name in gene_list:
    intergenic_regions = []
    locus = tb_species + '@'+ str(gene_name)
    hits = tb_mycobacteria_rbh.query('query_ref == @locus and percent_identical_matches > 40')
    num_hits_gt_40 = len(hits)
    tb_hit = 0
    for i, r in hits.iterrows():
        target_ref = r['target_ref']
        if target_ref in mycobacteria_upstream_dict:
            upstream_region = mycobacteria_upstream_dict[target_ref][3][:-3]
            if len(upstream_region) > 8:
                if tb_species in target_ref:
                    tb_hit = 1
                intergenic_regions.append([names_dict_1[target_ref.split('@')[0]].replace(' ', '_'), upstream_region])
    if locus in mycobacteria_upstream_dict:
        hit_info.append([gene_name, regulation_type_dict[gene_name], num_hits_gt_40, len(mycobacteria_upstream_dict[locus][3][:-3]), tb_hit, len(intergenic_regions)])
    else: 
        hit_info.append([gene_name, regulation_type_dict[gene_name], num_hits_gt_40, 0, tb_hit, len(intergenic_regions)])
hit_info_df = pd.DataFrame(hit_info, columns = ['Downstream CDS', 'Regulation type','num_hits', 'tb_upstream_region_length', 'tb_hit', 'num_upstream_regions'])

In [109]:
gene_info_df = hit_info_df.merge(a, on='Downstream CDS', how='left').fillna('')
gene_info_df.to_csv(project_dir +'/significant_gene_info.csv')

In [92]:
hit_info_df

Unnamed: 0,Downstream CDS,num_hits,upstream_region_length,tb_hit,num_upstream_regions
0,AL123456.3@Rv2242,99,59,1,81
1,AL123456.3@Rv0096,35,108,1,27
2,Rv2245,100,0,0,8
3,AL123456.3@Rv0097,34,18,1,26
4,AL123456.3@Rv0360c,100,82,1,79
...,...,...,...,...,...
93,AL123456.3@Rv0926c,92,53,1,62
94,AL123456.3@Rv0154c,96,423,1,77
95,AL123456.3@Rv0552,70,77,1,58
96,AL123456.3@Rv3772,98,77,1,81


##### Motif analysis

In [None]:
seq_ids = util.list_dirs(project_dir)
out_list = []
for id in seq_ids:
    if os.path.exists(project_dir + '/Intergenic_Regions/' + str(id) + '/rscape_.cov'):
        with open(project_dir + '/Intergenic_Regions/' + str(id) + '/rscape_.cov', 'r') as f:  
            num_pairs = 0
            e_values = []
            for l in f:
                if (not ('#' in l)):
                    a = l.split()
                    num_pairs +=1
            out_list.append((id, num_pairs))
out_list.sort(key = lambda x: x[0])
out_list

In [29]:
downreg_genes = pd.read_excel(project_dir + '/2022-10-02_data.xlsx', sheet_name = 'significnat genes downregulated')
downreg_gene_list = []
for i, r in downreg_genes.iterrows():
    downreg_gene_list.append(r['Locus'])

In [30]:
downreg_gene_upstream_tb_regions = []
for gene in downreg_gene_list:
    locus = tb_species + '@' + str(gene)
    if locus in mycobacteria_upstream_dict:
        upstream_region = mycobacteria_upstream_dict[locus][3]#[:-3]  # Remove start codon of downstream gene
        if len(upstream_region) > 5:
            downreg_gene_upstream_tb_regions.append([locus, upstream_region])
util.produce_fasta_file(downreg_gene_upstream_tb_regions, project_dir + '/downreg_gene_upstream_tb_regions.faa')

100%|██████████| 72/72 [00:00<?, ?it/s]


In [31]:
min_width = 3
min_sites = 5
subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; meme \"/mnt/f/Project_Data/mabR_Project/downreg_gene_upstream_tb_regions.faa" -oc \"/mnt/f/Project_Data/mabR_Project/downreg_meme_result" -dna -evt 0.01 -mod anr -brief 4000 -minw ' + str(min_width) +' -maxw 200 -minsites ' + str(min_sites)
               , shell=True)
      

CompletedProcess(args='wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; meme "/mnt/f/Project_Data/mabR_Project/downreg_gene_upstream_tb_regions.faa" -oc "/mnt/f/Project_Data/mabR_Project/downreg_meme_result" -dna -evt 0.01 -mod anr -brief 4000 -minw 3 -maxw 200 -minsites 5', returncode=0)

In [None]:
print(meme \"/mnt/f/Project_Data/mabR_Project/downreg_gene_upstream_tb_regions.faa" -oc \"/mnt/d/Project_Data/mabR_Project/downreg_meme_result" -dna -evt 0.01 -mod zoops -brief 4000 -minw ' + str(min_width) +' -maxw 200 -minsites ' + str(min_sites)

In [26]:
for record in SeqIO.parse(mycobacteria_seq_dir + '/' + tb_annotation_dirname +'/genomic.gbff', "genbank"):
    full_sequence = str(record.seq)
len(full_sequence)

4411532

In [27]:
full_sequence[2515303:2516548]

'GTGAACGACAATCAGTTGGCTCCAGTTGCCCGCCCGAGGTCGCCGCTCGAACTGCTGGACACTGTGCCCGATTCGCTGCTGCGGCGGTTGAAGCAGTACTCGGGCCGGCTGGCCACCGAGGCAGTTTCGGCCATGCAAGAACGGTTGCCGTTCTTCGCCGACCTAGAAGCGTCCCAGCGCGCCAGCGTGGCGCTGGTGGTGCAGACGGCCGTGGTCAACTTCGTCGAATGGATGCACGACCCGCACAGTGACGTCGGCTATACCGCGCAGGCATTCGAGCTGGTGCCCCAGGATCTGACGCGACGGATCGCGCTGCGCCAGACCGTGGACATGGTGCGGGTCACCATGGAGTTCTTCGAAGAAGTCGTGCCCCTGCTCGCCCGTTCCGAAGAGCAGTTGACCGCCCTCACGGTGGGCATTTTGAAATACAGCCGCGACCTGGCATTCACCGCCGCCACGGCCTACGCCGATGCGGCCGAGGCACGAGGCACCTGGGACAGCCGGATGGAGGCCAGCGTGGTGGACGCGGTGGTACGCGGCGACACCGGTCCCGAGCTGCTGTCCCGGGCGGCCGCGCTGAATTGGGACACCACCGCGCCGGCGACCGTACTGGTGGGAACTCCGGCGCCCGGTCCAAATGGCTCCAACAGCGACGGCGACAGCGAGCGGGCCAGCCAGGATGTCCGCGACACCGCGGCTCGCCACGGCCGCGCTGCGCTGACCGACGTGCACGGCACCTGGCTGGTGGCGATCGTCTCCGGCCAGCTGTCGCCAACCGAGAAGTTCCTCAAAGACCTGCTGGCAGCATTCGCCGACGCCCCGGTGGTCATCGGCCCCACGGCGCCCATGCTGACCGCGGCGCACCGCAGCGCTAGCGAGGCGATCTCCGGGATGAACGCCGTCGCCGGCTGGCGCGGAGCGCCGCGGCCCGTGCTGGCTAGGGAACTTTTGCCCGAACGCGCCCTGATGGGCGACGCCTCGGCGATCGTGGCCCTGCAT

In [35]:
n = 11
motif = util.reverse_complement('GGGAAAGCTTA')
for  i in range(len(full_sequence) - n):
    if full_sequence[i: i+n] == motif:
        print(i)

3691127
