#### Import packages, set directories and parameters

In [4]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
project_dir = 'F:/Project_Data/mabR_Project'
mycobacteria_seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_Complete_Annot_20230511/data'
tb_species = 'AL123456.3' 
tb_annotation_dirname = 'GCA_000195955.2'
min_region_length = 7 
full_build = False
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_run = False

In [8]:
cds_features = []
non_cds_features = []
genome_record = next(SeqIO.parse(mycobacteria_seq_dir + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"))
full_sequence = str(genome_record.seq)
len_full_sequence = len(full_sequence)
for feature in genome_record.features:
    if feature.type in ['gene', 'source']:
        pass
    else:
        a = feature.qualifiers
        feature_type = feature.type
        if feature_type == 'CDS':
            locus_tag = a.get("locus_tag")[0]
            (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
            if strand == 1:
                nt_sequence = full_sequence[start: stop]
            else:
                nt_sequence = util.reverse_complement(full_sequence[start: stop])
            cds_features.append([a.get("locus_tag")[0], start, stop, strand, nt_sequence])

cds_features.sort(key = lambda x: x[1])
max_stop = 0
for (locus, start, stop, strand, nt_sequence) in cds_features:
    if max_stop < start:
        non_cds_features.append([locus, max_stop, start, strand, full_sequence[max_stop: start]])
    max_stop = max(max_stop, stop)
   
 
     

In [16]:
cds_hexamer_counts = {}
for rf in [0,1,2]:
    for strand in [1, -1]:
        for (locus, start, stop, strand, nt_sequence) in cds_features:
            if strand == 1:
                temp_sequence = nt_sequence
            else:
                temp_sequence = util.reverse_complement(nt_sequence)
            sequence_length = len(temp_sequence)
            for i in range(sequence_length - (7 + rf)):
                hexamer = temp_sequence[i + rf : i + rf + 6]
                if hexamer in cds_hexamer_counts:
                    cds_hexamer_counts[hexamer] +=1
                else:
                    cds_hexamer_counts[hexamer] = 1

In [20]:
len(cds_hexamer_counts)

4096

In [27]:
temp = []
for k, v in cds_hexamer_counts.items():
    temp.append((k, v))
temp.sort(key = lambda x: x[1], reverse = True)
temp

[('CGGCGG', 12379),
 ('GGCGGC', 11023),
 ('CCGGCG', 9729),
 ('GCCGGC', 9600),
 ('GCGGCG', 9473),
 ('CCGCCG', 8758),
 ('CGCCGG', 8582),
 ('CGCCGA', 8547),
 ('CGGCGC', 8500),
 ('CGCCGC', 8385),
 ('GCCGCC', 8173),
 ('GCGCCG', 7730),
 ('GCTGGC', 7637),
 ('GGCCGC', 7501),
 ('TCGCCG', 7438),
 ('TGGCCG', 7400),
 ('CGGCCG', 7389),
 ('CGCGGC', 7330),
 ('GCCGCG', 7215),
 ('TCGGCG', 7088),
 ('GGCCGG', 7068),
 ('GCGGCC', 6926),
 ('GCGCTG', 6801),
 ('CGGCGA', 6779),
 ('GCGGTG', 6632),
 ('GCCGAC', 6630),
 ('CGCTGG', 6387),
 ('CCGACG', 6315),
 ('CGGTGG', 6224),
 ('CGACGA', 6223),
 ('CCGCGG', 6185),
 ('CTGGCC', 6118),
 ('TCGACG', 6077),
 ('GGCGCC', 5960),
 ('CACCGG', 5751),
 ('GGTGGC', 5719),
 ('ACCGGC', 5698),
 ('CGGTGC', 5662),
 ('CCGGTG', 5631),
 ('CGACGG', 5498),
 ('TGCTGG', 5486),
 ('ACGGCG', 5467),
 ('GCGCGG', 5266),
 ('GGCGGT', 5240),
 ('GGCCGA', 5202),
 ('CGACGC', 5145),
 ('CCGGCC', 5142),
 ('GTCGGC', 5141),
 ('CCGCGC', 5132),
 ('TGGCGG', 5122),
 ('CGCTGC', 5121),
 ('ACGCCG', 5115),
 ('GGCGCG'