#### Import packages, set directories and parameters

In [3]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
project_dir = 'F:/Project_Data1/Project'
mycobacteria_seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_Complete_Annot_20230511/data'
tb_species = 'AL123456.3' 
tb_annotation_dirname = 'GCA_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))

In [8]:
upstream_regions = []
for genome_record in SeqIO.parse(mycobacteria_seq_dir + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"):
    full_sequence = str(genome_record.seq)
    len_full_sequence = len(full_sequence)
    for feature in genome_record.features:
        if feature.type in ['gene', 'source']:
            continue
        a = feature.qualifiers
        feature_type = feature.type
        if feature.type == 'CDS' and a.get("locus_tag") != None:
            locus_tag = a.get("locus_tag")[0]
            (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
            if strand == 1:
                upstream_region = full_sequence[start - 7: start + 3]
            else:
                upstream_region = util.reverse_complement(full_sequence[stop -3: stop + 7])
        upstream_regions.append([locus_tag, upstream_region])           

In [20]:
start_dict = {}
for (locus, region) in upstream_regions:
    start_dict[locus] = region[-5:]

In [21]:
regulation_type_dict = {}
mab_dir = 'F:/Project_Data/mabR_Project'
downreg_genes = pd.read_excel(mab_dir + '/2022-10-02_data_NU_NAs_renamed.xlsx', sheet_name = 'significant genes downregulated')
upreg_genes = pd.read_excel(mab_dir + '/2022-10-02_data.xlsx', sheet_name = 'significant genes upregulated')
gene_list = []
for i, r in downreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Downregulated'
for i, r in upreg_genes.iterrows():
    gene_list.append(r['Locus'])
    regulation_type_dict[r['Locus']] = 'Upregulated'

In [26]:
util.reverse_complement(full_sequence[806172:806232])

'GGCGAGCAGTCGCAAAAGCCCCCGCACGCCGCGCGTGTCGGGGGCTTTTGCGACTGCTCG'

In [22]:
for x in gene_list:
    if x in regulation_type_dict:
        reg_type = regulation_type_dict[x]
    else:
        reg_type = ''
    if x in start_dict:
        startseq = start_dict[x]
    else:
        startseq = ''
    print(x, startseq, reg_type)

Rv2242 GAGTG Downregulated
Rv0096 GGATG Downregulated
Rv2245 AAGTG Downregulated
Rv0097 CGATG Downregulated
Rv0360c ACGTG Downregulated
Rv3149 CGGTG Downregulated
Rv2244 CTGTG Downregulated
Rv3145 AGTTG Downregulated
Rv1158c ACATG Downregulated
Rv3139 GTATG Downregulated
Rv3775 CCATG Downregulated
Rv1754c GAATG Downregulated
Rv2246 AGGTG Downregulated
Rv3147 CGATG Downregulated
Rv3150 GAATG Downregulated
Rv2243 ACGTG Downregulated
Rv1687c ACATG Downregulated
Rv0098 GCATG Downregulated
Rv1686c CGATG Downregulated
Rv0753c CCATG Downregulated
Rv0171 CAATG Downregulated
Rv0173 CAATG Downregulated
Rv2247 CGATG Downregulated
Rv0860 GCATG Downregulated
Rv1815 AGGTG Downregulated
Rv0242c AAGTG Downregulated
Rv0654 GCATG Downregulated
Rv1400c GTATG Downregulated
Rv1884c TTGTG Downregulated
Rv3148 TGATG Downregulated
Rv0172 AATTG Downregulated
Rv3760 CGGTG Downregulated
Rv3028c CCATG Downregulated
Rv0859 ACATG Downregulated
Rv3151 GCGTG Downregulated
Rv0287 TTATG Downregulated
Rv0106 CGATG Downr