##### Set up packages and directories

In [1]:
full_run = True

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import ORF_Functions as orffn
import random
import copy
from joblib import Parallel, delayed
import os
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [5]:
project_dir = 'F:/Project_Data/Project_10'
literature_datasets_dir = 'F:/Datasets/Data_From_Publications'
output_dir = project_dir + '/Output'
refseq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
num_cores = 8
core_numbers = list(range(1, num_cores+1))

In [6]:
reference_species = 'GCF_000195955.2'

In [19]:
min_nts = 30

In [11]:
for record in SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"):
    accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
    full_sequence = str(record.seq)
util.produce_fasta_file([[accession_ver, full_sequence]], project_dir + '/full_tb_sequence.fasta')

100%|██████████| 1/1 [00:00<00:00, 14.82it/s]


In [17]:
tb_loci = []
for record in SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"):
    for feature in record.features:
            a = feature.qualifiers
            if feature.type == 'CDS':
                tb_loci.append((a.get("locus_tag")[0], int(feature.location.start), int(feature.location.end), int(feature.location.strand)))
tb_loci.sort(key = lambda x: x[1])

In [20]:
intergenic_regions = []
max_stop = 0
for (locus, start, stop, strand) in tb_loci:
    if start > max_stop + min_nts:
        intergenic_regions.append([str(max_stop)+'_'+str(start), full_sequence[max_stop: start]])
    max_stop = max(max_stop, stop)

In [22]:
util.produce_fasta_file(intergenic_regions, project_dir + '/tb_intergenic_regions.faa')

100%|██████████| 2190/2190 [00:00<00:00, 71435.44it/s]


In [9]:
min_nts = 30

##### Extract full sequences from each organism and create directory of start and stops for each annotated cds (use Mycobrowser for MTb)

##### Create blast databases

In [12]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in full_tb_sequence.fasta -dbtype nucl -out full_tb_sequence_nt', shell=True, capture_output = True)
    os.chdir(w_d)

In [None]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd f:\\Datasets\\BLAST\\full_tb_sequence_nt & blastn -query tb_intergenic_regions.faa -db full_tb_sequence_nt -out intergenic_mtb_mtb_hits.csv -evalue 1e-7 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    os.chdir(w_d)