##### Set up packages and directories

In [1]:
full_run = True

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import ORF_Functions as orffn
import random
import copy
from joblib import Parallel, delayed
import os
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [5]:
project_dir = 'F:/Project_Data/Project_10'
literature_datasets_dir = 'F:/Datasets/Data_From_Publications'
output_dir = project_dir + '/Output'
refseq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
num_cores = 8
core_numbers = list(range(1, num_cores+1))

In [6]:
reference_species = 'GCF_000195955.2'

In [19]:
min_nts = 30

In [11]:
for record in SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"):
    accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
    full_sequence = str(record.seq)
util.produce_fasta_file([[accession_ver, full_sequence]], project_dir + '/full_tb_sequence.fasta')

100%|██████████| 1/1 [00:00<00:00, 14.82it/s]


In [33]:
tb_loci = []
for record in SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"):
    for feature in record.features:
            a = feature.qualifiers
            if feature.type not in ['source','gene']:
                tb_loci.append((feature.type, int(feature.location.start), int(feature.location.end)))
tb_loci.sort(key = lambda x: x[1])

In [35]:
std_annotation_intergenic_regions = []
max_stop = 0
for (feature_type, start, stop) in tb_loci:
    if start > max_stop + min_nts and feature_type == 'CDS':
        std_annotation_intergenic_regions.append([str(max_stop)+'_'+str(start), full_sequence[max_stop: start]])
    max_stop = max(max_stop, stop)
util.produce_fasta_file(std_annotation_intergenic_regions, project_dir + '/std_annot_tb_intergenic_regions.faa')

In [69]:
mycobrowser_loci = []
mycobrowser_df = pd.read_excel(literature_datasets_dir+'/Mycobrowser_Release_4.xlsx')
for i, r in mycobrowser_df.iterrows():
    mycobrowser_loci.append([r['Locus'], r['Feature'], int(r['Start'])-1, int(r['Stop'])])
mycobrowser_loci.sort(key = lambda x: x[2])

In [50]:
mycobrowser_annotation_intergenic_regions = []
max_stop = 0
for n, (locus, feature_type, start, stop) in enumerate(mycobrowser_loci):
    if start > max_stop + min_nts and feature_type == 'CDS':
        mycobrowser_annotation_intergenic_regions.append([str(max_stop)+'_'+str(start), full_sequence[max_stop: start]])
    max_stop = max(max_stop, stop)
util.produce_fasta_file(mycobrowser_annotation_intergenic_regions, project_dir + '/mycobrowser_annot_tb_intergenic_regions.faa')

100%|██████████| 2177/2177 [00:00<00:00, 83771.41it/s]


In [9]:
min_nts = 30

##### Extract full sequences from each organism and create directory of start and stops for each annotated cds (use Mycobrowser for MTb)

##### Create blast databases

In [12]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in full_tb_sequence.fasta -dbtype nucl -out full_tb_sequence_nt', shell=True, capture_output = True)
    os.chdir(w_d)

In [39]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd f:\\Datasets\\BLAST\\full_tb_sequence_nt & blastn -query std_annot_tb_intergenic_regions.faa -db full_tb_sequence_nt -out std_annot_intergenic_mtb_mtb_hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd f:\\Datasets\\BLAST\\full_tb_sequence_nt & blastn -query mycobrowser_annot_tb_intergenic_regions.faa -db full_tb_sequence_nt -out mycobrowser_annot_intergenic_mtb_mtb_hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    os.chdir(w_d)

In [84]:
blast_results = pd.read_csv('F:/Datasets/BLAST/full_tb_sequence_nt/std_annot_intergenic_mtb_mtb_hits.csv', header = None)
blast_results.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 
                             'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']

In [85]:
blast_results['hit_count'] = blast_results.groupby('query_ref')['target_ref'].transform('count')

In [86]:
blast_results = blast_results[blast_results['hit_count'] > 1]

In [87]:
for i, r in blast_results.iterrows():
    start1 = min(r['target_start_alignment'],r['target_end_alignment'])
    end1 = max(r['target_start_alignment'],r['target_end_alignment'])
    for (locus, feature_type, start, stop) in mycobrowser_loci:
        if start>= start1 and stop <= end1:
            print(start, start1, stop, end1, locus)
            blast_results.at[i,'target_feature'] = locus

177235 176953 177285 177542 MTB000104
958458 958458 958509 958517 MTB000120
786020 786012 786074 786148 MTB000116
958458 958456 958509 958522 MTB000120
958458 958458 958509 958517 MTB000120
1893576 1893563 1894230 1895724 Rv1667c
1894223 1893563 1895342 1895724 Rv1668c
2030346 2030204 2030643 2030693 Rv1792
2626222 2626221 2626519 2626534 Rv2347c
2577107 2577099 2577701 2577850 Rv2306A
1340577 1340525 1340625 1340658 MTB000075
2637687 2636932 2639535 2639672 Rv2356c
3189582 3189398 3190152 3190700 Rv2879c
3189850 3189398 3190678 3190700 Rv2880c
177235 177201 177285 177300 MTB000104
3378328 3378244 3378415 3378710 Rv3018A
3690940 3690939 3691059 3691140 MTB000153
3723655 3723632 3724042 3724614 Rv3337
3723903 3723632 3724548 3724614 Rv3338
958458 958456 958509 958514 MTB000120
3841713 3841533 3842076 3842238 Rv3424c
177235 177230 177285 177296 MTB000104
4215880 4215776 4216063 4216403 Rv3770A
4216077 4215776 4216269 4216403 Rv3770B
177235 177204 177285 177304 MTB000104


In [88]:
blast_results.to_csv(project_dir + '/test.csv')

In [65]:
mycobrowser_loci[-5:]

[['MTB000046', 'tRNA', 4168344, 4168430],
 ['MTB000047', 'tRNA', 4199130, 4199217],
 ['MTB000048', 'tRNA', 4216864, 4216937],
 ['MTB000049', 'tRNA', 4216967, 4217056],
 ['MTB000050', 'tRNA', 4222580, 4222667]]

In [70]:
mycobrowser_loci

[['Rv0001', 'CDS', 0, 1524],
 ['Rv0002', 'CDS', 2051, 3260],
 ['Rv0003', 'CDS', 3279, 4437],
 ['Rv0004', 'CDS', 4433, 4997],
 ['Rv0005', 'CDS', 5239, 7267],
 ['Rv0006', 'CDS', 7301, 9818],
 ['Rv0007', 'CDS', 9913, 10828],
 ['MTB000001', 'tRNA', 10886, 10960],
 ['MTB000002', 'tRNA', 11111, 11184],
 ['Rv0008c', 'CDS', 11873, 12311],
 ['Rv0009', 'CDS', 12467, 13016],
 ['Rv0010c', 'CDS', 13132, 13558],
 ['Rv0011c', 'CDS', 13713, 13995],
 ['Rv0012', 'CDS', 14088, 14877],
 ['Rv0013', 'CDS', 14913, 15612],
 ['Rv0014c', 'CDS', 15589, 17470],
 ['Rv0015c', 'CDS', 17466, 18762],
 ['Rv0016c', 'CDS', 18758, 20234],
 ['Rv0017c', 'CDS', 20230, 21640],
 ['Rv0018c', 'CDS', 21636, 23181],
 ['Rv0019c', 'CDS', 23269, 23737],
 ['Rv0020c', 'CDS', 23860, 25444],
 ['MTB000003', 'tRNA', 25643, 25726],
 ['Rv0021c', 'CDS', 25912, 26881],
 ['Rv0022c', 'CDS', 27022, 27442],
 ['Rv0023', 'CDS', 27594, 28365],
 ['Rv0024', 'CDS', 28361, 29207],
 ['Rv0025', 'CDS', 29244, 29607],
 ['Rv0026', 'CDS', 29721, 31068],
 ['Rv0