##### Set up packages and directories

In [1]:
full_run = True

In [196]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import ORF_Functions as orffn
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [5]:
project_dir = 'F:/Project_Data/Project_10'
literature_datasets_dir = 'F:/Datasets/Data_From_Publications'
output_dir = project_dir + '/Output'
refseq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
num_cores = 8
core_numbers = list(range(1, num_cores+1))

In [164]:
reference_species = 'GCF_000195955.2'
marinum_species = 'GCF_016745295.1'

In [165]:
min_nts = 30

In [189]:
def produce_full_sequence_and_intergenic_region_fasta(species):
    loci = []
    for record in SeqIO.parse(refseq_dir + '/'+species+'/genomic.gbff', "genbank"):
        accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
        full_sequence = str(record.seq)
        for feature in record.features:
                a = feature.qualifiers
                if feature.type not in ['source','gene'] and (int(feature.location.start) < int(feature.location.end)) and (int(feature.location.end) - int(feature.location.start)) < 1000000:
                    loci.append((feature.type, int(feature.location.start), int(feature.location.end)))

    loci.sort(key = lambda x: x[1])
    std_annotation_intergenic_regions = []
    max_stop = 0
    for (feature_type, start, stop) in loci:
        if start > max_stop + min_nts and feature_type == 'CDS':
            std_annotation_intergenic_regions.append([str(max_stop)+'_'+str(start), full_sequence[max_stop: start]])
        max_stop = max(max_stop, stop)
    util.produce_fasta_file([[accession_ver, full_sequence]], project_dir + '/full_'+species+'_sequence.fasta')
    util.produce_fasta_file(std_annotation_intergenic_regions, project_dir + '/'+species+'_intergenic_regions.faa')

In [203]:
def make_blast_sequence_db(species):
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in ' + project_dir + '/full_'+species+'_sequence.fasta' +' -dbtype nucl -out full_'+species+'_sequence_nt', shell=True, capture_output = True)
    os.chdir(w_d)
    if not(os.path.exists('F:/Datasets/BLAST/Self_Blast/' + species)):
        os.makedirs('F:/Datasets/BLAST/Self_Blast/' + species)
    files = util.list_files(project_dir)
    for file in files:
        if species in file:
            shutil.move(project_dir+'/'+file, 'F:/Datasets/BLAST/Self_Blast/' + species +'/' + file)  

In [231]:
def run_self_blast(species):
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd f:\\Datasets\\BLAST\\Self_BLAST\\' + species + ' & blastn -query ' + species + '_intergenic_regions.faa -db full_'+species+'_sequence_nt -out hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    #print('cd f:\\Datasets\\BLAST\\Self_BLAST\\' + species + ' & blastn -query ' + species + '_intergenic_regions.faa -db full_'+species+'_sequence_nt -out hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16')
    os.chdir(w_d)
    blast_results = pd.read_csv('F:/Datasets/BLAST/Self_BLAST/' + species + '/hits.csv', header = None)
    blast_results.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 
                             'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']
    blast_results['hit_count'] = blast_results.groupby('query_ref')['target_ref'].transform('count')
    blast_results = blast_results[blast_results['hit_count'] > 1]
    
    annotated_regions = []
    intergenic_regions = []
    for record in SeqIO.parse(refseq_dir + '/'+species+'/genomic.gbff', "genbank"):
        for feature in record.features:
                a = feature.qualifiers
                if feature.type not in ['source','gene'] and (int(feature.location.start) < int(feature.location.end)) and (int(feature.location.end) - int(feature.location.start)) < 1000000:
                    if not(a.get("locus_tag")==None):
                        locus_tag = a.get("locus_tag")[0]
                    else:
                        locus_tag = ''
                    annotated_regions.append((locus_tag, feature.type, int(feature.location.start), int(feature.location.end)))
    annotated_regions.sort(key = lambda x: x[2])
    prev_locus = ''
    max_stop = 0
    for n, (locus, feature_type, start, stop) in enumerate(annotated_regions):
        if start > max_stop:
            intergenic_regions.append([prev_locus+':'+locus,'Intergenic',max_stop, start])
        if stop > max_stop:
            prev_locus = locus 
        max_stop = max(max_stop, stop)    
    for x in intergenic_regions:
        annotated_regions.append(x)
    annotated_regions.sort(key = lambda x : x[2])

    blast_results['annot_features']=''
    for i, r in blast_results.iterrows():
        start1 = min(r['target_start_alignment'],r['target_end_alignment'])
        end1 = max(r['target_start_alignment'],r['target_end_alignment'])
        feature_matches = []
        for (locus, feature, start, stop) in annotated_regions:
            if start< end1 and stop > start1:
                overlap = (min(end1, stop) - max(start1, start))/ (end1-start1)
                feature_matches.append([locus, feature, overlap])
        blast_results.at[i,'annot_features'] = feature_matches
    blast_results.to_csv('F:/Datasets/BLAST/Self_BLAST/' + species + '/processed_hits.csv')

In [235]:
produce_full_sequence_and_intergenic_region_fasta(reference_species)
make_blast_sequence_db(reference_species)
run_self_blast(reference_species)
produce_full_sequence_and_intergenic_region_fasta(marinum_species)
make_blast_sequence_db(marinum_species)
run_self_blast(marinum_species)

100%|██████████| 1/1 [00:00<00:00, 10.87it/s]
100%|██████████| 2108/2108 [00:00<00:00, 79015.46it/s]
100%|██████████| 1/1 [00:00<00:00,  7.98it/s]
100%|██████████| 3159/3159 [00:00<00:00, 66698.24it/s]


In [69]:
mycobrowser_loci = []
mycobrowser_df = pd.read_excel(literature_datasets_dir+'/Mycobrowser_Release_4.xlsx')
for i, r in mycobrowser_df.iterrows():
    mycobrowser_loci.append([r['Locus'], r['Feature'], int(r['Start'])-1, int(r['Stop'])])
mycobrowser_loci.sort(key = lambda x: x[2])

In [114]:
mycobrowser_annotation_intergenic_regions = []
all_mycobrowser_regions = []
max_stop = 0
prev_locus = ''
for n, (locus, feature_type, start, stop) in enumerate(mycobrowser_loci):
    all_mycobrowser_regions.append([locus, start, stop])
    if start > max_stop + min_nts and feature_type == 'CDS':
        mycobrowser_annotation_intergenic_regions.append([str(max_stop)+'_'+str(start), full_sequence[max_stop: start]])
    if start > max_stop:
        all_mycobrowser_regions.append([prev_locus+'_'+locus,max_stop, start])
    if stop > max_stop:
        prev_locus = locus 
    max_stop = max(max_stop, stop)
util.produce_fasta_file(mycobrowser_annotation_intergenic_regions, project_dir + '/mycobrowser_annot_tb_intergenic_regions.faa')

100%|██████████| 2177/2177 [00:00<00:00, 84545.51it/s]


In [96]:
all_mycobrowser_regions.sort(key = lambda x: x[1])

In [9]:
min_nts = 30

##### Extract full sequences from each organism and create directory of start and stops for each annotated cds (use Mycobrowser for MTb)

##### Create blast databases

In [12]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in full_tb_sequence.fasta -dbtype nucl -out full_tb_sequence_nt', shell=True, capture_output = True)
    os.chdir(w_d)

In [39]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd f:\\Datasets\\BLAST\\full_tb_sequence_nt & blastn -query std_annot_tb_intergenic_regions.faa -db full_tb_sequence_nt -out std_annot_intergenic_mtb_mtb_hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd f:\\Datasets\\BLAST\\full_tb_sequence_nt & blastn -query mycobrowser_annot_tb_intergenic_regions.faa -db full_tb_sequence_nt -out mycobrowser_annot_intergenic_mtb_mtb_hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    os.chdir(w_d)

In [125]:
blast_results = pd.read_csv('F:/Datasets/BLAST/full_tb_sequence_nt/mycobrowser_annot_intergenic_mtb_mtb_hits.csv', header = None)
blast_results.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 
                             'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']

In [126]:
blast_results['hit_count'] = blast_results.groupby('query_ref')['target_ref'].transform('count')

In [127]:
blast_results = blast_results[blast_results['hit_count'] > 1]

In [132]:
repeat_regions = []
blast_results['target_locations']=''
blast_results['annot_features']=''
for i, r in blast_results.iterrows():
    start1 = min(r['target_start_alignment'],r['target_end_alignment'])
    end1 = max(r['target_start_alignment'],r['target_end_alignment'])
    matches = []
    for (locus, start, stop) in all_mycobrowser_regions:
        if start< end1 and stop > start1:
            overlap = (min(end1, stop) - max(start1, start))/ (end1-start1)
            matches.append([locus, overlap])
    blast_results.at[i,'target_locations'] = matches
    matches = []
    for (feature, start, stop) in tb_loci:
        if start< end1 and stop > start1:
            overlap = (min(end1, stop) - max(start1, start))/ (end1-start1)
            matches.append([feature, overlap])
            if ('repeat' in feature) or ('mobile' in feature):
                repeat_regions.append(r['query_ref'])
    blast_results.at[i,'annot_features'] = matches
repeat_regions = list(set(repeat_regions))
blast_results_2 = blast_results.query("not(query_ref.isin(@repeat_regions))")

In [133]:
blast_results_2.to_csv(project_dir + '/test.csv')

In [134]:
rbh = pd.read_csv('F:/Project_Data/Intergenic_Region_Comparative_Analysis' + '/reciprocal_best_hits.csv')

In [161]:
#rbh.query("query_ref == 'NC_000962.3@Rv3490' and target_species_name.str.contains('egmatis')")
rbh.query("query_ref == 'NC_000962.3@Rv3491' and target_species_name.str.contains('arinum')")

Unnamed: 0.1,Unnamed: 0,query_ref,target_ref,query_length,subject_length,percent_identical_matches,alignment_length,number_mismatches,number_of_gap_openings,query_start_alignment,...,target_start_alignment,target_end_alignment,e_value,bit_score,query_species,target_species,query_species_name,target_species_name,species_count,reciprocal_best_hit
479926,1007753,NC_000962.3@Rv3491,NZ_CP058277.1@HXW97_RS03760,192,193,75.13,193,47,1,1,...,1,193,7.92e-110,315.0,NC_000962.3,NZ_CP058277.1,Mycobacterium tuberculosis H37Rv,Mycobacterium marinum,190,Y


In [154]:
a = util.list_dirs('F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data')

In [162]:
for i in a:
    record = next(SeqIO.parse('F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data' + '/'+str(i)+'/genomic.gbff', "genbank"))
    name=record.annotations['organism']
    accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
    if 'arinum' in name:
        print(i, accession_ver, name)
        break

GCF_016745295.1 NZ_CP058277.1 Mycobacterium marinum
