In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil

In [2]:
full_run = True
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/Output'
wsl_output_loc = util.wslname(output_dir)
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
species_list = util.list_files(seq_dir)
hmmer_evalue = 1e-20
infernal_evalue = 1e-10

##### Generate file containing all intergenic regions in reference organism

In [3]:
features = []
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
full_sequence = str(genome_record.seq)
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
for i, r in mycobrowser_df.iterrows():
    if r['Strand'] == '+':
        strand = 1
    else:
        strand = -1
    features.append([r['Locus'],r['Start']-1, r['Stop'], strand])
features.sort(key=lambda x: x[1])
feature_info = []
for i, feature in enumerate(features):
    feature_sequence = full_sequence[feature[1]: feature[2]]
    #feature_info.append([feature[0], feature[1], feature[2], feature_sequence, len(feature_sequence)])
    if feature[1] < feature[2]:  
        if (i + 1)< len(features) and feature[2] < features[i+1][1]:
            utr_coords = (feature[2], features[i+1][1])
            utr_sequence = full_sequence[feature[2]: features[i+1][1]]
            utr_length = len(utr_sequence)
        else:
            utr_coords = (0,0)
            utr_sequence = ''
            utr_length = 0
        if utr_length > 50:
            feature_info.append([feature[0] + '_IG', utr_coords[0], utr_coords[1], utr_sequence, utr_length])
intergenic_df = pd.DataFrame(feature_info, columns = ['Locus', 'Start' , 'End', 'Sequence', 'Length'])

##### Run HMMER iteratively (initially against sequence, then four times against fitted HMM, keeping only top hits.  For last iteration keep all hits then run R-scape

In [None]:
query_file = 'intergenic_region.faa'
model_file = 'hmm.hmm'
for i, r in intergenic_df.iterrows():
    break_loop = 0
    sequence_list = [[r['Locus'], r['Sequence']]]
    locus_id = r['Locus']
    print(locus_id)
    results_dir = output_dir + '/' + locus_id
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    util.produce_fasta_file(sequence_list, results_dir + '/' + query_file)
    blastfn.nhmmer_search_sequence(results_dir, query_file, datasets_dir, 'merged_file.txt', 'align.sto', 'output.txt', 'summary.txt', hmmer_evalue) 
    if not os.path.exists(results_dir +'/summary.txt'):
        continue
    for i in range(4):
        blastfn.align_keep_top_hit_per_species(results_dir, 'summary.txt', 'align.sto', 'align.sto', 'HMMER', hmmer_evalue)
        blastfn.hmmer_build(results_dir, 'align.sto' ,model_file)
        blastfn.nhmmer_search_model(results_dir, model_file, datasets_dir, 'merged_file.txt', 'align.sto', 'output.txt', 'summary.txt', hmmer_evalue)      #seq_dir, 'full_sequences.faa'
        if not os.path.exists(results_dir +'/summary.txt'):
            break_loop = 1
            break
    if break_loop == 1:
        continue
    blastfn.run_rscape(results_dir, 'align.sto', locus_id)

In [4]:
test = intergenic_df[intergenic_df.Locus == 'Rv1173_IG']

In [5]:
locus_id = r['Locus']

In [None]:
hmmer_eval =  1e-20
infernal_eval = 0.001
query_file = 'intergenic_region.faa'
hm_model_file = 'hmm.hmm'
cm_model_file = 'cm.cm'
for i, r in test.iterrows():
    break_loop = 0
    sequence_list = [[r['Locus'], r['Sequence']]]
    locus_id = r['Locus']
    print(locus_id)
    results_dir = output_dir + '/' + locus_id
   
    util.produce_fasta_file(sequence_list, results_dir + '/' + query_file)
    
    blastfn.nhmmer_search_sequence(results_dir, query_file, datasets_dir, 'merged_file.txt', 'align_1.sto', 'output.txt', 'summary.txt', hmmer_eval) 
  
    blastfn.infernal_build_and_calib(results_dir, 'align_1.sto' ,'cm_1.cm', False)
    blastfn.infernal_search(results_dir, 'cm_1.cm', datasets_dir, 'merged_file.txt', 'search_1.sto', 'search_hits_1.txt', 'summary.txt', infernal_eval) 
    
    blastfn.align_keep_top_hit_per_species(results_dir, 'summary.txt', 'search_1.sto', 'search_bh_1.sto', 'INFERNAL', infernal_eval)
    blastfn.run_rscape(results_dir, 'search_bh_1.sto', 'rscape_1')

    #2
    
    blastfn.infernal_build_and_calib(results_dir, 'rscape_1.cacofold.R2R.sto' ,'cm_2.cm')
    blastfn.infernal_search(results_dir, 'cm_2.cm', datasets_dir, 'merged_file.txt', 'search_2.sto', 'search_hits_2.txt', 'summary.txt', infernal_eval) 
    blastfn.align_keep_top_hit_per_species(results_dir, 'summary.txt', 'search_2.sto', 'search_bh_2.sto', 'INFERNAL', infernal_eval)
    blastfn.run_rscape(results_dir, 'search_bh_2.sto', 'rscape_2')
        
   #3

    blastfn.infernal_build_and_calib(results_dir, 'rscape_2.cacofold.R2R.sto' ,'cm_3.cm')
    blastfn.infernal_search(results_dir, 'cm_3.cm', datasets_dir, 'merged_file.txt', 'search_3.sto', 'search_hits_3.txt', 'summary.txt', infernal_eval) 
    blastfn.align_keep_top_hit_per_species(results_dir, 'summary.txt', 'search_3.sto', 'search_bh_3.sto', 'INFERNAL', infernal_eval)
    blastfn.run_rscape(results_dir, 'search_bh_3.sto', 'rscape_3')
    
    #if not os.path.exists(results_dir +'/summary.txt'):
    #    break_loop = 1
    #    break
    #if break_loop == 1:
    #    continue
    #blastfn.run_rscape(results_dir, 'align.sto', locus_id)

Rv1173_IG


100%|██████████| 1/1 [00:00<?, ?it/s]


In [None]:
merged_file_loc = util.wslname(datasets_dir + '/merged_file.txt')
analysis_directory =  util.wslname(output_dir + '/' + locus_id)
w_analysis_directory = output_dir + '/' + locus_id

In [None]:
query_file = 'intergenic_region.faa'
hm_model_file = 'hmm.hmm'
cm_model_file = 'cm.cm'

In [None]:
subprocess.run('wsl cd ' + analysis_directory + ' ; nhmmer --cpu 32 -A align_1.sto --incE 1e-20 '+ query_file +' '+ merged_file_loc, shell=True)
# Initial run with HMM

In [None]:
subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmbuild --noss -F cm_1.cm align_1.sto'  , shell=True)
subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmcalibrate cm_1.cm', shell= True)
subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmsearch  --tblout search_hits_1.txt -A search_1.sto cm_1.cm ' + merged_file_loc  , shell=True) 
best_hit_per_species(w_analysis_directory, 1)
subprocess.run('wsl cd ' + analysis_directory + ' ; ~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname rscape_1 search_bh_1.sto'  , shell=True)

# Subsequent runs with Cacofold CM

for iter_num in range(2, 3):    #(2,4)
    subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmbuild -F cm_'+str(iter_num)+'.cm rscape_'+str(iter_num - 1) +'.cacofold.R2R.sto'  , shell=True)
    subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmcalibrate cm_'+str(iter_num)+'.cm', shell= True)
    subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmsearch --tblout search_hits_'+str(iter_num)+'.txt -A search_'+str(iter_num)+'.sto cm_'+str(iter_num)+'.cm ' + merged_file_loc  , shell=True)  
    best_hit_per_species(w_analysis_directory, iter_num)
    subprocess.run('wsl cd ' + analysis_directory + ' ; ~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname rscape_'+str(iter_num) +' search_bh_'+str(iter_num)+'.sto'  , shell=True)

In [None]:
subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmbuild --noss -F cm_1.cm align_1.sto'  , shell=True)

In [None]:
subprocess.run('wsl cd ' + analysis_directory + ' ; ~/infernal-1.1.4/src/cmsearch  --tblout search_hits_1.txt -A search_1.sto cm_1.cm ' + merged_file_loc  , shell=True) 