In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil

In [10]:
full_run = True
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/RScape_Local_Run_2'
wsl_output_loc = util.wslname(output_dir)
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
species_list = util.list_files(seq_dir)

##### Generate file containing all intergenic regions in reference organism

In [3]:
features = []
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
full_sequence = str(genome_record.seq)
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
for i, r in mycobrowser_df.iterrows():
    if r['Strand'] == '+':
        strand = 1
    else:
        strand = -1
    features.append([r['Locus'],r['Start']-1, r['Stop'], strand])
features.sort(key=lambda x: x[1])
feature_info = []
for i, feature in enumerate(features):
    feature_sequence = full_sequence[feature[1]: feature[2]]
    #feature_info.append([feature[0], feature[1], feature[2], feature_sequence, len(feature_sequence)])
    if feature[1] < feature[2]:  
        if (i + 1)< len(features) and feature[2] < features[i+1][1]:
            utr_coords = (feature[2], features[i+1][1])
            utr_sequence = full_sequence[feature[2]: features[i+1][1]]
            utr_length = len(utr_sequence)
        else:
            utr_coords = (0,0)
            utr_sequence = ''
            utr_length = 0
        if 1==1:
        #if utr_length > 50:
            feature_info.append([feature[0], utr_coords[0], utr_coords[1], utr_sequence, utr_length])
intergenic_df = pd.DataFrame(feature_info, columns = ['Locus', 'Start' , 'End', 'Sequence', 'Length'])

##### Run HMMER and INFERNAL iteratively

In [14]:
test = intergenic_df[intergenic_df.Locus.isin(['Rv0439c'])]

In [15]:
hmmer_eval =  1e-5
infernal_eval = 1e-5
query_file = 'intergenic_region.faa'
hm_model_file = 'hmm.hmm'
cm_model_file = 'cm.cm'
for i, r in test.iterrows():
    if r['Length'] >= 60:
        sequence_list = [[r['Locus'], r['Sequence']]]
        locus_id = r['Locus']
        print(locus_id)
        results_dir = output_dir + '/' + locus_id
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
        util.produce_fasta_file(sequence_list, results_dir + '/' + query_file)

        blastfn.nhmmer_search_sequence(results_dir, query_file, datasets_dir, 'full_sequences.faa', 'align_0.sto', 'output.txt', 'hsummary.txt', hmmer_eval) 
        if not os.path.exists(results_dir +'/hsummary.txt'):
            continue
        blastfn.align_keep_top_hit_per_species(results_dir, 'hsummary.txt', 'align_0.sto', 'align_bh_0.sto', 'HMMER', hmmer_eval)
        for i in range(4):
            blastfn.hmmer_build(results_dir, 'align_bh_'+str(i)+'.sto' ,hm_model_file)
            blastfn.nhmmer_search_model(results_dir, hm_model_file, datasets_dir, 'full_sequences.faa', 'align_'+str(i+1)+'.sto', 'output.txt', 'hsummary_'+str(i+1)+'.txt', hmmer_eval)      #seq_dir, 'full_sequences.faa'
            blastfn.align_keep_top_hit_per_species(results_dir, 'hsummary_'+str(i+1)+'.txt', 'align_'+str(i+1)+'.sto', 'align_bh_'+str(i+1)+'.sto', 'HMMER', hmmer_eval)


        blastfn.infernal_build_and_calib(results_dir, 'align_bh_4.sto' ,'cm_1.cm', False)
        blastfn.infernal_search(results_dir, 'cm_1.cm', datasets_dir, 'full_sequences.faa', 'search_1.sto', 'search_hits_1.txt', 'summary_1.txt', infernal_eval) 
        if os.path.getsize(results_dir + '/search_hits_1.txt') > 1e7:
            continue
        blastfn.align_keep_top_hit_per_species(results_dir, 'summary_1.txt', 'search_1.sto', 'search_bh_1.sto', 'INFERNAL', infernal_eval)
        blastfn.run_rscape(results_dir, 'search_bh_1.sto', 'rscape_1')

        #2
        #'rscape_1.cacofold.R2R.sto'
        blastfn.infernal_build_and_calib(results_dir, 'search_bh_1.sto' ,'cm_2.cm')
        blastfn.infernal_search(results_dir, 'cm_2.cm', datasets_dir, 'full_sequences.faa', 'search_2.sto', 'search_hits_2.txt', 'summary_2.txt', infernal_eval) 
        if os.path.getsize(results_dir + '/search_hits_2.txt') > 1e7:
            continue
        blastfn.align_keep_top_hit_per_species(results_dir, 'summary_2.txt', 'search_2.sto', 'search_bh_2.sto', 'INFERNAL', infernal_eval)
        blastfn.run_rscape(results_dir, 'search_bh_2.sto', 'rscape_2')

       #3

        blastfn.infernal_build_and_calib(results_dir, 'search_bh_2.sto' ,'cm_3.cm')
        blastfn.infernal_search(results_dir, 'cm_3.cm', datasets_dir, 'full_sequences.faa', 'search_3.sto', 'search_hits_3.txt', 'summary_3.txt', infernal_eval) 
        if os.path.getsize(results_dir + '/search_hits_3.txt') > 1e7:
            continue
        blastfn.align_keep_top_hit_per_species(results_dir, 'summary_3.txt', 'search_3.sto', 'search_bh_3.sto', 'INFERNAL', infernal_eval)
        blastfn.run_rscape(results_dir, 'search_3.sto', 'rscape_3')



Rv0439c


100%|██████████| 1/1 [00:00<?, ?it/s]
