In [1]:
full_run = True

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [3]:
project_dir = 'D:/Project_Data/Project_7'
literature_datasets_dir = project_dir + '/Data_From_Publications'
output_dir = project_dir + '/Output'
refseq_dir = project_dir + '/NCBI_Dataset_Mycobacteria'
num_cores = 8
core_numbers = list(range(1, num_cores+1))

In [4]:
species_list = util.list_dirs(refseq_dir)
reference_species = 'GCF_000195955.2'
outgroup_species = 'GCF_000696675.2'
#species_list = species_list[4:6]    #For testing
species_list_excl_ref = [x for x in species_list if x!= reference_species]
num_species = len(species_list)

In [5]:
ref_genome_record = next(SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"))
reference_sequence_length = len(str(ref_genome_record.seq))
translator = util.Translator()

In [6]:
reference_list = [[reference_species, str(ref_genome_record.seq)]]
util.produce_fasta_file(reference_list, 'D:/H37Rv.faa')

100%|██████████| 1/1 [00:00<00:00, 26.32it/s]


In [7]:
def process_blast_output(infile_loc, outfile_loc, top_hit_only = False):
    blast_results = pd.read_csv(infile_loc, header = None)
    blast_results.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']
    for i, r in blast_results.iterrows():
        blast_results.at[i, 'query_species'] = '_'.join(r.query_ref.split('_')[0:2])
        blast_results.at[i, 'target_species'] = '_'.join(r.target_ref.split('_')[0:2])
    blast_results['query_species_name'] = blast_results['query_species'].map(names_dict)
    blast_results['target_species_name'] = blast_results['target_species'].map(names_dict)
    if top_hit_only == True:
        blast_results = blast_results.loc[blast_results.groupby(['query_ref','target_species'])['bit_score'].idxmax()]
    blast_results['species_count'] = blast_results.groupby('query_ref')['query_ref'].transform('size')
    with open(outfile_loc, 'wb') as f:
        pickle.dump(blast_results, f)
    return blast_results

In [8]:
def keep_reciprocal_best_hits(query_df, reverse_query_df, outfile_loc):
    temp_1_dict = {}
    temp_2_dict = {}
    for i, r in query_df.iterrows():
        temp_1_dict[r['query_ref']] = r['target_ref']
    for i, r in reverse_query_df.iterrows():
        temp_2_dict[r['query_ref']] = r['target_ref']
    for i, r in query_df.iterrows():
        if temp_1_dict[r['query_ref']] in temp_2_dict and temp_2_dict[temp_1_dict[r['query_ref']]] == r['query_ref']:
            query_df.at[i, 'reciprocal_best_hit'] = 'Y'
        else:
            query_df.at[i, 'reciprocal_best_hit'] = 'N'
    output = query_df[query_df.reciprocal_best_hit == 'Y'] 
    with open(outfile_loc, 'wb') as f:
        pickle.dump(output, f)
    return output

In [9]:
def generate_myco_info(num_subsets, subset_num, species_master_list):
    output = []
    species_list = util.chunk_list(species_master_list, num_subsets, subset_num)
    for species in species_list:
        features = []
        genome_record = next(SeqIO.parse(refseq_dir + '/'+species+'/genomic.gbff', "genbank"))
        full_sequence = str(genome_record.seq)
        if full_sequence.count('A') + full_sequence.count('C') + full_sequence.count('G') + full_sequence.count('T') < len(full_sequence):
            continue
        organism = genome_record.annotations['organism']
        
        #  Read feature information
        if species == reference_species:
            mycobrowser_df = pd.read_excel(literature_datasets_dir+'/Mycobrowser_Release_4.xlsx')
            for i, r in mycobrowser_df.iterrows():
                if r['Feature'] == 'CDS':
                    if r['Strand'] == '+':
                        strand = 1
                    else:
                        strand = -1
                    features.append([r['Locus'],r['Start']-1, r['Stop'], strand])
            
        else:
            
            for feature in genome_record.features:
                    a = feature.qualifiers
                    if feature.type == 'CDS' and a.get("locus_tag")!= None and int(feature.location.end) - int(feature.location.start) < 100000:  #  Exclude strange Biopython parsing where starts with complement join and looks like a CDS is full length of genome!   
                        locus_tag = a.get("locus_tag")[0]
                        features.append([locus_tag, int(feature.location.start), int(feature.location.end), int(feature.location.strand)])
        
        features.sort(key=lambda x: x[1])
        
        feature_info = []
        for i, feature in enumerate(features):
            if feature[1] < feature[2]:  
                if feature[3] == 1:
                    cds_nt_sequence = full_sequence[feature[1]:feature[2]]
                else:
                    cds_nt_sequence = util.reverse_complement(full_sequence[feature[1]:feature[2]])
                cds_aa_sequence = translator.translate_sequence(full_sequence[feature[1]:feature[2]],feature[3], 0)                                                                                           
                if (i + 1)< len(features) and feature[3] == 1 and feature[2] < features[i+1][1]:
                    utr_coords = (feature[2], features[i+1][1])
                    utr_sequence = full_sequence[feature[2]: features[i+1][1]]
                elif (i > 0) and feature[3] == -1 and features[i-1][2] < feature[1]:
                    utr_coords = (features[i-1][2], feature[1])
                    utr_sequence = util.reverse_complement(full_sequence[features[i-1][2]: feature[1]])
                else:
                    utr_coords = (0,0)
                    utr_sequence = ''
                if i > 0 and feature[3] == 1 and features[i-1][2] < feature[1]:
                    utr_5_coords = (features[i-1][2], feature[1])
                    utr_5_sequence = full_sequence[features[i-1][2]: feature[1]]
                elif (i + 1) < len(features) and feature[3] == -1 and features[i+1][1] > feature[2]:
                    utr_5_coords = (feature[2], features[i+1][1])
                    utr_5_sequence = util.reverse_complement(full_sequence[feature[2]: features[i+1][1]])
                else:
                    utr_5_coords = (0,0)
                    utr_5_sequence = ''
                
                
                feature_info.append([species, feature[0], cds_aa_sequence, utr_sequence, feature, cds_nt_sequence, utr_coords, utr_5_coords, utr_5_sequence])

        output.append((species, organism, feature_info))
    return output

In [10]:
if full_run == True:
    myco_info_dict = {}
    protein_info_dict = {}
    names_dict = {}
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_myco_info)(num_cores, core_number, species_list) for core_number in core_numbers)
    for core_output in parallel_output:
        for results in core_output:
            myco_info_dict[results[0]] = (results[1], results[2])
            for feature in results[2]:
                protein_info_dict[feature[0] + '_' + feature[1]] = feature
            names_dict[results[0]] = results[1]

In [None]:
if full_run == True:
    comparison_protein_list = []
    reference_nt_list = []
    reference_protein_list = []
    reference_utr_list = []
    comparison_utr_list = []
    comparison_nt_list = []
    for species in species_list:
        for feature_info in myco_info_dict[species][1]:
            comparison_protein_list.append([feature_info[0]+'_'+feature_info[1],feature_info[2][:-1]])
            comparison_nt_list.append([feature_info[0]+'_'+feature_info[1],feature_info[5]])
            if len(feature_info[3]) > 8:
                comparison_utr_list.append([feature_info[0]+'_'+feature_info[1],feature_info[3]])  
    for feature_info in myco_info_dict[reference_species][1]:
            reference_protein_list.append([feature_info[0]+'_'+feature_info[1],feature_info[2][:-1]])
            reference_nt_list.append([feature_info[0]+'_'+feature_info[1],feature_info[5]])
            if len(feature_info[3]) > 8:
                reference_utr_list.append([feature_info[0]+'_'+feature_info[1],feature_info[3]])
    util.produce_fasta_file(comparison_protein_list, 'D:/BLAST/comp_prot/comparison_proteins.faa')
    util.produce_fasta_file(reference_protein_list, 'D:/BLAST/ref_prot/reference_proteins.faa')
    util.produce_fasta_file(reference_nt_list, 'D:/BLAST/ref_nt/reference_nt.faa')
    util.produce_fasta_file(comparison_nt_list, 'D:/BLAST/comp_nt/comparison_nt.faa')
    util.produce_fasta_file(reference_utr_list, 'D:/BLAST/ref_utr/reference_utr.faa')
    util.produce_fasta_file(comparison_utr_list, 'D:/BLAST/comp_utr/comparison_utr.faa')

In [None]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("D:/")
    subprocess.run('cd D:/BLAST/comp_prot &  makeblastdb -in comparison_proteins.faa -dbtype prot -out d://BLAST//comp_prot//comp_prot', shell=True, capture_output = True)
    subprocess.run('cd D:/BLAST/ref_prot &  makeblastdb -in reference_proteins.faa -dbtype prot -out d://BLAST//ref_prot//ref_prot', shell=True, capture_output = True)
    subprocess.run('cd D:/BLAST/ref_nt &  makeblastdb -in reference_nt.faa -dbtype nucl -out d://BLAST//ref_nt//ref_nt', shell=True, capture_output = True)
    subprocess.run('cd D:/BLAST/comp_nt &  makeblastdb -in comparison_nt.faa -dbtype nucl -out d://BLAST//comp_nt//comp_nt', shell=True, capture_output = True)
    subprocess.run('cd D:/BLAST/comp_utr &  makeblastdb -in comparison_utr.faa -dbtype nucl -out d://BLAST//comp_utr//comp_utr', shell=True, capture_output = True)
    os.chdir(w_d)

In [None]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("D:/")
    subprocess.run('cd d:\\BLAST\\comp_prot & blastp -query D:/BLAST/ref_prot/reference_proteins.faa -db comp_prot -out ref_comp_hits.csv -evalue 1e-10 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd d:\\BLAST\\ref_prot & blastp -query D:/BLAST/comp_prot/comparison_proteins.faa -db ref_prot -out comp_ref_hits.csv -evalue 1e-10 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd d:\\BLAST\\ref_nt & blastn -query D:/BLAST/ref_utr/reference_utr.faa -db ref_nt -out utr_ref_hits.csv -evalue 1e-10 -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd d:\\BLAST\\comp_nt & blastn -query D:/BLAST/comp_utr/comparison_utr.faa -db comp_nt -out utr_comp_hits.csv -evalue 1e-10 -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd d:\\BLAST\\comp_utr & blastn -query D:/BLAST/comp_utr/comparison_utr.faa -db comp_utr -out utr_all_comp_hits.csv -evalue 1e-10 -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
 
    os.chdir(w_d)

In [None]:
if full_run == True:
    blast_results_rc = process_blast_output('D:\\BLAST\\comp_prot\\ref_comp_hits.csv', project_dir + '/blast_results_rc.pkl', True)
    blast_results_cr = process_blast_output('D:\\BLAST\\ref_prot\\comp_ref_hits.csv', project_dir + '/blast_results_cr.pkl', True)
    blast_results_utr_ref = process_blast_output('D:\\BLAST\\ref_nt\\utr_ref_hits.csv', project_dir + '/utr_results_ref.pkl', False)
    blast_results_comp_ref = process_blast_output('D:\\BLAST\\comp_nt\\utr_comp_hits.csv', project_dir + '/utr_results_comp.pkl', False)
    blast_results_utr_utr = process_blast_output('D:\\BLAST\\comp_utr\\utr_all_comp_hits.csv', project_dir + '/comp_results_utr_utr.pkl', False)
    rbh_results = keep_reciprocal_best_hits(blast_results_rc, blast_results_cr, project_dir + '/rbh_results_temp.pkl')
    rbh_results['query_info'] = rbh_results['query_ref'].map(protein_info_dict)
    rbh_results['target_info'] = rbh_results['target_ref'].map(protein_info_dict)
    organism_names = rbh_results[['target_species','target_species_name']].drop_duplicates().reset_index(drop=True)
    organism_dict = {}
    for i, r in organism_names.iterrows():
        if 'BCG' in r['target_species_name']:
            organism_names.at[i,'sname'] = 'M.bovis_BCG'
        elif 'AF2122' in r['target_species_name']:
            organism_names.at[i,'sname'] = 'M.bovis_AF2122/97'
        else:
            organism_names.at[i,'sname'] = r['target_species_name'][0] + '.' + r['target_species_name'].split(' ')[1]
    for i, r in organism_names.iterrows():
        organism_dict[r['target_species']] = r['sname']
    rbh_results['target_species_sname'] = rbh_results['target_species'].map(organism_dict)
    for i, r in rbh_results.iterrows():
        rbh_results.at[i,'query_start'] = r['query_info'][4][1]
        rbh_results.at[i,'target_start'] = r['target_info'][4][1]
        rbh_results.at[i,'target_utr'] = r['target_info'][3]
        rbh_results.at[i,'target_cds'] = r['target_info'][2][:-1]
        rbh_results.at[i,'target_utr_start'] = r['target_info'][6][0]
        rbh_results.at[i,'target_utr_end'] = r['target_info'][6][1]
        rbh_results.at[i,'target_utr_5'] = r['target_info'][8]
        rbh_results.at[i,'target_utr_5_start'] = r['target_info'][7][0]
        rbh_results.at[i,'target_utr_5_end'] = r['target_info'][7][1]
    blast_results_rc.to_csv(project_dir + '/blast_results_rc.csv')
    blast_results_cr.to_csv(project_dir + '/blast_results_cr.csv')   
    blast_results_utr_ref.to_csv(project_dir + '/blast_results_utr_ref.csv') 
    blast_results_comp_ref.to_csv(project_dir + '/blast_results_comp_ref.csv') 
    blast_results_utr_utr.to_csv(project_dir + '/blast_results_utr_utr.csv') 
    with open(project_dir + '/rbh_results.pkl', 'wb') as f:
        pickle.dump(rbh_results, f)

In [12]:
if 1==1:
#if full_run == False:
    with open(project_dir + '/rbh_results.pkl', 'rb') as f:
        rbh_results = pickle.load(f) 

In [8]:
rbh_results[['query_ref', 'target_species_sname', 'target_utr_start', 'target_utr_end', 'target_utr_5_start', 'target_utr_5_end']].to_csv(project_dir + '/rbh_csv.csv')

In [None]:
blast_results_utr_utr = process_blast_output('D:\\BLAST\\comp_utr\\utr_all_comp_hits.csv', project_dir + '/comp_results_utr_utr.pkl', False)
blast_results_utr_utr.to_csv(project_dir + '/blast_results_utr_utr.csv') 

In [13]:
query_refs = rbh_results[['query_ref']].drop_duplicates().reset_index(drop=True)['query_ref'].to_list()

In [18]:
def run_alignments(num_subsets, subset_num, field_name, directory, ref_master_list):
    ref_list = util.chunk_list(ref_master_list, num_subsets, subset_num)
    for ref in ref_list:
        temp_df = rbh_results[rbh_results.query_ref == ref]
        seq_list = []
        for i, r in temp_df.iterrows():
            if len(r[field_name]) > 0:
                seq_list.append([r['target_species_sname'], r[field_name]])
        util.produce_fasta_file(seq_list, project_dir+'/testseq_'+str(subset_num)+'.fasta')
        cline = MuscleCommandline(muscle_exe, input=project_dir+'/testseq_'+str(subset_num)+'.fasta', out=directory + '/All/align_'+ ref +'.fasta')
        try:
            stdout, stderr = cline()
        except Exception as e:
            continue
        temp = util.read_fasta_to_array(directory + '/All/align_'+ ref +'.fasta', species_order = [])
        num_sequences = len(temp[0])
        if num_sequences == num_species:
            shutil.copyfile(directory + '/All/align_'+ ref +'.fasta', directory +'/Full_Ortholog/align_'+ ref +'.fasta')

In [19]:
if full_run == True:
    #Parallel(n_jobs=-1)(delayed(run_alignments)(num_cores, core_number, 'target_cds', project_dir + '/CDS_Alignments', query_refs) for core_number in core_numbers)
    #Parallel(n_jobs=-1)(delayed(run_alignments)(num_cores, core_number, 'target_utr', project_dir + '/UTR_Alignments', query_refs) for core_number in core_numbers)
    Parallel(n_jobs=-1)(delayed(run_alignments)(num_cores, core_number, 'target_utr_5', project_dir + '/UTR_5_Alignments', query_refs) for core_number in core_numbers)

##### Build tree from full orthologs

In [None]:
subprocess.run('cd \\users\\nicho\\IQTree & bin\\iqtree2 -q ' + project_dir + '/CDS_Alignments/Full_Ortholog/' + ' --prefix '+ project_dir + '/CDS_Alignments/Full_Ortholog_Tree/CDS_Full_Ortholog_Tree -m LG -B 1000 -T AUTO -o ' + organism_dict['GCF_000696675.2'], shell=True)

###### Insert blank sequences for display and delete gaps

In [None]:
temp = util.read_fasta_to_array(project_dir+'/UTR_Alignments/All/align_GCF_000195955.2_Rv0044c.fasta', species_order = [])
sequence_length = len(temp[1][0])
blank_seq = '-' * sequence_length
sequence_names = temp[0]
sequences = temp[1]
for k, v in organism_dict.items():
    if not(v in sequence_names):
        sequence_names.append(v)
        sequences.append(blank_seq)
for i, name in enumerate(sequence_names):
    if name == organism_dict[reference_species]:
        ref_species_index = i
ref_insert_positions = []
for i, letter in enumerate(sequences[ref_species_index]):
    if letter == '-':
        ref_insert_positions.append(i)
insert_deleted_sequences = []
for sequence in sequences:
    temp_letter_list = []
    for i,letter in enumerate(sequence):
        if i in ref_insert_positions:
            continue
        else:
            temp_letter_list.append(sequence[i])
    insert_deleted_sequences.append(''.join(temp_letter_list))
sequence_info = []
for (sequence_name, sequence) in zip(sequence_names, insert_deleted_sequences):
    sequence_info.append([sequence_name, sequence])
util.produce_fasta_file(sequence_info,project_dir+ '/UTR_Alignments/All_Deleted_Gaps/align_GCF_000195955.2_Rv0044c.fasta')            

In [None]:
subprocess.call ('C:/"Program Files"/R/R-4.1.2/bin/x64/Rscript --vanilla D:/Project_Data/Project_7/R_Scripts/Motif_Plots.R', shell=True)

In [None]:
plt.figure(figsize=(12, 12))
g = sns.FacetGrid(rbh_results, col='target_species_name', height=5, col_wrap=3)
g.map(sns.scatterplot, 'target_start', 'query_start', s=2)
g.set_titles(row_template = '{row_name}', col_template = '{col_name}')

#####  Run Infernal/r-scape process

In [14]:
project_dir = 'D:/Project_Data/Project_7'
dataset_loc = project_dir + '/NCBI_Dataset_Corynebacteriales'
r_scape_output_loc = project_dir + '/R_Scape_Results_3'
merged_file_loc = dataset_loc + '/merged_file.txt'
intergenic_alignment_loc = project_dir + '/UTR_Alignments/All'
wsl_merged_file_loc = util.wslname(merged_file_loc)
if 1==0:
#if full_run == True:
    with open(merged_file_loc, 'w') as outfile:
        for dir in util.list_dirs(dataset_loc):
            directory = dataset_loc + '/' + dir
            for file in util.list_files(directory):
                if file.endswith("genomic.fna"):
                    with open(directory + '/' + file, encoding="utf-8", errors='ignore') as infile:
                        outfile.write(infile.read())

In [17]:
def match_utr(utr_s, utr_e, searchline):
    result = re.search('NC_000962.3/(\S*)\s', searchline)
    if not(result == None):
        start = int(result.group(1).split('-')[0])
        end = int(result.group(1).split('-')[1])
        if ((start < utr_e) and (end > utr_s)) or  ((end < utr_e) and (start > utr_s)):
            return True
        else:
            return False
    else:
        return False
    
def utr_in_file(filename, utr_s, utr_e):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            for l in f:
                if match_utr(utr_s, utr_e, l) == True:
                    return True
            return False
    else:
        return False

In [None]:
alignment_ids = ['.'.join(x.split('.')[:-1]) for x in util.list_files(intergenic_alignment_loc)]
alignment_ids = alignment_ids[31:]
for alignment_id in tqdm(alignment_ids):
    ref = alignment_id[6:]
    temp_df = rbh_results[rbh_results.query_ref == ref]
    temp_df_2 = temp_df[temp_df.target_species_sname == 'M.tuberculosis']
    utr_start = temp_df_2.iloc[0]['target_utr_start']
    utr_end = temp_df_2.iloc[0]['target_utr_end']
    
    intergenic_file = alignment_id + '.fasta'
    analysis_directory = r_scape_output_loc + '/' + ref
    wsl_analysis_directory = util.wslname(analysis_directory)
    if not os.path.exists(analysis_directory):
        os.makedirs(analysis_directory)
    intergenic_region_alignment = intergenic_alignment_loc + '/' + intergenic_file
    alignment = AlignIO.read(intergenic_region_alignment, "fasta")
    AlignIO.write(alignment, analysis_directory + '/initial_align.sto', "stockholm");

    # Initial run with HMM

    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmbuild --noss -F initial_cm.cm initial_align.sto'  , shell=True)
    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmsearch -A initial_search.sto initial_cm.cm ' + wsl_merged_file_loc  , shell=True) 
    if utr_in_file(analysis_directory + '/initial_search.sto', utr_start, utr_end) == False:
        continue
    # Second run with CM

    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmbuild -F interim_cm.cm initial_search.sto'  , shell=True)
    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmcalibrate interim_cm.cm', shell= True)
    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmsearch -A interim_search.sto interim_cm.cm ' + wsl_merged_file_loc  , shell=True)  
    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname rscape_interim interim_search.sto'  , shell=True)
    if utr_in_file(analysis_directory + '/interim_search.sto', utr_start, utr_end) == False:
        continue
   
    # Final run with CaCofold CM

    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmbuild -F final_cm.cm rscape_interim.cacofold.sto'  , shell=True)
    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmcalibrate final_cm.cm', shell= True)
    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; cmsearch -A final_search.sto final_cm.cm ' + wsl_merged_file_loc  , shell=True)  
    if utr_in_file(analysis_directory + '/final_search.sto', utr_start, utr_end) == False:
        continue
    subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname rscape_final final_search.sto'  , shell=True)

  0%|          | 0/3408 [00:00<?, ?it/s]

In [None]:
print(utr_start, utr_end)

In [None]:
print(analysis_directory + '/initial_search.sto')