In [1]:
import pandas as pd
import os
import subprocess
from Comparative_Analysis import Utilities as util
from Bio import AlignIO, SeqIO
from tqdm.auto import tqdm
import pickle
import re
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Alignment as align
import random
import copy

In [2]:
project_dir = 'D:/Project_Data/Project_7'
dataset_loc = project_dir + '/NCBI_Dataset_Actinobacteria'
r_scape_output_loc = project_dir + '/R_Scape_Results_Test'
merged_file_loc = dataset_loc + '/merged_file.txt'
literature_datasets_dir = project_dir + '/Data_From_Publications'
wsl_merged_file_loc = util.wslname(merged_file_loc)
reference_species = 'GCF_000195955.2_ASM19595v2'
reference_species_folder = 'GCF_000195955.2'

In [3]:
if 1==1:
    with open(merged_file_loc, 'w') as outfile:
        for dir in util.list_dirs(dataset_loc):
            directory = dataset_loc + '/' + dir
            for file in util.list_files(directory):
                if file.endswith("genomic.fna"):
                    with open(directory + '/' + file, encoding="utf-8", errors='ignore') as infile:
                        outfile.write(infile.read())

In [3]:
output = []
features = []
genome_record = next(SeqIO.parse(dataset_loc + '/'+reference_species_folder +'/' + reference_species + '_genomic.fna', "fasta"))
full_sequence = str(genome_record.seq)
mycobrowser_df = pd.read_excel(literature_datasets_dir+'/Mycobrowser_Release_4.xlsx')
for i, r in mycobrowser_df.iterrows():
    #if r['Feature'] == 'CDS':
    if 1 == 1:
        if r['Strand'] == '+':
            strand = 1
        else:
            strand = -1
        features.append([r['Locus'],r['Start']-1, r['Stop'], strand])
features.sort(key=lambda x: x[1])
feature_info = []
for i, feature in enumerate(features):
    if feature[1] < feature[2]:  
        if (i + 1)< len(features) and feature[2] < features[i+1][1]:
            utr_coords = (feature[2], features[i+1][1])
            utr_sequence = full_sequence[feature[2]: features[i+1][1]]
            utr_length = len(utr_sequence)
        else:
            utr_coords = (0,0)
            utr_sequence = ''
            utr_length = 0
        feature_info.append([feature[0], utr_coords[0], utr_coords[1], utr_sequence, utr_length])
intergenic_df = pd.DataFrame(feature_info, columns = ['Locus', 'Start' , 'End', 'Sequence', 'Length'])
intergenic_df.to_csv(project_dir + '/intergenic_df.csv')

In [3]:
intergenic_df = pd.read_csv(project_dir + '/intergenic_df.csv')
alignment_ids = []
for i, r in intergenic_df.iterrows():
    alignment_ids.append(r['Locus'])

In [4]:
alignment_ids = [alignment_ids[0]]

In [10]:
alignment_ids = ['Rv0052']

In [11]:
def best_hit_per_species(analysis_dir, search_num):
    best_hit_dict = {}
    with open(analysis_dir + '/search_hits_'+str(search_num)+'.txt','r') as infile:
        for l in infile:
            if l[0] =='#':
                pass
            else:
                space_delims = l.split()
                seq_id = space_delims[0]
                seq_from = space_delims[7]
                seq_to = space_delims[8]
                e_value = float(space_delims[15])
                if e_value < 0.001:
                    if seq_id in best_hit_dict:
                        if e_value < best_hit_dict[seq_id][2]:
                            best_hit_dict[seq_id] = (seq_from, seq_to, e_value)
                    else:
                        best_hit_dict[seq_id] = (seq_from, seq_to, e_value)
    inclusion_sections = []
    for k, v in best_hit_dict.items():
        inclusion_sections.append(k+'/'+v[0]+'-'+v[1])
    inclusion_sections = inclusion_sections + ['//', '# STOCKHOLM', 'Infernal']
    with open(analysis_dir + '/search_'+str(search_num)+'.sto', 'r') as infile:
        with open(analysis_dir + '/search_bh_'+str(search_num)+'.sto', 'w') as outfile:
            for l in infile:
                if len(l) < 2:
                    outfile.write(l)
                else:
                    for sect in inclusion_sections:
                        if (sect in l) or (len(l) < 2):
                            outfile.write(l)
                            break

In [12]:
def match_utr(utr_s, utr_e, searchline):
    result = re.search('NC_000962.3/(\S*)\s', searchline)
    if not(result == None):
        start = int(result.group(1).split('-')[0])
        end = int(result.group(1).split('-')[1])
        if ((start < utr_e) and (end > utr_s)) or  ((end < utr_e) and (start > utr_s)):
            return True
        else:
            return False
    else:
        return False
    
def utr_in_file(filename, utr_s, utr_e):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            for l in f:
                if match_utr(utr_s, utr_e, l) == True:
                    return True
            return False
    else:
        return False

In [13]:
for alignment_id in tqdm(alignment_ids):
    temp_df = intergenic_df[intergenic_df.Locus == alignment_id]
    utr_seq = str(temp_df.iloc[0]['Sequence'])
    utr_start = int(temp_df.iloc[0]['Start'])
    utr_end = int(temp_df.iloc[0]['End'])
    utr_length = int(temp_df.iloc[0]['Length'])
    analysis_directory = r_scape_output_loc + '/' + alignment_id
    wsl_analysis_directory = util.wslname(analysis_directory)
    if len(utr_seq) > 150:
        if not os.path.exists(analysis_directory):
            os.makedirs(analysis_directory)
        util.produce_fasta_file([[alignment_id, utr_seq]], analysis_directory + '/utr_'+ alignment_id + '.fasta')
        subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; nhmmer -A align_1.sto --incE 1e-20 '+  'utr_'+ alignment_id + '.fasta '+wsl_merged_file_loc, shell=True)
    
        # Initial run with HMM

        subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/infernal-1.1.4/src/cmbuild --noss -F cm_1.cm align_1.sto'  , shell=True)
        subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/infernal-1.1.4/src/cmsearch  --tblout search_hits_1.txt -A search_1.sto cm_1.cm ' + wsl_merged_file_loc  , shell=True) 
        if utr_in_file(analysis_directory + '/search_1.sto', utr_start, utr_end) == False:
            continue
        best_hit_per_species(analysis_directory, 1)
        subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname rscape_1 search_bh_1.sto'  , shell=True)
      
        # Subsequent runs with Cacofold CM

        for iter_num in range(2, 4):
            subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/infernal-1.1.4/src/cmbuild -F cm_'+str(iter_num)+'.cm rscape_'+str(iter_num - 1) +'.cacofold.R2R.sto'  , shell=True)
            subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/infernal-1.1.4/src/cmcalibrate cm_'+str(iter_num)+'.cm', shell= True)
            subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/infernal-1.1.4/src/cmsearch --tblout search_hits_'+str(iter_num)+'.txt -A search_'+str(iter_num)+'.sto cm_'+str(iter_num)+'.cm ' + wsl_merged_file_loc  , shell=True)  
            if utr_in_file(analysis_directory + '/search_'+str(iter_num)+'.sto', utr_start, utr_end) == False:
                continue
            best_hit_per_species(analysis_directory, iter_num)
            subprocess.run('wsl cd ' + wsl_analysis_directory + ' ; ~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname rscape_'+str(iter_num) +' search_bh_'+str(iter_num)+'.sto'  , shell=True)

  0%|          | 0/1 [00:00<?, ?it/s]


100%|██████████| 1/1 [00:00<?, ?it/s][A
