In [17]:
import pandas as pd
pd.options.mode.chained_assignment = None  
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import time
from tqdm.auto import tqdm
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from scipy.stats import chi2
from IPython import display

num_cores = 16
core_numbers = list(range(1, num_cores+1))
project_dir = 'D:/Project_Data/Project_9'
output_dir = project_dir + '/Output'
seq_dir = 'D:/Prokaryotic_Ref_Rep_Sequences/ncbi_genomes_20220815'
#seq_dir = 'D:/Project_Data/Project_8/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
results_dir = output_dir

produce_data = False

In [18]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A'}
    temp = []
    for char in reversed(seq_string):
        if char in complement_dict:
            temp.append(complement_dict[char])
        else:
            return('')
    return ''.join(temp)

In [19]:
file_list = util.list_files(seq_dir)

In [20]:
def generate_rprotein_list(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for filename in ids:
        for record in SeqIO.parse(seq_dir + '/' + filename, "genbank"):
            organism_name = record.annotations['organism']
            taxonomy = record.annotations['taxonomy']
            accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
            organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
            full_sequence = str(record.seq)
            for feature in record.features:
                if not(feature.type == 'gene'):
                    a = feature.qualifiers
                    if a.get("product")!= None:
                        if 'ibosomal protein' in a.get("product")[0]:
                                locus_tag = a.get("locus_tag")[0]
                                accession_locus = accession_ver + '@' + locus_tag
                                product = a.get("product")[0]
                                start = int(feature.location.start)
                                end = int(feature.location.end)
                                strand = int(feature.location.strand)
                                if strand == 1:
                                    sequence = full_sequence[max(start - 300, 0): start]
                                else:
                                    sequence = reverse_complement(full_sequence[end : min(end + 300, len(full_sequence)+1)])

                                temp.append([organism_name, accession_locus, product, start, end, strand, sequence, taxonomy])
    return temp

In [21]:
if produce_data == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_rprotein_list)(num_cores, core_number, file_list) for core_number in core_numbers)
    temp = [item for sublist in parallel_output for item in sublist]
    rprotein_dict = {}
    name_dict = {}
    for x in temp:
        if not(x[1].split('@')[0] in name_dict):
            name_dict[x[1].split('@')[0]] = (x[0], x[7])
        rprotein_dict[x[1]] = x[2:7]
    with open(output_dir + '/name_dict.pkl', 'wb') as f:
        pickle.dump(name_dict, f) 
    with open(output_dir + '/rprotein_dict.pkl', 'wb') as f:
        pickle.dump(rprotein_dict, f) 

In [22]:
with open(output_dir + '/name_dict.pkl', 'rb') as f:
    name_dict = pickle.load(f) 
with open(output_dir + '/rprotein_dict.pkl', 'rb') as f:
    rprotein_dict = pickle.load(f) 

In [24]:
sequence_list = []
phyla_count_dict = {}
for k, v in rprotein_dict.items():
    if 'L10' in v[0]:
        taxonomy_info = name_dict[k.split('@')[0]][1]
        if len(taxonomy_info)>2:
            phylum = taxonomy_info[1]
            subphylum = taxonomy_info[2]
            if phylum == 'Firmicutes':
            #if subphylum == 'Corynebacteriales':
                sequence_list.append([k,v[4]])
            if phylum in phyla_count_dict:
                phyla_count_dict[phylum] +=1
            else:
                phyla_count_dict[phylum] = 1
        #print(name_dict[k.split('@')[0]][1][1], v)
print (phyla_count_dict)
region_file_name = 'all_upstream_regions.faa'
util.produce_fasta_file(sequence_list, results_dir + '/' + region_file_name)

{'Actinobacteria': 3254, 'Chlamydiae': 31, 'Proteobacteria': 6481, 'Chlorobi': 16, 'Euryarchaeota': 434, 'Firmicutes': 2851, 'Crenarchaeota': 58, 'Cyanobacteria': 175, 'Spirochaetes': 158, 'Candidatus Thermoplasmatota': 14, 'Tenericutes': 192, 'Aquificae': 22, 'Gemmatimonadetes': 5, 'Bacteroidetes': 1821, 'Deferribacteres': 9, 'Acidobacteria': 38, 'Thermotogae': 43, 'Chloroflexi': 55, 'Thaumarchaeota': 25, 'Verrucomicrobia': 53, 'Elusimicrobia': 3, 'Deinococcus-Thermus': 101, 'Coprothermobacterota': 2, 'Dictyoglomi': 2, 'Nitrospirae': 16, 'Fusobacteria': 39, 'Synergistetes': 23, 'Planctomycetes': 117, 'Candidatus Cloacimonetes': 2, 'Fibrobacteres': 5, 'Lentisphaerae': 3, 'Chrysiogenetes': 3, 'Thermodesulfobacteria': 12, 'Ignavibacteriae': 2, 'Caldiserica/Cryosericota group': 2, 'Nitrospinae/Tectomicrobia group': 1, 'Balneolaeota': 15, 'Candidatus Saccharibacteria': 5, 'Rhodothermaeota': 5, 'Armatimonadetes': 4, 'Candidatus Dependentiae': 1, 'Kiritimatiellaeota': 4, 'Candidatus Kryptoni

100%|██████████| 2851/2851 [00:00<00:00, 114813.69it/s]


In [25]:
e_val = 1e-10

In [26]:
wsl_output_dir = util.wslname(output_dir)
mafft_output_filename = 'mafft.faa'
result = subprocess.run('wsl echo magpie | sudo -S mafft ' + wsl_output_dir + '/' + region_file_name + ' > ' + wsl_output_dir + '/' + mafft_output_filename, capture_output=True, text=True)

In [27]:
blastfn.hmmer_build(output_dir, mafft_output_filename, 'hmm.hmm')

In [28]:
for i in range(3):
    blastfn.nhmmer_search_model(output_dir, 'hmm.hmm', output_dir, region_file_name, 'align_sto', 'output.txt', 'hsummary,txt', e_val)
    blastfn.hmmer_build(output_dir, 'align_sto', 'hmm.hmm')
blastfn.nhmmer_search_model(output_dir, 'hmm.hmm', output_dir, region_file_name, 'align.sto', 'output.txt', 'hsummary,txt', e_val)

In [29]:
for i in range(1):
    blastfn.infernal_build_and_calib(output_dir, 'align.sto' ,'cm_1.cm', False)
    blastfn.infernal_search(output_dir, 'cm_1.cm', output_dir, region_file_name, 'search_1.sto', 'search_hits_1.txt', 'summary_1.txt', e_val) 
    if os.path.getsize(output_dir + '/search_hits_1.txt') > 1e8:
        continue
    blastfn.run_rscape(output_dir, 'search_1.sto', 'rscape_1')


    #2
    blastfn.infernal_build_and_calib(output_dir, 'rscape_1.cacofold.R2R.sto' ,'cm_2.cm')


    blastfn.infernal_search(output_dir, 'cm_2.cm', output_dir, region_file_name, 'search_2.sto', 'search_hits_2.txt', 'summary_2.txt', e_val) 
    if os.path.getsize(output_dir + '/search_hits_2.txt') > 1e8:
        continue
    blastfn.run_rscape(output_dir, 'search_2.sto', 'rscape_2')

    #3

    blastfn.infernal_build_and_calib(output_dir, 'rscape_2.cacofold.R2R.sto' ,'cm_3.cm')
    blastfn.infernal_search(output_dir, 'cm_3.cm', output_dir, region_file_name, 'search_3.sto', 'search_hits_3.txt', 'summary_3.txt', e_val) 
    if os.path.getsize(output_dir + '/search_hits_3.txt') > 1e8:
        continue
    blastfn.run_rscape(output_dir, 'search_3.sto', 'rscape_3')

    #Final

    blastfn.infernal_build_and_calib(output_dir, 'rscape_3.cacofold.R2R.sto' ,'cm_4.cm')

In [9]:
blastfn.run_tblastx('D:/BLAST/ref_prok_rep_genomes', 'all_regions.faa', 'ref_prok_rep_genomes', e_value = 1e-10)