In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import time
from tqdm.auto import tqdm
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from scipy.stats import chi2
from IPython import display

num_cores = 16
core_numbers = list(range(1, num_cores+1))
project_dir = 'D:/Project_Data/Project_9/SD_Region_Actinobacteria_All_Levels'
output_dir = project_dir + '/Output'
#seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
#tb_species = 'NC_000962.3' 
#tb_genome_filename = 'GCF_000195955.2/genomic.gbff'

seq_dir = 'F:/Datasets/NCBI_Refseq_Actinobacteria_All_Levels/data'
tb_species = 'NC_000962.3' 
tb_genome_filename = '/GCF_000195955.2/genomic.gbff'

e_val = 5e-2
produce_data = True

In [2]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A'}
    temp = []
    for char in reversed(seq_string):
        if char in complement_dict:
            temp.append(complement_dict[char])
        else:
            return('')
    return ''.join(temp)

In [3]:
file_list_temp = util.list_dirs(seq_dir)
file_list = [x for x in file_list_temp if os.path.exists(seq_dir + '/' + x + '/genomic.gbff')]
#file_list= util.list_files(seq_dir)
print(len(file_list_temp))
print(len(file_list))

1757
1748


In [16]:
def generate_rprotein_list(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for filename in ids:
        for record in SeqIO.parse(seq_dir + '/' + filename + '/genomic.gbff', "genbank"):
        #for record in SeqIO.parse(seq_dir + '/' + filename, "genbank"):    
            organism_name = record.annotations['organism']
            taxonomy = record.annotations['taxonomy']
            accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
            organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
            full_sequence = str(record.seq)
            features = []
            for feature in record.features:
                a = feature.qualifiers
                if a.get("product")!= None and a.get("locus_tag")!= None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    product = a.get("product")[0]
                    features.append([(accession_locus, product), int(feature.location.start), int(feature.location.end), int(feature.location.strand)])
            features.sort(key=lambda x: x[1])
            utr_coords = (-100,0)
            for i, feature in enumerate(features):
                if 'ibosomal' in feature[0][1] and 'protein' in feature[0][1]:
                    if feature[3] == 1:
                        utr_sequence = full_sequence[feature[1]-100: feature[1]]
                    else:
                        utr_sequence =  reverse_complement(full_sequence[feature[2]: feature[2]+100])
                    temp.append([organism_name, feature[0], taxonomy, (feature[1], feature[2], feature[3]), (utr_coords, utr_sequence)])
    return temp

In [17]:
if produce_data == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_rprotein_list)(num_cores, core_number, file_list) for core_number in core_numbers)
    temp = [item for sublist in parallel_output for item in sublist]
    rprotein_dict = {}
    name_dict = {}
    for x in temp:
        if not(x[1][0].split('@')[0] in name_dict):
            name_dict[x[1][0].split('@')[0]] = (x[0], x[2])
        rprotein_dict[x[1][0]] = [x[1][1], x[3], x[4]]
    with open(output_dir + '/name_dict.pkl', 'wb') as f:
        pickle.dump(name_dict, f) 
    with open(output_dir + '/rprotein_dict.pkl', 'wb') as f:
        pickle.dump(rprotein_dict, f) 

In [5]:
with open(output_dir + '/name_dict.pkl', 'rb') as f:
    name_dict = pickle.load(f) 
with open(output_dir + '/rprotein_dict.pkl', 'rb') as f:
    rprotein_dict = pickle.load(f) 

In [9]:
ribosomal_protein_ids = []
ribosomal_protein_info = []
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)
    for feature in record.features:
        a = feature.qualifiers
        start=(feature.location.start)
        stop= int(feature.location.end)
        strand = int(feature.location.strand)
        if strand == 1:
              utr_sequence = full_sequence[start-20:start]
        else:
              utr_sequence =  reverse_complement(full_sequence[stop: stop+20])
        if a.get("product") != None:
            if 'ibosomal' in a.get("product")[0] and 'protein' in a.get("product")[0]:
                ribosomal_protein_info.append([a.get("product")[0].split('/')[0].split(' ')[-1:][0], utr_sequence])
                ribosomal_protein_ids.append(a.get("product")[0].split('/')[0].split(' ')[-1:][0])

In [18]:
ribosomal_protein_ids = ['L10']
#ribosomal_protein_ids.remove('L10')
print(ribosomal_protein_ids)
#ribosomal_protein_ids = ribosomal_protein_ids[3:]
#print(ribosomal_protein_ids)

['L10']


In [21]:
for rprotein_id in ribosomal_protein_ids:    
    temp_results_dir = output_dir + '/' + rprotein_id
    results_dir = temp_results_dir.replace(' ','_')
    wsl_results_dir = util.wslname(results_dir)
    malign_output_filename = 'malign.faa'
    malign_output_sto_filename = 'malign.sto'
    sequence_list = []
    
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    for k, v in rprotein_dict.items():
        if rprotein_id in v[0].split('/')[0]:
            sequence_list.append([k,v[2][1]])
    print (len(sequence_list))
    region_file_name = 'all_upstream_regions.faa'
    util.produce_fasta_file(sequence_list, results_dir + '/' + region_file_name)

    result = subprocess.run('wsl echo magpie | sudo -S mafft ' + wsl_results_dir + '/' + region_file_name + ' > ' + wsl_results_dir + '/' + malign_output_filename, capture_output=True, text=True)
    
    blastfn.hmmer_build(results_dir, malign_output_filename, 'hmm.hmm')

    for i in range(3):
        blastfn.nhmmer_search_model(results_dir, 'hmm.hmm', results_dir, region_file_name, 'align_sto', 'output.txt', 'hsummary,txt', e_val)
        blastfn.hmmer_build(results_dir, 'align_sto', 'hmm.hmm')


    blastfn.nhmmer_search_model(results_dir, 'hmm.hmm', results_dir, region_file_name, 'align.sto', 'output.txt', 'hsummary,txt', e_val)

    blastfn.infernal_build_and_calib(results_dir, 'align.sto' ,'cm_1.cm', False)
    blastfn.infernal_search(results_dir, 'cm_1.cm', results_dir, region_file_name, 'search_1.sto', 'search_hits_1.txt', 'summary_1.txt', e_val) 
   
    blastfn.run_rscape(results_dir, 'search_1.sto', 'rscape_1')
    #blastfn.hmmer_build(results_dir, malign_output_filename, 'hmm.hmm')

    #blastfn.nhmmer_search_model(results_dir, 'hmm.hmm', results_dir, region_file_name, 'align.sto', 'output.txt', 'hsummary,txt', 1)
   
    #blastfn.run_rscape(results_dir,'align.sto', 'rscape_1')

1751


100%|██████████| 1751/1751 [00:00<00:00, 441944.05it/s]


In [None]:
seq_dir

In [None]:
translator = util.Translator()
temp = []
for record in SeqIO.parse('C:/Users/nicho/Downloads/ncbi_dataset/ncbi_dataset/data/GCF_013697105.1/genomic.gbff', "genbank"):
        organism_name = record.annotations['organism']
        taxonomy = record.annotations['taxonomy']
        accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
        organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
        full_sequence = str(record.seq)
        features = []
        for feature in record.features:
            a = feature.qualifiers
            if a.get("product")!= None and a.get("locus_tag")!= None:
                locus_tag = a.get("locus_tag")[0]
                accession_locus = accession_ver + '@' + locus_tag
                product = a.get("product")[0]
                features.append([(accession_locus, product), int(feature.location.start), int(feature.location.end), int(feature.location.strand)])
        features.sort(key=lambda x: x[1])
        for i, feature in enumerate(features):
            if 'ibosomal' in feature[0][1] and 'protein' in feature[0][1] and 'L25' in feature[0][1]:
                exception = 1
                if feature[1] < feature[2]:  
                    if feature[3] == 1:
                        if i > 0 and feature[1] > features[i-1][2]:
                            exception = 0
                            utr_coords = (features[i-1][2], feature[1])
                            utr_sequence = full_sequence[features[i-1][2]: feature[1]]
                            utr_length = len(utr_sequence)
                    else:
                        if i + 1 < len(features) and feature[2] < features[i+1][1]:
                            seq = translator.translate_sequence(full_sequence[feature[1]:feature[2]], -1, 0)
                            exception = 0
                            utr_coords = (feature[2], features[i+1][1])
                            utr_sequence =  reverse_complement(full_sequence[feature[2]: features[i+1][1]])
                            utr_length = len(utr_sequence)
                if exception == 1:
                    utr_coords = (0,0)
                    utr_sequence = ''
                    utr_length = 0 
                temp.append([organism_name, feature[0], taxonomy, (feature[1], feature[2], feature[3]), (utr_coords, seq)])
temp

In [None]:
temp