In [168]:
import pandas as pd
pd.options.mode.chained_assignment = None  
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import time
from tqdm.auto import tqdm
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from scipy.stats import chi2
from IPython import display

num_cores = 16
core_numbers = list(range(1, num_cores+1))
project_dir = 'D:/Project_Data/Project_9/Mycobacteriaceae'
output_dir = project_dir + '/Output'
seq_dir = 'F:/Datasets/ncbi_dataset/ncbi_dataset/data'
#seq_dir = 'D:/Project_Data/Project_8/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCA_000195955.2/genomic.gbff'

e_val = 1e-10
produce_data = True

In [169]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A'}
    temp = []
    for char in reversed(seq_string):
        if char in complement_dict:
            temp.append(complement_dict[char])
        else:
            return('')
    return ''.join(temp)

In [170]:
file_list_temp = util.list_dirs(seq_dir)
file_list = [x for x in file_list_temp if os.path.exists(seq_dir + '/' + x + '/genomic.gbff.temp')]
print(len(file_list_temp))
print(len(file_list))

10669
10669


In [173]:
def generate_rprotein_list(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for filename in ids:
        for record in SeqIO.parse(seq_dir + '/' + filename + '/genomic.gbff.temp', "genbank"):
            organism_name = record.annotations['organism']
            taxonomy = record.annotations['taxonomy']
            accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
            organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
            full_sequence = str(record.seq)
            features = []
            for feature in record.features:
                a = feature.qualifiers
                if a.get("product")!= None and a.get("locus_tag")!= None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    product = a.get("product")[0]
                    features.append([(accession_locus, product), int(feature.location.start), int(feature.location.end), int(feature.location.strand)])
            features.sort(key=lambda x: x[1])

            for i, feature in enumerate(features):
                if 'ibosomal' in feature[0][1] and 'protein' in feature[0][1]:
                    exception = 1
                    if feature[1] < feature[2]:  
                        if feature[3] == 1:
                            if i > 0 and feature[1] > features[i-1][2]:
                                exception = 0
                                utr_coords = (features[i-1][2], feature[1])
                                utr_sequence = full_sequence[features[i-1][2]: feature[1]]
                                utr_length = len(utr_sequence)
                        else:
                            if i + 1 < len(features) and feature[2] < features[i+1][1]:
                                exception = 0
                                utr_coords = (feature[2], features[i+1][1])
                                utr_sequence =  reverse_complement(full_sequence[feature[2]: features[i+1][1]])
                                utr_length = len(utr_sequence)
                    if exception == 1:
                        utr_coords = (0,0)
                        utr_sequence = ''
                        utr_length = 0 
                    temp.append([organism_name, feature[0], taxonomy, (feature[1], feature[2], feature[3]), (utr_coords, utr_sequence)])
    return temp

In [None]:
if produce_data == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_rprotein_list)(num_cores, core_number, file_list) for core_number in core_numbers)
    temp = [item for sublist in parallel_output for item in sublist]
    rprotein_dict = {}
    name_dict = {}
    for x in temp:
        if not(x[1][0].split('@')[0] in name_dict):
            name_dict[x[1][0].split('@')[0]] = (x[0], x[2])
        rprotein_dict[x[1][0]] = [x[1][1], x[3], x[4]]
    with open(output_dir + '/name_dict.pkl', 'wb') as f:
        pickle.dump(name_dict, f) 
    with open(output_dir + '/rprotein_dict.pkl', 'wb') as f:
        pickle.dump(rprotein_dict, f) 

In [133]:
with open(output_dir + '/name_dict.pkl', 'rb') as f:
    name_dict = pickle.load(f) 
with open(output_dir + '/rprotein_dict.pkl', 'rb') as f:
    rprotein_dict = pickle.load(f) 

In [117]:
ribosomal_protein_ids = []
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
            for feature in record.features:
                if 1==1:
                #if not(feature.type == 'gene'):
                    a = feature.qualifiers
                    if a.get("product") != None:
                        if 'ibosomal' in a.get("product")[0] and 'protein' in a.get("product")[0]:
                            ribosomal_protein_ids.append(a.get("product")[0].split('/')[0].split(' ')[-1:][0])
                            print(a.get("locus_tag"))
print(ribosomal_protein_ids)

['Rv0053']
['Rv0055']
['Rv0056']
['Rv0105c']
['Rv0634B']
['Rv0640']
['Rv0641']
['Rv0651']
['Rv0652']
['Rv0682']
['Rv0683']
['Rv0700']
['Rv0701']
['Rv0702']
['Rv0703']
['Rv0704']
['Rv0705']
['Rv0706']
['Rv0707']
['Rv0708']
['Rv0709']
['Rv0710']
['Rv0714']
['Rv0715']
['Rv0716']
['Rv0717']
['Rv0718']
['Rv0719']
['Rv0720']
['Rv0721']
['Rv0722']
['Rv0723']
['Rv0979A']
['Rv0995']
['Rv1015c']
['Rv1298']
['Rv1630']
['Rv1642']
['Rv1643']
['Rv2055c']
['Rv2056c']
['Rv2057c']
['Rv2058c']
['Rv2412']
['Rv2441c']
['Rv2442c']
['Rv2785c']
['Rv2890c']
['Rv2904c']
['Rv2909c']
['Rv3420c']
['Rv3442c']
['Rv3443c']
['Rv3456c']
['Rv3458c']
['Rv3459c']
['Rv3460c']
['Rv3461c']
['Rv3924c']
['RpsF', 'RpsR1', 'RplI', 'RpmB1', 'RpmG2', 'RplK', 'RplA', 'RplJ', 'L7', 'RpsL', 'RpsG', 'NusE)', 'RplC', 'RplD', 'RplW', 'RplB', 'RpsS', 'RplV', 'RpsC', 'RplP', 'RpmC', 'RpsQ', 'RplN', 'RplX', 'RplE', 'RpsN1', 'RpsH', 'RplF', 'RplR', 'RpsE', 'RpmD', 'RplO', 'RpmF', 'S5)', 'RplY', 'RpmE', 'RpsA', 'RpmI', 'RplT', 'RpsR2', 'Rps

In [144]:
ribosomal_protein_ids = ['L10']

In [150]:
for rprotein_id in ribosomal_protein_ids:    
    temp_results_dir = output_dir + '/' + rprotein_id
    results_dir = temp_results_dir.replace(' ','_')
    wsl_results_dir = util.wslname(results_dir)
    malign_output_filename = 'malign.faa'
    sequence_list = []
    
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    for k, v in rprotein_dict.items():
        taxonomy_info = name_dict[k.split('@')[0]][1]
        if 'Corynebacteriales' in taxonomy_info and rprotein_id in v[0].split('/')[0]:
            sequence_list.append([k,v[2][1]])
    print (len(sequence_list))
    region_file_name = 'all_upstream_regions.faa'
    util.produce_fasta_file(sequence_list, results_dir + '/' + region_file_name)

    result = subprocess.run('wsl echo magpie | sudo -S mafft ' + wsl_results_dir + '/' + region_file_name + ' > ' + wsl_results_dir + '/' + malign_output_filename, capture_output=True, text=True)
    #muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
    #cline = MuscleCommandline(muscle_exe, input=results_dir+'/'+ region_file_name, out=results_dir + '/'+malign_output_filename)
    #result = cline();
 
    blastfn.hmmer_build(results_dir, malign_output_filename, 'hmm.hmm')

    for i in range(3):
        blastfn.nhmmer_search_model(results_dir, 'hmm.hmm', results_dir, region_file_name, 'align_sto', 'output.txt', 'hsummary,txt', e_val)
        blastfn.hmmer_build(results_dir, 'align_sto', 'hmm.hmm')


    blastfn.nhmmer_search_model(results_dir, 'hmm.hmm', results_dir, region_file_name, 'align.sto', 'output.txt', 'hsummary,txt', e_val)

    blastfn.infernal_build_and_calib(results_dir, 'align.sto' ,'cm_1.cm', False)
    blastfn.infernal_search(results_dir, 'cm_1.cm', results_dir, region_file_name, 'search_1.sto', 'search_hits_1.txt', 'summary_1.txt', e_val) 
    if not os.path.exists(results_dir + '/search_hits_1.txt'):
        continue
    if os.path.getsize(results_dir + '/search_hits_1.txt') > 1e8:
        continue
    blastfn.run_rscape(results_dir, 'search_1.sto', 'rscape_1')


    #2
    blastfn.infernal_build_and_calib(results_dir, 'rscape_1.cacofold.R2R.sto' ,'cm_2.cm')
    blastfn.infernal_search(results_dir, 'cm_2.cm', results_dir, region_file_name, 'search_2.sto', 'search_hits_2.txt', 'summary_2.txt', e_val) 
    if not os.path.exists(results_dir + '/search_hits_2.txt'):
        continue
    if os.path.getsize(results_dir + '/search_hits_2.txt') > 1e8:
        continue
    blastfn.run_rscape(results_dir, 'search_2.sto', 'rscape_2')

    #3

    blastfn.infernal_build_and_calib(results_dir, 'rscape_2.cacofold.R2R.sto' ,'cm_3.cm')
    blastfn.infernal_search(results_dir, 'cm_3.cm', results_dir, region_file_name, 'search_3.sto', 'search_hits_3.txt', 'summary_3.txt', e_val) 
    if not os.path.exists(results_dir + '/search_hits_3.txt'):
        continue
    if os.path.getsize(results_dir + '/search_hits_3.txt') > 1e8:
        continue
    blastfn.run_rscape(results_dir, 'search_3.sto', 'rscape_3')

    #Final

    #blastfn.infernal_build_and_calib(output_dir, 'rscape_3.cacofold.R2R.sto' ,'cm_4.cm')

4892


100%|██████████| 4892/4892 [00:00<00:00, 206073.53it/s]


In [103]:
translator = util.Translator()
temp = []
for record in SeqIO.parse('C:/Users/nicho/Downloads/ncbi_dataset/ncbi_dataset/data/GCF_013697105.1/genomic.gbff', "genbank"):
        organism_name = record.annotations['organism']
        taxonomy = record.annotations['taxonomy']
        accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
        organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
        full_sequence = str(record.seq)
        features = []
        for feature in record.features:
            a = feature.qualifiers
            if a.get("product")!= None and a.get("locus_tag")!= None:
                locus_tag = a.get("locus_tag")[0]
                accession_locus = accession_ver + '@' + locus_tag
                product = a.get("product")[0]
                features.append([(accession_locus, product), int(feature.location.start), int(feature.location.end), int(feature.location.strand)])
        features.sort(key=lambda x: x[1])
        for i, feature in enumerate(features):
            if 'ibosomal' in feature[0][1] and 'protein' in feature[0][1] and 'L25' in feature[0][1]:
                exception = 1
                if feature[1] < feature[2]:  
                    if feature[3] == 1:
                        if i > 0 and feature[1] > features[i-1][2]:
                            exception = 0
                            utr_coords = (features[i-1][2], feature[1])
                            utr_sequence = full_sequence[features[i-1][2]: feature[1]]
                            utr_length = len(utr_sequence)
                    else:
                        if i + 1 < len(features) and feature[2] < features[i+1][1]:
                            seq = translator.translate_sequence(full_sequence[feature[1]:feature[2]], -1, 0)
                            exception = 0
                            utr_coords = (feature[2], features[i+1][1])
                            utr_sequence =  reverse_complement(full_sequence[feature[2]: features[i+1][1]])
                            utr_length = len(utr_sequence)
                if exception == 1:
                    utr_coords = (0,0)
                    utr_sequence = ''
                    utr_length = 0 
                temp.append([organism_name, feature[0], taxonomy, (feature[1], feature[2], feature[3]), (utr_coords, seq)])
temp


[['Mycobacterium canettii',
  ('NZ_JACEGU010000017.1@H1J40_RS05720',
   '50S ribosomal protein L25/general stress protein Ctc'),
  ['Bacteria',
   'Actinobacteria',
   'Corynebacteriales',
   'Mycobacteriaceae',
   'Mycobacterium',
   'Mycobacterium tuberculosis complex'],
  (37447, 38095, -1),
  ((38095, 38311),
   'MAKSASNQLRVTVRTETGKGASRRARRAGKIPAVLYGHGAEPQHLELPGHDYAAVLRHSGTNAVLTLDIAGKEQLALTKALHIHPIRRTIQHADLLVVRRGEKVVVEVSVVVEGQAGPDTLVTQETNSIEIEAEALSIPEQLTVSIEGAEPGTQLTAGQIALPAGVSLISDPDLLVVNVVKAPTAEELEGEVAGAEEAEEAAVEAGEAEAAGESE*')]]

In [98]:
temp

[['Mycobacterium canettii',
  ('NZ_JACEGU010000011.1@H1J40_RS04060', '30S ribosomal protein S10'),
  ['Bacteria',
   'Actinobacteria',
   'Corynebacteriales',
   'Mycobacteriaceae',
   'Mycobacterium',
   'Mycobacterium tuberculosis complex'],
  (740, 1046, 1),
  ((0, 0), '')],
 ['Mycobacterium canettii',
  ('NZ_JACEGU010000011.1@H1J40_RS04085', '30S ribosomal protein S19'),
  ['Bacteria',
   'Actinobacteria',
   'Corynebacteriales',
   'Mycobacteriaceae',
   'Mycobacterium',
   'Mycobacterium tuberculosis complex'],
  (3818, 4100, 1),
  ((3778, 3818), 'CCGCGCAATCAGATCTAGGGAGTTTCAGGAGTAGCCAACC')],
 ['Mycobacterium canettii',
  ('NZ_JACEGU010000011.1@H1J40_RS04095', '30S ribosomal protein S3'),
  ['Bacteria',
   'Actinobacteria',
   'Corynebacteriales',
   'Mycobacteriaceae',
   'Mycobacterium',
   'Mycobacterium tuberculosis complex'],
  (4722, 5514, 1),
  ((4690, 4722), 'TGGGCCAGAAGATCAATCCGCACGGCTTCCGG')],
 ['Mycobacterium canettii',
  ('NZ_JACEGU010000011.1@H1J40_RS04110', '30S ribo