In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil

In [2]:
full_run = True

In [3]:
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/Output'
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'

In [4]:
species_list = util.list_files(seq_dir)

In [None]:
if full_run == True:
    reference_list = []
    for species in tqdm(species_list):
        for genome_record in SeqIO.parse(seq_dir + '/' + species, "genbank"):
            feature_info = []
            organism_name = genome_record.annotations['organism']
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
            reference_list.append([organism_accession, str(genome_record.seq)])
    util.produce_fasta_file(reference_list, seq_dir + '/all_actinobacteria_ref_rep_comp.faa')    

In [None]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("D:/")
    subprocess.run('cd '+ seq_dir + ' &  makeblastdb -in all_actinobacteria_ref_rep_comp.faa -dbtype nucl -out ' + blast_db_name, shell=True, capture_output = True)
    os.chdir(w_d)
    files_to_move = [x for x in util.list_files(seq_dir) if x[:-4] == blast_db_name]
    for file in files_to_move:
        source = seq_dir + '/' + file
        destination = blast_dir + '/' + file
        shutil.move(source, destination)

In [None]:
def generate_sequence_info(num_subsets, subset_num, species_master_list):
    sequence_info = []
    sequence_list = util.chunk_list(species_master_list, num_subsets, subset_num)
    for sequence in sequence_list:
        for genome_record in SeqIO.parse(seq_dir + '/' + sequence, "genbank"):
            feature_info = []
            organism_name = genome_record.annotations['organism']
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
            locus_tag = ''
            for feature in genome_record.features:
                if locus_tag == '':
                    previous_locus_tag = ''
                    previous_end = 0
                else:
                    previous_locus_tag = locus_tag
                    previous_end = end
                a = feature.qualifiers
                
                if feature.type != 'gene' and not(feature.type == 'source'):
                    if a.get("locus_tag") != None:
                        locus_tag = a.get("locus_tag")[0]
                    else:
                        locus_tag = 'None'
                    feature_type = feature.type
                    start = int(feature.location.start)
                    end = int(feature.location.end)
                    strand = int(feature.location.strand)
                    if start > previous_end:
                        feature_info.append([(previous_locus_tag, locus_tag),'inter_feature', previous_end, start, 0])
                    feature_info.append([locus_tag, feature_type, start, end, strand])
            sequence_info.append((organism_accession, feature_info))
    return sequence_info

In [None]:
if full_run == True:
    sequence_info_dict = {}
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_sequence_info)(num_cores, core_number, species_list) for core_number in core_numbers)
    for core_output in parallel_output:
        for (accession_ver, feature_info) in core_output:
            sequence_info_dict[accession_ver] = feature_info
    with open(output_dir + '/' + 'sequence_info_dict.pkl', 'wb') as f:
        pickle.dump(sequence_info_dict, f)

In [None]:
if not(full_run == True):
    with open(output_dir + '/' + 'sequence_info_dict.pkl', 'rb') as f:
        sequence_info_dict = pickle.load(f) 

In [81]:
output = []
features = []
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
full_sequence = str(genome_record.seq)
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
for i, r in mycobrowser_df.iterrows():
    if r['Strand'] == '+':
        strand = 1
    else:
        strand = -1
    features.append([r['Locus'],r['Start']-1, r['Stop'], strand])
features.sort(key=lambda x: x[1])
feature_info = []
for i, feature in enumerate(features):
    feature_sequence = full_sequence[feature[1]: feature[2]]
    #feature_info.append([feature[0], feature[1], feature[2], feature_sequence, len(feature_sequence)])
    if feature[1] < feature[2]:  
        if (i + 1)< len(features) and feature[2] < features[i+1][1]:
            utr_coords = (feature[2], features[i+1][1])
            utr_sequence = full_sequence[feature[2]: features[i+1][1]]
            utr_length = len(utr_sequence)
        else:
            utr_coords = (0,0)
            utr_sequence = ''
            utr_length = 0
        if utr_length > 50:
            feature_info.append([feature[0] + '_IG', utr_coords[0], utr_coords[1], utr_sequence, utr_length])
intergenic_df = pd.DataFrame(feature_info, columns = ['Locus', 'Start' , 'End', 'Sequence', 'Length'])
intergenic_df.to_csv(output_dir + '/all_regions_df.csv')
sequence_list = []
records = 0
for i, r in intergenic_df.iterrows():
    if r['Locus'] == 'Rv0052_IG':
        sequence_list.append([r['Locus'], r['Sequence']])
util.produce_fasta_file(sequence_list, blast_dir + '/all_regions.faa')

100%|██████████| 1/1 [00:00<?, ?it/s]


In [None]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("D:/")
    subprocess.run('cd ' + blast_dir + ' & blastn -query all_regions.faa -db ' + blast_db_name + ' -out blast_hits.csv -evalue 1e-10 -outfmt  "10 qaccver saccver staxid staxids ssciname qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    os.chdir(w_d)

In [None]:
blast_results = pd.read_csv(blast_dir + '/blast_hits.csv', header = None)
blast_results.columns = ['query_accession_ver', 'subject_accession_ver', 'subject_tax_id', 'subject_taxids', 'subject_name', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 'query_start_alignment', 'query_end_alignment', 'subject_start_alignment', 'subject_end_alignment', 'e_value', 'bit_score']

In [None]:
def find_annotations(accession_ver, start, stop, feature_dict):
    if start > stop:
        (start, stop) = (stop, start)
    annotations_found = []
    feature_list = feature_dict[accession_ver]
    max_pct_cover = -1
    max_feature = []
    for feature in feature_list:
        if feature[2] < (stop - 1) and feature[3] >= (start - 1):
            len_feature = feature[3] - feature[2]
            pct_cover = (min(feature[3], stop) - max(feature[2], start)) / (stop - start)
            if pct_cover > 0.05 and len_feature < 100000:
                if pct_cover > max_pct_cover:
                    max_feature = feature
                    max_pct_cover = pct_cover
                annotations_found.append([feature, pct_cover])
    return (annotations_found, [max_feature, max_pct_cover])

In [None]:
#blast_results = blast_results[0:30000]

In [None]:
ortholog_hits_dict = {}
for i, r in tqdm(blast_results.iterrows(), total=blast_results.shape[0]):
        annotations = find_annotations(r['subject_accession_ver'], r['subject_start_alignment'], r['subject_end_alignment'], sequence_info_dict)
        blast_results.at[i,'features'] = str(annotations[0])
        blast_results.at[i,'closest_cover_feature'] = str(annotations[1])
        if annotations[1] == [[],-1]:
            blast_results.at[i,'closest_locus'] = []
        elif annotations[1][0][1] == 'inter_feature':
            blast_results.at[i,'closest_locus'] = [annotations[1][0][0][0], annotations[1][0][0][1]]   
        else:
            blast_results.at[i,'closest_locus'] = [annotations[1][0][0]]
            if annotations[1][0][0] in ortholog_hits_dict:
                ortholog_hits_dict[annotations[1][0][0]].append(r['query_accession_ver'])
            else:
                ortholog_hits_dict[annotations[1][0][0]] = [r['query_accession_ver']]

In [None]:
for i, r in tqdm(blast_results.iterrows(), total=blast_results.shape[0]):
    closest_locus = r['closest_locus']
    if len(closest_locus) == 2:
        if closest_locus[0] in ortholog_hits_dict:
            tb_1 = ortholog_hits_dict[closest_locus[0]]
        else:
            tb_1 == ''
        if closest_locus[1] in ortholog_hits_dict:
            tb_2 = ortholog_hits_dict[closest_locus[1]]
        else:
            tb_2 == '' 
        blast_results.at[i,'tb_closest_locus'] = [tb_1, tb_2]

In [None]:
blast_results.to_csv(output_dir + '/blast_results.csv')

In [49]:
all_regions_file = blast_dir + '/all_regions.faa'
merged_file = seq_dir + '/all_actinobacteria_ref_rep_comp.faa'
wsl_merged_file = util.wslname(merged_file)
wsl_all_regions_file = util.wslname(all_regions_file)
wsl_output_loc = util.wslname(output_dir)
#subprocess.run('wsl cd ' + wsl_output_loc + ' ; nhmmer -A align.sto -o hmmer.txt --tblout summary.txt --notextw --cpu 16 --incE 1e-10 '+  wsl_all_regions_file + ' ' + wsl_merged_file, shell=True)

In [70]:
def align_keep_top_hit_per_species(output_dir, hit_file, alignment_file, output_alignment_file, package, evalue = 0.01): 
    wsl_output_loc = util.wslname(output_dir)
    dict = {}
    if package == 'INFERNAL':
        with open(output_dir + '/' + hit_file, 'r') as f:
            for l in f:
                if not(l[0] == '#'): 
                    a = l.split()
                    if a[16] == '!':
                        if a[0] in dict:
                            if float(a[15]) < dict[a[0]][1]:
                                dict[a[0]] = (a[7]+'-'+a[8],float(a[15]))
                        else:
                            dict[a[0]] = (a[7]+'-'+a[8],float(a[15]))
    elif package == 'HMMER':
        with open(output_dir + '/' + hit_file, 'r') as f:
            for l in f:
                if not(l[0] == '#'): 
                    a = l.split()
                    if float(a[12]) <= evalue:
                        if a[0] in dict:
                            if float(a[12]) < dict[a[0]][1]:
                                dict[a[0]] = (a[6]+'-'+a[7],float(a[12]))
                        else:
                            dict[a[0]] = (a[6]+'-'+a[7],float(a[12]))
    else:
        pass
    with open(output_dir + '/keep_list.txt', 'w') as f:
        lines = []
        for k, v in dict.items():
            lines.append(k + '/' + v[0] + "\n")
        f.write(''.join(lines))
    subprocess.run('wsl cd ' + wsl_output_loc + ' ; esl-alimanip -o '+output_alignment_file + ' --seq-r keep_list.txt '+ alignment_file, shell=True)

In [71]:
align_keep_top_hit_per_species(output_dir, 'rcmhits_4.txt', 'rcm_align_4.sto' ,'outtest.sto', 'INFERNAL')
align_keep_top_hit_per_species(output_dir, 'summary.txt', 'align_4.sto' ,'outtest2.sto', 'HMMER', 1e-10)

In [80]:
hmmer_evalue = 1e-10
infernal_evalue = 1e-10
subprocess.run('wsl cd ' + wsl_output_loc + ' ; nhmmer -A align_0.sto -o hmmer_0.txt --tblout summary.txt --notextw --cpu 16 --incE 1e-10 /mnt/d/BLAST/actinobacteria_ref_rep_comp/all_regions.faa /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa', shell=True)
print ('HMMER')

for i in range(1, 5):
    print(i)
    align_keep_top_hit_per_species(output_dir, 'summary.txt', 'align_'+str(i-1)+'.sto', 'top_hit_align_'+str(i-1)+'.sto', 'HMMER', hmmer_evalue)
    subprocess.run('wsl cd ' + wsl_output_loc + ' ; hmmbuild --cpu 16 hmm_'+str(i)+'.hmm top_hit_align_'+str(i-1)+'.sto', shell=True)
    subprocess.run('wsl cd ' + wsl_output_loc + ' ; nhmmer -A align_'+str(i)+'.sto -o hmmer.txt --tblout summary.txt --notextw --cpu 16 --incE ' + str(hmmer_evalue) +' hmm_'+str(i)+'.hmm /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa', shell=True)
print('INFERNAL')
for i in range(1, 5):
    print(i)
    if i == 1:
        align_keep_top_hit_per_species(output_dir, 'summary.txt', 'align_4.sto', 'top_hit_align_4.sto', 'HMMER', hmmer_evalue)
        subprocess.run('wsl cd ' + wsl_output_loc + ' ; ~/rscape_v2.0.0.g/bin/R-scape  --cacofold --outname struc_'+str(i)+' top_hit_align_4.sto ', shell=True)
  
    subprocess.run('wsl cd ' + wsl_output_loc + ' ; cmbuild -F rcm_'+str(i)+'.cm struc_'+str(i)+'.cacofold.sto', shell=True)
    subprocess.run('wsl cd ' + wsl_output_loc + ' ;  cmcalibrate rcm_'+str(i)+'.cm', shell=True)
    subprocess.run('wsl cd ' + wsl_output_loc + ' ;  cmsearch --tblout rcmhits_'+str(i)+'.txt -A rcm_align_'+str(i)+'.sto --cpu 16 --incE ' + str(infernal_evalue) + ' rcm_'+str(i)+'.cm /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa', shell=True)
    align_keep_top_hit_per_species(output_dir, 'rcmhits_'+str(i)+'.txt', 'rcm_align_'+str(i)+'.sto', 'top_hit_rcm_align_'+str(i)+'.sto', 'INFERNAL')
    subprocess.run('wsl cd ' + wsl_output_loc + ' ; ~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname struc_'+str(i + 1)+' top_hit_rcm_align_'+str(i)+'.sto ', shell=True)

HMMER
1


FileNotFoundError: [Errno 2] No such file or directory: 'D:/Project_Data/Project_8/Output/summary.txt'

In [57]:
dict = {}
with open(output_dir + '/' + 'summary.txt', 'r') as f:
    for l in f:
        if not(l[0] == '#'): 
            a = l.split()
            print(a[0], a[12], a[6], a[7])
        #    if a[16] == '!':
        #        if a[0] in dict:
        #            if float(a[15]) < dict[a[0]][1]:
        #                dict[a[0]] = (a[7]+'-'+a[8],float(a[15]))
        #        else:
        #            dict[a[0]] = (a[7]+'-'+a[8],float(a[15]))
#with open(output_dir + '/keep_list.txt', 'w') as f:
#    lines = []
#    for k, v in dict.items():
#        lines.append(k + '/' + v[0] + "\n")
#    f.write(''.join(lines))

Mycobacterium_canettii_CIPT_140010059_NC_015848.1 3.7e-121 1279060 1279619
Mycobacterium_tuberculosis_H37Rv_NC_000962.3 4.3e-120 1261351 1261910
Mycobacterium_tuberculosis_variant_bovis_AF2122/97_LT708304.1 4.3e-120 1262750 1263309
Mycobacterium_tuberculosis_variant_bovis_BCG_str._Pasteur_1173P2_AM408590.1 4.3e-120 1292766 1293325
Mycobacterium_shinjukuense_NZ_AP022575.1 3.4e-108 373878 374464
Mycobacterium_shottsii_NZ_AP022572.1 2e-104 1641123 1641612
Mycobacterium_marinum_NZ_CP058277.1 5.9e-104 165016 164527
Mycobacterium_pseudoshottsii_JCM_15466_NZ_AP018410.1 2e-103 4885404 4884915
Mycobacterium_ulcerans_NZ_CP085200.1 9e-103 129360 128871
Mycobacterium_noviomagense_NZ_AP022583.1 7.7e-101 663992 663541
Mycobacterium_conspicuum_NZ_AP022613.1 2.2e-100 4529374 4528931
Mycobacterium_mantenii_NZ_AP022590.1 5.4e-100 3128711 3128251
Mycobacterium_malmoense_NZ_CP080999.1 1.2e-99 4111384 4110893
Mycobacterium_stomatepiae_NZ_AP022587.1 1.4e-99 5227870 5228342
Mycobacterium_simiae_NZ_AP022568.1

In [47]:
subprocess.run('wsl cd ' + wsl_output_loc + ' ; esl-alimanip -o unique_aligns.sto --seq-r keep_list.txt rcm_align_4.sto ', shell=True)

CompletedProcess(args='wsl cd /mnt/d/Project_Data/Project_8/Output ; esl-alimanip -o unique_aligns.sto --seq-r keep_list.txt rcm_align_4.sto ', returncode=0)

In [40]:
with open(output_dir + '/' + 'rcmhits_4.txt', 'r') as f:
    for l in f:
        if 'Mycobacterium_noviomagens' in l:
            print(l)

Mycobacterium_noviomagense_NZ_AP022583.1                                    -         struc_4.cacofold     -          cm        1      247   663916   663628      -    no    1 0.66   5.4  190.6   2.2e-31 !   -

Mycobacterium_noviomagense_NZ_AP022583.1                                    -         struc_4.cacofold     -          cm        1      247  1344376  1344211      -    no    1 0.63   0.0   36.0      0.48 ?   -



In [41]:
dict['Mycobacterium_noviomagense_NZ_AP022583.1']

('663916-663628', 2.2e-31)

In [None]:
cmbuild --noss -F cm1.cm align_mask5.sto
cmcalibrate cm1.cm
cmsearch --tblout cmhits1.txt -A cmalign.sto --incE 1e-10 cm1.cm /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa
esl-alimask -g -o cm_mask1.sto cmalign.sto


cmbuild -F cm2.cm cm_mask1.sto
cmcalibrate cm2.cm
cmsearch --tblout cmhits2.txt -A cmalign1.sto --cpu 16 --incE 1e-10 cm2.cm /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa
esl-alimask -g -o cm_mask2.sto cmalign1.sto

cmbuild --noss -F cm3.cm cm_mask2.sto
cmcalibrate cm3.cm
cmsearch --tblout cmhits3.txt -A cmalign2.sto --incE 1e-5 cm3.cm /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa
esl-alimask -g -o cm_mask3.sto cmalign2.sto
esl-alimask -g -o cm_mask3.sto cmalign2.sto

~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname struc1 align_mask5.sto
cmbuild -F rcm2.cm struc1.cacofold.sto 
cmcalibrate rcm2.cm
cmsearch --tblout rcmhits2.txt -A rcmalign1.sto --cpu 16 --incE 1e-10 rcm2.cm /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa
esl-alimask -g -o rcm_mask2.sto rcmalign1.sto

~/rscape_v2.0.0.g/bin/R-scape --cacofold --outname struc2_t rcm_mask2.sto

In [None]:
i = 1
a= 'nhmmer -A align_'+str(i)+'.sto -o hmmer.txt --tblout summary.txt --notextw --cpu 16 --incE 1e-10 hmm_'+str(i)+'.hmm /mnt/d/BLAST/actinobacteria_ref_rep_comp/all_regions.faa /mnt/d/Actinobacteria_Ref_Rep_Lev_Complete/all_actinobacteria_ref_rep_comp.faa'
print(a)