In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from Comparative_Analysis import ORF_Functions as orffn
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
import logomaker as lm

In [2]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
full_sequence = str(genome_record.seq)
orf_finder = orffn.ORF_Finder()
a=orf_finder.max_orf(852200, 852700, 2, True, 100)
for orf in a:
    if orf[2] == -1:
        print(orf, 852900- orf[0], 852900 - orf[1], util.reverse_complement(full_sequence[orf[0]:orf[1]]))

NameError: name 'seq_dir' is not defined

In [2]:
full_run = True
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/RScape_Local_Run_2'
wsl_output_loc = util.wslname(output_dir)
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
species_list = util.list_files(seq_dir)
species_list = [x for x in species_list if '.gbff' in x]    # Exclude other files generated in directory

In [3]:
filename_dict = {}
def create_filename_dict(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for id in (ids):
        for genome_record in SeqIO.parse(seq_dir + '/' + id, "genbank"):
            organism_name = genome_record.annotations['organism']
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            temp.append((accession_ver, id))
    return temp
parallel_output = Parallel(n_jobs=-1)(delayed(create_filename_dict)(num_cores, core_number, species_list) for core_number in core_numbers)
temp = [item for sublist in parallel_output for item in sublist]
for (accession_ver, filename) in temp:
    filename_dict[accession_ver] = filename

In [6]:
with open('D:/Project_Data/Project_8/Output/gene_info_dict.pkl', 'rb') as f:
    gene_info_dict = pickle.load(f) 
with open('D:/Project_Data/Project_8/Output//names_dict.pkl', 'rb') as f:
    names_dict = pickle.load(f) 

##### Generate files containing all genic and intergenic regions in reference organism

In [7]:
features = []
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
full_sequence = str(genome_record.seq)
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
for i, r in mycobrowser_df.iterrows():
    if r['Strand'] == '+':
        strand = 1
    else:
        strand = -1
    features.append([r['Locus'],r['Start']-1, r['Stop'], strand])
features.sort(key=lambda x: x[1])
feature_info = []
non_feature_info = []
for i, feature in enumerate(features):
    if feature[3] == 1:
        feature_sequence = full_sequence[feature[1]: feature[2]]
    else:
        feature_sequence = util.reverse_complement(full_sequence[feature[1]: feature[2]])
    feature_info.append([feature[0], feature[1], feature[2], feature_sequence, len(feature_sequence)])
    if feature[1] < feature[2]:  
        if (i + 1)< len(features) and feature[2] < features[i+1][1]:
            utr_coords = (feature[2], features[i+1][1])
            utr_sequence = full_sequence[feature[2]: features[i+1][1]]
            utr_length = len(utr_sequence)
        else:
            utr_coords = (0,0)
            utr_sequence = ''
            utr_length = 0
        non_feature_info.append([feature[0], utr_coords[0], utr_coords[1], utr_sequence, utr_length])
genic_df = pd.DataFrame(feature_info, columns = ['Locus', 'Start' , 'End', 'Sequence', 'Length'])
intergenic_df = pd.DataFrame(non_feature_info, columns = ['Locus', 'Start' , 'End', 'Sequence', 'Length'])

##### Run HMMER and INFERNAL iteratively

In [90]:
genic_test = genic_df[genic_df.Locus.isin(['Rv0756c'])]    # True downstream gene
intergenic_test = intergenic_df[intergenic_df.Locus.isin(['Rv0757'])]   # Annotation order upstream gene

In [10]:
hmmer_eval =  1e-10
hmmer_eval2 = 1e-5
query_file = 'antisense_region.faa'
hm_model_file = 'hmm.hmm'
antisense_sequence = util.reverse_complement(full_sequence[852286:852900])

sequence_list = [['antisense_phoR', antisense_sequence]]
locus_id = 'antisense_phoR'
print(locus_id)
results_dir = output_dir + '/' + 'antisense_phoR'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
util.produce_fasta_file(sequence_list, results_dir + '/' + query_file)

blastfn.nhmmer_search_sequence(results_dir, query_file, datasets_dir, 'full_sequences.faa', 'align_0.sto', 'output.txt', 'hsummary_0.txt', hmmer_eval) 
blastfn.align_keep_top_hit_per_species(results_dir, 'hsummary_0.txt', 'align_0.sto', 'align_bh_0.sto', 'HMMER', hmmer_eval)

antisense_phoR


100%|██████████| 1/1 [00:00<?, ?it/s]


In [64]:
def print_features(accession, stop, start):
    output_values = []
    for feature_info in gene_info_dict[accession]:
        start_f = int(feature_info[1])
        stop_f = int(feature_info[2])
        if stop_f - start_f  > 100000:   #Something wrong!
            continue
        start_1 = min(stop, start)
        stop_1 = max(stop, start)
        if stop_1 > start_f and start_1 < stop_f:
            overlap = (min(stop_f, stop_1) - max(start_f, start_1)) / (stop_1 - start_1)
            print(feature_info, overlap)


In [32]:
def find_features(accession, stop, start):
    output_values = []
    for feature_info in gene_info_dict[accession]:
        start_f = int(feature_info[1])
        stop_f = int(feature_info[2])
        if stop_f - start_f  > 100000:   #Something wrong!
            continue
        start_1 = min(stop, start)
        stop_1 = max(stop, start)
        if stop_1 > start_f and start_1 < stop_f:
            overlap = (min(stop_f, stop_1) - max(start_f, start_1)) / (stop_1 - start_1)
            output_values.append((feature_info, overlap))
            #print(feature_info, overlap)
    return output_values

In [82]:
with open(output_dir + '/' + locus_id + '/align_bh_0.sto', 'r') as f:  
    dict = {}
    for l in tqdm(f):
        if (not ('#' in l)):
            a = l.split()
            if len(a) > 0:
                accession = a[0].split('/')[0]
                if not(accession == ''):
                    species = names_dict[accession]
                    seq = a[1].replace('-','')
                    if species in dict:
                        dict[species] = dict[species] + (seq)
                    else:
                        dict[species] = seq
sequence_list = []
for k, v in dict.items():
    if 'Myco' in k and not('lepr' in k):
        sequence_list.append([k, v])
util.produce_fasta_file(sequence_list, results_dir + '/antisense_region_hits.faa')                  

0it [00:00, ?it/s]

100%|██████████| 94/94 [00:00<00:00, 95003.51it/s]


In [83]:
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
cline = MuscleCommandline(muscle_exe, input = results_dir + '/antisense_region_hits.faa', out=results_dir + '/antisense_region_alignment.fasta')
try:
    stdout, stderr = cline()
except Exception as e:
    pass

In [63]:
util.reverse_complement(full_sequence[852286:852683])

'GAGTGCCGTATAGCTCTGGCCGTCGGGGCTGATCACCCGAACGTAGAACCTCGACGGCGGCCGATCGGGGTTATGACCAGGGTAGGGGTCCGGCGCCAAGGGCAGCGTGATCTGCGCCCAGATTTGGGCTTCCTCGAGCAACACCCGATCGATCCGGCTGGTCAGCCGGTGCTGCAACATCGAGGTGACCGCGATCCCCGAGGCCACAAGTCCAGTGGCCACCAGGATCAGCGTGGCTGCGACCAGGCGTACCCGTAGGGGCAGCCTTCCTCGAAGGTGTCTGGCCATTGCCGCGTTCTCCTCGGGCTGCCGATCCGATTAACTACCAAGACTCATCGAGGCTCCCGCAGTACGTAGCCCACCCCGCGCAGCGTGTGCAGCAGCCGCTTCTCCCCAG'

In [93]:
homolog_hits = []
with open(output_dir + '/' + locus_id + '/hsummary_0.txt', 'r') as f:  
    for l in f:
        if (not ('#' in l)):
            a = l.split()
            accession = a[0]
            species = names_dict[a[0]]
            seq_from = int(a[6])
            seq_to = int(a[7])
            if a[11] == '+':
                strand = 1
            else:
                strand = -1
            e_value = float(a[12])
            start = min(seq_from,seq_to) - 1
            stop = max(seq_from, seq_to)
            if e_value < 1e-10:
                for feature in (find_features(accession, start, stop)):
                    if feature[1] > 0.9 and not (feature[0][0][-2:] == 'IG'):
                        homolog_hits.append((accession, filename_dict[accession], start, stop, strand, e_value))
                        break

In [95]:
util.produce_fasta_file(temp, output_dir + '/upstream_regions.faa')  

100%|██████████| 189/189 [00:00<00:00, 47143.83it/s]


In [96]:
for (accession, filename, start, stop, strand, e_value) in homolog_hits:
        for genome_record in SeqIO.parse(seq_dir + '/' + filename, "genbank"):
            full_sequence = str(genome_record.seq)
            print(accession, filename, start, stop, strand, e_value)
            if strand == 1:
                upstream_sequence = full_sequence[max(0, start - 1000): start]
            else:
                upstream_sequence = util.reverse_complement(full_sequence[stop: (stop+1000)])
            print(upstream_sequence)
        break

NC_000962.3 GCF_000195955.2_ASM19595v2_genomic.gbff 850740 851466 -1 4.6e-210
GCCACCAGGATCAGCGTGGCTGCGACCAGGCGTACCCGTAGGGGCAGCCTTCCTCGAAGGTGTCTGGCCATTGCCGCGTTCTCCTCGGGCTGCCGATCCGATTAACTACCAAGACTCATCGAGGCTCCCGCAGTACGTAGCCCACCCCGCGCAGCGTGTGCAGCAGCCGCTTCTCCCCAGTGTCGATCTTGCGGCGCAGATACGACACGTAGGACTCGACGACGTTGACATCACCACCGAAGTCGTAGCGCCAAACGTGGTCGAGAATCTTAGGCTTGCTCAGCACGGTGCCCGCGTTGATCACGAAATAGCGCAGCAGGGTGAATTCGGTGGGCGACAGCGACACCGGTTGGCCCGCCTTCCACACTTCGTGGGTCTCCTCGTCGAGCTCGATATCGGCGAACGTCAGTCGAACATTACGTGGTTCCTTGTTGCCCTTGCCCGCGCGTCGCAGGATGACCCGCAGCCTGGCCACGACCTCCTCCAAACTGAAGGGCTTTGTCACATAGTCGTCACCACCCAGGGTCAGACCCGCGATCTTGTCCTGTAGCGAGTCACGGGCCGTCAGGAACAACGCCGGGGCATCGATGCCGTCGGCGCGCAGCCGGCGCAGCACCCCAAAGCCGTCCATCCCGGGCATCATCACATCGAGGATCACCGCGTCCGGCCGGGTTTCCCGGGCCCGATCCAGCGCCTGTGCCCCGTTGGTCGCGGTGTAGACTTCAAAGCCCTGGAACTTGAGGCTCACCGACAGCAGTTCAACGATGTTGGCCTCATCATCGACCACGAGGACACGAGCCTCCGGTGTGGTGTTTTCGCCTGGGGTTCCCGCCGTCACGAGATCAACCCCTTTCCGCATTGGTTGAACGTTACCTTCACAGTCATTGTGTAATTCCTGAAAGCTCGTTGCCAGTAGTCTGCT

In [80]:
hmmer_eval =  1e-10
hmmer_eval_2 = 1e-5
infernal_eval = 1e-5
query_file = 'intergenic_region.faa'
hm_model_file = 'hmm.hmm'
cm_model_file = 'cm.cm'


blastfn.infernal_build_and_calib(results_dir, 'align_bh_0.sto' ,'cm_1.cm', False)
blastfn.infernal_search(results_dir, 'cm_1.cm', output_dir, 'antisense_region_hits.faa', 'search_1.sto', 'search_hits_1.txt', 'summary_1.txt', infernal_eval) 
if os.path.getsize(results_dir + '/search_hits_1.txt') < 1e7:
    blastfn.align_keep_top_hit_per_species(results_dir, 'summary_1.txt', 'search_1.sto', 'search_bh_1.sto', 'INFERNAL', infernal_eval)
    blastfn.run_rscape(results_dir, 'search_bh_1.sto', 'rscape_1')

#2
blastfn.infernal_build_and_calib(results_dir, 'rscape_1.cacofold.R2R.sto' ,'cm_2.cm')
blastfn.infernal_search(results_dir, 'cm_2.cm', output_dir, 'antisense_region_hits.faa', 'search_2.sto', 'search_hits_2.txt', 'summary_2.txt', infernal_eval) 
if os.path.getsize(results_dir + '/search_hits_2.txt') < 1e7:
    blastfn.align_keep_top_hit_per_species(results_dir, 'summary_2.txt', 'search_2.sto', 'search_bh_2.sto', 'INFERNAL', infernal_eval)
    blastfn.run_rscape(results_dir, 'search_bh_2.sto', 'rscape_2')

#3
#'search_bh_2.sto'
blastfn.infernal_build_and_calib(results_dir, 'rscape_2.cacofold.R2R.sto' ,'cm_3.cm')
blastfn.infernal_search(results_dir, 'cm_3.cm', output_dir, 'antisense_region_hits.faa', 'search_3.sto', 'search_hits_3.txt', 'summary_3.txt', infernal_eval) 
if os.path.getsize(results_dir + '/search_hits_3.txt') < 1e7:
    blastfn.align_keep_top_hit_per_species(results_dir, 'summary_3.txt', 'search_3.sto', 'search_bh_3.sto', 'INFERNAL', infernal_eval)
    blastfn.run_rscape(results_dir, 'search_3.sto', 'rscape_3')

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'D:/Project_Data/Project_8/RScape_Local_Run_2/antisense_phoR/search_hits_2.txt'

In [57]:
locus_id = 'Rv0439c'

In [44]:
full_sequence[852351:852396]

'GTCTTGGTAGTTAATCGGATCGGCAGCCCGAGGAGAACGCGGCAA'

In [89]:
with open('D:/Project_Data/Project_8/RScape_Run_Thoth_3' + '/Rv0756c'  + '/summary.txt', 'r') as f:  
    for l in f:
        if (not ('#' in l)):
            a = l.split()
            print(names_dict[a[0]], a[0],a[7],a[8],a[9],a[15])
          

Mycobacterium ostraviense NZ_CP089224.1 87490 87622 + 2.3e-25
Mycobacterium heidelbergense NZ_AP022615.1 2722378 2722510 + 1.3e-24
Mycobacterium canettii CIPT 140010059 NC_015848.1 861827 861960 + 1.8e-24
Mycobacterium tuberculosis variant bovis BCG str. Pasteur 1173P2 AM408590.1 883254 883387 + 4.8e-24
Mycobacterium tuberculosis variant bovis AF2122/97 LT708304.1 853661 853794 + 4.8e-24
Mycobacterium tuberculosis H37Rv NC_000962.3 851471 851604 + 5.2e-24
Mycobacterium parmense NZ_AP022614.1 4906113 4906245 + 6.5e-24
Mycobacterium kansasii ATCC 12478 NC_022663.1 2523101 2522969 - 1.4e-23
Mycobacterium shottsii NZ_AP022572.1 372938 372806 - 4.3e-23
Mycobacterium marinum NZ_CP058277.1 801616 801484 - 4.3e-23
Mycobacterium malmoense NZ_CP080999.1 4788538 4788408 - 4.7e-23
Mycobacterium pseudoshottsii JCM 15466 NZ_AP018410.1 5539864 5539732 - 5.7e-23
Mycobacterium florentinum NZ_AP022576.1 4352964 4353097 + 6.3e-23
Mycobacterium ulcerans NZ_CP085200.1 335018 335150 + 9.6e-23
Mycobacterium 

In [71]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
for feature in genome_record.features:
                    a = feature.qualifiers
                    if feature.type == 'CDS':
                        if a.get("locus_tag")!= None and a.get("translation")!= None:
                            locus_tag = a.get("locus_tag")[0]
                            if a.get("product")!= None:
                                product = a.get("product")[0]
                                if 'sigma factor' in product:
                                    print(product)

ECF RNA polymerase sigma factor SigG
ECF RNA polymerase sigma factor SigK
anti-anti-sigma factor
ECF RNA polymerase sigma factor SigL
ECF RNA polymerase sigma factor SigI
ECF RNA polymerase sigma factor SigE
sigma factor regulatory protein
ECF RNA polymerase sigma factor SigC
RNA polymerase sigma factor SigA
RNA polymerase sigma factor SigB
anti-sigma factor RshA
ECF RNA polymerase sigma factor SigH
RNA polymerase sigma factor SigF
anti-sigma factor RsbW
ECF RNA polymerase sigma factor SigJ
ECF RNA polymerase sigma factor SigD
anti-anti-sigma factor RsfB
ECF RNA polymerase sigma factor SigM


In [74]:
for k, v in names_dict.items():
    if 'egmatis' in v:
        print(k, filename_dict[k])

NZ_CP102342.1 GCF_024600175.1_ASM2460017v1_genomic.gbff


In [75]:
genome_record = next(SeqIO.parse(seq_dir + '/' + 'GCF_024600175.1_ASM2460017v1_genomic.gbff', "genbank"))
for feature in genome_record.features:
                    a = feature.qualifiers
                    if feature.type == 'CDS':
                        if a.get("locus_tag")!= None and a.get("translation")!= None:
                            locus_tag = a.get("locus_tag")[0]
                            if a.get("product")!= None:
                                product = a.get("product")[0]
                                if 'sigma factor' in product:
                                    print(product)

sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
anti-sigma factor antagonist
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
RNA polymerase sigma factor SigF
sigma-70 family RNA polymerase sigma factor SigH
sigma-70 family RNA polymerase sigma factor
RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
RNA polymerase sigma factor
sigma-70 family RNA polymerase sigma factor
RNA polymerase sigma factor SigE
sigma-70 family RNA polymerase sigma factor
anti-sigma factor
RNA polymerase sigma factor
RNA polymerase sigma factor SigJ
RNA polymerase si

In [86]:
util.reverse_complement(full_sequence[12311:12468])

'TTACGCCCGGGCGGGCCCCCACCCGGTGCCCGGAACACAAACCCGCACGCGGCGCACCCGCAACGAGCCCGCCGACGGGCCAGTGATGGCAGTCTGTGAAGGAACTTGACCGAGTTATTGAGCGCAAGCCTCGGCCCGGACACAGAAAGGCGGCAAA'

In [87]:
full_sequence[12311:12468]

'TTTGCCGCCTTTCTGTGTCCGGGCCGAGGCTTGCGCTCAATAACTCGGTCAAGTTCCTTCACAGACTGCCATCACTGGCCCGTCGGCGGGCTCGTTGCGGGTGCGCCGCGTGCGGGTTTGTGTTCCGGGCACCGGGTGGGGGCCCGCCCGGGCGTAA'