#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from random import sample
import ete3;



In [176]:
project_dir = 'F:/Project_Data/Project_9'
seq_dir = 'F:/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
comparator_filename = 'GCF_016745295.1_ASM1674529v1_genomic.gbff'
tb_reannotation_filename = 'annot.gbk'
num_cores = 16
core_numbers = list(range(1, num_cores+1))

#### Find all (maximal nested) ORFs 

In [15]:
def create_filename_dict(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for id in (ids):
        genome_record = next(SeqIO.parse(seq_dir + '/' + id, "genbank"))
        organism_name = genome_record.annotations['organism']
        temp.append((organism_name, id))
    return temp

species_list = util.list_files(seq_dir)
filename_dict = {}
parallel_output = Parallel(n_jobs=-1)(delayed(create_filename_dict)(num_cores, core_number, species_list) for core_number in core_numbers)
temp = [item for sublist in parallel_output for item in sublist]
for (species_name, filename) in temp:
    filename_dict[species_name] = filename

In [132]:
for k, v in filename_dict.items():
    if 'H37Rv' in k or 'arinum' in k:
        print(k,v)

Mycobacterium tuberculosis H37Rv GCF_000195955.2_ASM19595v2_genomic.gbff
Corynebacterium marinum DSM 44953 GCF_000835165.1_ASM83516v1_genomic.gbff
Mycobacterium marinum GCF_016745295.1_ASM1674529v1_genomic.gbff


In [55]:
def produce_non_shadowed_orfs(orfs):
    a = orfs
    a.sort(key = lambda x: x[3], reverse = True)
    orf_list = [a[0]]
    for x in tqdm(a[1:]):
        matched = 0
        for v in orf_list:
            if x[1] > v[0] and x[0]<=v[1]:
                matched = 1
                break
               
        if matched == 0:
            orf_list.append(x)
    orf_list.sort(key = lambda x: x[0])
    return orf_list

In [134]:
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    tb_sequence = str(record.seq)
for record in SeqIO.parse(seq_dir + '/' + comparator_filename, "genbank"):
    comparator_sequence = str(record.seq)
    
ORFFinder = orffn.ORF_Finder(tb_sequence)
tb_orfs = ORFFinder.max_orf(0, len(tb_sequence), output_orfs = 'Nested', min_orf_length = 50)
tb_orfs_reduced = produce_non_shadowed_orfs(tb_orfs)
ORFFinder = orffn.ORF_Finder(comparator_sequence)
comparator_orfs = ORFFinder.max_orf(0, len(comparator_sequence), output_orfs = 'Nested', min_orf_length = 50)
comparator_orfs_reduced = produce_non_shadowed_orfs(comparator_orfs)

100%|██████████| 63719/63719 [00:07<00:00, 8142.75it/s] 
100%|██████████| 88938/88938 [00:15<00:00, 5694.50it/s]


In [184]:
xls = pd.ExcelFile('F:/Datasets/Data_From_Publications/Mycobrowser_Release_4.xlsx')
mycobrowser_df = pd.read_excel(xls)

In [185]:
min_intergenic_length = 50

mycobrowser_inter_feature_orfs = []
genbank_cds_boundaries = []
mycobrowser_cds_boundaries = []
ORFFinder = orffn.ORF_Finder(tb_sequence)

for genome_record in SeqIO.parse(project_dir + '/' + tb_reannotation_filename, "genbank"):
    for feature in genome_record.features:
        if feature.type not in ('source', 'gene'):
            a = feature.qualifiers  
            genbank_cds_boundaries.append((int(feature.location.start), int(feature.location.end), int(feature.location.strand))) 

for i,r in mycobrowser_df.iterrows():
    mycobrowser_cds_boundaries.append((r['Locus'],r['Start'],r['Stop'], r['Strand']))
mycobrowser_cds_boundaries.sort(key = lambda x: x[1])
max_stop = 0
for i, (locus, start, stop, strand) in enumerate(mycobrowser_cds_boundaries):
    if i < len(mycobrowser_cds_boundaries) - 1:
        max_stop = max(stop, max_stop)
        if mycobrowser_cds_boundaries[i+1][1] > max_stop + min_intergenic_length:
            a =ORFFinder.max_orf(max_stop-5, mycobrowser_cds_boundaries[i+1][1]+5, output_orfs = 'Nested', min_orf_length = 50)
            for x in a:
                mycobrowser_inter_feature_orfs.append(x)

#### Produce FASTA file with CDS candidates

In [188]:
genbank_cds_boundaries[0:100]

[(0, 1524, 1),
 (2051, 3260, 1),
 (3279, 4437, 1),
 (4433, 4997, 1),
 (5239, 7267, 1),
 (7301, 9818, 1),
 (9913, 10828, 1),
 (10886, 10960, 1),
 (11111, 11184, 1),
 (11224, 11416, 1),
 (11554, 11692, -1),
 (11873, 12311, -1),
 (12467, 13016, 1),
 (13132, 13558, -1),
 (13713, 13995, -1),
 (14133, 14877, 1),
 (14913, 15612, 1),
 (15589, 17470, -1),
 (17466, 18762, -1),
 (18758, 20234, -1),
 (20230, 21640, -1),
 (21636, 23172, -1),
 (23269, 23737, -1),
 (23860, 25444, -1),
 (25643, 25726, 1),
 (25912, 26881, -1),
 (27022, 27442, -1),
 (27594, 28365, 1),
 (28361, 29207, 1),
 (29244, 29607, 1),
 (29739, 31151, 1),
 (31188, 31506, 1),
 (31513, 31819, 1),
 (31823, 31967, -1),
 (32056, 33154, 1),
 (33223, 33553, 1),
 (34294, 36610, 1),
 (36606, 36870, 1),
 (36866, 37262, 1),
 (37258, 38947, 1),
 (39055, 39829, -1),
 (39876, 41202, -1),
 (41303, 41912, 1),
 (42003, 42351, -1),
 (42432, 43365, -1),
 (43561, 46471, 1),
 (46580, 47084, -1),
 (47365, 48100, -1),
 (48232, 49027, -1),
 (49042, 49939,

In [163]:
def produce_orf_protein_fasta_file(species_name, orf_list, full_sequence, output_file):
    trans = util.Translator()
    temp = []
    for x in orf_list:
        if x[2] == 1:
            prot = trans.translate_sequence(full_sequence[x[0]:x[1]], 1, 0)
        else:
            prot = trans.translate_sequence(util.reverse_complement(full_sequence[x[0]:x[1]]), 1, 0)
        name = species_name + '_' +str(x[0])+'_'+str(x[1])+'_'+str(x[2])
        temp.append([name, prot[:-1]])
    util.produce_fasta_file(temp, output_file)

In [171]:
produce_orf_protein_fasta_file('M.tb', tb_orfs, tb_sequence, project_dir + '/tb_orfs.fasta')
produce_orf_protein_fasta_file('M.tb', tb_orfs_reduced, tb_sequence, project_dir + '/tb_orfs_reduced.fasta')
produce_orf_protein_fasta_file('M.marinum', comparator_orfs, comparator_sequence, project_dir + '/comparator_orfs.fasta')
produce_orf_protein_fasta_file('M.marinum', comparator_orfs_reduced, comparator_sequence, project_dir + '/comparator_orfs_reduced.fasta')
produce_orf_protein_fasta_file('M.tb', mycobrowser_inter_feature_orfs, tb_sequence, project_dir + '/mycobrowser_inter_feature_orfs.fasta')

100%|██████████| 63720/63720 [00:01<00:00, 53007.66it/s]
100%|██████████| 6122/6122 [00:00<00:00, 69502.61it/s]
100%|██████████| 88939/88939 [00:02<00:00, 35440.39it/s]
100%|██████████| 8163/8163 [00:00<00:00, 24155.07it/s]
100%|██████████| 4832/4832 [00:00<00:00, 210043.39it/s]


In [136]:
blastfn.build_blast_db(project_dir, 'comparator_orfs.fasta', 'comparator_orfs', 'F:/Datasets/BLAST/comparator_orfs')

In [137]:
blastfn.build_blast_db(project_dir, 'comparator_orfs_reduced.fasta', 'comparator_orfs_reduced', 'F:/Datasets/BLAST/comparator_orfs_reduced')

In [172]:
blastfn.run_blastp('F:/Datasets/BLAST/comparator_orfs', 'mycobrowser_inter_feature_orfs.fasta', 'comparator_orfs', e_value = 1e-5)

In [62]:
blastfn.run_blastp('F:/Datasets/BLAST/comparator_orfs_reduced', 'tb_orfs_reduced.fasta', 'comparator_orfs_reduced', e_value = 1e-5)

In [151]:
blastfn.run_blastp('F:/Datasets/BLAST/comparator_orfs_reduced', 'mycobrowser_inter_feature_orfs.fasta', 'comparator_orfs_reduced', e_value = 1e-5)

In [186]:
blast_results_df = pd.read_csv('F:/Datasets/BLAST/comparator_orfs/hits.csv', header=None)
blast_results_df = blast_results_df.loc[blast_results_df.groupby([0])[13].idxmax()]
blast_results_df['Overlap'] = 0
blast_results_df['Genbank_Coordinates'] = ''

In [187]:
for i, r in tqdm(blast_results_df.iterrows()):
    a=(r[0].split('_'))
    start=int(a[1])
    stop=int(a[2])
    strand = int(a[3])
    highest_overlap = 0
    hit = 0
    for feature in genbank_cds_boundaries:
            if start <= feature[1] and stop >= feature[0]:
                overlap = (min(stop, feature[1]) - max(start, feature[0]))/(stop - start)
                if overlap > highest_overlap:
                    hit = 1
                    highest_overlap = overlap
                    highest_entry = feature
    if hit == 1:
        blast_results_df.at[i,'Overlap'] = highest_overlap
        blast_results_df.at[i,'Genbank_Coordinates'] = highest_entry

blast_results_df.to_csv(project_dir +'/blast_annotation_xref.csv')

265it [00:00, 2558.37it/s]


In [160]:
blast_results_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,Overlap,Mycobrowser_Locus
59,M.tb_1014393_1014555_1,M.marinum_460982_470066_-1,53,3027,47.059,51,27,0,3,53,2863,2913,5.110000e-06,40.4,0.000000,
60,M.tb_1016799_1016925_1,M.marinum_457189_457375_-1,41,61,58.537,41,17,0,1,41,20,60,9.540000e-08,42.0,0.000000,
61,M.tb_1016819_1017215_-1,M.marinum_456881_457427_1,131,181,62.222,90,27,2,48,130,48,137,1.860000e-32,112.0,0.000000,
62,M.tb_1056785_1057070_-1,M.marinum_398190_399204_1,94,337,56.790,81,19,2,11,75,257,337,2.050000e-21,85.1,0.000000,
66,M.tb_1057042_1057129_1,M.marinum_398864_398963_-1,28,32,84.000,25,4,0,1,25,3,27,9.780000e-08,40.8,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,M.tb_918010_918142_1,M.marinum_710211_710343_-1,43,43,81.395,43,8,0,1,43,1,43,4.960000e-20,72.8,1.000000,Rv0824c
54,M.tb_918148_918268_1,M.marinum_710085_710205_-1,39,39,89.744,39,4,0,1,39,1,39,1.490000e-19,71.2,1.000000,Rv0824c
56,M.tb_934087_934405_-1,M.marinum_6223244_6225035_-1,105,596,56.061,66,29,0,23,88,209,274,4.900000e-18,77.0,0.000000,
57,M.tb_934292_934481_-1,M.marinum_6223244_6225035_-1,62,596,50.980,51,22,1,1,48,155,205,1.760000e-08,47.8,0.000000,
