#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from random import sample
import ete3;



In [2]:
project_dir = 'F:/Project_Data/Project_9'
seq_dir = 'F:/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
comparator_filename = 'GCF_016745295.1_ASM1674529v1_genomic.gbff'
tb_reannotation_filename = 'annot.gbk'
num_cores = 16
core_numbers = list(range(1, num_cores+1))

#### Sample of 15 mycobacteria

In [3]:
def create_filename_dict(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for id in (ids):
        genome_record = next(SeqIO.parse(seq_dir + '/' + id, "genbank"))
        organism_name = genome_record.annotations['organism']
        temp.append((organism_name, id))
    return temp

species_list = util.list_files(seq_dir)
filename_dict = {}
parallel_output = Parallel(n_jobs=-1)(delayed(create_filename_dict)(num_cores, core_number, species_list) for core_number in core_numbers)
temp = [item for sublist in parallel_output for item in sublist]
for (species_name, filename) in temp:
    filename_dict[species_name] = filename

In [10]:
sample_filenames = ['GCF_000195955.2_ASM19595v2_genomic.gbff', 'GCF_024600175.1_ASM2460017v1_genomic.gbff', 'GCF_010730055.1_ASM1073005v1_genomic.gbff', 'GCF_020616615.1_ASM2061661v1_genomic.gbff', 'GCF_010731535.1_ASM1073153v1_genomic.gbff',
                    'GCF_022370755.1_ASM2237075v1_genomic.gbff', 'GCF_016745295.1_ASM1674529v1_genomic.gbff', 'GCF_000157895.3_ASM15789v2_genomic.gbff', 'GCF_900603025.1_MHAS_genomic.gbff', 'GCF_018363015.1_ASM1836301v1_genomic.gbff',
                    'GCF_019645855.1_ASM1964585v1_genomic.gbff', 'GCF_010727945.1_ASM1072794v1_genomic.gbff', 'GCF_010731895.1_ASM1073189v1_genomic.gbff', 'GCF_900637205.1_50279_G01_genomic.gbff', 'GCF_000184435.1_ASM18443v1_genomic.gbff']
for f in sample_filenames:
    for k, v in filename_dict.items():
        if f==v:
            print(k)

Mycobacterium tuberculosis H37Rv
Mycolicibacterium smegmatis
Mycobacterium shinjukuense
Mycobacterium ulcerans
Mycobacterium lacus
Mycobacterium goodii
Mycobacterium marinum
Mycobacterium kansasii ATCC 12478
Mycolicibacterium hassiacum DSM 44199
Mycolicibacterium neoaurum
Mycobacterium malmoense
Mycobacterium cookii
Mycolicibacterium helvum
Mycolicibacterium chitae
Mycolicibacterium gilvum Spyr1


In [17]:
sample_info = []
for filename in tqdm(sample_filenames):
    genome_record = next(SeqIO.parse(seq_dir + '/' + filename, "genbank"))
    organism_name = genome_record.annotations['organism']
    for feature in genome_record.features:
        a = feature.qualifiers
        if feature.type == 'CDS':
            if a.get("protein_id") != None and a.get("locus_tag")!= None and a.get("translation")!= None:
                locus_tag = a.get("locus_tag")[0]
                protein_id = a.get("protein_id")[0]
                translation = a.get("translation")[0]
                sample_info.append([organism_name, int(feature.location.start), int(feature.location.end), int(feature.location.strand), locus_tag, protein_id, translation])

100%|██████████| 15/15 [00:10<00:00,  1.40it/s]


In [21]:
# Full sample
util.produce_fasta_file([[x[5], x[6]] for x in sample_info], project_dir + '/full_sample_proteins.fasta')
# Tb only
util.produce_fasta_file([[x[5], x[6]] for x in sample_info if 'H37Rv' in x[0]], project_dir + '/H37Rv_proteins.fasta')

100%|██████████| 75224/75224 [00:06<00:00, 10973.90it/s]
100%|██████████| 3906/3906 [00:00<00:00, 45987.75it/s]


#### Produce FASTA file with CDS candidates

In [22]:
blastfn.build_blast_db(project_dir, 'full_sample_proteins.fasta', 'full_sample_proteins', 'F:/Datasets/BLAST/full_sample_proteins')

In [23]:
blastfn.build_blast_db(project_dir, 'H37Rv_proteins.fasta', 'H37Rv_proteins', 'F:/Datasets/BLAST/H37Rv_proteins')

In [172]:
blastfn.run_blastp('F:/Datasets/BLAST/full_sample_proteins', 'H37Rv_proteins.fasta', 'full_sample_proteins', e_value = 1e-5)

In [62]:
blastfn.run_blastp('F:/Datasets/BLAST/H37Rv_proteins', 'full_sample_proteins.fasta', 'H37Rv_proteins', e_value = 1e-5)

In [186]:
blast_results_df = pd.read_csv('F:/Datasets/BLAST/comparator_orfs/hits.csv', header=None)
blast_results_df = blast_results_df.loc[blast_results_df.groupby([0])[13].idxmax()]
blast_results_df['Overlap'] = 0
blast_results_df['Genbank_Coordinates'] = ''

In [187]:
for i, r in tqdm(blast_results_df.iterrows()):
    a=(r[0].split('_'))
    start=int(a[1])
    stop=int(a[2])
    strand = int(a[3])
    highest_overlap = 0
    hit = 0
    for feature in genbank_cds_boundaries:
            if start <= feature[1] and stop >= feature[0]:
                overlap = (min(stop, feature[1]) - max(start, feature[0]))/(stop - start)
                if overlap > highest_overlap:
                    hit = 1
                    highest_overlap = overlap
                    highest_entry = feature
    if hit == 1:
        blast_results_df.at[i,'Overlap'] = highest_overlap
        blast_results_df.at[i,'Genbank_Coordinates'] = highest_entry

blast_results_df.to_csv(project_dir +'/blast_annotation_xref.csv')

265it [00:00, 2558.37it/s]


In [160]:
blast_results_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,Overlap,Mycobrowser_Locus
59,M.tb_1014393_1014555_1,M.marinum_460982_470066_-1,53,3027,47.059,51,27,0,3,53,2863,2913,5.110000e-06,40.4,0.000000,
60,M.tb_1016799_1016925_1,M.marinum_457189_457375_-1,41,61,58.537,41,17,0,1,41,20,60,9.540000e-08,42.0,0.000000,
61,M.tb_1016819_1017215_-1,M.marinum_456881_457427_1,131,181,62.222,90,27,2,48,130,48,137,1.860000e-32,112.0,0.000000,
62,M.tb_1056785_1057070_-1,M.marinum_398190_399204_1,94,337,56.790,81,19,2,11,75,257,337,2.050000e-21,85.1,0.000000,
66,M.tb_1057042_1057129_1,M.marinum_398864_398963_-1,28,32,84.000,25,4,0,1,25,3,27,9.780000e-08,40.8,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,M.tb_918010_918142_1,M.marinum_710211_710343_-1,43,43,81.395,43,8,0,1,43,1,43,4.960000e-20,72.8,1.000000,Rv0824c
54,M.tb_918148_918268_1,M.marinum_710085_710205_-1,39,39,89.744,39,4,0,1,39,1,39,1.490000e-19,71.2,1.000000,Rv0824c
56,M.tb_934087_934405_-1,M.marinum_6223244_6225035_-1,105,596,56.061,66,29,0,23,88,209,274,4.900000e-18,77.0,0.000000,
57,M.tb_934292_934481_-1,M.marinum_6223244_6225035_-1,62,596,50.980,51,22,1,1,48,155,205,1.760000e-08,47.8,0.000000,
