##### Files and imports

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil

In [2]:
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/Output'
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
all_protein_blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp_prot'
all_protein_blast_db_name = 'actinobacteria_ref_rep_comp_prot'
ref_protein_blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp_prot_ref'
ref_protein_blast_db_name = 'actinobacteria_ref_rep_comp_prot_ref'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
reference_species = 'NC_000962.3'

In [3]:
species_list = util.list_files(seq_dir)
species_list = [x for x in species_list if '.gbff' in x]    # Exclude other files generated in directory
print(len(species_list))

681


##### Produce file with all protein sequences in comparison species and reference species used for BLAST database building and searches

In [4]:
full_run = True

In [18]:
def parse_genome(num_subsets, subset_num, id_list):
    translator = util.Translator()
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for id in ids:
        temp_1 = []
        for genome_record in SeqIO.parse(seq_dir + '/' + id, "genbank"):
            organism_name = genome_record.annotations['organism']
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver

            if id == reference_species_filename:
                output = []
                features = []
                genome_record = next(SeqIO.parse(seq_dir + '/' + id, "genbank"))
                organism_name = genome_record.annotations['organism']
                accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
                organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
                full_sequence = str(genome_record.seq)
                mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
                for i, r in mycobrowser_df.iterrows():
                    if r['Feature'] == 'CDS':
                        locus_tag = r['Locus']
                        accession_locus = accession_ver + '@' + locus_tag
                        start = int(r['Start'])-1
                        stop = int(r['Stop'])
                        if r['Strand'] == '+':
                            strand = 1
                        else:
                            strand = -1
                        translation = translator.translate_sequence(full_sequence[start:stop],strand, 0)[:-1]    #Remove stop codon from translation       
                        temp_1.append([organism_name, accession_ver, accession_locus, start, stop, strand, translation, locus_tag])     #Mycobrowser is 1-indexed
            else:
                for feature in genome_record.features:
                    a = feature.qualifiers
                    if feature.type == 'CDS':
                        if a.get("locus_tag")!= None and a.get("translation")!= None:
                            locus_tag = a.get("locus_tag")[0]
                            accession_locus = accession_ver + '@' + locus_tag
                            translation = a.get("translation")[0]
                            temp_1.append([organism_name, accession_ver, accession_locus, int(feature.location.start), int(feature.location.end), int(feature.location.strand), translation, locus_tag])
            temp.append(temp_1)
    return temp

In [None]:
names_dict = {}
for id in tqdm(species_list):
    for genome_record in SeqIO.parse(seq_dir + '/' + id, "genbank"):
        organism_name = genome_record.annotations['organism']
        accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
        if not(accession_ver in names_dict):
            names_dict[accession_ver] = organism_name

In [6]:
parallel_output = Parallel(n_jobs=-1)(delayed(parse_genome)(num_cores, core_number, species_list) for core_number in core_numbers)
temp = [item for sublist in parallel_output for item in sublist]
gene_records = [item for sublist in temp for item in sublist]
reference_gene_records = [x for x in gene_records if x[1] == reference_species]

##### Produce and pickle dictionary of all genes and locations

In [14]:
gene_records.sort(key=lambda x: (x[1],x[3]))

In [8]:
gene_info_dict = {}
for i, record in enumerate(gene_records):
    if record[1] in gene_info_dict:
        gene_info_dict[record[1]].append((record[7],record[3], record[4], (record[7], record[5]), (record[7], record[5])))
    else:
        gene_info_dict[record[1]] = [(record[7],record[3], record[4], (record[7], record[5]), (record[7], record[5]))]
    if (i + 1) < len(gene_records):
        next_feature = gene_records[i+1]
        if next_feature[1] == record[1] and next_feature[3] > record[4]:
            gene_info_dict[record[1]].append((record[7] + '_IG', record[4], next_feature[3], (record[7], record[5]), (next_feature[7], next_feature[5])))
with open(output_dir + '/gene_info_dict.pkl', 'wb') as f:
    pickle.dump(gene_info_dict, f)   

In [None]:
###### protein_records = []
for record in gene_records:
    protein_records.append([record[2], record[6]])
util.produce_fasta_file(protein_records, seq_dir + '/all_proteins.faa')    
util.produce_fasta_file(protein_records, ref_protein_blast_dir + '/all_proteins.faa') 
protein_records = []
for record in reference_gene_records:
    protein_records.append([record[2], record[6]])
util.produce_fasta_file(protein_records, seq_dir + '/reference_proteins.faa')  
util.produce_fasta_file(protein_records, all_protein_blast_dir + '/reference_proteins.faa')   

##### Produce FASTA containing all full sequences (used for HMMER/INFERNAL searches of intergenic regions)

In [None]:
def generate_full_sequence_file(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    reference_list = []
    for species in ids:
        for genome_record in SeqIO.parse(seq_dir + '/' + species, "genbank"):
            feature_info = []
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            reference_list.append([accession_ver, str(genome_record.seq)])
    return reference_list

In [None]:
if full_run == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_full_sequence_file)(num_cores, core_number, species_list) for core_number in core_numbers)
    temp = [item for sublist in parallel_output for item in sublist]
    gene_records = [[item[0], item[1]] for item in temp]
    util.produce_fasta_file(gene_records, seq_dir + '/full_sequences.faa') 

##### Build BLAST database from all protein sequences and run BLAST searches to and from reference set to all species

In [None]:
if 1==1:
    blastfn.build_blast_db(seq_dir, 'all_proteins.faa', all_protein_blast_db_name, all_protein_blast_dir, 'prot')
    blastfn.build_blast_db(seq_dir, 'reference_proteins.faa', ref_protein_blast_db_name, ref_protein_blast_dir, 'prot')
    blastfn.run_blastp(all_protein_blast_dir, 'reference_proteins.faa',  all_protein_blast_db_name, 1e-10)
    blastfn.run_blastp(ref_protein_blast_dir, 'all_proteins.faa',  ref_protein_blast_db_name, 1e-10)

In [None]:
ref_to_all_best_hits = blastfn.process_blast_output(all_protein_blast_dir+'/hits.csv', output_dir + '/Blast_Output/ref_to_all_best_hits.pkl', names_dict, True)
all_to_ref_best_hits = blastfn.process_blast_output(ref_protein_blast_dir+'/hits.csv', output_dir + '/Blast_Output/all_to_ref_best_hits.pkl', names_dict, True)
reciprocal_best_hits = blastfn.keep_reciprocal_best_hits(ref_to_all_best_hits, all_to_ref_best_hits, output_dir + '/Blast_Output/reciprocal_best_hits.pkl')

In [None]:
if 1==1:
#if full_run == False:
    with open(output_dir + '/Blast_Output/reciprocal_best_hits.pkl', 'rb') as f:
        rbh_results = pickle.load(f)
        rbh_results.to_csv(output_dir + '/Blast_Output/reciprocal_best_hits.csv')