#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from Comparative_Analysis import Alignment as alignfn
from random import sample
from Bio.Align.Applications import MuscleCommandline
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
project_dir = 'F:/Project_Data/E_Pump_Project'
seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
sequence_dirs = util.list_dirs(seq_dir)
tb_species = 'NC_000962.3' 
tb_annotation_dirname = 'GCF_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

#### Create file with all CDS for species and create BLAST databases for TB CDS and All CDS (to do reciprocal best hits)

#### Subset based on loci of interest, and filter orthologues based on percent identity, synteny

In [5]:
def generate_protein_dataset(num_subsets, subset_num, dir_list): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    all_cds = []
    all_tb_cds = []
    names = []
    sequences = []
    locations = []
    for dirname in (sequence_dirs):
        for genome_record in SeqIO.parse(seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                    if dirname == tb_annotation_dirname:
                        all_tb_cds.append([accession_locus, translation])
    return (all_cds, all_tb_cds, names, locations, sequences)    

In [6]:
if 1==1:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, sequence_dirs) for core_number in core_numbers)
    names_dict = {}
    locations_dict = {}
    sequence_dict = {}
    all_cds = []
    all_tb_cds = []
    for x in parallel_output:
        all_cds += x[0]
        all_tb_cds += x[1]
        for temp in x[2]:
            names_dict[temp[0]] = temp[1]
        for temp in x[3]:
            locations_dict[temp[0]] = temp[1]
        for temp in x[4]:
            sequence_dict[temp[0]] = temp[1]

In [20]:
loci = ['Rv0532']    # [TetR,  divergent_gene]
target_loci = [tb_species+ '@' + x for x in loci]

In [21]:
rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')
rbh = rbh[rbh['query_ref'].isin(target_loci)]
rbh['target_loc'] = rbh['target_ref'].map(locations_dict)
rbh['cds_count'] = rbh.groupby('target_species_name')['query_ref'].transform('size')
rbh['min_pct_id'] = rbh.groupby('target_species_name')['percent_identical_matches'].transform('min')

In [22]:
for i, r in rbh.iterrows():
    (start, stop, strand) = r['target_loc']
    rbh.at[i,'start'] = start
    rbh.at[i,'stop'] =  stop
    rbh.at[i,'strand'] = strand
rbh = rbh[rbh['min_pct_id'] >= 80]

#### Generate FASTA file containing intergenic regions in orthologous species and run Muscle / R-scape

In [23]:
intergenic_regions = []
tetR_regions = []
for target_species in list(rbh['target_species_name'].unique()):
    temp = rbh[rbh['target_species_name'] == target_species]
    for i, r in temp.iterrows():
        target_species_accession = r['target_species']
        if r['query_ref'].split('@')[1] == loci[0]:
            divgene_coords = r['target_loc']
    if divgene_coords[2] == 1:
        intergenic_region = (divgene_coords[0], divgene_coords[1], 1)
    else:
        intergenic_region = (divgene_coords[0], divgene_coords[1],-1)
    if intergenic_region[2] == 1:
        intergenic_sequence = sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]]
    else:
        intergenic_sequence = util.reverse_complement(sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]])
    intergenic_regions.append([target_species.replace(" ", "_"), intergenic_sequence])
    
util.produce_fasta_file(intergenic_regions, project_dir +'/intergenic_regions.fasta')
cline = MuscleCommandline(muscle_exe, input= project_dir +'/intergenic_regions.fasta', out=project_dir + '/RV0282_alignment.fasta')
result = cline();
#blastfn.convert_fasta_to_stockholm(project_dir, '/intergenic_region_'+loci[0]+'_'+loci[1] + '_alignment.fasta', '/intergenic_region_'+loci[0]+'_'+loci[1] + '_alignment.sto)

#util.produce_fasta_file(tetR_regions, project_dir +'/tetR_regions.fasta')
#cline = MuscleCommandline(muscle_exe, input= project_dir +'/'+'tetR_regions.fasta', out=project_dir + '/tetR_region_alignment3.fasta')
#result = cline();
#blastfn.convert_fasta_to_stockholm(project_dir, 'tetR_region_alignment3.fasta', 'tetR_region_alignment3.sto')

100%|██████████| 1/1 [00:00<?, ?it/s]


In [121]:
blastfn.run_rscape(project_dir, 'intergenic_region_alignment3.sto', 'rscape_output')

#### Analyse interfeature orthologs in target species