#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from Comparative_Analysis import Alignment as alignfn
from random import sample
from Bio.Align.Applications import MuscleCommandline
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
project_dir = 'F:/Project_Data/E_Pump_Project'
seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
tb_species = 'NC_000962.3' 
tb_annotation_dirname = 'GCF_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

#### Create file with all CDS for species and create BLAST database

In [3]:
sequence_dirs = util.list_dirs(seq_dir)
len(sequence_dirs)

215

In [4]:
loci = ['Rv0451c', 'Rv0450c', 'Rv0452']

In [7]:
def generate_protein_dataset(num_subsets, subset_num, dir_list): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    tb_cds = []
    all_cds = []
    names = []
    sequences = []
    locations = []
    for dirname in (sequence_dirs):
        for genome_record in SeqIO.parse(seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                    if dirname == tb_annotation_dirname and locus_tag in loci:
                        tb_cds.append([accession_locus, translation])
    return (all_cds, tb_cds, names, locations, sequences)           

In [None]:
if full_build == True:
#if 1==1:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, sequence_dirs) for core_number in core_numbers)
    names_dict = {}
    locations_dict = {}
    sequence_dict = {}
    all_cds = []
    tb_cds = []
    for x in parallel_output:
        all_cds += x[0]
        tb_cds += x[1]
        for temp in x[2]:
            names_dict[temp[0]] = temp[1]
        for temp in x[3]:
            locations_dict[temp[0]] = temp[1]
        for temp in x[4]:
            sequence_dict[temp[0]] = temp[1]

In [None]:
if full_build == True:
    util.produce_fasta_file(all_cds, project_dir + '/all_cds.fasta')
    util.produce_fasta_file(tb_cds, project_dir + '/tb_cds.fasta')
    blastfn.build_blast_db(project_dir, 'all_cds.fasta', 'Mycobacteriaceae_Refseq', 'F:/Datasets/BLAST/Mycobacteriaceae_Refseq')
    blastfn.build_blast_db(project_dir, 'tb_cds.fasta', 'E_Pump_CDS', 'F:/Datasets/BLAST/E_Pump_CDS')

In [None]:
if full_build == True:
    blastfn.run_blastp('F:/Datasets/BLAST/Mycobacteriaceae_Refseq', 'tb_cds.fasta', 'Mycobacteriaceae_Refseq', 'tb_cds_hits.csv', e_value = 1e-10)
    blastfn.run_blastp('F:/Datasets/BLAST/E_Pump_CDS', 'all_cds.fasta', 'E_Pump_CDS', 'reverse_hits.csv', e_value = 1e-10)

In [None]:
a = blastfn.process_blast_output('F:/Datasets/BLAST/Mycobacteriaceae_Refseq/tb_cds_hits.csv', names_dict, top_hit_only = False)
a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
b = blastfn.process_blast_output('F:/Datasets/BLAST/E_Pump_CDS/reverse_hits.csv', names_dict, top_hit_only = False)
b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
rbh =  blastfn.keep_reciprocal_best_hits(a, b)

In [None]:
rbh['target_loc'] = rbh['target_ref'].map(locations_dict)
rbh['cds_count'] = rbh.groupby('target_species_name')['query_ref'].transform('size')
rbh['min_pct_id'] = rbh.groupby('target_species_name')['percent_identical_matches'].transform('min')
rbh = rbh[rbh['cds_count'] == 3]
for i, r in rbh.iterrows():
    (start, stop, strand) = r['target_loc']
    rbh.at[i,'start'] = start
    rbh.at[i,'stop'] =  stop
    rbh.at[i,'strand'] = strand
rbh['min_start'] = rbh.groupby('target_species_name')['start'].transform('min')
rbh['max_stop'] = rbh.groupby('target_species_name')['stop'].transform('max')
rbh['region_size'] = rbh['max_stop'] - rbh['min_start']
rbh = rbh[rbh['region_size'] < 5000]
rbh = rbh[rbh['min_pct_id'] > 80.0]
rbh.to_csv(project_dir + '/reciprocal_best_hits.csv')

In [None]:
intergenic_regions = []
for target_species in list(rbh['target_species_name'].unique()):
    temp = rbh[rbh['target_species_name'] == target_species]
    for i, r in temp.iterrows():
        target_species_accession = r['target_species']
        if r['query_ref'].split('@')[1] == 'Rv0451c':
            Rv_0451c_coords = r['target_loc']
        if r['query_ref'].split('@')[1] == 'Rv0452':
            Rv_0452_coords = r['target_loc']
    if Rv_0451c_coords[2] == 1:
        intergenic_region = (Rv_0452_coords[1], Rv_0451c_coords[0]+3, 1)
    else:
        intergenic_region = (Rv_0451c_coords[1] - 3, Rv_0452_coords[0], -1)
    if intergenic_region[2] == 1:
        intergenic_sequence = sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]]
    else:
        intergenic_sequence = util.reverse_complement(sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]])
    print(target_species, intergenic_region)
    intergenic_regions.append([target_species.replace(" ", "_"), intergenic_sequence])

In [None]:
util.produce_fasta_file(intergenic_regions, project_dir +'/intergenic_regions.fasta')
cline = MuscleCommandline(muscle_exe, input= project_dir +'/intergenic_regions.fasta', out=project_dir + '/intergenic_region_alignment.fasta')
result = cline();

In [5]:
blastfn.convert_fasta_to_stockholm(project_dir, 'intergenic_region_alignment.fasta', 'intergenic_region_alignment.sto')

In [6]:
blastfn.run_rscape(project_dir, 'intergenic_region_alignment.sto', 'rscape_output')

In [None]:
cds_info_dict = {}
for dirname in (sequence_dirs):
    for genome_record in SeqIO.parse(seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
        temp_dict = {}
        accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])    
        for feature in genome_record.features:
            a = feature.qualifiers
            if feature.type == 'CDS':
                locus_tag = a.get("locus_tag")[0]
                accession_locus = accession_ver + '@' + locus_tag
                temp_dict[locus_tag]  = [(int(feature.location.start), int(feature.location.end), int(feature.location.strand))]
        cds_info_dict[accession_ver] = temp_dict

In [None]:
for (species, dictionary) in cds_info_dict.items():
    for (k, v) in cds_info_dict[species].items():
        (cds_start, cds_stop, cds_strand) = v[0]
        if cds_strand == 1:
            stops = [0]
            for (k2, v2) in cds_info_dict[species].items():
                (start, stop, strand) = v2[0]
                if start < cds_start:
                    stops.append(stop)
            max_stop = max(stops)
            if max_stop >= cds_start:
                upstream_region = (cds_start, cds_start, 1)
            else:
                upstream_region = (max_stop, cds_start, 1)
            cds_info_dict[species][k].append(upstream_region)
        else:
            starts = [999999999]
            for (k2, v2) in cds_info_dict[species].items():
                (start, stop, strand) = v2[0]
                if stop > cds_stop:
                    starts.append(start)
            min_start = min(starts)
            if min_start <= cds_stop:
                upstream_region = (cds_start, cds_start, -1)
            else:
                upstream_region = (cds_stop, min_start, -1)
            cds_info_dict[species][k].append(upstream_region)
                                   

In [None]:
tb_cds = []
all_cds = []
orf_dict = {}
names_dict = {}
upstream_seq_dict = {}
annotated_orf_translation_dict = {}
maximal_orf_translation_dict = {}
trans = util.Translator()

sequence_dirs = [x for x in sequence_dirs if x!= 'GCA_000340435.3'] # Has an S in it
for dirname in (sequence_dirs):
    print(dirname)
    for genome_record in SeqIO.parse(seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
        accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
        full_sequence = str(genome_record.seq) 
        ORFFinder = orffn.ORF_Finder(full_sequence)
        orfs = ORFFinder.max_orf(0, len(full_sequence), output_orfs = 'Nested', min_orf_length = 50)
        orf_dict[accession_ver] = (full_sequence, orfs)
        
        for feature in genome_record.features:
            names_dict[accession_ver] = genome_record.annotations['organism']
            a = feature.qualifiers
            if feature.type == 'CDS' and a.get("translation") != None:
                locus_tag = a.get("locus_tag")[0]
                accession_locus = accession_ver + '@' + locus_tag
                translation = a.get("translation")[0]
                all_cds.append([accession_locus, translation])
                if dirname == tb_annotation_dirname and locus_tag in loci:
                    tb_cds.append([accession_locus, translation])
                (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                if strand == 1:
                    (upstream_start, upstream_stop, upstream_strand) = cds_info_dict[accession_ver][locus_tag][1]
                    upstream_seq = full_sequence[upstream_start: upstream_stop+3]
                    for orf in orf_dict[accession_ver][1]:
                        if orf[1] == stop:
                            maximal_aa_seq = trans.translate_sequence(full_sequence[orf[0]:orf[1]], 1, 0)[:-1]
                else:
                    (upstream_start, upstream_stop, upstream_strand) = cds_info_dict[accession_ver][locus_tag][1]
                    upstream_seq = util.reverse_complement(full_sequence[upstream_start-3: upstream_stop])
                    for orf in orf_dict[accession_ver][1]:
                        if orf[0] == start:
                            maximal_aa_seq = trans.translate_sequence(util.reverse_complement(full_sequence[orf[0]:orf[1]]), 1, 0)[:-1]
                upstream_seq_dict[accession_locus] = upstream_seq
                maximal_orf_translation_dict[accession_locus] = maximal_aa_seq
                annotated_orf_translation_dict[accession_locus] = translation
               

#### Set up dataframe with annotated (based on PGP run) M.tb feature boundaries and Mycobrowser M.tb feature boundaries

#### Produce FASTA file with CDS candidates

In [None]:
util.produce_fasta_file(all_cds, project_dir + '/all_cds.fasta')
util.produce_fasta_file(tb_cds, project_dir + '/tb_cds.fasta')

In [None]:
blastfn.build_blast_db(project_dir, 'all_cds.fasta', 'all_cds', 'F:/Datasets/BLAST/all_cds')
blastfn.build_blast_db(project_dir, 'tb_cds.fasta', 'tb_cds', 'F:/Datasets/BLAST/tb_cds')

In [None]:
blastfn.run_blastp('F:/Datasets/BLAST/all_cds', 'tb_cds.fasta', 'all_cds', 'tb_cds_hits.csv', e_value = 1e-10)

#### Analyse CDS orthologs in target species

In [None]:
blast_output = blastfn.process_blast_output('F:/Datasets/BLAST/all_cds/tb_cds_hits.csv', names_dict, top_hit_only = False)

In [None]:
test = blast_output #blast_output[blast_output['target_species_name'] == 'Mycobacterium marinum'];
test['num_identical_matches'] = test['alignment_length'] * test['percent_identical_matches']/100;
test['max_identical_query_matches'] = test.groupby(['query_ref', 'target_species'])['num_identical_matches'].transform('max');
test['max_identical_target_matches'] = test.groupby('target_ref')['num_identical_matches'].transform('max');
test = test[test['num_identical_matches']>0.9999*test['max_identical_query_matches']]
test = test[test['num_identical_matches']>0.9999*test['max_identical_target_matches']]
test['count_matches'] = test.groupby('query_ref')['target_ref'].transform('count');
test['upstream_sequence'] = test['target_ref'].map(upstream_seq_dict)
test['annotated_translation'] = test['target_ref'].map(annotated_orf_translation_dict)
test['maximal_translation'] = test['target_ref'].map(maximal_orf_translation_dict)
test.to_csv(project_dir + '/test.csv')

In [None]:
query_refs = [tb_species + '@' + locus for locus in loci]

In [None]:
for ref in query_refs:
    print(ref)
    sequences = []
    temp = test[test['query_ref'] == ref]
    for i, r in temp.iterrows():
        sequences.append([r['target_ref'], r['upstream_sequence']])
    util.produce_fasta_file(sequences, project_dir + '/sequences_to_align.faa')
    cline = MuscleCommandline(muscle_exe, input= project_dir+'/'+ '/sequences_to_align.faa', out=project_dir + '/align_output_'+ref )
    result = cline();

In [None]:
query_ref = 'NC_000962.3@Rv1410c'
align = alignfn.Alignment(project_dir + '/align_output_'+query_ref, query_ref, 'NT', insert_symbol = '-', species_order = [])

In [None]:
align.calculate_entropies(modified=False)

In [None]:
align.sequence_list

In [None]:
sns.scatterplot(data = temp_df, x = 'query_1_start', y = 'target_1_start', s=1)

In [None]:
test.to_csv(project_dir + '/test.csv')

In [None]:
sns.scatterplot(data = test, x  = 'tb_start', y = 'target_start', s=1 )

In [None]:
sns.histplot(data = test, x = 'percent_identical_matches')

#### Analyse interfeature orthologs in target species

In [None]:
prob_dict = {}
for (start, stop, strand, length, prob) in mycobrowser_inter_feature_orf_probabilities:
    prob_dict[(start, stop, strand)] = prob

In [None]:
if_blast_output = blastfn.process_blast_output('F:/Datasets/BLAST/comparator_orfs/if_hits.csv', organism_name_dict, top_hit_only = False)
if_test = if_blast_output[if_blast_output['target_species_name'] == 'Mycobacterium marinum'];
if_test['Overlap'] = 0
if_test['Genbank_Coordinates'] = ''

for i, r in tqdm(if_test.iterrows()):
    start = int(r['query_ref'].split('@')[1].split('_')[0])
    stop = int(r['query_ref'].split('@')[1].split('_')[1])
    strand = int(r['query_ref'].split('@')[1].split('_')[2])
    if_test.at[i,'probability'] = prob_dict[(start, stop, strand)]
    highest_overlap = 0
    hit = 0
    for feature in genbank_cds_boundaries:
            if start <= feature[1] and stop >= feature[0]:
                overlap = (min(stop, feature[1]) - max(start, feature[0]))/(stop - start)
                if overlap > highest_overlap:
                    hit = 1
                    highest_overlap = overlap
                    highest_entry = feature
    if hit == 1:
        if_test.at[i,'Overlap'] = highest_overlap
        if_test.at[i,'Genbank_Coordinates'] = highest_entry
if_test = if_test[abs(if_test['query_length'] - if_test['query_end_alignment']) < 3]
if_test = if_test[abs(if_test['subject_length'] - if_test['target_end_alignment']) < 3]
if_test.to_csv(project_dir + '/if_test_2.csv')

In [None]:
of = orffn.H37Rv_ORF_Finder()
print(of.mutation_count_list[4056047:4056344])
print(of.mutation_bin_probability(of.mutation_count_list[4056047:4056344]))

In [None]:
average = []
totlen = len(of.mutation_count_list)
for x in range(0, int(totlen/500)-1):
    average.append(sum(of.mutation_count_list[x*500: x*500 +500]))
sns.histplot(average, bins=300)

In [None]:
def bin_formula(max_bin_counts, tot_bin_counts):
    return 1- binom.cdf(max_bin_counts-1, tot_bin_counts,1/3)

def mutation_bin_probability(mutation_counts):
    bin_counts = [0,0,0]
    for i, c in enumerate(mutation_counts):
        bin_counts[i % 3] += c
    if sum(bin_counts) == 0:
        return 2
    else:
        return (bin_counts, bin_formula(bin_counts[2], sum(bin_counts)))  

In [None]:
mutation_bin_probability(of.mutation_count_list[4056047:4056344])

In [None]:
organism_name_dict

In [None]:
orf_dict['NC_000962.3'][0][2760753:2760856]

In [None]:
orf_dict['NZ_CP058277.1'][0][5934538:5934641]

In [None]:
orf_dict['NZ_AP022581.1'][0][692416:692519]

In [None]:
trans = util.Translator()
v = orf_dict[tb_species]'NC_000962.3@2760763_2762380_1']
   

In [None]:
 for k, v in orf_dict.items():
        full_sequence = v[0]
        orf_list = v[1]
        for x in orf_list:
            if x[2] == 1:
                prot = trans.translate_sequence(full_sequence[x[0]:x[1]], 1, 0)

In [None]:
trans = util.Translator()
temp = []
for k, v in orf_dict.items():
    full_sequence = v[0]
    orf_list = v[1]
    for x in orf_list:
        if x[2] == 1:
            prot = trans.translate_sequence(full_sequence[x[0]:x[1]], 1, 0)
        else:
            prot = trans.translate_sequence(util.reverse_complement(full_sequence[x[0]:x[1]]), 1, 0)
        name = k + '@' +str(x[0])+'_'+str(x[1])+'_'+str(x[2])
        if name in ['NC_000962.3@2760763_2762380_1']:
            print(prot[:-1])
        if name in ['NZ_CP058277.1@5934638_5936567_1']:
            print(prot[89:622])

In [None]:
cds_info_dict['NZ_CP080999.1']['K3U93_RS21900']