#### Directories etc

In [65]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from Comparative_Analysis import Alignment as alignfn
from random import sample
from Bio.Align.Applications import MuscleCommandline
pd.options.mode.chained_assignment = None  # default='warn'
import ete3;

In [97]:
project_dir = 'F:/Project_Data/E_Pump_Project'
seq_dir = 'F:/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
sample_filenames = ['GCF_000195955.2_ASM19595v2_genomic.gbff', 'GCF_024600175.1_ASM2460017v1_genomic.gbff', 'GCF_010730055.1_ASM1073005v1_genomic.gbff', 'GCF_010731535.1_ASM1073153v1_genomic.gbff',
                    'GCF_022370755.1_ASM2237075v1_genomic.gbff', 'GCF_016745295.1_ASM1674529v1_genomic.gbff', 'GCF_000157895.3_ASM15789v2_genomic.gbff', 'GCF_900603025.1_MHAS_genomic.gbff', 'GCF_018363015.1_ASM1836301v1_genomic.gbff',
                    'GCF_019645855.1_ASM1964585v1_genomic.gbff', 'GCF_010727945.1_ASM1072794v1_genomic.gbff', 'GCF_010731895.1_ASM1073189v1_genomic.gbff', 'GCF_900637205.1_50279_G01_genomic.gbff', 'GCF_000184435.1_ASM18443v1_genomic.gbff']
tb_annotation_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
tb_reannotation_filename = 'annot.gbk'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

#### Create file with all CDS for species

In [98]:
loci = ['Rv3239c', 'Rv3728', 'Rv1250', 'Rv0783c', 'Rv2846c', 'Rv1063c', 'Rv1410c', 'Rv1877', 'Rv2333c', 'Rv2459',  'Rv2565']

In [99]:
tb_cds = []
all_cds = []
names_dict = {}
upstream_seq_dict = {}
for filename in tqdm(sample_filenames):
    for genome_record in SeqIO.parse(seq_dir + '/' + filename, "genbank"):
        #print(filename, genome_record.annotations['organism'])
        full_sequence = str(genome_record.seq) 
        for feature in genome_record.features:
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names_dict[accession_ver] = genome_record.annotations['organism']
            a = feature.qualifiers
            if feature.type == 'CDS' and a.get("translation") != None:
                locus_tag = a.get("locus_tag")[0]
                accession_locus = accession_ver + '@' + locus_tag
                translation = a.get("translation")[0]
                all_cds.append([accession_locus, translation])
                if filename == tb_annotation_filename and locus_tag in loci:
                    tb_cds.append([accession_locus, translation])
                (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                if strand == 1:
                    upstream_seq = full_sequence[start - 300: start+3]
                else:
                    upstream_seq = util.reverse_complement(full_sequence[stop-3: stop + 300])
                upstream_seq_dict[accession_locus] = upstream_seq

100%|██████████| 14/14 [00:09<00:00,  1.46it/s]


#### Set up dataframe with annotated (based on PGP run) M.tb feature boundaries and Mycobrowser M.tb feature boundaries

#### Produce FASTA file with CDS candidates

In [100]:
util.produce_fasta_file(all_cds, project_dir + '/all_cds.fasta')
util.produce_fasta_file(tb_cds, project_dir + '/tb_cds.fasta')

100%|██████████| 71376/71376 [00:05<00:00, 12588.03it/s]
100%|██████████| 11/11 [00:00<?, ?it/s]


In [101]:
blastfn.build_blast_db(project_dir, 'all_cds.fasta', 'all_cds', 'F:/Datasets/BLAST/all_cds')
blastfn.build_blast_db(project_dir, 'tb_cds.fasta', 'tb_cds', 'F:/Datasets/BLAST/tb_cds')

In [103]:
blastfn.run_blastp('F:/Datasets/BLAST/all_cds', 'tb_cds.fasta', 'all_cds', 'tb_cds_hits.csv', e_value = 1e-10)

#### Analyse CDS orthologs in target species

In [104]:
blast_output = blastfn.process_blast_output('F:/Datasets/BLAST/all_cds/tb_cds_hits.csv', names_dict, top_hit_only = False)

In [105]:
test = blast_output #blast_output[blast_output['target_species_name'] == 'Mycobacterium marinum'];
test['num_identical_matches'] = test['alignment_length'] * test['percent_identical_matches']/100;
test['max_identical_query_matches'] = test.groupby(['query_ref', 'target_species'])['num_identical_matches'].transform('max');
test['max_identical_target_matches'] = test.groupby('target_ref')['num_identical_matches'].transform('max');
test = test[test['num_identical_matches']>0.9999*test['max_identical_query_matches']]
test = test[test['num_identical_matches']>0.9999*test['max_identical_target_matches']]
test['count_matches'] = test.groupby('query_ref')['target_ref'].transform('count');
test['upstream_sequence'] = test['target_ref'].map(upstream_seq_dict)
test.to_csv(project_dir + '/test.csv')

In [106]:
query_refs = [tb_species + '@' + locus for locus in loci]

In [107]:
for ref in query_refs:
    print(ref)
    sequences = []
    temp = test[test['query_ref'] == ref]
    for i, r in temp.iterrows():
        sequences.append([r['target_ref'], r['upstream_sequence']])
    util.produce_fasta_file(sequences, project_dir + '/sequences_to_align.faa')
    cline = MuscleCommandline(muscle_exe, input= project_dir+'/'+ '/sequences_to_align.faa', out=project_dir + '/align_output_'+ref )
    result = cline();

NC_000962.3@Rv3239c


100%|██████████| 6/6 [00:00<?, ?it/s]

NC_000962.3@Rv3728



100%|██████████| 9/9 [00:00<00:00, 9002.80it/s]


NC_000962.3@Rv1250


100%|██████████| 14/14 [00:00<?, ?it/s]


NC_000962.3@Rv0783c


100%|██████████| 7/7 [00:00<?, ?it/s]


NC_000962.3@Rv2846c


100%|██████████| 14/14 [00:00<?, ?it/s]


NC_000962.3@Rv1063c


100%|██████████| 14/14 [00:00<00:00, 14125.63it/s]


NC_000962.3@Rv1410c


100%|██████████| 14/14 [00:00<?, ?it/s]

NC_000962.3@Rv1877



100%|██████████| 11/11 [00:00<?, ?it/s]


NC_000962.3@Rv2333c


100%|██████████| 11/11 [00:00<?, ?it/s]


NC_000962.3@Rv2459


100%|██████████| 6/6 [00:00<?, ?it/s]


NC_000962.3@Rv2565


100%|██████████| 1/1 [00:00<?, ?it/s]


In [114]:
query_ref = 'NC_000962.3@Rv1410c'
align = alignfn.Alignment(project_dir + '/align_output_'+query_ref, query_ref, 'NT', insert_symbol = '-', species_order = [])

In [115]:
align.calculate_entropies(modified=False)

In [116]:
align.sequence_list

['------------CAACGGTCTGGGTAACTGGCTGGCCAGCATCACCGACGCCAAGTCGGAGGCCAGCGAGACCATCAATGGTGTTGACACGGTGCGGATCTCCGGCAAGGTCAGCGCCGACGCCATGAACAAACTGATCCCGC---TCAAGGCGACCACCCCGCTGCCCGCCACCGTCTGGATCCAGAAGGCCGACCCCCATCAGCTGGTGCAGGCCAAGGCCGACACAGGCAACGGCGGCAGCATTCAGATCACGCTGTCGGAGTGGGACAAGCCCGTCACCGTCACCAAGCCCGCCGT---CTGACG---------------------GCCTGAGTCATG',
 '---GAACCCCGAGGCCGGGGTGGCCAACATCCTGGCCAACTTCTCCGACGCCAAGTCCGAGGGCACCGAGCAGATCGACGGCATCGACACCGTCCGGGTCACCGGTGAGGTCAGCGCCGACGCGGTCAACCAGCTGATCCCCTCGCTGAAGGCCACCTCGCCCGTGCCGGGCACCGCGTGGATCGAGAACGGCGGCGACCACAACCTGGTGCGCGCTCAGATCGAGCCGACCGGTGACAGCTCCATCGAGCTGACCCTGTCGAAGTGGAACGAGCCCGTCACCGTCACCAAGCCGCAGGT---GTAATG---------------------------------',
 '------CCCGAACACCGGTCTGGCCAACATCTTGACCAACATCAGCAACCCGAAGTCCCAGTCGCGCGAAACGATCAACGGGCAAAGCACCGTCAAGATCACCGGGACGGCCGCGGCGGACGCCGTGAATGGCCTCGCGCCCCAGCTGAAGGCCACCCAGCCGGCGGCCACCACCGTGTGGATCGAGGAGAACGGTGATCACCAACTGGTGCAGATTCAGCTCGAGCAGAGCCCGGGCAACTCGGTGCAGATGACGCTGTCCAACTGGAACGCGCCGGTCCAGGTCACCAAGCCCCCGGTGGCC

In [None]:
sns.scatterplot(data = temp_df, x = 'query_1_start', y = 'target_1_start', s=1)

In [None]:
test.to_csv(project_dir + '/test.csv')

In [None]:
sns.scatterplot(data = test, x  = 'tb_start', y = 'target_start', s=1 )

In [None]:
sns.histplot(data = test, x = 'percent_identical_matches')

#### Analyse interfeature orthologs in target species

In [None]:
prob_dict = {}
for (start, stop, strand, length, prob) in mycobrowser_inter_feature_orf_probabilities:
    prob_dict[(start, stop, strand)] = prob

In [None]:
if_blast_output = blastfn.process_blast_output('F:/Datasets/BLAST/comparator_orfs/if_hits.csv', organism_name_dict, top_hit_only = False)
if_test = if_blast_output[if_blast_output['target_species_name'] == 'Mycobacterium marinum'];
if_test['Overlap'] = 0
if_test['Genbank_Coordinates'] = ''

for i, r in tqdm(if_test.iterrows()):
    start = int(r['query_ref'].split('@')[1].split('_')[0])
    stop = int(r['query_ref'].split('@')[1].split('_')[1])
    strand = int(r['query_ref'].split('@')[1].split('_')[2])
    if_test.at[i,'probability'] = prob_dict[(start, stop, strand)]
    highest_overlap = 0
    hit = 0
    for feature in genbank_cds_boundaries:
            if start <= feature[1] and stop >= feature[0]:
                overlap = (min(stop, feature[1]) - max(start, feature[0]))/(stop - start)
                if overlap > highest_overlap:
                    hit = 1
                    highest_overlap = overlap
                    highest_entry = feature
    if hit == 1:
        if_test.at[i,'Overlap'] = highest_overlap
        if_test.at[i,'Genbank_Coordinates'] = highest_entry
if_test = if_test[abs(if_test['query_length'] - if_test['query_end_alignment']) < 3]
if_test = if_test[abs(if_test['subject_length'] - if_test['target_end_alignment']) < 3]
if_test.to_csv(project_dir + '/if_test_2.csv')

In [None]:
of = orffn.H37Rv_ORF_Finder()
print(of.mutation_count_list[4056047:4056344])
print(of.mutation_bin_probability(of.mutation_count_list[4056047:4056344]))

In [None]:
average = []
totlen = len(of.mutation_count_list)
for x in range(0, int(totlen/500)-1):
    average.append(sum(of.mutation_count_list[x*500: x*500 +500]))
sns.histplot(average, bins=300)

In [None]:
def bin_formula(max_bin_counts, tot_bin_counts):
    return 1- binom.cdf(max_bin_counts-1, tot_bin_counts,1/3)

def mutation_bin_probability(mutation_counts):
    bin_counts = [0,0,0]
    for i, c in enumerate(mutation_counts):
        bin_counts[i % 3] += c
    if sum(bin_counts) == 0:
        return 2
    else:
        return (bin_counts, bin_formula(bin_counts[2], sum(bin_counts)))  

In [None]:
mutation_bin_probability(of.mutation_count_list[4056047:4056344])

In [None]:
organism_name_dict

In [None]:
orf_dict['NC_000962.3'][0][2760753:2760856]

In [None]:
orf_dict['NZ_CP058277.1'][0][5934538:5934641]

In [None]:
orf_dict['NZ_AP022581.1'][0][692416:692519]

In [None]:
trans = util.Translator()
v = orf_dict[tb_species]'NC_000962.3@2760763_2762380_1']
   

In [None]:
 for k, v in orf_dict.items():
        full_sequence = v[0]
        orf_list = v[1]
        for x in orf_list:
            if x[2] == 1:
                prot = trans.translate_sequence(full_sequence[x[0]:x[1]], 1, 0)

In [None]:
trans = util.Translator()
temp = []
for k, v in orf_dict.items():
    full_sequence = v[0]
    orf_list = v[1]
    for x in orf_list:
        if x[2] == 1:
            prot = trans.translate_sequence(full_sequence[x[0]:x[1]], 1, 0)
        else:
            prot = trans.translate_sequence(util.reverse_complement(full_sequence[x[0]:x[1]]), 1, 0)
        name = k + '@' +str(x[0])+'_'+str(x[1])+'_'+str(x[2])
        if name in ['NC_000962.3@2760763_2762380_1']:
            print(prot[:-1])
        if name in ['NZ_CP058277.1@5934638_5936567_1']:
            print(prot[89:622])