#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from collections import defaultdict
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from random import sample
pd.options.mode.chained_assignment = None  # default='warn'
import ete3;



In [2]:
project_dir = 'F:/Project_Data/Project_11'
seq_dir = 'F:/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
dictionary_dir = 'F:/Datasets/CRYPTIC_DATA/Cryptic_Dictionaries'
mutation_count_dir = 'F:/Datasets/CRYPTIC_DATA/Cryptic_Mutation_Counts'
mutation_count_dir = 'F:/Datasets/CRYPTIC_DATA/Cryptic_Mutation_Counts'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
mycobrowser_dir = 'F:/Datasets/Data_From_Publications'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
cryptic_input_path = 'F:/Datasets/CRYPTIC_DATA'

In [3]:
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)

In [4]:
full_run = False

#### Functions

In [None]:
def produce_sequences(position_list, variant_dict, position_dict):
    base_sequence = []
    for i in position_list:
        base_sequence.append(full_sequence[i-1])     # Cryptic are 1 based 
    output_sequence_dict ={}
    for k, v in variant_dict.items():
        output_sequence_dict[k] = copy.deepcopy(base_sequence)
    for i, pos in enumerate(position_list):
        if pos in position_dict:
            variant_info = position_dict[pos][1:]  # Miss out initlal "ref" record
            for (name, alt) in variant_info:
                output_sequence_dict[name][i] = alt.upper()
    output_sequences = []
    for k, v in output_sequence_dict.items():
        output_sequences.append(['seq_'+str(k), ''.join(v)])
    return output_sequences

In [None]:
def produce_sequences_to_score(position_list, variant_dict, position_dict, distinct_sequence_names):
    base_sequence = []
    for i in position_list:
        base_sequence.append({full_sequence[i-1]})     # Cryptic are 1 based 
    output_sequence_dict ={}
    for k, v in variant_dict.items():
        if k in distinct_sequence_names:
            output_sequence_dict[k] = copy.deepcopy(base_sequence)
    for i, pos in enumerate(position_list):
        if pos in position_dict:
            variant_info = position_dict[pos][1:]  # Miss out initlal "ref" record
            for (name, alt) in variant_info:
                if name in output_sequence_dict:
                    output_sequence_dict[name][i] = {alt.upper()}
    temp_dict = {}
    for k, v in output_sequence_dict.items():
        temp_dict['seq_'+str(k)] = v
    return temp_dict

In [None]:
def fitch_1(list_1, list_2):
    res =[]
    for i, j in zip(list_1, list_2):
        a = i.intersection(j)
        if len(a) == 0:
            a = i.union(j)
        res.append(a)
    return res

def fitch_2(parent_list, child_list):
    res = []
    mutations = []
    for i, j in zip(parent_list, child_list):
        mutation = 0
        a = i.intersection(j)
        if len(a) == 0:
            a = set(list(j)[0])
            mutation = 1
        res.append(a)
        if mutation == 1:
            mutations.append(1)
        else:
            mutations.append(0)
    return (res, mutations)

In [None]:
def generate_mutation_counts(filename, core_number):    
    seq_length = 100
    a = filename.split('_')
    start = a[-3]
    stop = a[-2]
    with open(filename, 'rb') as f:
        sequence_to_score_dict = pickle.load(f)
    master_tree2= ete3.Tree(project_dir + '/' + 'my_tree.nwk')
    for node in master_tree2.traverse("postorder"):
        if node.is_leaf():
            node.add_features(seq = sequence_to_score_dict[node.name][core_number * seq_length: (core_number+1) * seq_length])
        else:
            children = node.children
            node.add_features(seq = fitch_1(children[0].seq, children[1].seq))
    #for k, v in sequence_to_score_dict.items():              
        #seq_length = len(v)
        #break
    
    mutation_counts = [0 for i in range(seq_length)]
    for node in master_tree2.traverse("preorder"):
        if node.is_leaf():
            continue
        if node.is_root():
            node.seq = [{list(x)[0]} for x in node.seq]
        children = node.children
        mutations = []
        child_sequences = []
        for child in children:
            (temp_1, temp_2) = fitch_2(node.seq ,child.seq)
            child_sequences.append(temp_1)
            child.seq = temp_1
            mutations.append(temp_2)
        temp = []
        for n, (h, i, j) in enumerate(zip(mutation_counts, mutations[0], mutations[1])):
            if i + j == 0:
                temp.append(h+0)
            elif i + j == 1:
                temp.append(h+1)
            else:
                if child_sequences[0][i] == child_sequences[1][i]:
                    temp.append(h+1)
                else:
                    temp.append(h+2)
            
        mutation_counts = temp     
    return (start, stop, mutation_counts)

#### Create variant dictionaries 

In [None]:
if full_run == True:
    variant_df = pd.read_csv(cryptic_input_path + "/VARIANTS.csv") 
    with open(project_dir + '/variant_df.pkl', 'wb') as f:
        pickle.dump(variant_df[['UNIQUEID', 'VARIANT', 'MUTATION_TYPE', 'IS_NULL', 'IS_HET', 'IS_FILTER_PASS', 'IS_SNP', 'REF', 'ALT', 'GENOME_INDEX']], f)    

In [None]:
#if full_run == True:
if 1==1:
    position_dict = {}
    variant_dict = {}
    id_dict = {}
    with open(project_dir + '/variant_df.pkl', 'rb') as f:
        variant_df = pickle.load(f) 
        unique_ids = variant_df.UNIQUEID.unique()
        for i, unique_id in enumerate(unique_ids):
            id_dict[unique_id] = i
        for i, r in variant_df.iterrows():
            if r['IS_NULL'] == False and r['IS_FILTER_PASS'] == True and r['IS_HET'] == False and r['IS_SNP'] == True :
            #if r['IS_SNP'] == True:    
                if id_dict[r['UNIQUEID']] in variant_dict:
                    variant_dict[id_dict[r['UNIQUEID']]].append((r['GENOME_INDEX'], r['ALT']))
                else:
                    variant_dict[id_dict[r['UNIQUEID']]] = [(r['GENOME_INDEX'], r['ALT'])]

                if r['GENOME_INDEX'] in position_dict:
                    position_dict[r['GENOME_INDEX']].append((id_dict[r['UNIQUEID']], r['ALT']))
                else:
                    position_dict[r['GENOME_INDEX']] = [r['REF'], (id_dict[r['UNIQUEID']], r['ALT'])]    # If first entry also include reference value for info

    with open(project_dir + '/id_dict.pkl', 'wb') as f:
        pickle.dump(id_dict, f)
    with open(project_dir + '/variant_dict.pkl', 'wb') as f:
        pickle.dump(variant_dict, f) 
    with open(project_dir + '/position_dict.pkl', 'wb') as f:
        pickle.dump(position_dict, f) 
        

In [5]:
if full_run == False:
    with open(project_dir + '/id_dict.pkl', 'rb') as f:
        id_dict = pickle.load(f)  
    with open(project_dir + '/variant_dict.pkl', 'rb') as f:
        variant_dict = pickle.load(f)  
    with open(project_dir + '/position_dict.pkl', 'rb') as f:
        position_dict = pickle.load(f)  
    with open(project_dir + '/variant_df.pkl', 'rb') as f:
        variant_df = pickle.load(f)      

In [5]:
with open(project_dir + '/variant_df.pkl', 'rb') as f:
        variant_df = pickle.load(f)      
genomes_df = pd.read_csv(cryptic_input_path + '/GENOMES.csv')

  genomes_df = pd.read_csv(cryptic_input_path + '/GENOMES.csv')


In [29]:
temp = genomes_df#[genomes_df['UNIQUEID'] == 'site.02.subj.0005.lab.2014222011.iso.1']
temp2 = temp[temp['SNP_DISTANCE_TO_H37rV'] <100]
len(temp2)
for i, r in temp2.iterrows():
    print(r['UNIQUEID'], r['SNP_DISTANCE_TO_H37rV'] , len(variant_dict[id_dict[r['UNIQUEID']]]))

site.05.subj.LR-2285.lab.FN-00492-18.iso.1 95.0 95
site.04.subj.03671.lab.JJH10007.iso.1 54.0 54
site.13.subj.070208197.lab.070208197.iso.1 45.0 45
site.13.subj.070207152.lab.070207152.iso.1 38.0 38
site.13.subj.070208207.lab.070208207.iso.1 41.0 41
site.13.subj.080200043.lab.080200043.iso.1 37.0 37
site.03.subj.GB-91540144.lab.IML-01052.iso.1 45.0 45
site.03.subj.GB-91540133.lab.IML-01053.iso.1 45.0 45
site.03.subj.GB-91540141.lab.IML-01056.iso.1 44.0 44
site.03.subj.GB-91540140.lab.IML-01055.iso.1 45.0 45
site.03.subj.GB-91540138.lab.IML-01054.iso.1 44.0 44
site.06.subj.MHL_0208-14.lab.06MIL0327.iso.1 40.0 40
site.10.subj.YA00038861.lab.YA00038861.iso.1 47.0 47
site.10.subj.YA00099963.lab.YA00099963.iso.1 39.0 39


In [49]:
len(variant_dict[id_dict['site.05.subj.LR-2285.lab.FN-00492-18.iso.1']])

95

In [64]:
df_cd = pd.merge(variant_df, , how='outer', on = 'GENOME_INDEX')
for i, r in df_cd.iterrows():

97961825

In [6]:
gpi_genomes_df = genomes_df[genomes_df['BELONGS_GPI']==True]

In [7]:
gpi_variants_df = pd.merge(variant_df, gpi_genomes_df, how='inner', on = 'UNIQUEID')

In [12]:
gpi_variants_df

Unnamed: 0,UNIQUEID,VARIANT,MUTATION_TYPE,IS_NULL,IS_HET,IS_FILTER_PASS,IS_SNP,REF,ALT,GENOME_INDEX,...,IMAGE_MD5SUM,FTP_PATH,FTP_FILENAME_VCF,TREE_PATH,TREE_FILENAME_VCF,FASTQ_MD5SUMS,SEQTREAT_SAMPLE,MYKROBE_LINEAGE_NAME_1,MYKROBE_LINEAGE_NAME_2,ENA
0,site.02.subj.0958.lab.22A197.iso.1,1849c>a,SNP,False,False,True,True,c,a,1849,...,{'02-0958-22A197-1-14': 'a587bac9ad2a0ebd36274...,/well/bag/jeffk/release_staging/,00/01/41/00/14100/site.02.iso.1.subject.0958.l...,dat/CRyPTIC2/V2/02/0958/22A197/1/regenotyped/,site.02.subj.0958.lab.22A197.iso.1.v0.8.3.rege...,,False,Lineage 2,lineage2.2.1,ERS5301054
1,site.02.subj.0958.lab.22A197.iso.1,1977a>g,SNP,False,False,True,True,a,g,1977,...,{'02-0958-22A197-1-14': 'a587bac9ad2a0ebd36274...,/well/bag/jeffk/release_staging/,00/01/41/00/14100/site.02.iso.1.subject.0958.l...,dat/CRyPTIC2/V2/02/0958/22A197/1/regenotyped/,site.02.subj.0958.lab.22A197.iso.1.v0.8.3.rege...,,False,Lineage 2,lineage2.2.1,ERS5301054
2,site.02.subj.0958.lab.22A197.iso.1,4013t>c,SNP,False,False,True,True,t,c,4013,...,{'02-0958-22A197-1-14': 'a587bac9ad2a0ebd36274...,/well/bag/jeffk/release_staging/,00/01/41/00/14100/site.02.iso.1.subject.0958.l...,dat/CRyPTIC2/V2/02/0958/22A197/1/regenotyped/,site.02.subj.0958.lab.22A197.iso.1.v0.8.3.rege...,,False,Lineage 2,lineage2.2.1,ERS5301054
3,site.02.subj.0958.lab.22A197.iso.1,7362g>c,SNP,False,False,True,True,g,c,7362,...,{'02-0958-22A197-1-14': 'a587bac9ad2a0ebd36274...,/well/bag/jeffk/release_staging/,00/01/41/00/14100/site.02.iso.1.subject.0958.l...,dat/CRyPTIC2/V2/02/0958/22A197/1/regenotyped/,site.02.subj.0958.lab.22A197.iso.1.v0.8.3.rege...,,False,Lineage 2,lineage2.2.1,ERS5301054
4,site.02.subj.0958.lab.22A197.iso.1,7585g>c,SNP,False,False,True,True,g,c,7585,...,{'02-0958-22A197-1-14': 'a587bac9ad2a0ebd36274...,/well/bag/jeffk/release_staging/,00/01/41/00/14100/site.02.iso.1.subject.0958.l...,dat/CRyPTIC2/V2/02/0958/22A197/1/regenotyped/,site.02.subj.0958.lab.22A197.iso.1.v0.8.3.rege...,,False,Lineage 2,lineage2.2.1,ERS5301054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18159372,site.10.subj.YA00166043.lab.YA00166043.iso.1,4338587_indel,INDEL,False,False,True,False,agctccgagctctagt,agctccgagtctagt,4338587,...,{'10-YA00166043-YA00166043-1-14': 'e151041dd9b...,/well/bag/jeffk/release_staging/,00/02/95/61/29561/site.10.iso.1.subject.YA0016...,dat/CRyPTIC2/V2/10/YA00166043/YA00166043/1/reg...,site.10.subj.YA00166043.lab.YA00166043.iso.1.v...,,False,Lineage 2,lineage2.2.1,NO_ENA
18159373,site.10.subj.YA00166043.lab.YA00166043.iso.1,4379044_indel,INDEL,False,False,True,False,cg,c,4379044,...,{'10-YA00166043-YA00166043-1-14': 'e151041dd9b...,/well/bag/jeffk/release_staging/,00/02/95/61/29561/site.10.iso.1.subject.YA0016...,dat/CRyPTIC2/V2/10/YA00166043/YA00166043/1/reg...,site.10.subj.YA00166043.lab.YA00166043.iso.1.v...,,False,Lineage 2,lineage2.2.1,NO_ENA
18159374,site.10.subj.YA00166043.lab.YA00166043.iso.1,4383144_indel,INDEL,False,False,True,False,c,ccgggg,4383144,...,{'10-YA00166043-YA00166043-1-14': 'e151041dd9b...,/well/bag/jeffk/release_staging/,00/02/95/61/29561/site.10.iso.1.subject.YA0016...,dat/CRyPTIC2/V2/10/YA00166043/YA00166043/1/reg...,site.10.subj.YA00166043.lab.YA00166043.iso.1.v...,,False,Lineage 2,lineage2.2.1,NO_ENA
18159375,site.10.subj.YA00166043.lab.YA00166043.iso.1,4400660_indel,INDEL,False,False,True,False,ac,a,4400660,...,{'10-YA00166043-YA00166043-1-14': 'e151041dd9b...,/well/bag/jeffk/release_staging/,00/02/95/61/29561/site.10.iso.1.subject.YA0016...,dat/CRyPTIC2/V2/10/YA00166043/YA00166043/1/reg...,site.10.subj.YA00166043.lab.YA00166043.iso.1.v...,,False,Lineage 2,lineage2.2.1,NO_ENA


In [11]:
for i, r in gpi_variants_df.iterrows():
    print(i, r['GENOME_INDEX'], r['REF'], full_sequence[ r['GENOME_INDEX'] -1])
    if i>10:
        break

0 1849 c C
1 1977 a A
2 4013 t T
3 7362 g G
4 7585 g G
5 9304 g G
6 11312 g G
7 11820 c C
8 11879 t A
9 14785 t T
10 14861 g G
11 15117 c C


In [65]:
for k in range(len(cryptic_labels)):
    if cryptic_labels[k] == 'site.05.subj.LR-2285.lab.FN-00492-18.iso.1':
        print(k)

7119


In [63]:

len(genomes_df[genomes_df['BELONGS_GPI']==True])

15228

In [6]:
temp = variant_df[variant_df['UNIQUEID'] == 'site.00.subj.LE10KTB_23.lab.7627572.iso.1']

In [8]:
temp.to_csv(project_dir + '/nick2.csv')

In [None]:
df_1 = variant_df[variant_df['UNIQUEID'] == 'site.02.subj.0005.lab.2014222011.iso.1']
df_2 = variant_df[variant_df['UNIQUEID'] == 'site.02.subj.0007.lab.2014222016.iso.1']

In [None]:
df_cd = pd.merge(df_1, df_2, how='outer', on = 'GENOME_INDEX')
for i, r in df_cd.iterrows():
    df_cd.at[i,'refseq'] = full_sequence[r['GENOME_INDEX']-1]

In [None]:
distance = 0
for i, r in df_cd.iterrows():
    if (not(r['VARIANT_x'] == r['VARIANT_y'])):
        distance +=1
distance

In [None]:
df_cd.to_csv(project_dir  +'/nick.csv')

In [None]:
nick = df_cd.query('ALT_x == ALT_y')

In [None]:
df_cd

In [31]:
cryptic_labels = np.load(cryptic_input_path + '/GPI_SNP_DISTANCES_LABELS.npy')
cryptic_distances=np.load(cryptic_input_path + '/GPI_SNP_DISTANCES_VALUES.npy')

In [None]:
len(cryptic_distances)

In [34]:
def generate_distances(snp_pos_dict_1, snp_pos_dict_2):
    distance_dict = {}
    for k1, v1 in snp_pos_dict_1.items():
        for k2, v2 in snp_pos_dict_2.items():
            sd = v1.symmetric_difference(v2)
            temp = {x[:-1] for x in sd}   # Only count variants with different nt in SNP as one mutation
            #temp = sd
            d = len(temp)
            distance_dict[(k1, k2)] = d
            distance_dict[(k2, k1)] = d
    return(distance_dict)

In [35]:
def nick_distances(id_1, id_2):
    samp_1 = cryptic_labels[id_1]
    samp_2 = cryptic_labels[id_2]
    id_1 = id_dict[samp_1]
    id_2 = id_dict[samp_2]
    snp_pos_dict = {}
    snp_1 = set([str(pos) + snp for (pos, snp) in variant_dict[id_1]]) 
    snp_2 = set([str(pos) + snp for (pos, snp) in variant_dict[id_2]]) 
    snp_pos_dict[samp_1] = snp_1
    snp_pos_dict[samp_2] = snp_2
    a = generate_distances(snp_pos_dict, snp_pos_dict)
    return a[(samp_1, samp_2)]

In [54]:
i = 7119
for j in range(len(cryptic_labels)):
    if j < 20:
        continue
    if i==j:
        continue
    
    if abs(cryptic_distances[i,j]- nick_distances(i,j))>400:
        temp = genomes_df[genomes_df['UNIQUEID'] == cryptic_labels[j]]
        print (i, j, cryptic_distances[i,j], nick_distances(i,j))
        for j, r in temp.iterrows():
            print(r['SNP_DISTANCE_TO_H37rV'])

7119 6689 27 543
624.0
7119 6956 20 539
608.0
7119 6978 28 548
611.0
7119 7106 19 557
642.0
7119 7324 27 561
642.0
7119 7401 22 558
623.0
7119 7408 21 538
609.0
7119 7722 9 552
645.0
7119 7858 16 535
600.0
7119 7889 9 551
642.0
7119 7908 14 534
599.0
7119 7910 18 533
616.0
7119 7915 4 528
615.0
7119 7948 3 532
605.0
7119 7965 18 551
636.0
7119 7979 16 550
631.0
7119 7988 23 553
634.0
7119 8033 18 550
635.0
7119 8160 15 552
635.0
7119 8236 24 556
639.0
7119 8318 5 548
637.0
7119 8321 6 554
645.0
7119 8325 27 561
644.0
7119 8333 9 554
647.0
7119 8350 16 531
608.0
7119 8394 23 557
640.0
7119 8415 21 556
639.0
7119 8440 15 537
600.0
7119 8942 20 553
638.0


In [None]:
snp_pos_dict = {}
snp_1 = set([str(pos) + snp for (pos, snp) in variant_dict[id_dict[samp_1]]]) 
snp_2 = set([str(pos) + snp for (pos, snp) in variant_dict[id_dict[samp_2]]]) 
snp_pos_dict[samp_1] = snp_1
snp_pos_dict[samp_2] = snp_2

In [None]:
def generate_distances(snp_pos_dict_1, snp_pos_dict_2):
    distance_dict = {}
    for k1, v1 in snp_pos_dict_1.items():
        for k2, v2 in snp_pos_dict_2.items():
            sd = v1.symmetric_difference(v2)
            #temp = {x[:-1] for x in sd}   # Only count variants with different nt in SNP as one mutation
            temp = sd
            d = len(temp)
            distance_dict[(k1, k2)] = d
            distance_dict[(k2, k1)] = d
    return(distance_dict)

In [None]:
generate_distances(snp_pos_dict, snp_pos_dict)

In [None]:
master_dict = {}
for comparators in tqdm(pairwise_list):
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_distances)(snp_pos_dict[pos1], snp_pos_dict[pos2]) for (pos1, pos2) in comparators)
    for output_dict in parallel_output:
        for (k, v) in output_dict.items():
            master_dict[k] = v
with open(project_dir + '/master_distance_dict.pkl', 'wb') as f:
        pickle.dump(master_dict, f) 

In [None]:
ids = []
for k, v in master_dict.items():
    ids.append(k[0])
ids = list(set(ids))
with open(project_dir+'/tb_seq_distances.phy', 'w') as f:
    f.write('%d\n' % len(ids))
    for idref in ids:
        f.write('seq_'+str(idref))
        for opref in ids:
            f.write('\t%s' % str(abs(master_dict[idref, opref])))
        f.write('\n')

#### Produce and save mutations per position (first produce files containing blocks of 10,000 nt values for all sequences in tree - will take about 36 hours)

In [None]:
variant_positions = []
for k, v in variant_dict.items():
    if k in distinct_sequence_names:
        for (pos, snp) in v:
            variant_positions.append(pos-1)     #Cryptic is 1 indexed
sorted_variant_position_list = list(set(variant_positions))
sorted_variant_position_list.sort()
pos_id_dict = dict(zip(sorted_variant_position_list, range(len(set(variant_positions)))))
id_pos_dict = dict(zip(range(len(set(variant_positions))), sorted_variant_position_list))

In [None]:
tb_variants_sequence_length = len(pos_id_dict)
chunk_size = 1000
num_chunks = math.ceil(tb_variants_sequence_length/chunk_size)
chunk_variant_dict = defaultdict(lambda: defaultdict(list))
for k, v in variant_dict.items():
    if k in distinct_sequence_names:
        for (pos, snp) in v:
            chunk = int(pos_id_dict[pos-1]/chunk_size)
            position_in_chunk = pos_id_dict[pos-1] % chunk_size
            chunk_variant_dict[chunk][k].append((position_in_chunk,snp.upper()))
print("Built dictionary")

In [None]:
full_tb_variant_sequence = ''.join([full_sequence[pos] for pos in sorted_variant_position_list])
if full_run == True:
    for chunk in tqdm(range(num_chunks)):
            start_pos = chunk * chunk_size
            end_pos = min(tb_variants_sequence_length, start_pos + chunk_size)
            variant_sequence_dict = {}
            seq_chunk = [{x} for x in full_tb_variant_sequence[start_pos:end_pos]]
            for seq_id in distinct_sequence_names:
                temp = copy.copy(seq_chunk)
                if seq_id in chunk_variant_dict[chunk]:
                    for (pos, snp) in chunk_variant_dict[chunk][seq_id]:
                        temp[pos] = {snp}
                variant_sequence_dict['seq_'+str(seq_id)] = temp
            with open(dictionary_dir + '/variant_dictionary_'+str(start_pos)+'_'+str(end_pos)+'_' + '.pkl', 'wb') as f:
                   pickle.dump(variant_sequence_dict, f) 

In [None]:
filename_list = util.list_files(dictionary_dir)
filename_list[0]

In [None]:
#if full_run==True:   #mutation_count_dir
if 1==1:
    res = []
    filename_list = util.list_files(dictionary_dir)
    for filename in tqdm([filename_list[0]]):
        temp_2 = filename.split('_')
        start_pos = int(temp_2[2])
        end_pos = int(temp_2[3])
        parallel_output = Parallel(n_jobs=-1)(delayed(generate_mutation_counts)(dictionary_dir+'/' + filename, core_number) for core_number in range(10))
        temp = []
        for x in parallel_output:
            temp+=x[2]
        res.append((int(parallel_output[0][0]), int(parallel_output[0][1]), temp))
        with open(test_dir + '/mutation_counts_'+str(start_pos)+'_'+str(end_pos)+'_' + '.pkl', 'wb') as f:
                   pickle.dump((int(parallel_output[0][0]), int(parallel_output[0][1]), temp), f) 
    with open(test_dir + '/all_mutation_counts.pkl', 'wb') as f:
        pickle.dump(res, f) 

In [None]:
if full_run == True:
    non_zero_mutation_counts = []
    for x in res:
        non_zero_mutation_counts += x[2]
    zero_and_non_zero_mutation_counts = []
    for i in range(len(full_sequence)):
        if i in pos_id_dict:
            zero_and_non_zero_mutation_counts.append(non_zero_mutation_counts[pos_id_dict[i]])
        else:
            zero_and_non_zero_mutation_counts.append(0)
with open(mutation_count_dir + '/zero_and_non_zero_mutation_counts.pkl', 'wb') as f:
        pickle.dump(zero_and_non_zero_mutation_counts, f)     

In [None]:
if full_run == False:
    with open(mutation_count_dir + '/zero_and_non_zero_mutation_counts.pkl', 'rb') as f:
        zero_and_non_zero_mutation_counts = pickle.load(f)  
    

#### Calculate probabilites for annotated (and reannotated PGAP) CDS regions

In [None]:
cds_boundaries = []
for genome_record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    for feature in genome_record.features:
        if feature.type == 'CDS':
            a = feature.qualifiers  
            if a.get("pseudo") == None:
                pseudo = False
            else:
                pseudo = True
            cds_boundaries.append((a.get("locus_tag")[0], pseudo, a.get("product")[0], int(feature.location.start), int(feature.location.end), int(feature.location.strand)))   
reannotated_cds_boundaries = []
for genome_record in SeqIO.parse(project_dir + '/annot.gbk', "genbank"):
    for feature in genome_record.features:
        if feature.type == 'CDS':
            a = feature.qualifiers  
            if a.get("pseudo") == None:
                pseudo = False
            else:
                pseudo = True
            reannotated_cds_boundaries.append((a.get("locus_tag")[0], pseudo, a.get("product")[0], int(feature.location.start), int(feature.location.end), int(feature.location.strand)))   
cds_boundaries.sort(key = lambda x: x[3])
reannotated_cds_boundaries.sort(key = lambda x: x[3])

In [None]:
mutation_df = pd.read_csv(project_dir+'/mutation_df.csv')

In [None]:
zero_and_non_zero_mutation_counts = []
for i, r in mutation_df.iterrows():
    zero_and_non_zero_mutation_counts.append(r['Num_Mutations'])

In [None]:
temp =[]
for (locus, pseudo, product, start, stop, strand) in cds_boundaries:
    if pseudo == False:
        if strand == 1:
            #temp.append(mutation_bin_probability(old_mutation_count_list[start:stop]))
            temp.append(mutation_bin_probability(zero_and_non_zero_mutation_counts[start:stop]))
        else:
            #temp.append(mutation_bin_probability(reversed(old_mutation_count_list[start:stop])))
            temp.append(mutation_bin_probability(reversed(zero_and_non_zero_mutation_counts[start:stop])))
scores = []
for x in temp:
    if x == 2:
        scores.append(x)
    else:
        scores.append(x[1])

In [None]:
sns.histplot(scores, bins =100)

In [None]:
sns.histplot([x for x in zero_and_non_zero_mutation_counts if x < 200], bins = 100)

In [None]:
for n, i in enumerate(zero_and_non_zero_mutation_counts):
    if i > 10000:
        print(n, i)

#### Identify potential ORFS (min length 200) in inter-CDS regions of standard annotation and output to Dataframe

In [None]:
ORFFinder = orffn.ORF_Finder()
trans = util.Translator()
min_intergenic_length = 100
details =  []
results =[]
for i, (locus, pseudo, product, start, stop, strand) in enumerate(cds_boundaries):
    if i < len(cds_boundaries) - 1:
        if cds_boundaries[i+1][3] > stop + min_intergenic_length:
            a =ORFFinder.max_orf(stop-40, cds_boundaries[i+1][3]+40, 1e-20, output_all_orfs = False, min_orf_length = 200)
            if not(a==(0,0,0)):
                ov = 0
                info = ('','','','','','','')
                for i, (loc, pse, pro, sta, sto, stra) in enumerate(reannotated_cds_boundaries):
                    if a[1] > sta and a[0] < sto:
                        ov = (min(a[1], sto) - max(a[0], sta)) / (sto - sta)
                        if ov > 0.3:
                            info =  (loc, pse, pro, sta, sto, stra, ov)
                        
                ov = 0
                myco_info = ('','','','','')
                for i, (loc, sta, sto, stra) in enumerate(mycobrowser_features):
                    if a[1] > sta and a[0] < sto:
                        ov = (min(a[1], sto) - max(a[0], sta)) / (sto - sta)
                        if ov > 0.3:
                            myco_info =  (loc, sta, sto, stra, ov)
                        
                sequ = trans.translate_sequence(full_sequence[a[0]:a[1]], a[2], 0)
                details.append([a, sequ])
                results.append([a[0],a[1],a[2],a[3],info[0],info[1],info[2],info[3],info[4],info[5],info[6],myco_info[0],myco_info[1],myco_info[2],myco_info[3],myco_info[4]])
results_df = pd.DataFrame(results, columns = ['start_pos','end_pos','strand','score','PGAP_ref','PGAP_pseudogene','PGAP_product', 'PGAP_start', 'PGAP_end','PGAP_strand', 'PGAP_overlap', 'Mycob_ref','Mycob_start', 'Mycob_end','Mycob_strand', 'Mycob_overlap'])
results_df.to_csv(project_dir + '/cds_candidates.csv')

#### Calculate probabilities for regions in Smith et al 2021 and plot histogram

In [None]:
xls = pd.ExcelFile('F:/Datasets/Data_From_Publications/Smith_2021.xlsx')
df1 = pd.read_excel(xls, 'Table S3', header=3)
co_ords = []
for i, r in df1.iterrows():
    if r['Classification'] == 'Novel':
        if r['Strand'] == '+':
            co_ords.append([int(r['Start Coordinate']-1), int(r['Stop Coordinate']), 1])
        else:
            co_ords.append([int(r['Stop Coordinate'] - 1), int(r['Start Coordinate']),-1])

In [None]:
probs = []
for x in co_ords:
    if x[2] == 1:
        a = (mutation_bin_probability(mutation_count_list[x[0]:x[1]]))
    else:
        a = (mutation_bin_probability(reversed(mutation_count_list[x[0]:x[1]])))
    if a == 2:
        probs.append(2)
    else:
        probs.append(a[1])
    print(x, abs(x[1]-x[0]), a)

In [None]:
sns.histplot(probs, bins=100)

#### Find all (maximal nested) ORFs and filter out ORFS on opposite strand which would have same non-synonymous positions with larger ORF on other strand

In [None]:
ORFFinder = orffn.ORF_Finder(full_sequence)
a = ORFFinder.max_orf(0, 4411532, output_orfs = 'Nested', min_orf_length = 50)
a.sort(key = lambda x: x[3], reverse = True)
orf_list = [a[0]]
for x in tqdm(a[1:]):
    matched = 0
    for v in orf_list:
        if v[0]<=x[0] and v[1]>=x[1]:
            if x[2] == v[2]:
                if (v[0] - x[0])%3 == 0:
                    matched = 1
                    break
            else:
                if (v[0] - x[0])%3 == 1:
                    matched = 1
                    break
    if matched == 0:
        orf_list.append(x)
orf_list.sort(key = lambda x: x[0])

In [None]:
temp = []
for (start, stop, strand, length) in orf_list:
    if strand == 1:
        temp.append(mutation_bin_probability(zero_and_non_zero_mutation_counts[start:stop]))
    else:
        temp.append(mutation_bin_probability(reversed(zero_and_non_zero_mutation_counts[start:stop])))
scores = []
for x in temp:
    if x == 2:
        scores.append(x)
    else:
        scores.append(x[1])

In [None]:
sns.histplot(scores, bins =100)

In [None]:
prob = []
for x in orf_list:
    prob.append(x[4])
sns.histplot(prob, bins=100)

In [None]:
annotated_features = []
for genome_record in SeqIO.parse(project_dir + '/annot.gbk', "genbank"):
    for feature in genome_record.features:
        if feature.type != 'source':
            annotated_features.append((int(feature.location.start), int(feature.location.end), int(feature.location.strand)))   
for genome_record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    for feature in genome_record.features:
        if feature.type != 'source':
            annotated_features.append((int(feature.location.start), int(feature.location.end), int(feature.location.strand)))   
annotated_features.sort(key = lambda x: x[1])

In [None]:
non_overlapping_orfs = []
for i, orf in enumerate(orf_list):
    max_ov = 0
    for (sta, sto, stra) in annotated_features:
        if orf[0] < sto and orf[1] > sta:
            ov = (min(orf[1], sto) - max(orf[0], sta)) / (orf[1] - orf[0])
            max_ov = max(ov, max_ov)
    if max_ov < 0.1:
        non_overlapping_orfs.append(orf)

#### Produce FASTA file with CDS candidates

In [None]:
non_overlapping_orfs

In [None]:
trans = util.Translator()
temp = []
for x in non_overlapping_orfs:
    if x[4] < 1e-5 or x[4]==2:
        if x[2] == 1:
            prot = trans.translate_sequence(full_sequence[x[0]:x[1]], 1, 0)
        else:
            prot = trans.translate_sequence(util.reverse_complement(full_sequence[x[0]:x[1]]), 1, 0)
        name = 'Start_'+str(x[0])+'_Stop_'+str(x[1])+'_Strand_'+str(x[2])
        temp.append([name, prot[:-1]])
util.produce_fasta_file(temp, project_dir + '/' + 'tb_orf_candidates.faa')

In [None]:
blastfn.run_tblastn('F:/Datasets/BLAST/actinobacteria_ref_genomes', 'tb_orf_candidates.faa', 'blastdb_sourceseq_actinobacteria', e_value = 1e-5)

In [None]:
prob = []
for x in orf_list:
    prob.append(x[4])
sns.histplot(prob, bins =100)

In [None]:
candidates = [x for x in orf_list if x[4] <1e-3]
len(candidates)

#### Legacy code (might be useful - this is when tree was defined based on "optimal splits")

In [None]:
def split_tree(species_to_split, position):
    temp =  set([x[0] for x in reduced_position_dict[position][1:]]).intersection(species_to_split)
    temp_2 = species_to_split - temp
    if len(temp_2) > 1 and len(temp) > 1:
        return ([(position, temp, len(temp)), (position, temp_2, len(temp_2))])
    else:
        return([(-1, species_to_split, len(species_to_split))])

In [None]:
def optimal_split_position(species_to_split_list):
    best_split_num = 1e20
    best_position = 0
    splits_achieved_dict = {}
    for k, v in reduced_position_dict.items():
        mutation_count = 0
        worst_count = 0
        for species_to_split in species_to_split_list:
            num_species_to_split = len(species_to_split)
            optimal_split = int(num_species_to_split/2)
            mutation_count += abs(len(set([x[0] for x in v[1:]]).intersection(species_to_split)) - optimal_split)
            worst_count += optimal_split 
        splits_achieved_dict[k] = mutation_count
        if abs(mutation_count) < abs(best_split_num):
            best_position = k
            best_split_num = abs(mutation_count)
    return((best_position, best_split_num), (worst_count, splits_achieved_dict))

In [None]:
snps_to_use = []
reduced_position_dict =  {}
for k, v in position_dict.items():
    if len(v) >=100:
        reduced_position_dict[k] = v
print(len(position_dict), len(reduced_position_dict))


all_species = [[-1,set(x for x in range(len(id_dict))),99,True]]
pos = optimal_split_position([x[1] for x in all_species])[0][0]
snps_to_use.append(pos)
split_results = [split_tree(all_species[0][1], pos)]
print( [(x[0], x[2]) for x in split_results[0]] )
for i in range(1,50):
    if i ==10:
        break
    start = time.process_time()
    split_results.append([])
    optimal_split_output = optimal_split_position([x[1] for x in split_results[i-1]])  
    splits_achieved_dict = optimal_split_output[1][1]
    optimal_split_score = optimal_split_output[1][0]
    posn = optimal_split_output[0]
    score_list = []
    for k, v in splits_achieved_dict.items():
        score_list.append(v)
    score_list.sort()
    score_p50 = score_list[int(len(score_list)/2)]
    for k, v in splits_achieved_dict.items():
        if v >= score_p50:#optimal_split_score - 1000:
            reduced_position_dict.pop(k)
    print(len(reduced_position_dict))
    pos= posn[0]
    snps_to_use.append(pos)
    split_score = posn[1]
    successful_splits = 0
    for x in split_results[i-1]:
        temp = split_tree(x[1], pos) 
        if len(temp) == 1:
            split_results[i].append(temp[0])
        else:
            successful_splits +=1
            split_results[i].append(temp[0])
            split_results[i].append(temp[1])
    print(max([x[2] for x in split_results[i]]))
    print (time.process_time() - start)
    print([(x[0], x[2]) for x in split_results[i]])
    if successful_splits == 0:
        break

In [None]:
print(optimal_split_score)
splits_achieved_dict

In [None]:
len(test)

In [None]:
[(x[0],x[2]) for x in split_results[2] if x[2] > 300]

In [None]:
temp = []
for res in split_results:
    for info in res:
        temp.append(info[0])
snps = set(temp)
snps.remove(-1)

In [None]:
len(mutation_counts)

In [None]:
len(mutation_count_list)

In [None]:
sns.histplot(mutation_count_list, bins = 100)

In [None]:
a = [x[1] for x in split_results[1]]

In [None]:
len(a)

In [None]:
distinct_ids = []
variants = []
for k, v in tqdm(variant_dict.items()):
    if set(v) in variants:
        continue
    else:
        variants.append(set(v))
        distinct_ids.append(k)

In [None]:
len(variant_dict)

In [None]:
len(distinct_ids)

In [None]:
pow(2,10)

In [None]:
len(position_dict)

In [None]:

seq_length = 100

with open(project_dir +'/test.faa', 'rb') as f:
    sequence_to_score_dict = {}
    sequence_to_score_dict['seq1'] = [{x} for x in 'AAATTTT']
    sequence_to_score_dict['seq2'] = [{x} for x in 'AAACTTT']
    sequence_to_score_dict['seq3'] = [{x} for x in 'AAAGTTT']
    sequence_to_score_dict['seq4'] = [{x} for x in 'AAAATTT']
master_tree2= ete3.Tree(project_dir + '/' + 'testtree.nwk')
for node in master_tree2.traverse("postorder"):
    if node.is_leaf():
        node.add_features(seq = sequence_to_score_dict[node.name])
    else:
        children = node.children
        node.add_features(seq = fitch_1(children[0].seq, children[1].seq))

#for k, v in sequence_to_score_dict.items():              
    #seq_length = len(v)
    #break
mutation_counts = [0 for i in range(seq_length)]
for node in master_tree2.traverse("preorder"):
    if node.is_leaf():
        continue
    if node.is_root():
        node.seq = [{list(x)[0]} for x in node.seq]
    children = node.children
    mutations = []
    child_sequences = []
    for child in children:
        (temp_1, temp_2) = fitch_2(node.seq ,child.seq)
        child_sequences.append(temp_1)
        child.seq = temp_1
        mutations.append(temp_2)
   
    temp = []
    for n, (h, i, j) in enumerate(zip(mutation_counts, mutations[0], mutations[1])):
        if i + j == 0:
            temp.append(h)
        elif i + j == 1:
            temp.append(h+1)
        else:
            if child_sequences[0][i] == child_sequences[1][i]:
                temp.append(h+1)
            else:
                temp.append(h+2)
    mutation_counts = temp     
print(mutation_counts)

In [None]:
mutation_counts

In [None]:
dict = {}
for i in range(10000):
    for j in range(10000):
        dict[(i,j)] = 10

In [None]:
full_sequence[11874]

In [None]:
if 1==1:
    position_dict = {}
    variant_dict = {}
    id_dict = {}
    with open(project_dir + '/variant_df.pkl', 'rb') as f:
        variant_df = pickle.load(f) 
        for i, r in tqdm(variant_df.iterrows()):
            if r['GENOME_INDEX'] in [25, 11875]:
                print(r)

In [None]:
type(r['GENOME_INDEX'])