In [3]:
import subprocess
import os
import re
import pickle
import pandas as pd
import math
from tqdm import tqdm
import copy
from collections import defaultdict
import ete3
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from joblib import Parallel, delayed

In [4]:
project_dir =  'F:/Project_Data/Project_11'
datasets_dir = project_dir + '/Datasets'
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
dictionary_dir = project_dir + '/Dictionaries'
mutation_count_dir = project_dir + '/Mutation_Counts'
chunk_variant_dict_dir = project_dir + '/Chunk_Variant_Dictionaries'
tb_tree_filename = 'tb_gpi.nwk'
full_run = True

num_cores = 16
core_numbers = list(range(1, num_cores+1))
timeout=99999

for record in SeqIO.parse(datasets_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)
tb_length = len(full_sequence)

In [None]:

#  CREATE VARIANT DATASETS FROM CRYPTIC RAW DATA

if full_run == True:
#if 1==0:
    variant_df = pd.read_csv(datasets_dir + "/VARIANTS.csv") 
    with open(project_dir + '/variant_df.pkl', 'wb') as f:
        pickle.dump(variant_df[['UNIQUEID', 'VARIANT', 'MUTATION_TYPE', 'IS_NULL', 'IS_HET', 'IS_FILTER_PASS', 'IS_SNP', 'REF', 'ALT', 'GENOME_INDEX']], f)    

print("Stage_1_Complete")
if full_run == True:
#if 1==0:
    with open(project_dir + '/variant_df.pkl', 'rb') as f:
        full_variant_df = pickle.load(f) 
    print(len(full_variant_df))
    genomes_df = pd.read_csv(datasets_dir + '/GENOMES.csv')
    gpi_genomes_df = genomes_df[genomes_df['BELONGS_GPI']==True][['UNIQUEID']] 
    print(len(gpi_genomes_df))
    gpi_variant_df = pd.merge(full_variant_df, gpi_genomes_df, how='inner', on = 'UNIQUEID')
    print(len(gpi_variant_df))
    with open(project_dir + '/gpi_variant_df.pkl', 'wb') as f:
        pickle.dump(gpi_variant_df, f)    
print("Stage_2_Complete")
if full_run == True:
#if 1==1:
    complement_dict = {'a':'t', 'c':'g', 'g':'c', 't':'a'}
    position_dict = {}
    variant_dict = {}
    id_dict = {}
    with open(project_dir + '/gpi_variant_df.pkl', 'rb') as f:
        variant_df = pickle.load(f) 
        unique_ids = variant_df.UNIQUEID.unique()
        for i, unique_id in enumerate(unique_ids):
            id_dict[unique_id] = i

        for i, r in variant_df.iterrows():
            if r['IS_NULL'] == False and r['IS_FILTER_PASS'] == True and r['IS_HET'] == False and r['IS_SNP'] == True :
                genome_index = r['GENOME_INDEX']
                unique_id = r['UNIQUEID']
                ref_sequence_entry = full_sequence[genome_index - 1].lower()   # Cryptic is 1 indexed and lower case
                cryptic_ref_sequence_entry = r['REF']
                cryptic_alt_sequence_entry = r['ALT']
                if cryptic_ref_sequence_entry == ref_sequence_entry:
                    alt = cryptic_alt_sequence_entry
                elif complement_dict[cryptic_ref_sequence_entry] == ref_sequence_entry:
                    alt = complement_dict[cryptic_alt_sequence_entry]
                else:
                    print("Something strange at position ", genome_index, cryptic_ref_sequence_entry, ref_sequence_entry)
                    alt = cryptic_alt_sequence_entry
                
                id_ref = id_dict[unique_id]
                if id_ref in variant_dict:
                    variant_dict[id_ref].append((genome_index, alt))
                else:
                    variant_dict[id_ref] = [(genome_index, alt)]

                if genome_index in position_dict:
                    position_dict[genome_index].append((id_ref, alt))
                else:
                    position_dict[genome_index] = [ref_sequence_entry, (id_ref, alt)]    # If first entry also include reference value for info

    with open(project_dir + '/id_dict.pkl', 'wb') as f:
        pickle.dump(id_dict, f)
    with open(project_dir + '/variant_dict.pkl', 'wb') as f:
        pickle.dump(variant_dict, f) 
    with open(project_dir + '/position_dict.pkl', 'wb') as f:
        pickle.dump(position_dict, f) 

print("Stage_3_Complete")

# CONSTRUCT DISTANCE MATRIX

def generate_distances(ind_1, ind_2):
    with open(project_dir + '/snp_pos_dict_'+str(ind_1)+'.pkl', 'rb') as f:
        snp_pos_dict_1 = pickle.load(f)
    with open(project_dir + '/snp_pos_dict_'+str(ind_2)+'.pkl', 'rb') as f:
        snp_pos_dict_2 = pickle.load(f)
    distance_dict = {}
    for k1, v1 in snp_pos_dict_1.items():
        for k2, v2 in snp_pos_dict_2.items():
            if k2 >= k1:
                continue
            sd = v1.symmetric_difference(v2)
            temp = {x[:-1] for x in sd}   # Only count variants with different nt in SNP as one mutation
            #f = len(temp)/tb_length
            #d = -3/4 * math.log(1 - 4*f /3)
            #distance_dict[(k1, k2)] = d
            distance_dict[(k1, k2)] = len(temp)
    return(distance_dict)

print("Stage_4_Complete")

if full_run == True:
#if 1==1:
    with open(project_dir + '/variant_dict.pkl', 'rb') as f:
        temp_variant_dict = pickle.load(f) 
    print(len(temp_variant_dict))
    temp_dict = {}
    for (k, v) in temp_variant_dict.items():
        v.sort(key = lambda x: x[0])
        keystring = ''.join([str(pos) + snp for (pos, snp) in v])
        temp_dict[keystring] = k
    variant_dict = {}
    for (k, v) in temp_dict.items():
    	variant_dict[v] = temp_variant_dict[v] 
    print(len(variant_dict))
    temp_variant_dict = {} 
    for core in tqdm(core_numbers):
        snp_pos_dict = {}
        for n, (k, v) in enumerate(variant_dict.items()):
            if k%num_cores + 1 == core:
                snp_pos_dict[k] = set([str(pos) + snp for (pos, snp) in v])
        with open(project_dir + '/snp_pos_dict_'+str(core)+'.pkl', 'wb') as f:
            pickle.dump(snp_pos_dict, f) 
    variant_dict = {}

print("Stage_5_Complete")
if full_run == True:
#if 1==1:
    for core_1 in tqdm(core_numbers):
        master_dict = {}
        parallel_output = Parallel(n_jobs=-1, timeout = timeout)(delayed(generate_distances)(core_1, core_2) for (core_2) in core_numbers)
        for output_dict in parallel_output:
            for (k, v) in output_dict.items():
                master_dict[k] = v
        with open(project_dir + '/master_distance_dict_'+str(core_1)+'.pkl', 'wb') as f:
            pickle.dump(master_dict, f) 

# COMBINE DICTIONARIES GENERATED INTO SINGLE DICTIONARY

if full_run == True:
#if 1==1:
    master_dict = {}
    for core_1 in tqdm(core_numbers):
        with open(project_dir + '/master_distance_dict_'+str(core_1)+'.pkl', 'rb') as f:
            temp_master_dict = pickle.load(f) 
        k1_vals = []
        k2_vals = []
        for (k1, k2), v in temp_master_dict.items():
            k1_vals.append(k1)
            k2_vals.append(k2)
        k1_vals = list(set(k1_vals))
        k2_vals = list(set(k2_vals))
        k1_vals.sort()
        k2_vals.sort()

        for k1 in k1_vals:
            for k2 in k2_vals:
                if k2 >= k1:
                    continue
                if k1 in master_dict:
                    master_dict[k1].append(str(abs(temp_master_dict[(k1, k2)])))
                else:
                    master_dict[k1] = [str(abs(temp_master_dict[(k1, k2)]))]
    ids = []
    for k, v in master_dict.items():     # Note that seq_0 does not appear on LHS of distance matrix and therefore needs to be included separately in final output
        ids.append(k)
    ids.append(0)
    ids = list(set(ids))   
    ids.sort()


    #FINAL OUTPUT
    print("Writing data")
    #with open(project_dir + '/master_distance_dict.pkl', 'wb') as f:
    #    pickle.dump(master_dict, f) 
    with open(project_dir + '/ids.pkl', 'wb') as f:
        pickle.dump(ids, f)
    print("Writing Phylip file") 
    with open(project_dir+'/tb_seq_distances.phy', 'w') as f:
        f.write('%d\n' % len(ids))
        for n,idref in enumerate(ids):
            f.write('seq_'+str(idref))
            if idref in master_dict:
                for location in master_dict[idref]:
                    f.write('\t%s' % location)
            f.write('\n')
           

In [5]:
num_iterations = 30
num_cores = 32
core_numbers = list(range(0, num_cores))
timeout=99999


def list_files(dir):
    r = []
    s = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            if name == '.ipynb_checkpoints':
                pass
            else:
                s.append(name)
    return s

def generate_variant_sequence_dict(chunk):
    if os.path.exists(chunk_variant_dict_dir + '/chunk_variant_dictionary_'+str(chunk) + '.pkl'):    #  Number of chunk dictionaries generated can be lower given rounding of chunk size        
        with open(chunk_variant_dict_dir + '/chunk_variant_dictionary_'+str(chunk) + '.pkl', 'rb') as f:
            chunk_variant_dict = pickle.load(f) 
        with open(project_dir + '/ids.pkl', 'rb') as f:
            distinct_sequence_names = pickle.load(f) 

        full_tb_variant_sequence = ''.join([full_sequence[pos] for pos in sorted_variant_position_list])
        start_pos = chunk * chunk_size
        end_pos = min(tb_variants_sequence_length, start_pos + chunk_size)
        variant_sequence_dict = {}
        seq_chunk = [{x} for x in full_tb_variant_sequence[start_pos:end_pos]]
        for seq_id in distinct_sequence_names:
            temp = copy.copy(seq_chunk)
            if seq_id in chunk_variant_dict:
                for (pos, snp) in chunk_variant_dict[seq_id]:
                    temp[pos] = {snp}
            variant_sequence_dict['seq_'+str(seq_id)] = temp
        with open(dictionary_dir + '/variant_dictionary_'+str(start_pos)+'_'+str(end_pos)+'_' + '.pkl', 'wb') as f:
            pickle.dump(variant_sequence_dict, f)

def fitch_1(list_1, list_2):
    res =[]
    for i, j in zip(list_1, list_2):
        a = i.intersection(j)
        if len(a) == 0:
            a = i.union(j)
        res.append(a)
    return res

def fitch_2(parent_list, child_list):
    res = []
    mutations = []
    for i, j in zip(parent_list, child_list):
        mutation = 0
        a = i.intersection(j)
        if len(a) == 0:
            a = set(list(j)[0])
            mutation = 1
        res.append(a)
        if mutation == 1:
            mutations.append(1)
        else:
            mutations.append(0)
    return (res, mutations)

def generate_mutation_counts(filename, core_number):    
    seq_length = 22    #22 * 24 is length of chunks output in previous step
    a = filename.split('_')
    start = a[-3]
    stop = a[-2]
    with open(filename, 'rb') as f:
        sequence_to_score_dict = pickle.load(f)
    master_tree2= ete3.Tree(project_dir + '/' + tb_tree_filename)
    for node in master_tree2.traverse("postorder"):
        if node.is_leaf():
            node.add_features(seq = sequence_to_score_dict[node.name][core_number * seq_length: (core_number+1) * seq_length])
        else:
            children = node.children
            node.add_features(seq = fitch_1(children[0].seq, children[1].seq))
    #for k, v in sequence_to_score_dict.items():              
        #seq_length = len(v)
        #break
    
    mutation_counts = [0 for i in range(seq_length)]
    for node in master_tree2.traverse("preorder"):
        if node.is_leaf():
            continue
        if node.is_root():
            node.seq = [{list(x)[0]} for x in node.seq]
        children = node.children
        mutations = []
        child_sequences = []
        for child in children:
            (temp_1, temp_2) = fitch_2(node.seq ,child.seq)
            child_sequences.append(temp_1)
            child.seq = temp_1
            mutations.append(temp_2)
        temp = []
        for n, (h, i, j) in enumerate(zip(mutation_counts, mutations[0], mutations[1])):
            if i + j == 0:
                temp.append(h+0)
            elif i + j == 1:
                temp.append(h+1)
            else:
                if child_sequences[0][i] == child_sequences[1][i]:
                    temp.append(h+1)
                else:
                    temp.append(h+2)
            
        mutation_counts = temp     
    return (start, stop, mutation_counts)

#  IMPORT REFERENCE SEQUENCE

for record in SeqIO.parse(datasets_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)
tb_length = len(full_sequence)

#  LOAD VARIANT DICTIONARY AND IDS IN TREE

#if full_run == True:
if 1==1:
    with open(project_dir + '/variant_dict.pkl', 'rb') as f:
        variant_dict = pickle.load(f) 
    with open(project_dir + '/ids.pkl', 'rb') as f:
        distinct_sequence_names = pickle.load(f) 

    variant_positions = []
    for k, v in variant_dict.items():
        if k in distinct_sequence_names:
            for (pos, snp) in v:
                variant_positions.append(pos-1)     #Cryptic is 1 indexed
    sorted_variant_position_list = list(set(variant_positions))
    sorted_variant_position_list.sort()
    pos_id_dict = dict(zip(sorted_variant_position_list, range(len(set(variant_positions)))))
    id_pos_dict = dict(zip(range(len(set(variant_positions))), sorted_variant_position_list))
    with open(project_dir + '/pos_id_dict.pkl', 'wb') as f:
        pickle.dump(pos_id_dict, f) 
    with open(project_dir + '/id_pos_dict.pkl', 'wb') as f:
        pickle.dump(id_pos_dict, f) 
    tb_variants_sequence_length = len(pos_id_dict)
    num_chunks = num_cores * num_iterations   
    chunk_size = math.ceil(tb_variants_sequence_length/num_chunks)
    print(tb_variants_sequence_length)
    print(num_chunks, chunk_size, len(sorted_variant_position_list))

print("Stage_6 Complete")
#if full_run == True:
if 1==1:
    chunk_variant_dict = defaultdict(lambda: defaultdict(list))
    for k, v in variant_dict.items():
        if k in distinct_sequence_names:
            for (pos, snp) in v:
                chunk = int(pos_id_dict[pos-1]/chunk_size)
                position_in_chunk = pos_id_dict[pos-1] % chunk_size
                chunk_variant_dict[chunk][k].append((position_in_chunk,snp.upper()))
    a =list(chunk_variant_dict.keys())
    a.sort()
    print(a)
    print(len(list(chunk_variant_dict.keys())))
    max_id = 0
    for k, v in pos_id_dict.items():
        if v > max_id:
            max_id = v
    print("max ", v)
    for chunk in tqdm(list(chunk_variant_dict.keys())):
        with open(chunk_variant_dict_dir + '/chunk_variant_dictionary_'+str(chunk) + '.pkl', 'wb') as f:
            pickle.dump(chunk_variant_dict[chunk], f)
    print("Built dictionaries")
print("Stage_7 Complete")
#if full_run == True:
if 1==1:
    for core_1 in tqdm(range(num_iterations)):
        parallel_output = Parallel(n_jobs=-1, timeout = timeout)(delayed(generate_variant_sequence_dict)(core_1 * num_cores + core_2) for (core_2) in core_numbers)

print("Stage_8 Complete")
#if full_run==True:  
if 1==1:
    res = []
    filename_list = list_files(dictionary_dir)
    for filename in tqdm(filename_list):
        temp_2 = filename.split('_')
        start_pos = int(temp_2[2])
        end_pos = int(temp_2[3])
        parallel_output = Parallel(n_jobs=-1)(delayed(generate_mutation_counts)(dictionary_dir+'/' + filename, core_number) for core_number in range(24))
        temp = []
        for x in parallel_output:
            temp+=x[2]
        res.append((int(parallel_output[0][0]), int(parallel_output[0][1]), temp))
        with open(mutation_count_dir + '/mutation_counts_'+str(start_pos)+'_'+str(end_pos)+'_' + '.pkl', 'wb') as f:
                   pickle.dump((int(parallel_output[0][0]), int(parallel_output[0][1]), temp), f) 
    res.sort(key = lambda x: x[0])
    with open(mutation_count_dir + '/all_mutation_counts.pkl', 'wb') as f:
        pickle.dump(res, f) 
print("Stage_9 Complete")
#if full_run==True:  
if 1==1:
    with open(mutation_count_dir + '/all_mutation_counts.pkl', 'rb') as f:
        res = pickle.load(f) 
    non_zero_mutation_counts = []
    for x in res:
        non_zero_mutation_counts += x[2]
    zero_and_non_zero_mutation_counts = []
    for i in range(len(full_sequence)):
        if i in pos_id_dict:
            zero_and_non_zero_mutation_counts.append(non_zero_mutation_counts[pos_id_dict[i]])
        else:
            zero_and_non_zero_mutation_counts.append(0)
    with open(mutation_count_dir + '/zero_and_non_zero_mutation_counts.pkl', 'wb') as f:
        pickle.dump(zero_and_non_zero_mutation_counts, f)  
    mutation_df = pd.DataFrame(zero_and_non_zero_mutation_counts, columns = ['Num_Mutations'])
    mutation_df.to_csv(project_dir + '/mutation_df.csv')   

506022
960 528 506022
Stage_6 Complete
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 

100%|██████████| 959/959 [00:11<00:00, 86.85it/s] 


Built dictionaries
Stage_7 Complete


100%|██████████| 30/30 [27:27<00:00, 54.90s/it]


Stage_8 Complete


100%|██████████| 959/959 [4:16:32<00:00, 16.05s/it]  


Stage_9 Complete


In [7]:
len(zero_and_non_zero_mutation_counts)

4411532

In [19]:
nick = copy.deepcopy(res)

In [21]:
nick.sort(key = lambda x: x[0])

In [24]:
nick[0][0]

0

In [25]:
res.sort(key = lambda x: x[0])
with open(mutation_count_dir + '/all_mutation_counts.pkl', 'wb') as f:
    pickle.dump(res, f) 
print("Stage_9 Complete")
#if full_run==True:  
if 1==1:
    with open(mutation_count_dir + '/all_mutation_counts.pkl', 'rb') as f:
        res = pickle.load(f) 
    non_zero_mutation_counts = []
    for x in res:
        non_zero_mutation_counts += x[2]
    zero_and_non_zero_mutation_counts = []
    for i in range(len(full_sequence)):
        if i in pos_id_dict:
            zero_and_non_zero_mutation_counts.append(non_zero_mutation_counts[pos_id_dict[i]])
        else:
            zero_and_non_zero_mutation_counts.append(0)
    with open(mutation_count_dir + '/zero_and_non_zero_mutation_counts.pkl', 'wb') as f:
        pickle.dump(zero_and_non_zero_mutation_counts, f)  
    mutation_df = pd.DataFrame(zero_and_non_zero_mutation_counts, columns = ['Num_Mutations'])
    mutation_df.to_csv(project_dir + '/mutation_df.csv')   

Stage_9 Complete
