#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
import ete3;



In [2]:
project_dir = 'D:/Project_Data/Project_9'
cryptic_input_path = "F:/Datasets/CRYPTIC_DATA/"
seq_dir = 'D:/Project_Data/Project_8/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
num_cores = 16
core_numbers = list(range(1, num_cores+1))

In [3]:
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)

In [4]:
barcode_df = pd.read_csv(project_dir +'/Barcode.csv')

In [5]:
full_run = False

#### Functions

In [6]:
def produce_sequences(position_list, variant_dict, position_dict):
    base_sequence = []
    for i in position_list:
        base_sequence.append(full_sequence[i-1])     # Cryptic are 1 based 
    output_sequence_dict ={}
    for k, v in variant_dict.items():
        output_sequence_dict[k] = copy.deepcopy(base_sequence)
    for i, pos in enumerate(position_list):
        if pos in position_dict:
            variant_info = position_dict[pos][1:]  # Miss out initlal "ref" record
            for (name, alt) in variant_info:
                output_sequence_dict[name][i] = alt.upper()
    output_sequences = []
    for k, v in output_sequence_dict.items():
        output_sequences.append(['seq_'+str(k), ''.join(v)])
    return output_sequences

In [7]:
def produce_sequences_to_score(position_list, variant_dict, position_dict, distinct_sequence_names):
    base_sequence = []
    for i in position_list:
        base_sequence.append({full_sequence[i-1]})     # Cryptic are 1 based 
    output_sequence_dict ={}
    for k, v in variant_dict.items():
        if k in distinct_sequence_names:
            output_sequence_dict[k] = copy.deepcopy(base_sequence)
    for i, pos in enumerate(position_list):
        if pos in position_dict:
            variant_info = position_dict[pos][1:]  # Miss out initlal "ref" record
            for (name, alt) in variant_info:
                if name in output_sequence_dict:
                    output_sequence_dict[name][i] = {alt.upper()}
    temp_dict = {}
    for k, v in output_sequence_dict.items():
        temp_dict['seq_'+str(k)] = v
    return temp_dict

In [8]:
def fitch_1(list_1, list_2):
    res =[]
    for i, j in zip(list_1, list_2):
        a = i.intersection(j)
        if len(a) == 0:
            a = i.union(j)
        res.append(a)
    return res

def fitch_2(parent_list, child_list):
    res = []
    mutations = []
    for i, j in zip(parent_list, child_list):
        mutation = 0
        a = i.intersection(j)
        if len(a) == 0:
            a = set(list(j)[0])
            mutation = 1
        res.append(a)
        if mutation == 1:
            mutations.append(1)
        else:
            mutations.append(0)
    return (res, mutations)

In [9]:
def bin_formula(position_3_counts, tot_bin_counts):
    return 1- binom.cdf(position_3_counts-1, tot_bin_counts,1/3)

In [10]:
def mutation_bin_probability(mutation_counts):
    bin_counts = [0,0,0]
    for i, c in enumerate(mutation_counts):
        bin_counts[i % 3] += c
    if sum(bin_counts) == 0:
        return (2)
    else:
        return (bin_counts, bin_formula(bin_counts[2], sum(bin_counts)))  

#### Create variant dictionaries 

In [11]:
if full_run == True:
    variant_df = pd.read_csv(cryptic_input_path + "VARIANTS.csv") 
    with open(project_dir + '/variant_df.pkl', 'wb') as f:
        pickle.dump(variant_df[['UNIQUEID', 'VARIANT', 'MUTATION_TYPE', 'IS_NULL', 'IS_HET', 'IS_FILTER_PASS', 'IS_SNP', 'REF', 'ALT', 'GENOME_INDEX']], f)    

In [12]:
if full_run == True:
    position_dict = {}
    variant_dict = {}
    id_dict = {}
    with open(project_dir + '/variant_df.pkl', 'rb') as f:
        variant_df = pickle.load(f) 
        unique_ids = variant_df.UNIQUEID.unique()
        for i, unique_id in enumerate(unique_ids):
            id_dict[unique_id] = i

        for i, r in variant_df.iterrows():
            if r['IS_NULL'] == False and r['IS_FILTER_PASS'] == True and r['IS_HET'] == False and r['IS_SNP'] == True :
                
                if id_dict[r['UNIQUEID']] in variant_dict:
                    variant_dict[id_dict[r['UNIQUEID']]].append((r['GENOME_INDEX'], r['ALT']))
                else:
                    variant_dict[id_dict[r['UNIQUEID']]] = [(r['GENOME_INDEX'], r['ALT'])]

                if r['GENOME_INDEX'] in position_dict:
                    position_dict[r['GENOME_INDEX']].append((id_dict[r['UNIQUEID']], r['ALT']))
                else:
                    position_dict[r['GENOME_INDEX']] = [r['REF'], (id_dict[r['UNIQUEID']], r['ALT'])]    # If first entry also include reference value for info

    with open(project_dir + '/id_dict.pkl', 'wb') as f:
        pickle.dump(id_dict, f)
    with open(project_dir + '/variant_dict.pkl', 'wb') as f:
        pickle.dump(variant_dict, f) 
    with open(project_dir + '/position_dict.pkl', 'wb') as f:
        pickle.dump(position_dict, f) 
        

In [13]:
if full_run == False:
    with open(project_dir + '/id_dict.pkl', 'rb') as f:
        id_dict = pickle.load(f)  
    with open(project_dir + '/variant_dict.pkl', 'rb') as f:
        variant_dict = pickle.load(f)  
    with open(project_dir + '/position_dict.pkl', 'rb') as f:
        position_dict = pickle.load(f)  

#### Output full sequences and distinct sequences based on barcode positions for construction of tree

In [14]:
barcode_snps = []
for i, r in barcode_df.iterrows():
    start = r['start'] + 1            # Barcode file is 1 indexed
    lineage_name = r['lineage']
    if start in position_dict and 'lineage' in lineage_name:   # Just TB lineages
        barcode_snps.append(start)

In [15]:
sequences = produce_sequences(barcode_snps, variant_dict, position_dict)
util.produce_fasta_file(sequences, project_dir + '/' + 'tb_variants.faa')

100%|██████████| 77580/77580 [00:00<00:00, 112085.47it/s]


In [16]:
distinct_sequences = []
distinct_sequence_names = []
temp_dict = {}
for (ref, seq) in sequences:
    temp_dict[seq] = ref
for k, v in temp_dict.items():    
    distinct_sequences.append([v, k])
    distinct_sequence_names.append(int(v.split('_')[1]))
util.produce_fasta_file(distinct_sequences, project_dir + '/' + 'distinct_tb_variants.faa')

100%|██████████| 3045/3045 [00:00<00:00, 106608.14it/s]


In [17]:
master_tree = ete3.Tree(project_dir + '/' + 'distinct_tb_variants.nwk')

#### Build sequences of interest and then run Fitch algorithm to assess mutations

In [18]:
def calculation_mutation_significance(start, stop):    # 1 based
    positions = list(range(start,stop))
    sequence_to_score_dict = produce_sequences_to_score(positions, variant_dict, position_dict, distinct_sequence_names)

    for node in master_tree.traverse("postorder"):
        if node.is_leaf():
            node.add_features(seq = sequence_to_score_dict[node.name])
        else:
            children = node.children
            node.add_features(seq = fitch_1(children[0].seq, children[1].seq))
    for k, v in sequence_to_score_dict.items():              
        seq_length = len(v)
        break

    mutation_counts = [0 for i in range(seq_length)]
    for node in master_tree.traverse("preorder"):
        if node.is_leaf():
            continue
        if node.is_root():
            node.seq = [{list(x)[0]} for x in node.seq]
        children = node.children
        mutations = []
        for child in children:
            (temp_1, temp_2) = fitch_2(node.seq ,child.seq)
            child.seq = temp_1
            mutations.append(temp_2)
        temp = []
        for h, i, j in zip(mutation_counts, mutations[0], mutations[1]):
            temp.append(h+max(i, j))
        mutation_counts = temp        
    return mutation_bin_probability(mutation_counts)

In [19]:
cds_boundaries = []
for genome_record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    for feature in genome_record.features:
        if feature.type == 'CDS':
            a = feature.qualifiers  
            cds_boundaries.append((a.get("locus_tag")[0], int(feature.location.start), int(feature.location.end), int(feature.location.strand)))   

In [20]:
res = []
for (locus, start, stop, strand) in tqdm(cds_boundaries):
    if strand == 1:
        temp = calculation_mutation_significance(start+1, stop+1)
        res.append((locus, temp))
    else:
        continue

  0%|          | 1/3906 [00:53<58:09:30, 53.62s/it]

Rv0001 ([546, 152, 5048], 0.0)


  0%|          | 2/3906 [01:26<44:57:13, 41.45s/it]

Rv0002 ([274, 153, 879], 0.0)


  0%|          | 3/3906 [02:00<40:59:51, 37.82s/it]

Rv0003 ([65, 148, 175], 9.768844403401644e-07)


  0%|          | 4/3906 [02:20<33:26:18, 30.85s/it]

Rv0004 ([63, 44, 195], 0.0)


  0%|          | 5/3906 [03:19<44:21:54, 40.94s/it]

Rv0005 ([1368, 487, 6976], 0.0)


  0%|          | 6/3906 [04:29<55:12:55, 50.97s/it]

Rv0006 ([1544, 1036, 8073], 0.0)


  0%|          | 7/3906 [04:57<46:58:30, 43.37s/it]

Rv0007 ([342, 149, 736], 0.0)


  0%|          | 9/3906 [05:15<29:02:33, 26.83s/it]

Rv0009 ([265, 170, 1618], 0.0)


  0%|          | 12/3906 [05:44<19:37:41, 18.15s/it]

Rv0012 ([303, 129, 680], 0.0)


  0%|          | 13/3906 [06:07<20:30:15, 18.96s/it]

Rv0013 ([206, 121, 1062], 0.0)


  1%|          | 23/3906 [06:32<7:27:25,  6.91s/it] 

Rv0023 ([185, 134, 1222], 0.0)


  1%|          | 24/3906 [07:00<9:37:00,  8.92s/it]

Rv0024 ([141, 58, 463], 0.0)


  1%|          | 25/3906 [07:15<10:20:55,  9.60s/it]

Rv0025 ([16, 38, 16], 0.9793306035604931)


  1%|          | 26/3906 [08:03<16:23:52, 15.21s/it]

Rv0026 ([156, 80, 101], 0.9152039548444021)


  1%|          | 27/3906 [08:11<15:06:02, 14.01s/it]

Rv0027 ([27, 16, 32], 0.057615076935598974)


  1%|          | 28/3906 [08:25<15:00:49, 13.94s/it]

Rv0028 ([69, 36, 436], 0.0)


  1%|          | 29/3906 [09:04<21:00:38, 19.51s/it]

Rv0029 ([107, 75, 159], 2.492539175058539e-07)


  1%|          | 30/3906 [09:13<18:09:01, 16.86s/it]

Rv0030 ([69, 82, 153], 7.392921830273735e-10)


  1%|          | 31/3906 [10:23<32:53:01, 30.55s/it]

Rv0032 ([204, 109, 222], 4.8911940802476295e-05)


  1%|          | 32/3906 [10:31<26:17:58, 24.44s/it]

Rv0033 ([14, 7, 18], 0.06559989996044124)


  1%|          | 33/3906 [10:47<23:48:31, 22.13s/it]

Rv0034 ([15, 18, 129], 0.0)


  1%|          | 34/3906 [11:41<33:21:13, 31.01s/it]

Rv0035 ([196, 133, 110], 0.9999323877978199)


  1%|          | 37/3906 [11:59<18:37:27, 17.33s/it]

Rv0038 ([392, 163, 1750], 0.0)


  1%|          | 40/3906 [13:22<23:33:33, 21.94s/it]

Rv0041 ([1874, 1000, 6910], 0.0)


  1%|          | 48/3906 [13:37<10:02:16,  9.37s/it]

Rv0049 ([154, 47, 950], 0.0)


  1%|▏         | 49/3906 [14:37<15:33:01, 14.51s/it]

Rv0050 ([1363, 486, 4269], 0.0)


  1%|▏         | 50/3906 [15:24<19:53:18, 18.57s/it]

Rv0051 ([902, 290, 2179], 0.0)


  1%|▏         | 51/3906 [15:42<19:48:08, 18.49s/it]

Rv0052 ([37, 23, 73], 2.6718790036017737e-07)


  1%|▏         | 52/3906 [15:56<18:52:51, 17.64s/it]

Rv0053 ([137, 79, 675], 0.0)


  1%|▏         | 53/3906 [16:18<19:43:45, 18.43s/it]

Rv0054 ([129, 80, 1320], 0.0)


  1%|▏         | 54/3906 [16:27<17:39:11, 16.50s/it]

Rv0055 ([122, 78, 694], 0.0)


  1%|▏         | 55/3906 [16:45<18:00:50, 16.84s/it]

Rv0056 ([1801, 1861, 2410], 0.0)


  1%|▏         | 56/3906 [17:04<18:38:13, 17.43s/it]

Rv0057 ([632, 702, 756], 0.003320677234471381)


  1%|▏         | 57/3906 [18:28<37:43:38, 35.29s/it]

Rv0058 ([861, 370, 4835], 0.0)


  1%|▏         | 58/3906 [18:48<33:05:25, 30.96s/it]

Rv0059 ([33, 62, 90], 1.1756294457820893e-05)


  2%|▏         | 59/3906 [19:22<34:00:12, 31.82s/it]

Rv0060 ([26, 19, 96], 0.0)


  2%|▏         | 61/3906 [19:54<26:28:36, 24.79s/it]

Rv0062 ([414, 267, 1127], 0.0)


  2%|▏         | 61/3906 [20:11<21:12:38, 19.86s/it]


KeyboardInterrupt: 

In [None]:
out_list = []
for i, (loc_name, x,y) in enumerate((cds_boundaries)):
    if i < len(cds_boundaries) - 1:
        temp = cds_boundaries[i+1][1]
        if temp - y > 100:
            res = max_orf(y, temp, in_frame=True, genbank_format = True)
            if res[3][0] < 0.05 and res[3][1] > 0 and res[1] - res[0] > 100:
                print(loc_name, res)
                out_list.append([loc_name, res[0], res[1], res[2], res[3][0]])
df=pd.DataFrame(out_list,columns=['locus','start','end','strand','p_value'])
df.to_csv(cryptic_output_path + '/significant_degenerate_patterns_intergenic.csv')

#### Legacy code (might be useful)

In [None]:
def split_tree(species_to_split, position):
    temp = fast_position_dict[position].intersection(species_to_split)
    temp_2 = species_to_split - temp
    if len(temp_2) > 1 and len(temp) > 1:
        return ([(position, temp, len(temp)), (position, temp_2, len(temp_2))])
    else:
        return([(-1, species_to_split, len(species_to_split))])

In [None]:
def optimal_split_position(species_to_split_list):
    best_split_num = 1e20
    best_position = 0
    #for k in barcode_snps:
    for k, v in fast_position_dict.items():
        #v = fast_position_dict[k]
        mutation_count = 0
        for species_to_split in species_to_split_list:
            num_species_to_split = len(species_to_split)
            optimal_split = int(num_species_to_split/2)
            mutation_count += abs(len(v.intersection(species_to_split)) - optimal_split)
       
        if abs(mutation_count) < abs(best_split_num):
            best_position = k
            best_split_num = abs(mutation_count)
          
    return(best_position, best_split_num)

In [None]:
all_species = [[-1,set(x for x in range(len(id_dict))),99,True]]
pos = optimal_split_position([x[1] for x in all_species])[0]
split_results = [split_tree(all_species[0][1], pos)]
print( [(x[0], x[2]) for x in split_results[0]] )
for i in range(1,10):
    start = time.process_time()
    split_results.append([])
    posn = optimal_split_position([x[1] for x in split_results[i-1]])
    pos= posn[0]
    split_score = posn[1]
    successful_splits = 0
    for x in split_results[i-1]:
        temp = split_tree(x[1], pos) 
        if len(temp) == 1:
            split_results[i].append(temp[0])
        else:
            successful_splits +=1
            split_results[i].append(temp[0])
            split_results[i].append(temp[1])
    print(max([x[2] for x in split_results[i]]), [(x[0], x[2]) for x in split_results[i]] )
    if successful_splits == 0:
        break

In [None]:
temp = []
for res in split_results:
    for info in res:
        temp.append(info[0])
snps = set(temp)
snps.remove(-1)

In [None]:
def consensus(seq_list):
    consensus = []
    seq_len = len(seq_list[0])
    for i in range(seq_len):
        temp = [x[i] for x in seq_list]
        max_count = 0
        consensus_letter = 'X'
        for letter in ['A', 'C' ,'G', 'T']:
            tempct = temp.count(letter)
            if tempct > max_count:
                max_count = tempct
                consensus_letter = letter
        consensus.append(consensus_letter)
    return ''.join(consensus)