In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import ete3;



In [2]:
project_dir = 'D:/Project_Data/Project_9'
cryptic_input_path = "F:/Datasets/CRYPTIC_DATA/"

In [3]:
seq_dir = 'D:/Project_Data/Project_8/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
           full_sequence = str(record.seq)

In [4]:
full_run = False

In [5]:
if full_run == True:
    variant_df = pd.read_csv(cryptic_input_path + "VARIANTS.csv") 
    with open(project_dir + '/variant_df.pkl', 'wb') as f:
        pickle.dump(variant_df[['UNIQUEID', 'VARIANT', 'MUTATION_TYPE', 'IS_NULL', 'IS_HET', 'IS_FILTER_PASS', 'IS_SNP', 'REF', 'ALT', 'GENOME_INDEX']], f)    

In [6]:
if full_run == True:
    variant_dict = {}
    for i, r in variant_df.iterrows():
         if r['IS_NULL'] == False and r['IS_FILTER_PASS'] == True and r['IS_HET'] == False and r['IS_SNP'] == True :
            if r['UNIQUEID'] in variant_dict:
                temp = (variant_dict[r['UNIQUEID']])
                variant_dict[r['UNIQUEID']].append(r['VARIANT'])
            else:
                variant_dict[r['UNIQUEID']] = [r['VARIANT']]
    with open(project_dir + '/cryptic_variant_dict.pkl', 'wb') as f:
        pickle.dump(variant_dict, f) 
else:
    with open(project_dir + '/cryptic_variant_dict.pkl', 'rb') as f:
        variant_dict = pickle.load(f) 

In [7]:
if full_run == True:
    with open(project_dir + '/variant_df.pkl', 'rb') as f:
        variant_df = pickle.load(f) 
    position_dict = {}
    for i, r in variant_df.iterrows():
         if r['IS_NULL'] == False and r['IS_FILTER_PASS'] == True and r['IS_HET'] == False and r['IS_SNP'] == True :
            if r['GENOME_INDEX'] in position_dict:
                position_dict[r['GENOME_INDEX']].append((r['UNIQUEID'], r['ALT']))
            else:
                position_dict[r['GENOME_INDEX']] = [r['REF'], (r['UNIQUEID'], r['ALT'])]
    with open(project_dir + '/cryptic_position_dict.pkl', 'wb') as f:
        pickle.dump(position_dict, f) 
else:
    with open(project_dir + '/cryptic_position_dict.pkl', 'rb') as f:
        position_dict = pickle.load(f) 

In [8]:
id_dict = {}
for i, (k, v) in enumerate(variant_dict.items()):
    id_dict[k] = i
fast_position_dict = {}
for i, (k, v) in enumerate(position_dict.items()):
    temp = set([id_dict[x[0]] for x in v[1:]])
    if len(temp) >= 50:     # Exclude small SNPs
        fast_position_dict[k] = temp

In [9]:
def split_tree(species_to_split, position):
    temp = fast_position_dict[position].intersection(species_to_split)
    temp_2 = species_to_split - temp
    if len(temp_2) > 1 and len(temp) > 1:
        return ([(position, temp, len(temp)), (position, temp_2, len(temp_2))])
    else:
        return([(-1, species_to_split, len(species_to_split))])

In [10]:
def optimal_split_position(species_to_split_list):
    best_split_num = 1e20
    best_position = 0

    for k, v in fast_position_dict.items():
        mutation_count = 0
        for species_to_split in species_to_split_list:
            num_species_to_split = len(species_to_split)
            optimal_split = int(num_species_to_split/2)
            mutation_count += abs(len(v.intersection(species_to_split)) - optimal_split)
       
        if abs(mutation_count) < abs(best_split_num):
            best_position = k
            best_split_num = abs(mutation_count)
          
    return(best_position, best_split_num)

In [176]:
all_species = [[-1,set(x for x in range(len(id_dict))),99,True]]
pos = optimal_split_position([x[1] for x in all_species])[0]
split_results = [split_tree(all_species[0][1], pos)]
print( [(x[0], x[2]) for x in split_results[0]] )
for i in range(1,12):
    start = time.process_time()
    split_results.append([])
    posn = optimal_split_position([x[1] for x in split_results[i-1]])
    pos= posn[0]
    split_score = posn[1]
    successful_splits = 0
    for x in split_results[i-1]:
        temp = split_tree(x[1], pos) 
        if len(temp) == 1:
            split_results[i].append(temp[0])
        else:
            successful_splits +=1
            split_results[i].append(temp[0])
            split_results[i].append(temp[1])
    print(i, time.process_time() - start, split_score)
    if successful_splits == 0:
        break

[(836658, 38166), (836658, 39414)]
1 4.15625 1327
2 5.015625 7045
3 7.15625 16920
4 10.75 18956
5 15.09375 19515
6 22.640625 21145
7 31.890625 21594
8 44.984375 23025
9 64.21875 24206
10 84.953125 25675
11 110.125 26218


In [177]:
temp = []
for res in split_results:
    for info in res:
        temp.append(info[0])
snps = set(temp)
snps.remove(-1)
len(snps)

In [224]:
sequence_dict = {}
sequence_dict_2 = {}
for i in snps: 
    if i in fast_position_dict:
        for (name, alt) in position_dict[i][1:]:
            name2 = 'seq_'+ str(id_dict[name])
            if name2 in sequence_dict:
                sequence_dict[name2].append((i, alt.upper()))
            else:
                sequence_dict[name2] = [(i, alt.upper())]

temp = []
for key, val in sequence_dict.items():
    if val not in temp:
        temp.append(val)
        sequence_dict_2[key] = val           
                
sequences = []
for k, v in sequence_dict.items():
    sequence = []
    for i in snps: 
        sequence.append(full_sequence[i-1])
        for (pos, alt) in v:
            if pos == i:
                sequence[-1:] = alt
    sequences.append([k, ''.join(sequence)])
    
util.produce_fasta_file(sequences, project_dir + '/' + 'tb_variants.faa')

sequences = []
for k, v in sequence_dict_2.items():
    sequence = []
    for i in snps: 
        sequence.append(full_sequence[i-1])
        for (pos, alt) in v:
            if pos == i:
                sequence[-1:] = alt
    sequences.append([k, ''.join(sequence)])
util.produce_fasta_file(sequences, project_dir + '/' + 'distinct_tb_variants.faa')
 

100%|██████████| 76952/76952 [00:00<00:00, 742682.32it/s]
100%|██████████| 2295/2295 [00:00<00:00, 756755.32it/s]


In [220]:
master_tree = ete3.Tree(project_dir + '/' + 'distinct_tb_variants.faa.treefile')

In [239]:
sequence_dict = {}

for i in range(6000,7000): 
    if i in fast_position_dict:
        for (name, alt) in position_dict[i][1:]:
            name2 = 'seq_'+ str(id_dict[name])
            if name2 in sequence_dict:
                sequence_dict[name2].append((i, alt.upper()))
            else:
                sequence_dict[name2] = [(i, alt.upper())]

sequence_to_score_dict = {}
for k, v in sequence_dict.items():
    if k in sequence_dict_2:
        sequence = []
        for i in range(6000,7000): 
            sequence.append(full_sequence[i-1])
            for (pos, alt) in v:
                if pos == i:
                    sequence[-1:] = alt
        sequence_to_score_dict[k] = [{i} for i in ''.join(sequence)]  

for k, v in sequence_dict_2.items():
    if not(k in sequence_to_score_dict):
        sequence_to_score_dict[k] = [{i} for i in full_sequence[5999:6999]]

In [240]:
def fitch_1(list_1, list_2):
    res =[]
    for i, j in zip(list_1, list_2):
        a = i.intersection(j)
        if len(a) == 0:
            a = i.union(j)
        res.append(a)
    return res

def fitch_2(parent_list, child_list):
    res = []
    mutations = []
    for i, j in zip(parent_list, child_list):
        mutation = 0
        a = i.intersection(j)
        if len(a) == 0:
            a = set(list(j)[0])
            mutation = 1
        res.append(a)
        if mutation == 1:
            mutations.append(1)
        else:
            mutations.append(0)
    return (res, mutations)

In [241]:
for node in master_tree.traverse("postorder"):
    if node.is_leaf():
        node.add_features(seq = sequence_to_score_dict[node.name])
    else:
        children = node.children
        node.add_features(seq = fitch_1(children[0].seq, children[1].seq))
for k, v in sequence_to_score_dict.items():              
    seq_length = len(v)
    break

mutation_counts = [0 for i in range(seq_length)]
for node in master_tree.traverse("preorder"):
    if node.is_leaf():
        continue
    if node.is_root():
        node.seq = [{list(x)[0]} for x in node.seq]
    children = node.children
    mutations = []
    for child in children:
        (temp_1, temp_2) = fitch_2(node.seq ,child.seq)
        child.seq = temp_1
        mutations.append(temp_2)
    temp = []
    for h, i, j in zip(mutation_counts, mutations[0], mutations[1]):
        temp.append(h+max(i, j))
    mutation_counts = temp        

In [242]:
mutation_counts

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 13,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 125,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 37,
 0,
 0,
 4,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 33,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 5,
 5,
 0,
 0,
 0,
 0,

In [None]:
blastfn.run_rscape(project_dir, 'Rvnt30.sto', 'rscape_3')

In [None]:
tot = 0
start = 0
best = 0
for i in range(1,4000000):
    if not (i in position_dict):
        tot+=1
        if start == 0:
            start = i
    else:
        if tot > best:
            best = tot
            print("new best", best, start, i-1)
        start = 0
        tot = 0

In [None]:
print(len(a), len(b), len(a.symmetric_difference(b)))

In [None]:
len(a.intersection(b))

In [None]:
distances = np.zeros((len(variant_dict), 1))
for i, (k, v) in enumerate(variant_dict.items()):
    if i == 2:
        a = set(v)
    b = set(v)
    distances[i, 0] = len(a.symmetric_difference(b))
        

In [None]:
with open(project_dir + '/variant_df.pkl', 'rb') as f:
    variant_df = pickle.load(f) 

In [None]:
variant_df.dtypes

In [None]:
for i, r in variant_df.iterrows():
    print(r)
    if i > 10:
        break

In [None]:
for i, r in variant_df.iterrows():
    if '1416523g>x' == r['VARIANT']:
        print(r)
        if not(math.isnan(r['MUTATION_TYPE'])):
            print("Hello")
        break
    

In [None]:
def bin_formula(max_bin_counts, tot_bin_counts, in_frame = False):
    successes = 0
    if in_frame == False:
        for i in range(10000):
            a = np.random.default_rng().multinomial(tot_bin_counts, np.array([1/3, 1/3, 1/3]), size=None)
            if max(a) >= max_bin_counts:
                successes +=1
    else:
        for i in range(10000):
            a = np.random.default_rng().multinomial(tot_bin_counts, np.array([1/3, 1/3, 1/3]), size=None)
            if a[0] >= max_bin_counts:
                successes +=1
    return successes / 10000

In [None]:
cds_boundaries = []
for feature in genome_record.features:

        if feature.type == 'CDS':
            a = feature.qualifiers  
            cds_boundaries.append((a.get("locus_tag")[0], int(feature.location.start), int(feature.location.end)))   

In [None]:
def mutation_bin_probability(start, end, strand, in_frame = False):
    mutations = []
    for i in range(start,end):
        for letter in ['a','c','g','t']:
            if (i+1, 'SNP', letter) in variant_count_dict:        #Variant count dictionary positions are one indexed
                mutations.append(i)
    bin_counts =[0,0,0]
    for m in mutations:
        if strand == 1:
            bin_counts[(m-(start))%3] +=1
        else:
            bin_counts[((end-1)-m)%3] +=1
    #return (((1-binom.cdf(max(bin_counts),sum(bin_counts),1/3))*3)
    if in_frame == False:
        return (bin_formula(max(bin_counts), sum(bin_counts), in_frame), sum(bin_counts), bin_counts)  
    else:
        return (bin_formula(bin_counts[2], sum(bin_counts), in_frame), sum(bin_counts), bin_counts)  

In [None]:
def big_changes(start, end, strand):
    important_mutations = 0
    all_mutations = 0
    if strand == 1:
        for letter in ['a','c','g','t']:
            if (start + 2, 'SNP', letter) in variant_count_dict:    #one indexed dictionary
                important_mutations+=(variant_count_dict[(start + 2, 'SNP', letter)])
            if (start + 3, 'SNP', letter) in variant_count_dict:
                important_mutations+=(variant_count_dict[(start + 3, 'SNP', letter)]) 
    else:
        for letter in ['a','c','g','t']:
            if (end - 1, 'SNP', letter) in variant_count_dict:
                important_mutations+=(variant_count_dict[(end-1, 'SNP', letter)])
            if (end - 2, 'SNP', letter) in variant_count_dict:
                important_mutations+=(variant_count_dict[(end-2, 'SNP', letter)]) 
    for i in range(start, end):
        for letter in ['a','c','g','t']:
            if (i+1, 'SNP', letter) in variant_count_dict:        #Variant count dictionary positions are one indexed
                all_mutations+=variant_count_dict[(i+1, 'SNP', letter)]
    return (important_mutations, all_mutations)


In [None]:
def generate_cds_probs(num_subsets, subset_num, boundaries):
    cds_boundaries = util.chunk_list(boundaries, num_subsets, subset_num)
    probs = []
    lens = []
    no_mutations = []
    for (loc_name,x,y) in cds_boundaries:
        (temp, num_mutations) = mutation_bin_probability(x, y)
        if num_mutations > 0:
            probs.append(temp)
            lens.append(y-x)
        else:
            no_mutations.append(loc_name)
    return (probs, lens, no_mutations)

In [None]:
parallel_output = Parallel(n_jobs=-1)(delayed(generate_cds_probs)(num_cores, core_number, cds_boundaries) for core_number in core_numbers)
temp = [item for sublist in parallel_output for item in sublist]

In [None]:
t1 =[x[0] for x in parallel_output]
t2 =[x[1] for x in parallel_output]
temp1 = [item for sublist in t1 for item in sublist]
temp2 = [item for sublist in t2 for item in sublist]
sns.scatterplot(y=temp1, x=temp2)

#sns.ecdfplot(temp1, ax = ax)


In [None]:
print(len(temp))
sns.histplot(probs, bins = 100)

In [None]:
sns.ecdfplot(probs)

In [None]:
print(no_mutations)

In [None]:
low_prob_regions = []
non_cds_probs = []
non_cds_no_mutations = []
for i, (loc_name, x,y) in enumerate(tqdm(cds_boundaries)):
    if i < len(cds_boundaries) - 1:
        temp = cds_boundaries[i+1][1]
        if temp - y > 100:
            (temp2, num_mutations) = mutation_bin_probability(y, temp)
            if num_mutations > 0:
                non_cds_probs.append(temp2)
                if temp2 < 0.01:
                    low_prob_regions.append(loc_name)
            else:
                non_cds_no_mutations.append(loc_name)

In [None]:
def max_orf(seq_start, seq_stop, in_frame, output_all_orfs = False):
    orfs_found = []
    max_len = 0
    orf_length = 0
    start_pos = -999
    end_pos = -999
    for frame in ['Forward', 'Reverse']:
        if frame == 'Forward':
            temp = (full_sequence[seq_start: seq_stop])
        else:
            temp = align.reverse_complement(full_sequence[seq_start: seq_stop])
        seq_len = len(temp)
        for i in range(seq_len - 2):
            test_codon = temp[i: i+3] 
            if test_codon in ['ATG','GTG','TTG']:  #Missed out CTG as doesn't seem to be used very much at all
                for j in range(i + 3, seq_len - 2, 3):
                    test_codon_2 = temp[j: j+3] 
                    if test_codon_2 in ['TAG','TGA','TAA']:
                        orf_length = j - i
                        break
                if orf_length > 0:
                    if frame == 'Forward':
                        orf_start =  seq_start + i
                        orf_end = seq_start + j+3
                        orf_strand = 1
                    else:
                        orf_start =  seq_start + seq_len-(j+3)
                        orf_end = seq_start + seq_len-i
                        orf_strand = -1
                    orfs_found.append((orf_start, orf_end, orf_strand, orf_length, mutation_bin_probability(orf_start, orf_end, orf_strand, in_frame=True)))
                   
                if orf_length > max_len:
                    max_len = orf_length
                    start_pos = orf_start
                    end_pos = orf_end
                    strand = orf_strand 
               
    if output_all_orfs == True:
        sorted_orfs = sorted(orfs_found, key=lambda x: x[3], reverse=True)
        return sorted_orfs
    elif start_pos == -999:
        return(0,0,0,(0,0,[0,0,0]))
    else: 
        return(start_pos, end_pos, strand, mutation_bin_probability(start_pos, end_pos, strand, in_frame=True))

In [None]:
print(cds_boundaries[200:250])

In [None]:
max_orf(577338,578426, in_frame=True, output_all_orfs = True)    #(11500,13000)    (2272700,2275000) 

In [None]:
cds_boundaries = []
for feature in genome_record.features:
    if feature.type == 'CDS':
            a = feature.qualifiers  
            cds_boundaries.append((a.get("locus_tag")[0], int(feature.location.start), int(feature.location.end)))   

In [None]:
out_list = []
for i, (loc_name, x,y) in enumerate((cds_boundaries)):
    if i < len(cds_boundaries) - 1:
        temp = cds_boundaries[i+1][1]
        if temp - y > 100:
            res = max_orf(y, temp, in_frame=True, genbank_format = True)
            if res[3][0] < 0.05 and res[3][1] > 0 and res[1] - res[0] > 100:
                print(loc_name, res)
                out_list.append([loc_name, res[0], res[1], res[2], res[3][0]])
df=pd.DataFrame(out_list,columns=['locus','start','end','strand','p_value'])
df.to_csv(cryptic_output_path + '/significant_degenerate_patterns_intergenic.csv')

In [None]:
df=pd.DataFrame(out_list,columns=['locus','start','end','strand','p_value'])
df.to_csv(cryptic_output_path + '/significant_degenerate_patterns_intergenic.csv')

In [None]:
print(align.reverse_complement(full_sequence[578027:578192]))

In [None]:
print((full_sequence[578027:578192]))

In [None]:
num_cores = 16
core_numbers = list(range(1, num_cores+1))
tb_species = 'GCF_000195955.2'
outgroup_species = 'GCF_000696675.2'
non_cds_offset = 50
genome_ids_with_outgroup = util.list_dirs(genome_datasets_dir)
genome_ids = util.list_dirs(genome_datasets_dir)
genome_ids.remove(outgroup_species)
non_target_genome_ids = util.list_dirs(genome_datasets_dir)
non_target_genome_ids.remove(outgroup_species)
non_target_genome_ids.remove(tb_species)
num_ids = len(genome_ids)
num_ids_with_outgroup = len(genome_ids_with_outgroup)
orthologs = sar.Ortholog_Grouping(ortholog_dir)
outgroup_orthologs = sar.Ortholog_Grouping(outgroup_ortholog_dir)
seq_data = sar.Ortholog_Sequence_Dataset(orthologs, genome_datasets_dir, genome_ids, non_cds_offset, tb_species) 
outgroup_seq_data = sar.Ortholog_Sequence_Dataset(outgroup_orthologs, genome_datasets_dir, genome_ids_with_outgroup, non_cds_offset, tb_species) 
all_copy_seq_data = sar.Ortholog_Sequence_Dataset(orthologs, genome_datasets_dir, genome_ids, non_cds_offset, tb_species, single_copy = False) 
#print(outgroup_seq_data.species_info())

In [None]:
print(len(non_cds_probs))
sns.histplot(non_cds_probs, bins = 100)

In [None]:
sns.ecdfplot(non_cds_probs)

In [None]:
temp = all_copy_seq_data.sequence_data
for locus_id in no_mutations:
    print(locus_id)
    if len(temp[temp['locus_tag'] == locus_id]) > 0:
        group_id = temp[temp['locus_tag'] == locus_id].iloc[0]['group_id']
        temp[temp['group_id'] == group_id]
    else:
        print ("No record of "+ locus_id)

In [None]:
print(low_prob_regions)

In [None]:
ryptic_variant_count_dict = {}
for i in range(len(full_sequence)):
    if i+1 in variant_count_dict:
        cryptic_variant_count_dict[i] = variant_count_dict[i+1]
    else:
        cryptic_variant_count_dict[i] = 0
with open(literature_datasets_dir + '/' + 'cryptic_variant_count_dictionary.pkl','wb') as f:
    pickle.dump(cryptic_variant_count_dict, f)

In [None]:
for i in range(4099696, 4099098, -1):
    if i in nick_dict:
        print(i, nick_dict[i])
    else:
        print(i, 0)

In [None]:
print(variant_count_dict[4099403])

In [None]:
max_key = 0
for key, value in variant_count_dict.items():
    if key > max_key:
        max_key = key
print(max_key)

In [None]:
nick_dict = []
for i in range(max_key + 1):
    if i in variant_count_dict:
        nick_dict.append(variant_count_dict[i])
    else:
        nick_dict.append(0)


In [None]:
start = 4099375
end = 4099521
x_ax = list(range(start,end))
y_ax = nick_dict[start:end]
y_alt_ax = nick_dict[start:end]
for i, val in enumerate(y_ax):
    if val > 0:
        y_alt_ax[i] = val
        y_ax[i] = 1
        
plt.plot(x_ax, y_alt_ax)
plt.show()

plt.plot(x_ax, y_ax)
plt.show()

# Number of samples in normalized_tone
N = len(x_ax)

yf = fft(y_ax)
xf = fftfreq(N, 1)

plt.plot(xf[1:], np.abs(yf)[1:])
plt.show()


In [None]:
temp = []
chunksize = 0.5 * (10 ** 7) 
chunknum = 0
with pd.read_csv(cryptic_input_path + "VARIANTS.csv", chunksize=chunksize) as reader:
    for chunk in reader:
        chunknum += 1
        print(chunknum)
        for i, r in chunk.iterrows():
            if r['GENOME_INDEX'] in range(start,end):
                temp.append([r['UNIQUEID'], r['GENOME_INDEX'], r['REF'],r['ALT'],r['MUTATION_TYPE']])
df = pd.DataFrame(temp, columns = ['UNIQUEID', 'GENOME_INDEX','REF','ALT','MUTATION_TYPE'])
df.to_csv(cryptic_output_path + '/' + 'upstream_3660.csv')

In [None]:
print(df)

In [None]:
nick= df[['GENOME_INDEX', 'REF','ALT']]
a = nick.drop_duplicates()

In [None]:
full_sequence[4099380]

In [None]:
df.groupby(['GENOME_INDEX', 'REF', 'ALT']).size()

In [None]:
df.to_csv(cryptic_output_path + '/' + 'nick.csv')

In [None]:
full_sequence[4099376:4099400]

In [None]:
variant_count_dict[4099406]

In [None]:
def max_orf2(seq_start, seq_stop, in_frame):
    max_len = 0
    orf_length = 0
    start_pos = -999
    end_pos = -999
    for frame in ['Forward', 'Reverse']:
        if frame == 'Forward':
            temp = (full_sequence[seq_start: seq_stop])
        else:
            temp = align.reverse_complement(full_sequence[seq_start: seq_stop])
        seq_len = len(temp)
        for i in range(seq_len - 2):
            test_codon = temp[i: i+3] 
            if test_codon in ['ATG','GTG','TTG']:  #Missed out CTG as doesn't seem to be used very much at all
                for j in range(i + 3, seq_len - 2, 3):
                    test_codon_2 = temp[j: j+3] 
                    if test_codon_2 in ['TAG','TGA','TAA']:
                        orf_length = j - i
                        break
                if orf_length > max_len:
                    max_len = orf_length
                    if frame == 'Forward':
                        start_pos = i
                        end_pos = j+3
                        strand = 1
                    else:
                        start_pos = seq_len-(j+3)
                        end_pos = seq_len-i
                        strand = -1
    #print(seq_start+start_pos, seq_start+end_pos, strand)
    #if strand == 1:
    #    print(full_sequence[seq_start+start_pos:seq_start+start_pos+3], full_sequence[seq_start+end_pos-3: seq_start+end_pos])
    #else:
    #    print(align.reverse_complement(full_sequence[seq_start+end_pos-3: seq_start+end_pos]), align.reverse_complement(full_sequence[seq_start+start_pos:seq_start+start_pos+3]))
    
    if start_pos == -999:
        return(0,0,0,(0,0,[0,0,0]))
    else:
        return(seq_start+start_pos, seq_start+end_pos, strand, mutation_bin_probability(seq_start+start_pos, seq_start+end_pos, strand, in_frame=True), big_changes(seq_start+start_pos, seq_start+end_pos, strand))

In [None]:
num_iterations = 0
cds_boundaries = [[0, 0], [4411530, 4411530]]
while num_iterations <= 10:
    temp_boundaries = []
    for i, (x,y) in enumerate((cds_boundaries)):
        if i < len(cds_boundaries) - 1:
            temp = cds_boundaries[i+1][0]
            if temp - y > 100:
                res = max_orf(y, temp, in_frame=True, genbank_format = False)
                #if res[3][0] < 0.005 and res[3][1] > 0 and res[1] - res[0] > 100:
                if res[0] == res[1]:
                    pass
                else:
                    #print(res)
                    temp_boundaries.append([res[0], res[1]])
    cds_boundaries = sorted(cds_boundaries + temp_boundaries, key=lambda x: x[0])
    print("iteration " + str(num_iterations))
    print (cds_boundaries)
    num_iterations += 1

In [None]:
len(cds_boundaries)

In [None]:
print(2**10-1)

In [None]:
[[1,1],[3,4]]+[[1,5],[3,9]]

In [None]:
tot = 0
vsmall = 0
small = 0
for k, v in fast_position_dict.items():
    tot +=1
    temp = len(v)
    if temp < 4:
        vsmall+=1
    if temp < 50:
        small +=1
print(tot, vsmall, small)

In [167]:
max([0,2,1],[2,0,7])

[2, 0, 7]

In [169]:
counts = []
for h,i, j in zip([1,1,1],[0,2,1],[2,0,7]):
    counts.append(h+max(i, j))
print(counts)

[3, 3, 8]
