In [3]:
import timeit
import time
from pathlib import Path
import random
import sys
sys.path.append(str(Path.cwd().parent / "classes"))
sys.path.append(str(Path.cwd().parent))

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

from sim_config import SimConfiguration
from simulation import Simulation
from msa import Msa


TREES_PATH = Path.cwd() / "other_mammals_data"

In [None]:
def refresh_sim(tree_file):
    tree_path = Path(tree_file)
    print(tree_file)
    fasta_path = (tree_path.parent / (tree_path.stem + ".fasta")).resolve()
    fasta_text = fasta_path.read_text().replace("-","")
    fasta_seqs = fasta_text.split("\n")
    sequences_length = [len(seq.replace("\n","")) for seq in fasta_seqs]
    min_seq_len = int(min(sequences_length) *0.8)
    max_seq_len = int(max(sequences_length) *1.1)

    ROOT_SEQUENCE_LENGTH = random.randint(min_seq_len, max_seq_len)

    indel_rate = random.uniform(0.0001, 0.05)
    length_param_alpha = random.uniform(1.01, 2.0)
    seed = time.time_ns()%1000000
    print("seed=", seed, "indel_rate=",indel_rate, "Root length=", ROOT_SEQUENCE_LENGTH)

    sim_config = SimConfiguration(original_sequence_length=ROOT_SEQUENCE_LENGTH, indel_length_alpha=length_param_alpha,
                                indel_truncated_length=50,
                                rate_ins=indel_rate, rate_del=indel_rate,
                                deletion_extra_edge_length=50,
                                switch_factor=200,
                                seed=seed)
    sim = Simulation(input_tree=str(tree_file), config=sim_config)
    return sim

In [16]:
time_log = {"blocklist": [],
            "naive": []
            }

for tree in TREES_PATH.iterdir():
    if tree.suffix == ".fasta":
        continue
    print(tree.stem, tree.suffix)
    times_of_list, times_of_naive = [], []
    for i in range(10):
        current_sim = refresh_sim(str(tree))

        blocklist_time = timeit.timeit(current_sim.msa_from_blocklist, number=1)
        times_of_list.append(blocklist_time)
        print("Blocklist time","is", blocklist_time, "sec")
        
        naive_time = timeit.timeit(current_sim.msa_from_naive, number=1)
        print("Naive_time","is", naive_time, "sec")
        times_of_naive.append(naive_time)
    time_log["blocklist"].append(np.mean(times_of_list))
    time_log["naive"].append(np.mean(times_of_naive))



7288_NT_AL_AA .tree
seed= 700234 indel_rate= 0.022328786513222287 Root length= 44
Blocklist time is 0.007348815001023468 sec
Naive_time is 0.003764872999454383 sec
seed= 429643 indel_rate= 0.01580200808458792 Root length= 3
Blocklist time is 0.001623991000087699 sec
Naive_time is 0.0005837019998580217 sec
seed= 378316 indel_rate= 0.001983561370102874 Root length= 1
Blocklist time is 0.000930523998249555 sec
Naive_time is 0.0004488689992285799 sec
seed= 83122 indel_rate= 0.00560763243256791 Root length= 66
Blocklist time is 0.008428118999290746 sec
Naive_time is 0.007459608001227025 sec
seed= 577500 indel_rate= 0.009286173058818232 Root length= 60
Blocklist time is 0.007760905002214713 sec
Naive_time is 0.006154295002488652 sec
seed= 385215 indel_rate= 0.03811995879755317 Root length= 17
Blocklist time is 0.002810394998959964 sec
Naive_time is 0.0030009570000402164 sec
seed= 456032 indel_rate= 0.020059722418349708 Root length= 14
Blocklist time is 0.0023893079996923916 sec


ValueError: max() arg is an empty sequence

In [24]:
pd.DataFrame(time_log)

Unnamed: 0,blocklist,naive
0,0.025547,0.281615
1,0.025084,0.268677
2,0.025119,0.280378
3,0.025639,0.269029
4,0.025642,0.295416
5,0.0248,0.266108
6,0.024613,0.260383
7,0.025138,0.27059
8,0.024862,0.241323
9,0.024395,0.226481
