# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import shutil
from gzip import open as gopen
import subprocess
from concurrent.futures import ProcessPoolExecutor
# seed the rng
np.random.seed(42)

In [2]:
# go through each simulation directory to generate corrected transmission networks, sample data and time trees
for i in range(1,1101):
    # generate latent period
    latent_period = np.random.exponential(2.9/365) #expected value is 2.9/365 days
    # set up file paths for transmission network
    old_tn_path = f'simulations/{i:04d}/transmission_network.subsampled.txt.gz'
    new_tn_path = f'simulations/{i:04d}/transmission_network.subsampled.corrected.txt'
    # open files and write out the corrected data
    with gopen(old_tn_path, 'rt') as old_tn, open(new_tn_path, 'w') as new_tn:
        for line in old_tn:
            u, v, t = line.strip().split('\t')
            new_tn.write(f'{u}\t{v}\t{float(t) + latent_period:.6f}\n')
    # set up file paths for sample times
    old_samples_path = f'simulations/{i:04d}/subsample_times.txt.gz'
    new_samples_path = f'simulations/{i:04d}/subsample_times.corrected.txt'
    # open files and write out the corrected data
    with gopen(old_samples_path, 'rt') as old_samples, open(new_samples_path, 'w') as new_samples:
        for line in old_samples:
            u, t = line.strip().split('\t')
            new_samples.write(f'{u}\t{float(t) + latent_period}\n')
    # setup and run CoaTRan to generate corrected time trees
    command = ['coatran_constant', new_tn_path, new_samples_path, '1']
    env = os.environ.copy()
    coatran_rng_seed = np.random.randint(low=0, high=2**31) # ensure it is reproducible
    env["COATRAN_RNG_SEED"] = str(coatran_rng_seed)
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
    if result.returncode != 0:
        print(f"Error executing CoaTran for simulation {i}: {result.stderr.decode()}")
        print(command)
    else:
        tree_path = f'simulations/{i:04d}/tree.time.subsampled.corrected.nwk'
        with open(tree_path, 'w') as file_handle:
            file_handle.write(result.stdout.decode())

In [3]:
# Setup output folder for each of 1000 resamples
# each with subfolders for CC and AB clade analysisiresults
if os.path.exists('resamples'):
    # Remove the directory and its contents
    shutil.rmtree('resamples')
os.makedirs('resamples')
for i in range(1000):
    # create path for this resample
    os.makedirs(f'resamples/{i}')
    # create subdirectory with clade analysis results subdirectories
    os.makedirs(f'resamples/{i}/clade_analyses_CC')
    os.makedirs(f'resamples/{i}/clade_analyses_AB')
    os.makedirs(f'resamples/{i}/clade_analyses_CC_exact')
    os.makedirs(f'resamples/{i}/clade_analyses_AB_exact')

In [4]:
# function to call stableCoalescence_cladeAnalysis.py
# to resample one simulation 1000 times
def resample_simulation(i, seed):
    command = [
        "python3",
        "stableCoalescence_cladeAnalysis.py",
        "-tn", f'simulations/{i:04d}/transmission_network.subsampled.corrected.txt',
        "-tt", f'simulations/{i:04d}/tree.time.subsampled.corrected.nwk',
        "-g", f'simulations/{i:04d}/GEMF_files/output.txt.gz',
        "-s", "0.00092",
        "-m", "1",
        "-id", f'{i:04d}',
        "-seed", str(seed)
    ]
    subprocess.run(command)

In [5]:
# resample each of the 1100 simulations with a deterministic seed
seeds = np.random.randint(0, 2**31, size=1100)
with ProcessPoolExecutor(max_workers=24) as executor:
    executor.map(resample_simulation, range(1, 1101), seeds)

# Primary analysis (3.47-day doubling time, 15% ascertainment rate) tree sizes

In [6]:
def read_clade_results(dir):
    clade_analyses_d = dict()
    for path in sorted(os.listdir(dir)): # pool all clade analyses together
        clade_analysis_path = dir + path
        key = int(path.split('_')[0])
        clade_analyses_d[key] = {'clade_sizes': [], 'subclade_sizes': []}
        for line in open(clade_analysis_path):
            l = line.strip().strip(']').split('[')
            clade_size = int(l[0].strip()) # make each clade size (including single leaves) an integer
            subclade_sizes = [int(x) for x in l[1].strip().replace(' ', '').split(',')] # put each subclade size (including single leaves) into a list
            clade_analyses_d[key]['clade_sizes'].append(clade_size)
            clade_analyses_d[key]['subclade_sizes'].append(subclade_sizes)
    return clade_analyses_d

In [7]:
def read_root_mutations(dir):
    root_mutations_d = dict()
    for path in sorted([f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]): 
        root_mutations_path = dir + path
        key = int(path.split('_')[0])
        with open(root_mutations_path) as file:
            root_mutations_d[key] = int(file.readline().strip())
    return root_mutations_d

In [8]:
clade_analyses_CC_dir = './resamples/0/clade_analyses_CC/'
clade_analyses_AB_dir = './resamples/0/clade_analyses_AB/'

clade_analyses_CC_d = read_clade_results(clade_analyses_CC_dir)

print('Less than 787 taxa: %f' % (sum([sum(clade_analyses_CC_d[x]['clade_sizes']) < 787 for x in clade_analyses_CC_d])/1100)) 
print('More than 1000 taxa: %f' % (sum([sum(clade_analyses_CC_d[x]['clade_sizes']) > 1000 for x in clade_analyses_CC_d])/1100))
print('More than 5000 taxa: %f' % (sum([sum(clade_analyses_CC_d[x]['clade_sizes']) > 5000 for x in clade_analyses_CC_d])/1100))

Less than 787 taxa: 0.017273
More than 1000 taxa: 0.980909
More than 5000 taxa: 0.953636


# Tree shapes and bayes factors

In [9]:
unconstrained_results = np.array([1.68,80.85, 10.32, 0.92])/100 # linB, linA, C/C, T/T
recCA_results = np.array([77.28, 8.18, 10.49, 3.71])/100 # linB, linA, C/C, T/T

In [10]:
def calculate_bf(asr_results, simulation_results):
    # Let t_p be a polytomy, t_1C be one clade, and t_2c be two clades. Note that t_p includes t_1c. t_p equals all topologies with a basal polytomy (Fig. 2a). 
    # trees are in the order
    # (t_p, t_1C, t_2C, (t_p,(t_p,t_1C,t_2C)), (t_1C,(t_p,t_1C,t_2C)), (t_2c,(t_p,t_1C,t_2C)))
    compatibility_matrix = np.array([np.array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]), # S_A
                                     np.array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]), # S_B
                                     np.array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]), # S_CC
                                     np.array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])] # S_TT
                                    )
                                    
    # A matrix of conditional probabilities
    # Each row stores the vector Pr(S_MRCA | \btau)
    pr_s_mrca_given_tree = np.array([x/sum(x) if sum(x) > 0 else x for x in compatibility_matrix.T]) # if tree not associated with any haplotype, just keep the row as all 0s
    
    # Order: S_A, S_B, S_{C/C}, S_{T/T}
    pr_s_mrca_given_data = np.array(asr_results)/sum(asr_results)
    unnormalized_pr_data_given_s_mrca = pr_s_mrca_given_data.copy()
    
    # FAVITES simulation information
    # the 3 trees are in the order (t_p, t_1C, t_2C)
    pr_3_topos = np.array(simulation_results)
    pr_trees_given_I1 = np.concatenate([pr_3_topos, np.array([0]*9)])
    pr_trees_given_I2 = np.concatenate([np.array([0]*3), np.array([simulation_results[0]]), np.array([0]*8)])
    
    # Equal prior probability of 1 or 2 intros
    pr_I1 = 0.5
    pr_I2 = 0.5
    
    pr_s_mrca_and_I1 = np.dot([np.dot(pr_s_mrca_given_tree.T[i], pr_trees_given_I1) for i in range(0,4)], pr_I1) # dot product of P(haplotype|tree) (column) and P(trees|I_n), scaled by P(I_n)
    pr_s_mrca_and_I2 = np.dot([np.dot(pr_s_mrca_given_tree.T[i], pr_trees_given_I2) for i in range(0,4)], pr_I2)

    posterior_odds = np.dot(unnormalized_pr_data_given_s_mrca, pr_s_mrca_and_I2) / np.dot(unnormalized_pr_data_given_s_mrca, pr_s_mrca_and_I1)
    prior_odds = pr_I2/pr_I1
    BF = posterior_odds/prior_odds

    return(BF)



In [11]:
def test_sizes(run_1,run_2,interval):
    # read samples into seperate sets for each run
    sample_dict = {run_1: set(), run_2: set()}
    # read transmissions into one list for both runs
    transmission_list = []
    for run in [run_1, run_2]:
        with open(f'simulations/{run:04d}/subsample_times.corrected.txt', 'r') as file:
            for line in file:
                u, t = line.strip().split('\t')
                sample_dict[run].add(u)
        with open(f'simulations/{run:04d}/transmission_network.subsampled.corrected.txt', 'r') as file:
            for line in file:
                u,v,t = line.strip().split()
                if u != v:
                    if run == run_2:
                        t = float(t) + interval
                    else:
                        t = float(t)
                    transmission_list.append((v,t,run))
    # sort the transmissions
    transmission_list = sorted(transmission_list, key=lambda x: x[1])
    # start sample counts from -1 to compensate for the primary sample
    n_valid_samples = {run_1: -1, run_2: -1}
    # got through the transmissions from earliest to latest
    for event_no, transmission in enumerate(transmission_list):
        # check if the transmission is amongst the samples of its run
        if transmission[0] in sample_dict[transmission[2]]:
            # increment the count for that run
            n_valid_samples[transmission[2]] += 1
        # stop after the first 50,000
        if event_no == 50000:
            break
    # compare the numbers of sampled transmissions to the relative size condition 
    if 0.3 <= n_valid_samples[run_1]/(n_valid_samples[run_1] + n_valid_samples[run_2]) <= 0.7:
        return True
    else:
        return False

In [12]:
def clade_analysis_updated(clade_analyses_CC_dir, clade_analyses_AB_dir, clade_analyses_CC_exact_dir, clade_analyses_AB_exact_dir, root_mutations_dir, label, expected_coalescence_range, expected_introduction_range, seed, min_polytomy_size=100, _print_=False):
    
    rng = np.random.default_rng(seed)
    
    clade_analyses_CC_d = read_clade_results(clade_analyses_CC_dir) 
    clade_analyses_AB_d = read_clade_results(clade_analyses_AB_dir)
    clade_analyses_CC_exact_d = read_clade_results(clade_analyses_CC_exact_dir)
    clade_analyses_AB_exact_d = read_clade_results(clade_analyses_AB_exact_dir)
    root_mutations_d = read_root_mutations(root_mutations_dir)
    
    # C/C analysis
    cc_count = 0 # how often are there only two 1-mutation clades, each constituting more than 30% of taxa AND with a polytomy at the base of each clade, and no other descendants from the root?
    cc_exact_count = 0
    for run in clade_analyses_CC_d:
        clade_sizes = clade_analyses_CC_d[run]['clade_sizes']
        if len(clade_sizes) == 2: # make sure there are only two clades
            if min(clade_sizes) > (sum(clade_sizes)*0.30): # make sure each clade is >30%
                subclade_sizes = clade_analyses_CC_d[run]['subclade_sizes']
                if len(subclade_sizes[0]) >= min_polytomy_size and len(subclade_sizes[1]) >= min_polytomy_size: # each clade must have a polytomy at the base
                    cc_count += 1
                    if len(clade_analyses_CC_exact_d[run]['clade_sizes']) == 2:
                        cc_exact_count += 1
    
    # A/B analysis
    ab_count = 0 # interested in 2 mutations clade that are at least 30% of all taxa + has a basal polytomy + polytomy at 2 mutation clade
    ab_exact_count = 0
    lower_constraint = 0.3 # the 2-mutation clade must be at least 30% of all taxa
    upper_constraint = 0.7 # the 2-mutation clade must be at most 70% of all taxa

    for run in clade_analyses_AB_d:
        num_leaves = sum(clade_analyses_CC_d[run]['clade_sizes'])
        base_polytomy_size = len(clade_analyses_CC_d[run]['clade_sizes']) # check how many lineages descend from the root
        clade_sizes = clade_analyses_AB_d[run]['clade_sizes'] 
        subclade_sizes = clade_analyses_AB_d[run]['subclade_sizes']
        clade_sizes_exact = clade_analyses_AB_exact_d[run]['clade_sizes'] 
        subclade_sizes_exact = clade_analyses_AB_exact_d[run]['subclade_sizes']
        if not clade_sizes: # no 2 mutation clades
            continue
        if base_polytomy_size >= min_polytomy_size: # basal polytomy
            for index, clade_size in enumerate(clade_sizes): # loop through all 2 mutation clades
                if lower_constraint*num_leaves <= clade_size <= upper_constraint*num_leaves: # clade match size restrictions
                    if len(subclade_sizes[index]) >= min_polytomy_size: # polytomy at 2 mutation clade
                        ab_count += 1 # if all conditions are met, add 1 to the count
                        break # if one 2 mutation clade meets the conditions, break out of the loop and move on to the next run
            for index, clade_size in enumerate(clade_sizes_exact): # loop through all 2 mutation clades
                if lower_constraint*num_leaves <= clade_size <= upper_constraint*num_leaves: # clade match size restrictions
                    if len(subclade_sizes_exact[index]) >= min_polytomy_size: # polytomy at 2 mutation clade
                        ab_exact_count += 1 # if all conditions are met, add 1 to the count
                        break # if one 2 mutation clade meets the conditions, break out of the loop and move on to the next run

    CC_two_intros = []
    AB_two_intros = []
    CC_exact_two_intros = []
    AB_exact_two_intros = []
    for expected_coalescence in expected_coalescence_range:
        CC_two_intros_v = []
        AB_two_intros_v = []
        CC_exact_two_intros_v = []
        AB_exact_two_intros_v = []
        for expected_introduction in expected_introduction_range:
            AB = 0
            CC = 0
            AB_exact = 0
            CC_exact = 0
            for sample in range(1100):
                coalescence_time = expected_coalescence # np.random.exponential(expected_coalescence)
                mut1 = rng.poisson(coalescence_time*0.00092*29903)
                introduction_interval = expected_introduction # np.random.exponential(expected_introduction)
                mut2 = rng.poisson((coalescence_time+introduction_interval)*0.00092*29903)
                run_1, run_2 = rng.choice(range(1, 1101), 2, replace=True)
                clade_sizes_1 = clade_analyses_CC_d[run_1]['clade_sizes'] # get the clade sizes
                clade_sizes_2 = clade_analyses_CC_d[run_2]['clade_sizes'] # get the clade sizes
                if len(clade_sizes_1) >= 100 and len(clade_sizes_2) >= 100:  # if there are at least X number of descendants, including individual leaves
                    if test_sizes(run_1, run_2, introduction_interval):
                        mut1 += root_mutations_d[run_1]
                        mut2 += root_mutations_d[run_2]
                        if 0 in [mut1,mut2] and (mut1>1 or mut2>1):
                            AB += 1
                            if (mut1 == 2 or mut2 == 2):
                                AB_exact += 1
                        elif mut1 > 0 and mut2 > 0:
                            CC += 1
                            if (mut1 == 1 and mut2 == 1):
                                CC_exact += 1

            CC_two_intros_v.append(CC/1100)
            AB_two_intros_v.append(AB/1100)
            CC_exact_two_intros_v.append(CC_exact/1100)
            AB_exact_two_intros_v.append(AB_exact/1100)
        CC_two_intros.append(CC_two_intros_v)
        AB_two_intros.append(AB_two_intros_v)
        CC_exact_two_intros.append(CC_exact_two_intros_v)
        AB_exact_two_intros.append(AB_exact_two_intros_v)
 
    cc_result = cc_count/1100
    ab_result = ab_count/1100
    cc_exact_result = cc_exact_count/1100
    ab_exact_result = ab_exact_count/1100

    return [CC_two_intros, AB_two_intros, cc_result, ab_result, CC_exact_two_intros, AB_exact_two_intros, cc_exact_result, ab_exact_result]


### Main result

In [13]:
expected_coalescence_range = [0, 1/365, 2/365, 4/365, 7/365, 14/365, 28/365]
expected_introduction_range = [0, 1/365, 2/365, 4/365, 7/365, 14/365, 28/365]
# collect clade analysis results
def process_clade_analysis(i, seed):
    clade_analyses_CC_dir = f'./resamples/{i}/clade_analyses_CC/'
    clade_analyses_AB_dir = f'./resamples/{i}/clade_analyses_AB/'
    clade_analyses_CC_exact_dir = f'./resamples/{i}/clade_analyses_CC_exact/'
    clade_analyses_AB_exact_dir = f'./resamples/{i}/clade_analyses_AB_exact/'
    root_mutations_dir = f'./resamples/{i}/'
    return clade_analysis_updated(clade_analyses_CC_dir, clade_analyses_AB_dir, clade_analyses_CC_exact_dir, clade_analyses_AB_exact_dir, root_mutations_dir, '3.5 DT',expected_coalescence_range, expected_introduction_range, seed, min_polytomy_size=100)

# Initialize lists to store results
cc2 = []
ab2 = []
cc = []
ab = []
cc2_exact = []
ab2_exact = []
cc_exact = []
ab_exact = []

with ProcessPoolExecutor(max_workers=24) as executor:
    seeds = np.random.randint(0, 2**31, size=1000)
    # Map process_clade_analysis function across the range of values
    results = executor.map(process_clade_analysis, range(1000), seeds)

    # Unpack the results and append them to the respective lists
    for a, b, c, d, e, f, g, h in results:
        cc2.append(a)
        ab2.append(b)
        cc.append(c)
        ab.append(d)
        cc2_exact.append(e)
        ab2_exact.append(f)
        cc_exact.append(g)
        ab_exact.append(h)
np.array(cc2)
np.array(ab2)
np.array(cc)
np.array(ab)
np.array(cc2_exact)
np.array(ab2_exact)
np.array(cc_exact)
np.array(ab_exact)

cc2 = np.mean(cc2, axis = 0)
ab2 = np.mean(ab2, axis = 0)
cc = np.mean(cc)
ab = np.mean(ab)
cc2_exact = np.mean(cc2_exact, axis = 0)
ab2_exact = np.mean(ab2_exact, axis = 0)
cc_exact = np.mean(cc_exact)
ab_exact = np.mean(ab_exact)

In [14]:
# Setup the MRCA haplotype posterior probabilities for the Bayes factor calculation
unconstrained_results = np.array([1.68, 80.85, 10.32, 0.92])/100  
recCA_results = np.array([77.28, 8.18, 10.49, 3.71])/100  

print('Bayes factors for combinations of')
print(' - expected time between MRCA and first introduction (t1 days); and')
print(' - expected time between introductions (t2 days).\n')
print('With the recCA rooting, and')
print('- the relaxed separation constraint (two or more mutations)')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        bf.append((sum(recCA_results[:2])*ab2[i][j] + sum(recCA_results[2:])*cc2[i][j]) / \
        (sum(recCA_results[:2])*ab + sum(recCA_results[2:])*cc))
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x:.2f}' for x in bf]))
print('\n- the strict separation constraint (exactly two mutations)')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        bf.append((sum(recCA_results[:2])*ab2_exact[i][j] + sum(recCA_results[2:])*cc2_exact[i][j]) / \
        (sum(recCA_results[:2])*ab_exact + sum(recCA_results[2:])*cc_exact))
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x:.2f}' for x in bf]))
print('\nWith the unconstrained rooting, and')
print('- the relaxed separation constraint (two or more mutations)')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        bf.append((sum(unconstrained_results[:2])*ab2[i][j] + sum(unconstrained_results[2:])*cc2[i][j]) / \
        (sum(unconstrained_results[:2])*ab + sum(unconstrained_results[2:])*cc))
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x:.2f}' for x in bf]))
print('\n- the strict separation constraint (exactly two mutations)')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        bf.append((sum(unconstrained_results[:2])*ab2_exact[i][j] + sum(unconstrained_results[2:])*cc2_exact[i][j]) / \
        (sum(unconstrained_results[:2])*ab_exact + sum(unconstrained_results[2:])*cc_exact))
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x:.2f}' for x in bf]))

Bayes factors for combinations of
 - expected time between MRCA and first introduction (t1 days); and
 - expected time between introductions (t2 days).

With the recCA rooting, and
- the relaxed separation constraint (two or more mutations)
	t2
t1	0	1	2	4	7	14	28
0	0.21	0.22	0.23	0.24	0.26	0.25	0.14
1	0.22	0.23	0.23	0.25	0.27	0.26	0.14
2	0.24	0.25	0.25	0.26	0.27	0.25	0.13
4	0.26	0.27	0.27	0.28	0.28	0.26	0.13
7	0.28	0.29	0.29	0.29	0.29	0.25	0.12
14	0.30	0.29	0.29	0.29	0.27	0.22	0.10
28	0.25	0.24	0.24	0.23	0.21	0.17	0.08

- the strict separation constraint (exactly two mutations)
	t2
t1	0	1	2	4	7	14	28
0	0.16	0.17	0.17	0.18	0.18	0.15	0.05
1	0.17	0.17	0.17	0.18	0.18	0.15	0.05
2	0.18	0.18	0.18	0.18	0.18	0.14	0.04
4	0.19	0.18	0.18	0.18	0.17	0.13	0.04
7	0.18	0.18	0.18	0.17	0.16	0.11	0.03
14	0.14	0.14	0.13	0.12	0.10	0.07	0.02
28	0.05	0.04	0.04	0.04	0.03	0.02	0.00

With the unconstrained rooting, and
- the relaxed separation constraint (two or more mutations)
	t2
t1	0	1	2	4	7	14	28
0	0.21	0.21

In [15]:
print('### recCA - relaxed')
# Header
print('|  |', end='')
print(' | '.join(f'$t_2$ = {int(x*365)} days' for x in expected_introduction_range), '|')
print('|---|' + '|'.join([':---:' for _ in expected_introduction_range]) + '|')

# Rows
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        numerator = sum(recCA_results[:2]) * ab2[i][j] + sum(recCA_results[2:]) * cc2[i][j]
        denominator = sum(recCA_results[:2]) * ab + sum(recCA_results[2:]) * cc
        bf.append(numerator / denominator)
    # Row print with formatting
    print(f'| $t_1$ **= {int(expected_coalescence_range[i]*365)} days** |', end='')
    print(' | '.join(f'{x:.2f}' for x in bf), '|')
print('### unconstrained - relaxed')
# Header
print('|  |', end='')
print(' | '.join(f'$t_2$ = {int(x*365)} days' for x in expected_introduction_range), '|')
print('|---|' + '|'.join([':---:' for _ in expected_introduction_range]) + '|')

# Rows
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        numerator = sum(unconstrained_results[:2]) * ab2[i][j] + sum(unconstrained_results[2:]) * cc2[i][j]
        denominator = sum(unconstrained_results[:2]) * ab + sum(unconstrained_results[2:]) * cc
        bf.append(numerator / denominator)
    # Row print with formatting
    print(f'| $t_1$ **= {int(expected_coalescence_range[i]*365)} days** |', end='')
    print(' | '.join(f'{x:.2f}' for x in bf), '|')
print('### recCA - strict')
# Header
print('|  |', end='')
print(' | '.join(f'$t_2$ = {int(x*365)} days' for x in expected_introduction_range), '|')
print('|---|' + '|'.join([':---:' for _ in expected_introduction_range]) + '|')

# Rows
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        numerator = sum(recCA_results[:2]) * ab2_exact[i][j] + sum(recCA_results[2:]) * cc2_exact[i][j]
        denominator = sum(recCA_results[:2]) * ab_exact + sum(recCA_results[2:]) * cc_exact
        bf.append(numerator / denominator)
    # Row print with formatting
    print(f'| $t_1$ **= {int(expected_coalescence_range[i]*365)} days** |', end='')
    print(' | '.join(f'{x:.2f}' for x in bf), '|')
print('### unconstrained - strict')
# Header
print('|  |', end='')
print(' | '.join(f'$t_2$ = {int(x*365)} days' for x in expected_introduction_range), '|')
print('|---|' + '|'.join([':---:' for _ in expected_introduction_range]) + '|')

# Rows
for i in range(len(expected_coalescence_range)):
    bf = []
    for j in range(len(expected_introduction_range)):
        numerator = sum(unconstrained_results[:2]) * ab2_exact[i][j] + sum(unconstrained_results[2:]) * cc2_exact[i][j]
        denominator = sum(unconstrained_results[:2]) * ab_exact + sum(unconstrained_results[2:]) * cc_exact
        bf.append(numerator / denominator)
    # Row print with formatting
    print(f'| $t_1$ **= {int(expected_coalescence_range[i]*365)} days** |', end='')
    print(' | '.join(f'{x:.2f}' for x in bf), '|')

### recCA - relaxed
|  |$t_2$ = 0 days | $t_2$ = 1 days | $t_2$ = 2 days | $t_2$ = 4 days | $t_2$ = 7 days | $t_2$ = 14 days | $t_2$ = 28 days |
|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
| $t_1$ **= 0 days** |0.21 | 0.22 | 0.23 | 0.24 | 0.26 | 0.25 | 0.14 |
| $t_1$ **= 1 days** |0.22 | 0.23 | 0.23 | 0.25 | 0.27 | 0.26 | 0.14 |
| $t_1$ **= 2 days** |0.24 | 0.25 | 0.25 | 0.26 | 0.27 | 0.25 | 0.13 |
| $t_1$ **= 4 days** |0.26 | 0.27 | 0.27 | 0.28 | 0.28 | 0.26 | 0.13 |
| $t_1$ **= 7 days** |0.28 | 0.29 | 0.29 | 0.29 | 0.29 | 0.25 | 0.12 |
| $t_1$ **= 14 days** |0.30 | 0.29 | 0.29 | 0.29 | 0.27 | 0.22 | 0.10 |
| $t_1$ **= 28 days** |0.25 | 0.24 | 0.24 | 0.23 | 0.21 | 0.17 | 0.08 |
### unconstrained - relaxed
|  |$t_2$ = 0 days | $t_2$ = 1 days | $t_2$ = 2 days | $t_2$ = 4 days | $t_2$ = 7 days | $t_2$ = 14 days | $t_2$ = 28 days |
|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
| $t_1$ **= 0 days** |0.21 | 0.21 | 0.22 | 0.23 | 0.25 | 0.24 | 0.13 |
| $t_1$ **= 1 days** |0.22 | 0.22 | 

In [16]:
mutation_counts = [0,0,0,0]
for i in range(1000):
    for j in range(1,1101):
        path = f'resamples/{i}/{j:04d}_root_mutations.txt'
        with open(path) as file:
            n = int(file.readline().strip())
        if n > 3:
            n = 3
        mutation_counts[n] += 1
for muts, n in enumerate(mutation_counts):
    print(f'{muts} mutation(s): {n/1000/11:.1f}%')
    

0 mutation(s): 50.5%
1 mutation(s): 26.0%
2 mutation(s): 12.4%
3 mutation(s): 11.0%


In [17]:
print('AB topology frequencies')
print('')
print('With the relaxed separation constraint (two or more mutations)')
print(f'Single introduction frequency: {ab*100:.1f}%')
print('Two introduction frequencies:')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x*100:.1f}%' for x in ab2[i]]))
print('\nWith the strict separation constraint (exactly two mutations)')
print(f'Single introduction frequency: {ab_exact*100:.1f}%')
print('Two introduction frequencies:')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x*100:.1f}%' for x in ab2_exact[i]]))

AB topology frequencies

With the relaxed separation constraint (two or more mutations)
Single introduction frequency: 3.3%
Two introduction frequencies:
	t2
t1	0	1	2	4	7	14	28
0	0.6%	0.6%	0.6%	0.6%	0.7%	0.7%	0.3%
1	0.6%	0.6%	0.6%	0.7%	0.7%	0.6%	0.3%
2	0.6%	0.7%	0.7%	0.7%	0.7%	0.6%	0.3%
4	0.7%	0.7%	0.7%	0.7%	0.7%	0.6%	0.3%
7	0.7%	0.7%	0.7%	0.7%	0.7%	0.5%	0.2%
14	0.6%	0.6%	0.6%	0.6%	0.5%	0.4%	0.1%
28	0.3%	0.3%	0.3%	0.3%	0.2%	0.2%	0.1%

With the strict separation constraint (exactly two mutations)
Single introduction frequency: 2.5%
Two introduction frequencies:
	t2
t1	0	1	2	4	7	14	28
0	0.4%	0.4%	0.4%	0.4%	0.4%	0.4%	0.1%
1	0.4%	0.4%	0.4%	0.4%	0.4%	0.3%	0.1%
2	0.4%	0.4%	0.4%	0.4%	0.4%	0.3%	0.1%
4	0.4%	0.4%	0.4%	0.4%	0.4%	0.3%	0.1%
7	0.4%	0.4%	0.4%	0.4%	0.3%	0.2%	0.1%
14	0.3%	0.3%	0.3%	0.3%	0.2%	0.1%	0.0%
28	0.1%	0.1%	0.1%	0.1%	0.1%	0.0%	0.0%


In [18]:
print('CC topology frequencies')
print('')
print('With the relaxed separation constraint (two or more mutations)')
print(f'Single introduction frequency: {cc*100:.1f}%')
print('Two introduction frequencies:')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x*100:.1f}%' for x in cc2[i]]))
print('\nWith the strict separation constraint (exactly two mutations)')
print(f'Single introduction frequency: {cc_exact*100:.1f}%')
print('Two introduction frequencies:')
print('\tt2')
print('t1\t' + '\t'.join([str(int(x*365)) for x in expected_introduction_range]))
for i in range(len(expected_coalescence_range)):
    print(f'{int(expected_coalescence_range[i]*365):d}\t' + '\t'.join([f'{x*100:.1f}%' for x in cc2_exact[i]]))

CC topology frequencies

With the relaxed separation constraint (two or more mutations)
Single introduction frequency: 0.2%
Two introduction frequencies:
	t2
t1	0	1	2	4	7	14	28
0	0.7%	0.8%	0.9%	1.0%	1.1%	1.2%	0.9%
1	0.9%	1.0%	1.0%	1.1%	1.2%	1.3%	0.9%
2	1.0%	1.0%	1.1%	1.2%	1.3%	1.3%	0.9%
4	1.2%	1.3%	1.3%	1.4%	1.5%	1.5%	1.0%
7	1.6%	1.6%	1.6%	1.7%	1.8%	1.7%	1.0%
14	2.2%	2.2%	2.3%	2.3%	2.3%	2.1%	1.1%
28	2.9%	2.9%	2.9%	2.9%	2.9%	2.4%	1.3%

With the strict separation constraint (exactly two mutations)
Single introduction frequency: 0.1%
Two introduction frequencies:
	t2
t1	0	1	2	4	7	14	28
0	0.2%	0.3%	0.3%	0.3%	0.3%	0.2%	0.1%
1	0.3%	0.3%	0.3%	0.3%	0.3%	0.2%	0.1%
2	0.3%	0.3%	0.3%	0.3%	0.3%	0.2%	0.1%
4	0.3%	0.3%	0.4%	0.3%	0.3%	0.2%	0.1%
7	0.4%	0.4%	0.4%	0.3%	0.3%	0.2%	0.0%
14	0.3%	0.3%	0.3%	0.3%	0.2%	0.1%	0.0%
28	0.1%	0.1%	0.1%	0.1%	0.1%	0.0%	0.0%
