<a href="https://colab.research.google.com/gist/mmore500/a2e88e7c239935c362ec59c6b5a3f7b5/reconstruction-quality-experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Procedure:

For each experimental replicate per treatment,
- Navigate to <https://colab.research.google.com/gist/mmore500/a2e88e7c239935c362ec59c6b5a3f7b5> to open a fresh copy of the experiment notebook. **Open a fresh notebook copy for each treatment.**
- Click on filename on the top left of the Colab page(`a2e88e7c239935c362ec59c6b5a3f7b5`) and rename according to template
  - `evo=island{num_islands}-niche{num_niches}-ngen{num_generations}-popsize{population_size}-tournsize{tournament_size}+instrument={"steady"|"tilted"}-{"old"|"new"}-bits{annotation_size_bits}-diff{differentia_width}+replicate={replicate}+ext=.ipynb`.
  - For example, `evo=island1-niche1-ngen10000-popsize1024-tournsize2+instrument=steady-old-bits64-diff1+replicate=0+ext=.ipynb`.
- Configure variables in "Configure Experment" section.
- On the top menu, click `Runtime > Restart sesson and run all` if available, otherwise `Runtime > Run all`.
- Wait for final cell's execution to complete.
- Record configured variables and results from "Evaluate Reconstruction" section in [results spreadsheet](https://docs.google.com/spreadsheets/d/1ZhS4NDTDyBiwmwtWrZO5L06MGB3lhmp2-5ZzClhEwPU/edit?usp=sharing).
- On the top menu, click `File > Download > Download .ipynb`.
- Upload ipynb file to treatment directory at <https://osf.io/n4b2g/>, named same as notebook, except excluding `+replicate={replicate}+ext=.ipynb`.
  - Treatment directory should contain notebooks for each replicate of notebook.


## Set Up Environment

In [None]:
import os
import typing
import uuid

import alifedata_phyloinformatics_convert as apc
from Bio import Phylo
from hstrat import hstrat
from hstrat import _auxiliary_lib as hstrat_aux
from hsurf import hsurf
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import tqdist
from tqdm.notebook import tqdm

## Configure Experiment

In [None]:
# os.environ["annotation_size_bits"] = "64"
# os.environ["differentia_width_bits"] = "1"
# os.environ["downsample"] = "500"
# os.environ["stratum_retention_algo"] = "surf-steady"
# os.environ["population_size"] = "1024"
# os.environ["num_generations"] = "10000"
# os.environ["num_islands"] = "4"
# os.environ["num_niches"] = "2"
# os.environ["tournament_size"] = "2"
# os.environ["replicate"] = "0"

Configure instrumentation. **Edit me**

In [None]:
# TODO Uncomment one...
annotation_size_bits = int(os.environ["annotation_size_bits"])
print(f"{annotation_size_bits=}")
assert (
    annotation_size_bits.bit_count() == 1
), "must be power of 2 (1, 2, 4, 8, etc.)"

# TODO Uncomment one...
differentia_width_bits = int(os.environ["differentia_width_bits"])
print(f"{differentia_width_bits=}")
assert (
    differentia_width_bits.bit_count() == 1
), "must be power of 2 (1, 2, 4, 8, etc.)"

# TODO Uncomment one...
stratum_retention_algo = os.environ.get("stratum_retention_algo")
print(f"{stratum_retention_algo=}")
stratum_retention_algo = {
    "col-steady": hstrat.depth_proportional_resolution_tapered_algo,
    "col-tilted": hstrat.recency_proportional_resolution_curbed_algo,
    "surf-hybrid": hsurf.stratum_retention_interop_hybrid_algo,
    "surf-steady": hsurf.stratum_retention_interop_steady_algo,
    "surf-tilted": hsurf.stratum_retention_interop_tilted_sticky_algo,
}[stratum_retention_algo]

Configure evolutionary scale. **Edit me**

In [None]:
# TODO Uncomment one...
population_size = int(os.environ["population_size"])
print(f"{population_size=}")
assert population_size.bit_count() == 1, "must be power of 2 (1, 2, 4, 8, etc.)"

# TODO Uncomment one...
num_generations = int(os.environ["num_generations"])
print(f"{num_generations=}")

Configure evolutionary conditions.  **Edit me**

In [None]:
# TODO Uncomment one...
num_islands = int(os.environ["num_islands"])
print(f"{num_islands=}")
assert num_islands.bit_count() == 1, "must be power of 2 (1, 2, 4, 8, etc.)"

# TODO Uncomment one...
num_niches = int(os.environ["num_niches"])
print(f"{num_niches=}")
assert num_niches.bit_count() == 1, "must be power of 2 (1, 2, 4, 8, etc.)"

# TODO Uncomment one...
tournament_size = int(os.environ["tournament_size"])
print(f"{tournament_size=}")

Configure experimental replicate. **Edit me**

In [None]:
replicate = int(os.environ["replicate"])
print(f"{replicate=}")

Set up random number generator. (Do not edit.)

In [None]:
seed = (
    hash(
        (
            replicate,
            population_size,
            num_generations,
            num_islands,
            num_niches,
            tournament_size,
        )
    )
    % 2**32
)

seed

In [None]:
from hstrat._auxiliary_lib import seed_random

seed_random(seed)

Parametrize instrumentation. (Do not edit.)

In [None]:
annotation_capacity_strata = annotation_size_bits // differentia_width_bits
assert (
    annotation_capacity_strata.bit_count() == 1
), "must be power of 2 (1, 2, 4, 8, etc.)"
print(f"{annotation_capacity_strata=}")

parametrized_policy = stratum_retention_algo.Policy(
    parameterizer=hstrat.PropertyAtMostParameterizer(
        target_value=annotation_capacity_strata,
        policy_evaluator=hstrat.NumStrataRetainedUpperBoundEvaluator(
            at_num_strata_deposited=num_generations,
        ),
        param_lower_bound=2,
        param_upper_bound=1024,
    ),
)

print(f"{parametrized_policy=}")
print(
    f"num strata retained upper bound {parametrized_policy.CalcNumStrataRetainedUpperBound(num_generations)}"
)

## Setup

Helper functions.

In [None]:
def calc_tqdist_distance(
    x: pd.DataFrame,
    y: pd.DataFrame,
    progress_wrap: typing.Callable = lambda x: x,
) -> float:
    """Calculate dissimilarity between two trees. Used to measure how accurate
    tree reconstructions are."""
    tree_a = apc.RosettaTree(x, validate="error").as_dendropy
    tree_b = apc.RosettaTree(y, validate="error").as_dendropy

    # must suppress root unifurcations or tqdist barfs
    # see https://github.com/uym2/tripVote/issues/15
    tree_a.unassign_taxa(exclude_leaves=True)
    tree_a.suppress_unifurcations()
    tree_b.unassign_taxa(exclude_leaves=True)
    tree_b.suppress_unifurcations()

    tree_a_taxon_labels = [
        leaf.taxon.label for leaf in progress_wrap(tree_a.leaf_node_iter())
    ]
    tree_b_taxon_labels = [
        leaf.taxon.label for leaf in progress_wrap(tree_b.leaf_node_iter())
    ]
    all(
        progress_wrap(
            zip(tree_a.leaf_node_iter(), tree_b.leaf_node_iter(), strict=True),
        ),
    )
    assert sorted(tree_a_taxon_labels) == sorted(tree_b_taxon_labels)
    assert sorted(tree_a_taxon_labels) == sorted(
        x.loc[hstrat_aux.alifestd_find_leaf_ids(x), "taxon_label"],
    )
    assert sorted(tree_a_taxon_labels) == sorted(
        y.loc[hstrat_aux.alifestd_find_leaf_ids(y), "taxon_label"],
    )
    for taxon_label in progress_wrap(tree_a_taxon_labels):
        assert taxon_label
        assert taxon_label.strip()

    newick_a = tree_a.as_string(schema="newick").strip()
    newick_b = tree_b.as_string(schema="newick").strip()

    return {
        "quartet_distance": tqdist.quartet_distance(newick_a, newick_b),
        "quartet_distanc_raw": tqdist.quartet_distance_raw(newick_a, newick_b),
        "triplet_distance": tqdist.triplet_distance(newick_a, newick_b),
        "triplet_distance_raw": tqdist.triplet_distance_raw(newick_a, newick_b),
    }

## Generate Phylogeny

Use simple evolutionary simulation to generate a phylogenetic history to test reconstruction process on.

In [None]:
true_phylogeny_df = hstrat.evolve_fitness_trait_population(
    num_islands=num_islands,
    num_niches=num_niches,
    num_generations=num_generations,
    population_size=population_size,
    tournament_size=tournament_size,
    progress_wrap=tqdm,
)

In [None]:
true_phylogeny_df["taxon_label"] = true_phylogeny_df["loc"].astype(str)
true_phylogeny_df = hstrat_aux.alifestd_mark_leaves(
    true_phylogeny_df, mutate=True
)
true_phylogeny_df.loc[~true_phylogeny_df["is_leaf"], "taxon_label"] = ""
true_phylogeny_df

In [None]:
true_phylogeny_df = hstrat_aux.alifestd_to_working_format(
    hstrat_aux.alifestd_collapse_unifurcations(true_phylogeny_df, mutate=True),
    mutate=True,
).reset_index(drop=True)
true_phylogeny_df

In [None]:
full_true_phylogeny_df = true_phylogeny_df.copy()

## Downsample Phylogeny


In [None]:
downsample = int(os.environ["downsample"])
print(f"{downsample=}")
if downsample:
    print(f"downsampling to {downsample=}")
    leaf_ids = hstrat_aux.alifestd_find_leaf_ids(true_phylogeny_df)
    downsample_ids = np.random.default_rng(seed).choice(
        leaf_ids,
        downsample,
        replace=False,
    )
    true_phylogeny_df["extant"] = False
    true_phylogeny_df.loc[
        true_phylogeny_df["id"].isin(downsample_ids),
        "extant",
    ] = True
    true_phylogeny_df = hstrat_aux.alifestd_prune_extinct_lineages_asexual(
        true_phylogeny_df,
        mutate=True,
    )
    assert (
        len(hstrat_aux.alifestd_find_leaf_ids(true_phylogeny_df)) == downsample
    )

    true_phylogeny_df = hstrat_aux.alifestd_to_working_format(
        true_phylogeny_df,
        mutate=True,
    )

## Generate Reconstruction

Generate genome annotations as if tracking phylogeny in distributed environment.
Then run reconstruction proess to estimate true phylogeny from generated annotations.

In [None]:
extant_annotations = hstrat.descend_template_phylogeny_alifestd(
    true_phylogeny_df,
    seed_column=hstrat.HereditaryStratigraphicColumn(
        parametrized_policy,
        stratum_differentia_bit_width=differentia_width_bits,
    ),
    extant_ids=hstrat_aux.alifestd_find_leaf_ids(true_phylogeny_df),
    progress_wrap=tqdm,
)

len(extant_annotations)

In [None]:
reconstructed_phylogeny_df = hstrat.build_tree_trie(
    extant_annotations,
    bias_adjustment=hstrat.CompoundTriePostprocessor(
        [
            hstrat.PeelBackConjoinedLeavesTriePostprocessor(),
            hstrat.AssignOriginTimeSampleNaiveTriePostprocessor(),
        ],
    ),
    progress_wrap=tqdm,
    taxon_labels=true_phylogeny_df.loc[
        hstrat_aux.alifestd_find_leaf_ids(true_phylogeny_df),
        "taxon_label",
    ],
)
reconstructed_phylogeny_df

In [None]:
reconstructed_phylogeny_df = hstrat_aux.alifestd_collapse_unifurcations(
    reconstructed_phylogeny_df, mutate=True
)
reconstructed_phylogeny_df

In [None]:
assert hstrat_aux.alifestd_is_chronologically_ordered(
    reconstructed_phylogeny_df
)

## Evaluate Reconstruction

Reconstruction quality data --- collect into spreadsheet.

In [None]:
estimation_intervals = [
    hstrat.calc_ranks_since_mrca_bounds_with(
        *np.random.choice(extant_annotations, size=2, replace=False),
        prior="arbitrary",
    )
    for __ in tqdm(range(200))
]

In [None]:
# handle the case where common ancestry cannot be definitively established
try:
    median_abs_uncertainty = np.median([*map(np.ptp, estimation_intervals)])
except TypeError:
    median_abs_uncertainty = np.nan

try:
    mean_abs_uncertainty = np.mean([*map(np.ptp, estimation_intervals)])
except TypeError:
    mean_abs_uncertainty = np.nan

f"{median_abs_uncertainty=} {mean_abs_uncertainty=}"

In [None]:
# handle the case where common ancestry cannot be definitively established
try:
    rel_uncertainties = np.array([*map(np.ptp, estimation_intervals)]) / (
        np.array([*map(np.mean, estimation_intervals)]) + 1
    )
    median_rel_uncertainty = np.median(rel_uncertainties)
    mean_rel_uncertainty = np.mean(rel_uncertainties)
except TypeError:
    median_rel_uncertainty = np.nan
    mean_rel_uncertainty = np.nan

f"{median_rel_uncertainty=} {mean_rel_uncertainty=}"

In [None]:
num_true_inner_nodes = hstrat_aux.alifestd_count_inner_nodes(true_phylogeny_df)
num_reconstructed_inner_nodes = hstrat_aux.alifestd_count_inner_nodes(
    reconstructed_phylogeny_df
)
f"{num_true_inner_nodes=} {num_reconstructed_inner_nodes=}"

In [None]:
num_true_polytomies = hstrat_aux.alifestd_count_polytomies(true_phylogeny_df)
num_reconstructed_polytomies = hstrat_aux.alifestd_count_polytomies(
    reconstructed_phylogeny_df
)
f"{num_true_polytomies=} {num_reconstructed_polytomies=}"

In [None]:
true_polytomic_index = hstrat_aux.alifestd_calc_polytomic_index(
    true_phylogeny_df
)
reconstructed_polytomic_index = hstrat_aux.alifestd_calc_polytomic_index(
    reconstructed_phylogeny_df
)
f"{true_polytomic_index=} {reconstructed_polytomic_index=}"

In [None]:
distances = calc_tqdist_distance(
    true_phylogeny_df,
    reconstructed_phylogeny_df,
    progress_wrap=tqdm,
)
f"{distances=}"

In [None]:
sampled_triplet_distance_strict = (
    hstrat_aux.alifestd_estimate_triplet_distance_asexual(
        true_phylogeny_df,
        reconstructed_phylogeny_df,
        taxon_label_key="taxon_label",
        confidence=0.9,
        precision=0.05,
        strict=True,
        progress_wrap=tqdm,
        mutate=True,
    )
)
f"{sampled_triplet_distance_strict=}"

In [None]:
sampled_triplet_distance_lax = (
    hstrat_aux.alifestd_estimate_triplet_distance_asexual(
        true_phylogeny_df,
        reconstructed_phylogeny_df,
        taxon_label_key="taxon_label",
        confidence=0.9,
        precision=0.05,
        strict=False,
        progress_wrap=tqdm,
        mutate=True,
    )
)
f"{sampled_triplet_distance_lax=}"

In [None]:
sampled_triplet_distance_strict_ground = (
    hstrat_aux.alifestd_estimate_triplet_distance_asexual(
        true_phylogeny_df,
        reconstructed_phylogeny_df,
        taxon_label_key="taxon_label",
        confidence=0.9,
        precision=0.05,
        strict=(True, False),
        progress_wrap=tqdm,
        mutate=True,
    )
)
f"{sampled_triplet_distance_strict_ground=}"

In [None]:
sampled_triplet_distance_strict_reconst = (
    hstrat_aux.alifestd_estimate_triplet_distance_asexual(
        true_phylogeny_df,
        reconstructed_phylogeny_df,
        taxon_label_key="taxon_label",
        confidence=0.9,
        precision=0.05,
        strict=(False, True),
        progress_wrap=tqdm,
        mutate=True,
    )
)
f"{sampled_triplet_distance_strict_reconst=}"

In [None]:
sampled_triplet_comparisons_df = (
    hstrat_aux.alifestd_sample_triplet_comparisons_asexual(
        true_phylogeny_df,
        reconstructed_phylogeny_df,
        n=1000,
        taxon_label_key="taxon_label",
        progress_wrap=tqdm,
        mutate=True,
    )
)

## Visualize Phylogeny & Reconstruction

For validating results.

Topology only (no time).

In [None]:
if (downsample and downsample <= 10000) or population_size <= 10000:
    true_phylogeny_tree = apc.alife_dataframe_to_biopython_tree(
        hstrat_aux.alifestd_collapse_unifurcations(true_phylogeny_df),
        setup_branch_lengths=False,
    )
    reconstructed_phylogeny_tree = apc.alife_dataframe_to_biopython_tree(
        hstrat_aux.alifestd_collapse_unifurcations(reconstructed_phylogeny_df),
        setup_branch_lengths=False,
    )

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

    ax1.set_title("True Tree")
    Phylo.draw(true_phylogeny_tree, do_show=False, axes=ax1)

    ax2.set_title("Reconstructed Tree")
    Phylo.draw(reconstructed_phylogeny_tree, do_show=False, axes=ax2)

    plt.tight_layout()
    plt.show()

Scaled by time.

In [None]:
if (downsample and downsample <= 10000) or population_size <= 10000:
    true_phylogeny_tree = apc.alife_dataframe_to_biopython_tree(
        hstrat_aux.alifestd_collapse_unifurcations(true_phylogeny_df),
        setup_branch_lengths=True,
    )
    reconstructed_phylogeny_tree = apc.alife_dataframe_to_biopython_tree(
        hstrat_aux.alifestd_collapse_unifurcations(reconstructed_phylogeny_df),
        setup_branch_lengths=True,
    )

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

    ax1.set_title("True Tree")
    ax1.set_xscale("log")
    Phylo.draw(true_phylogeny_tree, do_show=False, axes=ax1)

    ax2.set_title("Reconstructed Tree")
    ax2.set_xscale("log")
    Phylo.draw(reconstructed_phylogeny_tree, do_show=False, axes=ax2)

    plt.tight_layout()
    plt.show()

## Data Output

In [None]:
treatment = f"evo=island{num_islands}-niche{num_niches}-ngen{num_generations}-popsize{population_size}-tournsize{tournament_size}+instrument={os.environ.get('stratum_retention_algo')}-bits{annotation_size_bits}-diff{differentia_width_bits}-dsamp{downsample}+replicate={replicate}"
treatment

In [None]:
traits = {
    "treatment": treatment,
    "algorithm": os.environ.get("stratum_retention_algo"),
    "replicate": replicate,
    "annotation_size_bits": annotation_size_bits,
    "differentia_width_bits": differentia_width_bits,
    "downsample": downsample,
    "num_islands": num_islands,
    "num_niches": num_niches,
    "tournament_size": tournament_size,
    "num_generations": num_generations,
    "population_size": population_size,
    "policy algorithm": stratum_retention_algo.PolicySpec.GetAlgoTitle(),
}
df_out = pd.DataFrame.from_records(
    [
        {
            "sampled_triplet_distance_strict": sampled_triplet_distance_strict,
            "sampled_triplet_distance_strict_ground": sampled_triplet_distance_strict_ground,
            "sampled_triplet_distance_strict_reconst": sampled_triplet_distance_strict_reconst,
            "sampled_triplet_distance_lax": sampled_triplet_distance_lax,
            "num_true_inner_nodes": num_true_inner_nodes,
            "num_reconstructed_inner_nodes": num_reconstructed_inner_nodes,
            "num_true_polytomies": num_true_polytomies,
            "num_reconstructed_polytomies": num_reconstructed_polytomies,
            "reconstructed_polytomic_index": reconstructed_polytomic_index,
            "true_polytomic_index": true_polytomic_index,
            **traits,
            **distances,
        },
    ]
)

filename = f"a=stats+{treatment}+ext=.csv"
df_out.to_csv(filename, index=False)
filename

In [None]:
sampled_triplet_comparisons_df["tree_uid"] = str(uuid.uuid4())
for k, v in traits.items():
    sampled_triplet_comparisons_df[k] = v

filename = f"a=sampled-triplet-comparisons+{treatment}+ext=.pqt"
sampled_triplet_comparisons_df.to_parquet(filename)
filename

In [None]:
full_true_phylogeny_df["tree_uid"] = str(uuid.uuid4())
full_true_phylogeny_df["kind"] = "true-full"
for k, v in traits.items():
    full_true_phylogeny_df[k] = v

filename = f"a=phylogeny-true-full+{treatment}+ext=.pqt"
full_true_phylogeny_df.to_parquet(filename)
filename

In [None]:
true_phylogeny_df["tree_uid"] = str(uuid.uuid4())
true_phylogeny_df["kind"] = "true"
hstrat_aux.alifestd_mark_num_children_asexual(true_phylogeny_df, mutate=True)

reconstructed_phylogeny_df["tree_uid"] = str(uuid.uuid4())
reconstructed_phylogeny_df["kind"] = "reconstructed"
hstrat_aux.alifestd_mark_num_children_asexual(
    reconstructed_phylogeny_df, mutate=True
)

In [None]:
filename = f"a=phylogeny-true-dsamp+{treatment}+ext=.pqt"
true_phylogeny_df.to_parquet(filename)
filename

In [None]:
filename = f"a=phylogeny-reconst-dsamp+{treatment}+ext=.pqt"
reconstructed_phylogeny_df.to_parquet(filename)
filename

In [None]:
polytomies_df = pd.concat([true_phylogeny_df, reconstructed_phylogeny_df])
polytomies_df = polytomies_df[polytomies_df["num_children"] > 2].copy()
for k, v in traits.items():
    polytomies_df[k] = v

filename = f"a=polytomies+{treatment}+ext=.pqt"
polytomies_df.to_parquet(filename)
filename

In [None]:
inner_nodes_df = pd.concat([true_phylogeny_df, reconstructed_phylogeny_df])
inner_nodes_df = inner_nodes_df[inner_nodes_df["num_children"] > 1].copy()
for k, v in traits.items():
    inner_nodes_df[k] = v

filename = f"a=inner_nodes+{treatment}+ext=.pqt"
inner_nodes_df.to_parquet(filename)
filename

## Reproducibility Information

For future reference if reproducing experiments.

In [None]:
print(
    f"""# instrumentation
{annotation_size_bits=}
{differentia_width_bits=}
{stratum_retention_algo.PolicySpec.GetAlgoTitle()=}
{downsample=}

# evolutionary scale
{population_size=}
{num_generations=}

# evolutionary conditions
{num_islands=}
{num_niches=}
{tournament_size=}
"""
)

In [None]:
import datetime

datetime.datetime.now().isoformat()

In [None]:
%load_ext watermark
%watermark

In [None]:
locals()