In [1]:
import os

from iterpop import iterpop as ip
from keyname import keyname as kn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats as scipy_stats
import seaborn as sns
from teeplot import teeplot


In [2]:
df = pd.read_csv("https://osf.io/ck47r/download")


In [3]:
evolutionary_variables = [
    "mut_distn",  # sensitivity analysis
    "num_generations",  # sensitivity analysis
    "num_islands",
    "num_niches",
    "p_island_migration",  # sensitivity analysis
    "p_niche_invasion",  # sensitivity analysis
    "population_size",  # doesn't change
    "tournament_size",
]


In [4]:
sensitivity_analysis_variables = [
    "epoch",
    "mut_distn",
]


In [5]:
fixed_variables = [
    "num_generations",  # homogeneous (num generations per epoch)
    "p_island_migration",  # homogeneous
    "population_size",  # homogeneous
    "subsampling-fraction",  # only use 1.0
    "trie-postprocess",  # just use naive postprocessing for now
]


In [6]:
# ensure fixed variables
df = df[
    (df["p_island_migration"] == 0.01)
    & ((df["subsampling-fraction"] == 1.0) | df["subsampling-fraction"].isna())
    & ((df["trie-postprocess"] == "naive") | df["trie-postprocess"].isna())
].copy()

for fixed_variable in fixed_variables:
    assert len(df[fixed_variable].dropna().unique()) == 1

df


Unnamed: 0,a,epoch,mut_distn,num_generations,num_islands,num_niches,p_island_migration,p_niche_invasion,population_size,replicate,...,subsampling-fraction,tournament_size,treatment,trie-postprocess,_generation,_index,ext,_,triplet_distance,quartet_distance
2,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,22,...,1.0,2,20,naive,32768,636,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.568345e-02,1.006780e-01
3,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,36,...,1.0,2,20,naive,32768,1028,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.547799e-04,2.322835e-03
8,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,7,...,1.0,2,20,naive,32768,216,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.120701e-05,4.261369e-05
10,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,38,...,1.0,2,20,naive,32768,1084,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.723737e-04,6.271271e-04
12,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,30,...,1.0,2,20,naive,32768,860,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.189480e-01,8.837823e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50391,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,8,...,1.0,2,25,naive,262144,249,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,2.817044e-03,8.520046e-03
50394,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,11,...,1.0,2,25,naive,262144,333,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.390141e-04,4.943354e-04
50395,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,17,...,1.0,2,25,naive,262144,501,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.652896e-11,3.860717e-10
50396,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,44,...,1.0,2,25,naive,262144,1257,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.865545e-04,2.452341e-03


In [7]:
# exclude extraneous variable values
df = df[(df["a"] == "reconstructed-tree")].copy()

df


Unnamed: 0,a,epoch,mut_distn,num_generations,num_islands,num_niches,p_island_migration,p_niche_invasion,population_size,replicate,...,subsampling-fraction,tournament_size,treatment,trie-postprocess,_generation,_index,ext,_,triplet_distance,quartet_distance
2,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,22,...,1.0,2,20,naive,32768,636,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.568345e-02,1.006780e-01
3,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,36,...,1.0,2,20,naive,32768,1028,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.547799e-04,2.322835e-03
8,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,7,...,1.0,2,20,naive,32768,216,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.120701e-05,4.261369e-05
10,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,38,...,1.0,2,20,naive,32768,1084,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.723737e-04,6.271271e-04
12,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,30,...,1.0,2,20,naive,32768,860,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.189480e-01,8.837823e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50391,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,8,...,1.0,2,25,naive,262144,249,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,2.817044e-03,8.520046e-03
50394,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,11,...,1.0,2,25,naive,262144,333,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.390141e-04,4.943354e-04
50395,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,17,...,1.0,2,25,naive,262144,501,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.652896e-11,3.860717e-10
50396,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,44,...,1.0,2,25,naive,262144,1257,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.865545e-04,2.452341e-03


In [8]:
df["regime"] = df.apply(
    lambda row: {
        (1, 1, 2, 3.0517578125e-08): "plain",
        (1, 1, 1, 3.0517578125e-08): "weak selection",
        (1, 1, 4, 3.0517578125e-08): "strong selection",
        (1, 1, 8, 3.0517578125e-08): "weak selection",
        (1, 4, 2, 3.0517578125e-06): "weak ecology",
        (1, 4, 2, 3.0517578125e-08): "ecology",
        (1, 8, 2, 3.0517578125e-08): "rich ecology",
        (1024, 1, 2, 3.0517578125e-08): "spatial structure",
    }.get(
        tuple(
            row[
                [
                    "num_islands",
                    "num_niches",
                    "tournament_size",
                    "p_niche_invasion",
                ]
            ]
        ),
        np.nan,  # default
    ),
    axis="columns",
)
df = df.dropna(axis="index", subset=["regime"]).copy()
df["regime"].unique()


array(['rich ecology', 'weak ecology', 'spatial structure',
       'strong selection', 'weak selection', 'plain', 'ecology'],
      dtype=object)

In [9]:
df["quality"] = df.apply(
    lambda row: {
        ("reconstructed-tree", 3.0): "33% resolution",
        ("reconstructed-tree", 10.0): "10% resolution",
        ("reconstructed-tree", 30.0): "3% resolution",
        ("reconstructed-tree", 100.0): "1% resolution",
    }.get(
        tuple(row[["a", "resolution"]].fillna(0)),
        np.nan,  # default
    ),
    axis="columns",
)
df = df.dropna(axis="index", subset=["quality"]).copy()
df["quality"].unique()


array(['33% resolution', '10% resolution', '1% resolution',
       '3% resolution'], dtype=object)

In [10]:
records = []
for phylometric in ["quartet_distance", "triplet_distance"]:
    for group, group_df in df.groupby(
        ["regime", *sensitivity_analysis_variables], as_index=False
    ):
        kw_result, n = scipy_stats.kruskal(
            *(
                quality_series
                for quality, quality_series in group_df.groupby(["quality"])[
                    phylometric
                ]
            )
        ), ip.pophomogeneous(group_df.groupby(["quality"]).count()["a"])
        records.append(
            {
                **{
                    "n": n,
                    "N": len([*group_df.groupby(["quality"])]),
                    "phylometric": phylometric,
                },
                **dict(zip(["statistic", "p"], kw_result)),
                **dict(
                    zip(["regime", *sensitivity_analysis_variables], group)
                ),
            }
        )

with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    out_df = (
        pd.DataFrame.from_records(records)
        .sort_values(["phylometric", *sensitivity_analysis_variables])
        .reset_index(drop=True)
    )
    display(out_df)
    os.makedirs("outdata", exist_ok=True)
    out_df.to_csv(
        "outdata/a=reconstruction-error-comparisons-between-resolutions+ext=.csv",
        index=False,
    )


Unnamed: 0,n,N,phylometric,statistic,p,regime,epoch,mut_distn
0,50,4,quartet_distance,157.594209,6.057488e-34,ecology,0,np.random.exponential
1,50,4,quartet_distance,148.945755,4.4481690000000003e-32,plain,0,np.random.exponential
2,50,4,quartet_distance,159.370245,2.506309e-34,rich ecology,0,np.random.exponential
3,50,4,quartet_distance,119.443284,1.017028e-25,spatial structure,0,np.random.exponential
4,50,4,quartet_distance,128.516346,1.12925e-27,strong selection,0,np.random.exponential
5,50,4,quartet_distance,141.116119,2.17169e-30,weak ecology,0,np.random.exponential
6,50,4,quartet_distance,131.510364,2.55607e-28,weak selection,0,np.random.exponential
7,50,4,quartet_distance,164.675976,1.794465e-35,ecology,0,np.random.standard_normal
8,50,4,quartet_distance,158.355188,4.150315e-34,plain,0,np.random.standard_normal
9,50,4,quartet_distance,127.673636,1.71544e-27,rich ecology,0,np.random.standard_normal


In [11]:
records = []
for phylometric in ["quartet_distance", "triplet_distance"]:
    for group, group_df in df.groupby(
        ["quality", *sensitivity_analysis_variables], as_index=False
    ):
        kw_result, n = scipy_stats.kruskal(
            *(
                quality_series
                for quality, quality_series in group_df.groupby(["regime"])[
                    phylometric
                ]
            )
        ), ip.pophomogeneous(group_df.groupby(["regime"]).count()["a"])
        records.append(
            {
                **{
                    "n": n,
                    "N": len([*group_df.groupby(["regime"])]),
                    "phylometric": phylometric,
                },
                **dict(zip(["statistic", "p"], kw_result)),
                **dict(
                    zip(["quality", *sensitivity_analysis_variables], group)
                ),
            }
        )

with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    out_df = (
        pd.DataFrame.from_records(records)
        .sort_values(["phylometric", *sensitivity_analysis_variables])
        .reset_index(drop=True)
    )
    display(out_df)
    os.makedirs("outdata", exist_ok=True)
    out_df.to_csv(
        "outdata/a=reconstruction-error-comparisons-between-regimes+ext=.csv",
        index=False,
    )


Unnamed: 0,n,N,phylometric,statistic,p,quality,epoch,mut_distn
0,50,7,quartet_distance,65.935566,2.778141e-12,1% resolution,0,np.random.exponential
1,50,7,quartet_distance,117.932925,4.4268860000000006e-23,10% resolution,0,np.random.exponential
2,50,7,quartet_distance,52.914004,1.221017e-09,3% resolution,0,np.random.exponential
3,50,7,quartet_distance,115.721248,1.2888290000000001e-22,33% resolution,0,np.random.exponential
4,50,7,quartet_distance,38.789497,7.87077e-07,1% resolution,0,np.random.standard_normal
5,50,7,quartet_distance,88.340497,6.698731000000001e-17,10% resolution,0,np.random.standard_normal
6,50,7,quartet_distance,38.149798,1.050097e-06,3% resolution,0,np.random.standard_normal
7,50,7,quartet_distance,120.202764,1.477372e-23,33% resolution,0,np.random.standard_normal
8,50,7,quartet_distance,107.742711,6.050249e-21,1% resolution,2,np.random.exponential
9,50,7,quartet_distance,147.547976,2.55219e-29,10% resolution,2,np.random.exponential
