In [1]:
import os

from iterpop import iterpop as ip
from keyname import keyname as kn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats as scipy_stats
import seaborn as sns
from teeplot import teeplot


In [2]:
df = pd.read_csv("https://osf.io/ck47r/download")


In [3]:
# phylometrics whittled down to descriptive, non-redundant set
phylometrics = [
    #     'diversity',
    "colless_like_index",
    #     'average_depth',
    #     'average_origin_time',
    #     'average_origin_time_normalized',
    #     'max_depth',
    "mean_pairwise_distance",
    #     'mean_pairwise_distance_branch_only',
    #     'num_active',
    "num_ancestors",
    #     'num_outside',
    #     'num_roots',
    #     'num_taxa',
    #     'phylogenetic_diversity',
    #     'sum_pairwise_distance',
    #     'sum_pairwise_distance_branch_only',
    #     'total_orgs',
    #     'tree_size',
    #     'variance_pairwise_distance',
    #     'variance_pairwise_distance_branch_only',
    #     'mrca_depth',
    #     'sackin_index',
    "mean_evolutionary_distinctiveness",
    #     'sum_evolutionary_distinctiveness',
    #     'variance_evolutionary_distinctiveness',
]


In [4]:
evolutionary_variables = [
    "mut_distn",  # sensitivity analysis
    "num_generations",  # sensitivity analysis
    "num_islands",
    "num_niches",
    "p_island_migration",  # sensitivity analysis
    "p_niche_invasion",  # sensitivity analysis
    "population_size",  # doesn't change
    "tournament_size",
]


In [5]:
sensitivity_analysis_variables = [
    "epoch",
    "mut_distn",
]


In [6]:
fixed_variables = [
    "num_generations",  # homogeneous (num generations per epoch)
    "p_island_migration",  # homogeneous
    "population_size",  # homogeneous
    "subsampling-fraction",  # only use 1.0
    "trie-postprocess",  # just use naive postprocessing for now
]


In [7]:
# ensure fixed variables
df = df[
    (df["p_island_migration"] == 0.01)
    & ((df["subsampling-fraction"] == 1.0) | df["subsampling-fraction"].isna())
    & ((df["trie-postprocess"] == "naive") | df["trie-postprocess"].isna())
].copy()

for fixed_variable in fixed_variables:
    assert len(df[fixed_variable].dropna().unique()) == 1

df


Unnamed: 0,a,epoch,mut_distn,num_generations,num_islands,num_niches,p_island_migration,p_niche_invasion,population_size,replicate,...,subsampling-fraction,tournament_size,treatment,trie-postprocess,_generation,_index,ext,_,triplet_distance,quartet_distance
7,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,22,...,1.0,2,20,naive,32768,636,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.568345e-02,1.006780e-01
9,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,36,...,1.0,2,20,naive,32768,1028,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.547799e-04,2.322835e-03
19,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,7,...,1.0,2,20,naive,32768,216,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.120701e-05,4.261369e-05
21,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,38,...,1.0,2,20,naive,32768,1084,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.723737e-04,6.271271e-04
23,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,30,...,1.0,2,20,naive,32768,860,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.189480e-01,8.837823e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100783,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,8,...,1.0,2,25,naive,262144,249,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,2.817044e-03,8.520046e-03
100790,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,11,...,1.0,2,25,naive,262144,333,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.390141e-04,4.943354e-04
100792,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,17,...,1.0,2,25,naive,262144,501,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.652896e-11,3.860717e-10
100795,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,44,...,1.0,2,25,naive,262144,1257,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.865545e-04,2.452341e-03


In [8]:
# exclude extraneous variable values
df = df[(df["a"] == "reconstructed-tree")].copy()

df


Unnamed: 0,a,epoch,mut_distn,num_generations,num_islands,num_niches,p_island_migration,p_niche_invasion,population_size,replicate,...,subsampling-fraction,tournament_size,treatment,trie-postprocess,_generation,_index,ext,_,triplet_distance,quartet_distance
7,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,22,...,1.0,2,20,naive,32768,636,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.568345e-02,1.006780e-01
9,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,36,...,1.0,2,20,naive,32768,1028,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.547799e-04,2.322835e-03
19,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,7,...,1.0,2,20,naive,32768,216,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.120701e-05,4.261369e-05
21,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,38,...,1.0,2,20,naive,32768,1084,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.723737e-04,6.271271e-04
23,reconstructed-tree,0,np.random.standard_normal,32768,1,8,0.01,3.051758e-08,32768,30,...,1.0,2,20,naive,32768,860,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.189480e-01,8.837823e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100783,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,8,...,1.0,2,25,naive,262144,249,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,2.817044e-03,8.520046e-03
100790,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,11,...,1.0,2,25,naive,262144,333,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,1.390141e-04,4.943354e-04
100792,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,17,...,1.0,2,25,naive,262144,501,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,9.652896e-11,3.860717e-10
100795,reconstructed-tree,7,np.random.exponential,32768,1024,4,0.01,3.051758e-06,32768,44,...,1.0,2,25,naive,262144,1257,.csv.gz,/mnt/home/mmore500/scratch/data/hstrat-evoluti...,6.865545e-04,2.452341e-03


In [9]:
df["regime"] = df.apply(
    lambda row: {
        (1, 1, 2, 3.0517578125e-08): "plain",
        (1, 1, 1, 3.0517578125e-08): "weak selection",
        (1, 1, 4, 3.0517578125e-08): "strong selection",
        (1, 1, 8, 3.0517578125e-08): "weak selection",
        (1, 4, 2, 3.0517578125e-06): "weak 4 niche ecology",
        (1, 4, 2, 3.0517578125e-08): "4 niche ecology",
        (1, 8, 2, 3.0517578125e-08): "8 niche ecology",
        (1024, 1, 2, 3.0517578125e-08): "spatial structure",
    }.get(
        tuple(
            row[
                [
                    "num_islands",
                    "num_niches",
                    "tournament_size",
                    "p_niche_invasion",
                ]
            ]
        ),
        np.nan,  # default
    ),
    axis="columns",
)
df = df.dropna(axis="index", subset=["regime"]).copy()
df["regime"].unique()


array(['8 niche ecology', 'weak 4 niche ecology', 'spatial structure',
       'strong selection', 'weak selection', 'plain', '4 niche ecology'],
      dtype=object)

In [10]:
df["quality"] = df.apply(
    lambda row: {
        ("reconstructed-tree", 3.0): "33% resolution",
        ("reconstructed-tree", 10.0): "10% resolution",
        ("reconstructed-tree", 30.0): "3% resolution",
        ("reconstructed-tree", 100.0): "1% resolution",
    }.get(
        tuple(row[["a", "resolution"]].fillna(0)),
        np.nan,  # default
    ),
    axis="columns",
)
df = df.dropna(axis="index", subset=["quality"]).copy()
df["quality"].unique()


array(['33% resolution', '10% resolution', '1% resolution',
       '3% resolution'], dtype=object)

In [11]:
records = []
for phylometric in ["quartet_distance", "triplet_distance"]:
    for group, group_df in df.groupby(
        ["regime", *sensitivity_analysis_variables], as_index=False
    ):
        kw_result, n = scipy_stats.kruskal(
            *(
                quality_series
                for quality, quality_series in group_df.groupby(["quality"])[
                    phylometric
                ]
            )
        ), ip.pophomogeneous(group_df.groupby(["quality"]).count()["a"])
        records.append(
            {
                **{
                    "n": n,
                    "N": len([*group_df.groupby(["quality"])]),
                    "phylometric": phylometric,
                },
                **dict(zip(["statistic", "p"], kw_result)),
                **dict(
                    zip(["regime", *sensitivity_analysis_variables], group)
                ),
            }
        )

with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    out_df = (
        pd.DataFrame.from_records(records)
        .sort_values(["phylometric", *sensitivity_analysis_variables])
        .reset_index(drop=True)
    )
    display(out_df)
    os.makedirs("outdata", exist_ok=True)
    out_df.to_csv(
        "outdata/a=reconstruction-error-comparisons-between-resolutions+ext=.csv",
        index=False,
    )


Unnamed: 0,n,N,phylometric,statistic,p,regime,epoch,mut_distn
0,50,4,quartet_distance,1.804036,0.6140571,4 niche ecology,0,np.random.exponential
1,50,4,quartet_distance,147.13883,1.091277e-31,8 niche ecology,0,np.random.exponential
2,50,4,quartet_distance,3.808119,0.2829431,plain,0,np.random.exponential
3,50,4,quartet_distance,119.443284,1.017028e-25,spatial structure,0,np.random.exponential
4,50,4,quartet_distance,3.447104,0.3276876,strong selection,0,np.random.exponential
5,50,4,quartet_distance,1.756621,0.6244199,weak 4 niche ecology,0,np.random.exponential
6,50,4,quartet_distance,89.6632,2.5875919999999997e-19,weak selection,0,np.random.exponential
7,50,4,quartet_distance,3.085409,0.3786433,4 niche ecology,0,np.random.standard_normal
8,50,4,quartet_distance,112.909099,2.595386e-24,8 niche ecology,0,np.random.standard_normal
9,50,4,quartet_distance,1.194113,0.7544165,plain,0,np.random.standard_normal


In [12]:
records = []
for phylometric in ["quartet_distance", "triplet_distance"]:
    for group, group_df in df.groupby(
        ["quality", *sensitivity_analysis_variables], as_index=False
    ):
        kw_result, n = scipy_stats.kruskal(
            *(
                quality_series
                for quality, quality_series in group_df.groupby(["regime"])[
                    phylometric
                ]
            )
        ), ip.pophomogeneous(group_df.groupby(["regime"]).count()["a"])
        records.append(
            {
                **{
                    "n": n,
                    "N": len([*group_df.groupby(["regime"])]),
                    "phylometric": phylometric,
                },
                **dict(zip(["statistic", "p"], kw_result)),
                **dict(
                    zip(["quality", *sensitivity_analysis_variables], group)
                ),
            }
        )

with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    out_df = (
        pd.DataFrame.from_records(records)
        .sort_values(["phylometric", *sensitivity_analysis_variables])
        .reset_index(drop=True)
    )
    display(out_df)
    os.makedirs("outdata", exist_ok=True)
    out_df.to_csv(
        "outdata/a=reconstruction-error-comparisons-between-regimes+ext=.csv",
        index=False,
    )


Unnamed: 0,n,N,phylometric,statistic,p,quality,epoch,mut_distn
0,50,7,quartet_distance,219.559076,1.292043e-44,1% resolution,0,np.random.exponential
1,50,7,quartet_distance,203.950785,2.7360729999999997e-41,10% resolution,0,np.random.exponential
2,50,7,quartet_distance,212.829484,3.513988e-43,3% resolution,0,np.random.exponential
3,50,7,quartet_distance,212.394385,4.350306e-43,33% resolution,0,np.random.exponential
4,50,7,quartet_distance,205.812665,1.0980999999999999e-41,1% resolution,0,np.random.standard_normal
5,50,7,quartet_distance,194.000965,3.586719e-39,10% resolution,0,np.random.standard_normal
6,50,7,quartet_distance,206.176684,9.185783999999999e-42,3% resolution,0,np.random.standard_normal
7,50,7,quartet_distance,210.970925,8.746509e-43,33% resolution,0,np.random.standard_normal
8,50,7,quartet_distance,124.534939,1.815603e-24,1% resolution,2,np.random.exponential
9,50,7,quartet_distance,116.089061,1.0790330000000001e-22,10% resolution,2,np.random.exponential
