In [1]:
# Parameters
data_url = "https://osf.io/8ycq7/"


In [2]:
import endomill
from nbmetalog import nbmetalog as nbm


In [3]:
nbm.print_metadata()


context: ci
hostname: cc611e23e2ba
interpreter: 3.8.12 (default, Jan 15 2022, 18:39:47)  [GCC 7.5.0]
nbcellexec: 3
nbname: phylogeny_simulation_mill
nbpath: /opt/hereditary-stratigraph-concept/binder/phylogenetic-inference/phylogeny_simulation_mill.ipynb
revision: null
session: 3124c2da-a99f-4dcf-9fea-a02618228e94
timestamp: 2022-08-07T03:12:50Z00:00




IPython==7.16.1
keyname==0.4.1
yaml==5.3.1
endomill==0.1.3
nbmetalog==0.2.6
re==2.2.1
ipython_genutils==0.2.0
logging==0.5.1.2
zmq==22.3.0
json==2.0.9
ipykernel==5.5.3


In [4]:
endomill.instantiate_over(
    parameter_packs=[
        {'data_url': data_url}
        for data_url in [
            # nk_ecoeaselection_seed110_pop100_mut.01_snapshot_3000.csv
            'https://osf.io/5d3be/',
            # nk_lexicaseselection_seed110_pop165_mut.01_snapshot_500.csv
            'https://osf.io/8ycq7/',
            # nk_randomselection_seed7_pop100_mut.01_snapshot_5000.csv
            'https://osf.io/ydxt7/',
            # nk_sharingselection_seed10_pop100_mut.01_snapshot_5000.csv
            'https://osf.io/cz9fk/',
            # nk_tournamentselection_seed140_pop100_mut.01_snapshot_5000.csv
            'https://osf.io/5ubn8/',
        ]
    ],
)


detected executing.endomill.ipynb file
skipping instantiate_over


In [5]:
# define papermill parameters
data_url: str


In [6]:
import anytree
from bs4 import BeautifulSoup
from collections import defaultdict
from hstrat import hstrat
from interval_search import doubling_search
from iterpop import iterpop as ip
import itertools as it
from keyname import keyname as kn
import numpy as np
import opytional as opyt
import pandas as pd
import random
import sys
from tqdm import tqdm
from urllib import request


In [7]:
random.seed(1) # ensure reproducibility
sys.setrecursionlimit(100000) # anytree exceeds default recursion limit on our data


In [8]:
nbm.print_metadata()


context: ci
hostname: cc611e23e2ba
interpreter: 3.8.12 (default, Jan 15 2022, 18:39:47)  [GCC 7.5.0]
nbcellexec: 8
nbname: phylogeny_simulation_mill
nbpath: /opt/hereditary-stratigraph-concept/binder/phylogenetic-inference/phylogeny_simulation_mill.ipynb
revision: null
session: 3124c2da-a99f-4dcf-9fea-a02618228e94
timestamp: 2022-08-07T03:12:51Z00:00




IPython==7.16.1
endomill==0.1.3
ipykernel==5.5.3
ipython_genutils==0.2.0
json==2.0.9
keyname==0.4.1
logging==0.5.1.2
nbmetalog==0.2.6
re==2.2.1
yaml==5.3.1
zmq==22.3.0
anytree==2.8.0
hstrat==0.3.2
iterpop==0.4.0
numpy==1.21.5
opytional==0.1.0
pandas==1.1.2


In [9]:
html = request.urlopen(data_url).read().decode('utf8')

soup = BeautifulSoup(html, 'html.parser')
title = soup.find('title')

data_filename = title.string.split()[-1]
print(data_filename)


OSF


In [10]:
endomill.add_instance_outpath(
    f'a=phylogeny_simulation+source={data_filename}.endomill.ipynb',
)


# Retrieve Target Phylogeny from OSF


In [11]:
target_phylogeny_df = pd.read_csv(
    f'{data_url}/download',
)

nbm.print_dataframe_synopsis(target_phylogeny_df)


digest: 7bffa7d6c47b38c6b9dc45d5093196088d0d82dd210a18e110bc231e2aab7746
manifest:
  ancestor_list: '    210#  ex., [4727]'
  depth: '            59#   ex., 37'
  destruction_time: ' 159#  ex., inf'
  id: '               233#  ex., 4737'
  num_offspring: '    4#    ex., 0'
  num_orgs: '         8#    ex., 1'
  origin_time: '      176#  ex., 500'
  phenotype: '        187#  ex., [ 0.78551 0.507546 0.298434 0.703895 0.70148 0.897296
    0.852754 0.533489 0.975623 0.814646 0.0274892 0.45607 0.221636 0.767401 0.74891
    0.860365 0.319087 0.293359 0.997853 0.770533 ]'
  tot_orgs: '         111#  ex., 1'
  total_offspring: '  15#   ex., 0'
num cols: 10
num cols all na: 0
num cols any na: 0
num na: 0
num rows: 233
num rows all na: 0
num rows any na: 0
size: 83K



# Create a Tree with Target Phylogeny Structure


In [12]:
# map id to anytree node
nodes = defaultdict(anytree.AnyNode)
for __, row in target_phylogeny_df.iterrows():
    node = nodes[row['id']]
    node.id = row['id']
    node.origin_time = row['origin_time']
    if 'NONE' not in row['ancestor_list']:
        ancestor_id = ip.popsingleton(
            eval(row['ancestor_list'])
        )
        node.parent = nodes[ancestor_id]

roots = {node.root for node in nodes.values()}


In [13]:
root = ip.popsingleton(roots)
root.height


58

In [14]:
max_origin_time = target_phylogeny_df['origin_time'].max()
max_origin_time


500

In [15]:
mean_leaf_origin_time = np.mean([
    leaf.origin_time
    for leaf in root.leaves
])
mean_leaf_origin_time


496.2916666666667

# Pick Parameters for Hereditary Stratigraphic Columns


In [16]:
def make_conditions(num_generations: int) -> pd.DataFrame:
    res = []
    for condemner_factory, target_column_bits, differentia_bit_width in it.product(
        [
            hstrat.StratumRetentionCondemnerTaperedDepthProportionalResolution,
            hstrat.StratumRetentionCondemnerRecencyProportionalResolution,
        ],
        [
            64,
            64 * 8,
            64 * 64,
        ],
        [
            1,
            8,
            64,
        ],
    ):
        policy_param = doubling_search(
            lambda x: \
                condemner_factory(x + 1).CalcNumStrataRetainedExact(num_generations)
                * differentia_bit_width > target_column_bits or x >= num_generations,
            {
                hstrat.StratumRetentionCondemnerTaperedDepthProportionalResolution: 1,
                hstrat.StratumRetentionCondemnerRecencyProportionalResolution: 0,
            }[condemner_factory],
        )

        actual_column_strata = condemner_factory(policy_param).CalcNumStrataRetainedExact(num_generations)
        actual_column_bits = actual_column_strata * differentia_bit_width

        res.append({
            'Retention Policy' : condemner_factory.__name__[25:],
            'Differentia Bit Width' : differentia_bit_width,
            'Retention Policy Resolution Parameter' : policy_param,
            'Target Retained Bits' : target_column_bits,
            'Actual Retained Bits' : actual_column_bits,
            'Retained Bits Error' : actual_column_bits - target_column_bits,
            'Actual Retained Strata' : actual_column_strata,
            'condemner' : condemner_factory(policy_param),
        })
    return pd.DataFrame.from_records(res)


In [17]:
conditions_df = make_conditions(int(mean_leaf_origin_time))
conditions_df.drop('condemner', axis=1)


Unnamed: 0,Retention Policy,Differentia Bit Width,Retention Policy Resolution Parameter,Target Retained Bits,Actual Retained Bits,Retained Bits Error,Actual Retained Strata
0,TaperedDepthProportionalResolution,1,31,64,63,-1,63
1,TaperedDepthProportionalResolution,8,3,64,56,-8,7
2,TaperedDepthProportionalResolution,64,1,64,192,128,3
3,TaperedDepthProportionalResolution,1,496,512,496,-16,496
4,TaperedDepthProportionalResolution,8,31,512,504,-8,63
5,TaperedDepthProportionalResolution,64,3,512,448,-64,7
6,TaperedDepthProportionalResolution,1,496,4096,496,-3600,496
7,TaperedDepthProportionalResolution,8,496,4096,3968,-128,496
8,TaperedDepthProportionalResolution,64,31,4096,4032,-64,63
9,RecencyProportionalResolution,1,8,64,60,-4,60


# Set Up Ancestor Column


In [18]:
bundle = hstrat.HereditaryStratigraphicColumnBundle({
    kn.pack({
        'differentia' : row['Differentia Bit Width'],
        'policy' : row['Retention Policy'],
        'resolution' : row['Retention Policy Resolution Parameter'],
        'target_bits' : row['Target Retained Bits'],
        'actual_bits' : row['Actual Retained Bits'],
        'bits_error' : row['Retained Bits Error'],
        'actual_strata' : row['Actual Retained Strata'],
    }) \
        : hstrat.HereditaryStratigraphicColumn(
            stratum_differentia_bit_width=row['Differentia Bit Width'],
            stratum_retention_condemner=row['condemner'],
    )
    for __, row in conditions_df.iterrows()
})


# Simulate Inheritance of Ancestor Column Down Phylogenetic Tree


In [19]:
root = ip.popsingleton(roots)
root.hstrat_column = bundle

for node in anytree.LevelOrderIter(ip.popsingleton(roots)):
    parent = node.parent
    if parent is not None:
        node.hstrat_column = parent.hstrat_column.Clone()
        for __ in range(node.origin_time - parent.origin_time): node.hstrat_column.DepositStratum()


# Extract Pairwise MRCA Estimates for Extant Organisms


In [20]:
# impl -> mean retained bits
mean_retained_bits = {
    impl: np.mean([
        node.hstrat_column.GetNumStrataRetained()[impl]
        * int(kn.unpack(impl)['differentia'])
        for node in root.leaves
    ])
    for impl in bundle
}

records = [
    {
        **{
            'Mean Actual Retained Bits': v,
        },
        **kn.unpack(k)
    }
    for k, v in mean_retained_bits.items()
]
actual_retained_bits_df = pd.DataFrame.from_records(records)
actual_retained_bits_df.to_csv(
    f'a=actual_retained_bits+source={data_filename}',
)

actual_retained_bits_df


Unnamed: 0,Mean Actual Retained Bits,actual_bits,actual_strata,bits_error,differentia,policy,resolution,target_bits,_
0,62.833333,63,63,-1,1,TaperedDepthProportionalResolution,31,64,actual_bits=63+actual_strata=63+bits_error=-1+...
1,56.0,56,7,-8,8,TaperedDepthProportionalResolution,3,64,actual_bits=56+actual_strata=7+bits_error=-8+d...
2,192.0,192,3,128,64,TaperedDepthProportionalResolution,1,64,actual_bits=192+actual_strata=3+bits_error=128...
3,497.291667,496,496,-16,1,TaperedDepthProportionalResolution,496,512,actual_bits=496+actual_strata=496+bits_error=-...
4,502.666667,504,63,-8,8,TaperedDepthProportionalResolution,31,512,actual_bits=504+actual_strata=63+bits_error=-8...
5,448.0,448,7,-64,64,TaperedDepthProportionalResolution,3,512,actual_bits=448+actual_strata=7+bits_error=-64...
6,497.291667,496,496,-3600,1,TaperedDepthProportionalResolution,496,4096,actual_bits=496+actual_strata=496+bits_error=-...
7,3978.333333,3968,496,-128,8,TaperedDepthProportionalResolution,496,4096,actual_bits=3968+actual_strata=496+bits_error=...
8,4021.333333,4032,63,-64,64,TaperedDepthProportionalResolution,31,4096,actual_bits=4032+actual_strata=63+bits_error=-...
9,57.958333,60,60,-4,1,RecencyProportionalResolution,8,64,actual_bits=60+actual_strata=60+bits_error=-4+...


In [21]:
res = []
for extant1, extant2 in tqdm([*it.product(root.leaves, root.leaves)]):
    if extant1 != extant2:
        bounds = extant1.hstrat_column.CalcRankOfMrcaBoundsWith(extant2.hstrat_column)
        for impl in extant1.hstrat_column:
            res.append({
                'Column Configuration' \
                    : impl,
                'Differentia Bit Width' \
                    : kn.unpack(impl)['differentia'],
                'Stratum Retention Policy' \
                    : kn.unpack(impl)['policy'],
                'Stratum Retention Policy Resolution Parameter' \
                    : kn.unpack(impl)['resolution'],
                'Stratigraphic Column Expected Retained Bits' \
                    : kn.unpack(impl)['actual_bits'],
                'Stratigraphic Column Mean Actual Retained Bits' \
                    : mean_retained_bits[impl],
                'Stratigraphic Column Target Retained Bits' \
                    : kn.unpack(impl)['target_bits'],
                'Stratigraphic Column Expected Retained Bits Error' \
                    : kn.unpack(impl)['bits_error'],
                'Stratigraphic Column Actual Num Retained Strata' \
                    : kn.unpack(impl)['actual_strata'],
                'Taxon Compared From' \
                    : extant1.id,
                'Taxon Compared To' \
                    : extant2.id,
                'Generation of Taxon Compared From' \
                    : extant1.hstrat_column.GetNumStrataDeposited(),
                'Generation of Taxon Compared To' \
                    : extant2.hstrat_column.GetNumStrataDeposited(),
                'Generation Of MRCA Lower Bound (inclusive)' \
                    : opyt.apply_if(
                        bounds[impl],
                        lambda x: x[0],
                    ),
                'Generation Of MRCA Upper Bound (exclusive)' \
                    : opyt.apply_if(
                        bounds[impl],
                        lambda x: x[1],
                    ),
                'MRCA Bound Confidence' \
                    : extant1.hstrat_column[impl].CalcRankOfMrcaBoundsWithProvidedConfidenceLevel(),
                'Rank of Earliest Detectable Mrca With' \
                    : extant1.hstrat_column[impl].CalcRankOfEarliestDetectableMrcaWith(extant2.hstrat_column[impl]),
            })

res_df = pd.DataFrame.from_records(res)


  0%|          | 0/576 [00:00<?, ?it/s]

  4%|▍         | 25/576 [00:00<00:02, 197.75it/s]

  9%|▉         | 52/576 [00:00<00:02, 208.11it/s]

 11%|█         | 64/576 [00:00<00:03, 146.26it/s]

 14%|█▎        | 78/576 [00:00<00:03, 142.66it/s]

 16%|█▋        | 95/576 [00:00<00:03, 149.22it/s]

 19%|█▉        | 108/576 [00:00<00:04, 115.34it/s]

 22%|██▏       | 127/576 [00:00<00:03, 125.48it/s]

 25%|██▌       | 146/576 [00:00<00:03, 137.95it/s]

 28%|██▊       | 161/576 [00:01<00:03, 119.14it/s]

 30%|███       | 174/576 [00:01<00:03, 118.10it/s]

 34%|███▎      | 194/576 [00:01<00:02, 134.05it/s]

 36%|███▋      | 209/576 [00:01<00:03, 119.74it/s]

 39%|███▊      | 223/576 [00:01<00:02, 118.62it/s]

 44%|████▍     | 252/576 [00:01<00:02, 141.75it/s]

 47%|████▋     | 269/576 [00:01<00:02, 123.76it/s]

 49%|████▉     | 284/576 [00:02<00:02, 108.51it/s]

 52%|█████▏    | 302/576 [00:02<00:02, 121.85it/s]

 55%|█████▌    | 317/576 [00:02<00:02, 106.10it/s]

 57%|█████▋    | 330/576 [00:02<00:02, 93.85it/s] 

 60%|██████    | 347/576 [00:02<00:02, 105.11it/s]

 62%|██████▏   | 359/576 [00:02<00:02, 81.59it/s] 

 65%|██████▍   | 374/576 [00:03<00:02, 93.80it/s]

 67%|██████▋   | 386/576 [00:03<00:02, 88.02it/s]

 69%|██████▉   | 398/576 [00:03<00:01, 95.23it/s]

 71%|███████   | 409/576 [00:03<00:01, 85.53it/s]

 73%|███████▎  | 422/576 [00:03<00:01, 93.64it/s]

 75%|███████▌  | 433/576 [00:03<00:01, 83.56it/s]

 77%|███████▋  | 446/576 [00:03<00:01, 92.32it/s]

 79%|███████▉  | 457/576 [00:03<00:01, 82.76it/s]

 82%|████████▏ | 470/576 [00:04<00:01, 92.05it/s]

 84%|████████▎ | 481/576 [00:04<00:01, 83.78it/s]

 86%|████████▌ | 495/576 [00:04<00:00, 92.55it/s]

 88%|████████▊ | 506/576 [00:04<00:00, 92.58it/s]

 93%|█████████▎| 537/576 [00:04<00:00, 117.06it/s]

 98%|█████████▊| 564/576 [00:04<00:00, 140.75it/s]

100%|██████████| 576/576 [00:04<00:00, 121.23it/s]




In [22]:
res_df


Unnamed: 0,Column Configuration,Differentia Bit Width,Stratum Retention Policy,Stratum Retention Policy Resolution Parameter,Stratigraphic Column Expected Retained Bits,Stratigraphic Column Mean Actual Retained Bits,Stratigraphic Column Target Retained Bits,Stratigraphic Column Expected Retained Bits Error,Stratigraphic Column Actual Num Retained Strata,Taxon Compared From,Taxon Compared To,Generation of Taxon Compared From,Generation of Taxon Compared To,Generation Of MRCA Lower Bound (inclusive),Generation Of MRCA Upper Bound (exclusive),MRCA Bound Confidence,Rank of Earliest Detectable Mrca With
0,actual_bits=63+actual_strata=63+bits_error=-1+...,1,TaperedDepthProportionalResolution,31,63,62.833333,64,-1,63,4737,4733,501,501,440.0,480.0,0.968750,40
1,actual_bits=56+actual_strata=7+bits_error=-8+d...,8,TaperedDepthProportionalResolution,3,56,56.000000,64,-8,7,4737,4733,501,501,448.0,500.0,0.996094,0
2,actual_bits=192+actual_strata=3+bits_error=128...,64,TaperedDepthProportionalResolution,1,192,192.000000,64,128,3,4737,4733,501,501,256.0,500.0,1.000000,0
3,actual_bits=496+actual_strata=496+bits_error=-...,1,TaperedDepthProportionalResolution,496,496,497.291667,512,-16,496,4737,4733,501,501,474.0,479.0,0.968750,4
4,actual_bits=504+actual_strata=63+bits_error=-8...,8,TaperedDepthProportionalResolution,31,504,502.666667,512,-8,63,4737,4733,501,501,472.0,480.0,0.996094,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9931,actual_bits=480+actual_strata=60+bits_error=-3...,8,RecencyProportionalResolution,8,480,463.666667,512,-32,60,4595,4738,486,501,480.0,482.0,0.996094,0
9932,actual_bits=576+actual_strata=9+bits_error=64+...,64,RecencyProportionalResolution,0,576,445.333333,512,64,9,4595,4738,486,501,480.0,486.0,1.000000,0
9933,actual_bits=496+actual_strata=496+bits_error=-...,1,RecencyProportionalResolution,496,496,497.291667,4096,-3600,496,4595,4738,486,501,476.0,481.0,0.968750,4
9934,actual_bits=3968+actual_strata=496+bits_error=...,8,RecencyProportionalResolution,496,3968,3978.333333,4096,-128,496,4595,4738,486,501,480.0,481.0,0.996094,0


In [23]:
nbm.print_dataframe_synopsis(res_df)


digest: 3d024b557a67b8e2a5572033518c8564b219fdbde3c226f09b29a38aebe1b15f
manifest:
  Column Configuration: '                              18#         ex., actual_bits=63+actual_strata=63+bits_error=-1+differentia=1+policy=TaperedDepthProportionalResolution+resolution=31+target_bits=64'
  Differentia Bit Width: '                             3#          ex., 1'
  Generation Of MRCA Lower Bound (inclusive): '        76#,1668na  ex., 440.0'
  Generation Of MRCA Upper Bound (exclusive): '        70#,1668na  ex., 480.0'
  Generation of Taxon Compared From: '                 10#         ex., 501'
  Generation of Taxon Compared To: '                   10#         ex., 501'
  MRCA Bound Confidence: '                             3#          ex., 0.96875'
  Rank of Earliest Detectable Mrca With: '             5#          ex., 40'
  Stratigraphic Column Actual Num Retained Strata: '   6#          ex., 63'
  Stratigraphic Column Expected Retained Bits: '       13#         ex., 63'
  Stratigraphic C

# Save Pairwise MRCA Estimates to File


In [24]:
res_df.to_csv(
    f'a=pairwise_mrca_estimates+source={data_filename}.gz',
    compression='gzip',
)
res_df.to_csv(
    f'a=pairwise_mrca_estimates+source={data_filename}',
)
