In [1]:
# Parameters
data_url = "https://osf.io/8ycq7/"


In [2]:
import endomill
from nbmetalog import nbmetalog as nbm


In [3]:
nbm.print_metadata()


context: ci
hostname: 774a00d2c919
interpreter: 3.8.12 (default, Jan 15 2022, 18:39:47)  [GCC 7.5.0]
nbcellexec: 3
nbname: phylogeny_simulation_mill
nbpath: /opt/hereditary-stratigraph-concept/binder/phylogenetic-inference/phylogeny_simulation_mill.ipynb
revision: null
session: 7c737d4e-c6b7-4ff6-b973-7f27662c1a45
timestamp: 2022-03-06T20:14:25Z00:00




IPython==7.16.1
keyname==0.4.1
yaml==5.3.1
endomill==0.1.2
nbmetalog==0.2.6
re==2.2.1
ipython_genutils==0.2.0
logging==0.5.1.2
zmq==22.3.0
json==2.0.9
ipykernel==5.5.3


In [4]:
endomill.instantiate_over(
    parameter_packs=[
        {'data_url': data_url}
        for data_url in [
            'https://osf.io/cz9fk/',
            'https://osf.io/ydxt7/',
            'https://osf.io/8ycq7/',
            'https://osf.io/5ubn8/',
        ]
    ],
)


detected executing.endomill.ipynb file
skipping instantiate_over


In [5]:
# define papermill parameters
data_url: str


In [6]:
import anytree
from bs4 import BeautifulSoup
from collections import defaultdict
from hstrat import hstrat
from interval_search import doubling_search
from iterpop import iterpop as ip
import itertools as it
from keyname import keyname as kn
import opytional as opyt
import pandas as pd
import random
import sys
from tqdm import tqdm
from urllib import request


In [7]:
random.seed(1) # ensure reproducibility
sys.setrecursionlimit(100000) # anytree exceeds default recursion limit on our data


In [8]:
nbm.print_metadata()


context: ci
hostname: 774a00d2c919
interpreter: 3.8.12 (default, Jan 15 2022, 18:39:47)  [GCC 7.5.0]
nbcellexec: 8
nbname: phylogeny_simulation_mill
nbpath: /opt/hereditary-stratigraph-concept/binder/phylogenetic-inference/phylogeny_simulation_mill.ipynb
revision: null
session: 7c737d4e-c6b7-4ff6-b973-7f27662c1a45
timestamp: 2022-03-06T20:14:26Z00:00




IPython==7.16.1
endomill==0.1.2
ipykernel==5.5.3
ipython_genutils==0.2.0
json==2.0.9
keyname==0.4.1
logging==0.5.1.2
nbmetalog==0.2.6
re==2.2.1
yaml==5.3.1
zmq==22.3.0
anytree==2.8.0
hstrat==0.2.0
iterpop==0.3.4
opytional==0.1.0
pandas==1.1.2


In [9]:
html = request.urlopen(data_url).read().decode('utf8')

soup = BeautifulSoup(html, 'html.parser')
title = soup.find('title')

data_filename = title.string.split()[-1]
print(data_filename)


nk_lexicaseselection_seed110_pop165_mut.01_snapshot_500.csv


In [10]:
endomill.add_instance_outpath(
    f'a=phylogeny_simulation+source={data_filename}.endomill.ipynb'
)


# Retrieve Target Phylogeny from OSF


In [11]:
target_phylogeny_df = pd.read_csv(
    f'{data_url}/download',
)

nbm.print_dataframe_synopsis(target_phylogeny_df)


digest: 7bffa7d6c47b38c6b9dc45d5093196088d0d82dd210a18e110bc231e2aab7746
manifest:
  ancestor_list: '    210#  ex., [4727]'
  depth: '            59#   ex., 37'
  destruction_time: ' 159#  ex., inf'
  id: '               233#  ex., 4737'
  num_offspring: '    4#    ex., 0'
  num_orgs: '         8#    ex., 1'
  origin_time: '      176#  ex., 500'
  phenotype: '        187#  ex., [ 0.78551 0.507546 0.298434 0.703895 0.70148 0.897296
    0.852754 0.533489 0.975623 0.814646 0.0274892 0.45607 0.221636 0.767401 0.74891
    0.860365 0.319087 0.293359 0.997853 0.770533 ]'
  tot_orgs: '         111#  ex., 1'
  total_offspring: '  15#   ex., 0'
num cols: 10
num cols all na: 0
num cols any na: 0
num na: 0
num rows: 233
num rows all na: 0
num rows any na: 0
size: 83K



# Create a Tree with Target Phylogeny Structure


In [12]:
# map id to anytree node
nodes = defaultdict(anytree.AnyNode)
for __, row in target_phylogeny_df.iterrows():
    node = nodes[row['id']]
    node.id = row['id']
    node.origin_time = row['origin_time']
    if 'NONE' not in row['ancestor_list']:
        ancestor_id = ip.popsingleton(
            eval(row['ancestor_list'])
        )
        node.parent = nodes[ancestor_id]

roots = {node.root for node in nodes.values()}


In [13]:
root = ip.popsingleton(roots)
root.height


58

# Pick Parameters for Hereditary Stratigraphic Columns


In [14]:
def make_conditions(num_generations: int) -> pd.DataFrame:
    res = []
    for condemner_factory, target_column_bits, differentia_bit_width in it.product(
        [
            hstrat.StratumRetentionCondemnerTaperedDepthProportionalResolution,
            hstrat.StratumRetentionCondemnerRecencyProportionalResolution,
        ],
        [
            64,
            64 * 8,
            64 * 16,
        ],
        [
            1,
            8,
            64,
        ],
    ):
        policy_param = doubling_search(
            lambda x: \
                condemner_factory(x + 1).CalcNumStrataRetainedExact(num_generations)
                * differentia_bit_width > target_column_bits or x >= num_generations,
            {
                hstrat.StratumRetentionCondemnerTaperedDepthProportionalResolution: 1,
                hstrat.StratumRetentionCondemnerRecencyProportionalResolution: 0,
            }[condemner_factory],
        )

        actual_column_strata = condemner_factory(policy_param).CalcNumStrataRetainedExact(num_generations)
        actual_column_bits = actual_column_strata * differentia_bit_width

        res.append({
            'Retention Policy' : condemner_factory.__name__[25:],
            'Differentia Bit Width' : differentia_bit_width,
            'Retention Policy Resolution Parameter' : policy_param,
            'Target Retained Bits' : target_column_bits,
            'Actual Retained Bits' : actual_column_bits,
            'Retained Bits Error' : actual_column_bits - target_column_bits,
            'Actual Retained Strata' : actual_column_strata,
            'condemner' : condemner_factory(policy_param),
        })
    return pd.DataFrame.from_records(res)


In [15]:
conditions_df = make_conditions(root.height + 1)
conditions_df.drop('condemner', axis=1)


Unnamed: 0,Retention Policy,Differentia Bit Width,Retention Policy Resolution Parameter,Target Retained Bits,Actual Retained Bits,Retained Bits Error,Actual Retained Strata
0,TaperedDepthProportionalResolution,1,59,64,59,-5,59
1,TaperedDepthProportionalResolution,8,3,64,56,-8,7
2,TaperedDepthProportionalResolution,64,1,64,192,128,3
3,TaperedDepthProportionalResolution,1,59,512,59,-453,59
4,TaperedDepthProportionalResolution,8,59,512,472,-40,59
5,TaperedDepthProportionalResolution,64,3,512,448,-64,7
6,TaperedDepthProportionalResolution,1,59,1024,59,-965,59
7,TaperedDepthProportionalResolution,8,59,1024,472,-552,59
8,TaperedDepthProportionalResolution,64,8,1024,1024,0,16
9,RecencyProportionalResolution,1,59,64,59,-5,59


# Set Up Ancestor Column


In [16]:
bundle = hstrat.HereditaryStratigraphicColumnBundle({
    kn.pack({
        'differentia' : row['Differentia Bit Width'],
        'policy' : row['Retention Policy'],
        'resolution' : row['Retention Policy Resolution Parameter'],
        'target_bits' : row['Target Retained Bits'],
        'actual_bits' : row['Actual Retained Bits'],
        'bits_error' : row['Retained Bits Error'],
        'actual_strata' : row['Actual Retained Strata'],
    }) \
        : hstrat.HereditaryStratigraphicColumn(
            stratum_differentia_bit_width=row['Differentia Bit Width'],
            stratum_retention_condemner=row['condemner'],
    )
    for __, row in conditions_df.iterrows()
})


# Simulate Inheritance of Ancestor Column Down Phylogenetic Tree


In [17]:
root = ip.popsingleton(roots)
root.hstrat_column = bundle

for node in anytree.LevelOrderIter(ip.popsingleton(roots)):
    parent = node.parent
    if parent is not None:
        node.hstrat_column = parent.hstrat_column.Clone()
        for __ in range(node.origin_time - parent.origin_time): node.hstrat_column.DepositStratum()


# Extract Pairwise MRCA Estimates for Extant Organisms


In [18]:
res = []
for extant1, extant2 in tqdm([*it.product(root.leaves, root.leaves)]):
    if extant1 != extant2:
        bounds = extant1.hstrat_column.CalcRankOfMrcaBoundsWith(extant2.hstrat_column)
        for impl in extant1.hstrat_column:
            res.append({
                'Column Configuration' \
                    : impl,
                'Differentia Bit Width' \
                    : kn.unpack(impl)['differentia'],
                'Stratum Retention Policy' \
                    : kn.unpack(impl)['policy'],
                'Stratum Retention Policy Resolution Parameter' \
                    : kn.unpack(impl)['resolution'],
                'Stratigraphic Column Actual Retained Bits' \
                    : kn.unpack(impl)['actual_bits'],
                'Stratigraphic Column Target Retained Bits' \
                    : kn.unpack(impl)['target_bits'],
                'Stratigraphic Column Retained Bits Error' \
                    : kn.unpack(impl)['bits_error'],
                'Stratigraphic Column Actual Num Retained Strata' \
                    : kn.unpack(impl)['actual_strata'],
                'Taxon Compared From' \
                    : extant1.id,
                'Taxon Compared To' \
                    : extant2.id,
                'Generation of Taxon Compared From' \
                    : extant1.hstrat_column.GetNumStrataDeposited(),
                'Generation of Taxon Compared To' \
                    : extant2.hstrat_column.GetNumStrataDeposited(),
                'Generation Of MRCA Lower Bound (inclusive)' \
                    : opyt.apply_if(
                        bounds[impl],
                        lambda x: x[0],
                    ),
                'Generation Of MRCA Upper Bound (exclusive)' \
                    : opyt.apply_if(
                        bounds[impl],
                        lambda x: x[1],
                    ),
                'MRCA Bound Confidence' \
                    : extant1.hstrat_column[impl].CalcRankOfMrcaBoundsWithProvidedConfidenceLevel(),
                'Rank of Earliest Detectable Mrca With' \
                    : extant1.hstrat_column[impl].CalcRankOfEarliestDetectableMrcaWith(extant2.hstrat_column[impl]),
            })

res_df = pd.DataFrame.from_records(res)


  0%|          | 0/576 [00:00<?, ?it/s]

  4%|▍         | 25/576 [00:00<00:02, 226.60it/s]

  9%|▉         | 53/576 [00:00<00:02, 234.24it/s]

 13%|█▎        | 74/576 [00:00<00:02, 224.18it/s]

 15%|█▌        | 89/576 [00:00<00:02, 183.95it/s]

 18%|█▊        | 105/576 [00:00<00:02, 170.48it/s]

 22%|██▏       | 127/576 [00:00<00:02, 173.55it/s]

 26%|██▌       | 147/576 [00:00<00:02, 179.84it/s]

 28%|██▊       | 164/576 [00:00<00:02, 172.25it/s]

 31%|███▏      | 181/576 [00:01<00:02, 157.47it/s]

 35%|███▍      | 200/576 [00:01<00:02, 163.03it/s]

 39%|███▊      | 222/576 [00:01<00:02, 174.82it/s]

 44%|████▍     | 252/576 [00:01<00:01, 195.08it/s]

 47%|████▋     | 273/576 [00:01<00:01, 184.86it/s]

 51%|█████     | 293/576 [00:01<00:01, 169.71it/s]

 54%|█████▍    | 311/576 [00:01<00:01, 142.18it/s]

 57%|█████▋    | 329/576 [00:01<00:01, 145.57it/s]

 60%|██████    | 347/576 [00:01<00:01, 151.94it/s]

 63%|██████▎   | 363/576 [00:02<00:01, 133.06it/s]

 66%|██████▌   | 378/576 [00:02<00:01, 129.73it/s]

 69%|██████▉   | 397/576 [00:02<00:01, 140.50it/s]

 72%|███████▏  | 412/576 [00:02<00:01, 132.05it/s]

 74%|███████▍  | 426/576 [00:02<00:01, 127.19it/s]

 77%|███████▋  | 443/576 [00:02<00:00, 136.07it/s]

 80%|███████▉  | 458/576 [00:02<00:01, 114.15it/s]

 82%|████████▏ | 471/576 [00:03<00:00, 117.62it/s]

 84%|████████▍ | 484/576 [00:03<00:00, 118.24it/s]

 86%|████████▋ | 497/576 [00:03<00:00, 120.07it/s]

 91%|█████████▏| 527/576 [00:03<00:00, 145.78it/s]

 96%|█████████▋| 555/576 [00:03<00:00, 170.16it/s]

100%|██████████| 576/576 [00:03<00:00, 163.93it/s]




In [19]:
res_df


Unnamed: 0,Column Configuration,Differentia Bit Width,Stratum Retention Policy,Stratum Retention Policy Resolution Parameter,Stratigraphic Column Actual Retained Bits,Stratigraphic Column Target Retained Bits,Stratigraphic Column Retained Bits Error,Stratigraphic Column Actual Num Retained Strata,Taxon Compared From,Taxon Compared To,Generation of Taxon Compared From,Generation of Taxon Compared To,Generation Of MRCA Lower Bound (inclusive),Generation Of MRCA Upper Bound (exclusive),MRCA Bound Confidence,Rank of Earliest Detectable Mrca With
0,actual_bits=59+actual_strata=59+bits_error=-5+...,1,TaperedDepthProportionalResolution,59,59,64,-5,59,4737,4733,501,501,460.0,480.0,0.968750,32
1,actual_bits=56+actual_strata=7+bits_error=-8+d...,8,TaperedDepthProportionalResolution,3,56,64,-8,7,4737,4733,501,501,448.0,500.0,0.996094,0
2,actual_bits=192+actual_strata=3+bits_error=128...,64,TaperedDepthProportionalResolution,1,192,64,128,3,4737,4733,501,501,256.0,500.0,1.000000,0
3,actual_bits=59+actual_strata=59+bits_error=-45...,1,TaperedDepthProportionalResolution,59,59,512,-453,59,4737,4733,501,501,460.0,480.0,0.968750,32
4,actual_bits=472+actual_strata=59+bits_error=-4...,8,TaperedDepthProportionalResolution,59,472,512,-40,59,4737,4733,501,501,476.0,480.0,0.996094,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9931,actual_bits=472+actual_strata=59+bits_error=-4...,8,RecencyProportionalResolution,59,472,512,-40,59,4595,4738,486,501,480.0,481.0,0.996094,0
9932,actual_bits=320+actual_strata=5+bits_error=-19...,64,RecencyProportionalResolution,0,320,512,-192,5,4595,4738,486,501,480.0,486.0,1.000000,0
9933,actual_bits=59+actual_strata=59+bits_error=-96...,1,RecencyProportionalResolution,59,59,1024,-965,59,4595,4738,486,501,476.0,481.0,0.968750,28
9934,actual_bits=472+actual_strata=59+bits_error=-5...,8,RecencyProportionalResolution,59,472,1024,-552,59,4595,4738,486,501,480.0,481.0,0.996094,0


In [20]:
nbm.print_dataframe_synopsis(res_df)


digest: e6697cce570a5f2195fa0045077882142a8ec691f9c2dbaf10acc7411233b7c3
manifest:
  Column Configuration: '                            18#         ex., actual_bits=59+actual_strata=59+bits_error=-5+differentia=1+policy=TaperedDepthProportionalResolution+resolution=59+target_bits=64'
  Differentia Bit Width: '                           3#          ex., 1'
  Generation Of MRCA Lower Bound (inclusive): '      71#,1964na  ex., 460.0'
  Generation Of MRCA Upper Bound (exclusive): '      61#,1964na  ex., 480.0'
  Generation of Taxon Compared From: '               10#         ex., 501'
  Generation of Taxon Compared To: '                 10#         ex., 501'
  MRCA Bound Confidence: '                           3#          ex., 0.96875'
  Rank of Earliest Detectable Mrca With: '           5#          ex., 32'
  Stratigraphic Column Actual Num Retained Strata: ' 6#          ex., 59'
  Stratigraphic Column Actual Retained Bits: '       9#          ex., 59'
  Stratigraphic Column Retained Bits 

# Save Pairwise MRCA Estimates to File


In [21]:
res_df.to_csv(
    f'a=pairwise_mrca_estimates+source={data_filename}.gz',
    compression='gzip',
)
