In [None]:
import endomill
from nbmetalog import nbmetalog as nbm


In [None]:
nbm.print_metadata()


In [None]:
prefix = (
    'https://github.com/mmore500/hereditary-stratigraph-concept/'
    'blob/7f28f68696c67737f6f18054c121769ff616758e/'
    'binder/phylogenetic-inference/'
)
reconstructed_phylogeny_urls = [
    prefix + 'a=reconstructed_phylogenies+source=nk_lexicaseselection_seed110_pop165_mut.01_snapshot_500.csv.gz',
    prefix + 'a=reconstructed_phylogenies+source=nk_randomselection_seed7_pop100_mut.01_snapshot_5000.csv.gz',
    prefix + 'a=reconstructed_phylogenies+source=nk_sharingselection_seed10_pop100_mut.01_snapshot_5000.csv.gz',
    prefix + 'a=reconstructed_phylogenies+source=nk_tournamentselection_seed140_pop100_mut.01_snapshot_5000.csv.gz',
]

original_phylogeny_urls = [
            # nk_ecoeaselection_seed110_pop100_mut.01_snapshot_3000.csv
#             'https://osf.io/5d3be/',
            # nk_lexicaseselection_seed110_pop165_mut.01_snapshot_500.csv
            'https://osf.io/8ycq7/',
            # nk_randomselection_seed7_pop100_mut.01_snapshot_5000.csv
            'https://osf.io/ydxt7/',
            # nk_sharingselection_seed10_pop100_mut.01_snapshot_5000.csv
            'https://osf.io/cz9fk/',
            # nk_tournamentselection_seed140_pop100_mut.01_snapshot_5000.csv
            'https://osf.io/5ubn8/',
]

endomill.instantiate_over(
    parameter_packs=[
        {
            'reconstructed_phylogeny_url': reconstructed_phylogeny_url,
            'original_phylogeny_url': original_phylogeny_url,
        }
        for reconstructed_phylogeny_url, original_phylogeny_url
        in zip(reconstructed_phylogeny_urls, original_phylogeny_urls)
    ],
)


In [None]:
#define papermil parameters
reconstructed_phylogeny_url: str
original_phylogeny_url: str

# reconstructed_phylogeny_url = reconstructed_phylogeny_urls[0]
# original_phylogeny_url = original_phylogeny_urls[0]


In [None]:
import alifedata_phyloinformatics_convert as apc
from Bio import Phylo
import dendropy
from iterpop import iterpop as ip
from keyname import keyname as kn
from os.path import basename
import pandas as pd
import sys


In [None]:
sys.setrecursionlimit(100000) # data exceeds default recursion limit


In [None]:
nbm.print_metadata()


In [None]:
original_df = pd.read_csv(
    f'{original_phylogeny_url}/download',
)

original_df['name'] = original_df['id'].astype(str)
original_df['taxon_label'] = original_df['id'].astype(str)

nbm.print_dataframe_synopsis(original_df)


In [None]:
reconstructed_df = pd.read_csv(
    f'{reconstructed_phylogeny_url}?raw=true',
    compression='gzip',
)
reconstructed_df['taxon_label'] = reconstructed_df['name']
reconstructed_df['Instance'] = reconstructed_df.apply(
    lambda row: row['Treatment'] + row['Algorithm'],
    axis=1,
)

nbm.print_dataframe_synopsis(reconstructed_df)

data_filename = basename(reconstructed_phylogeny_url)

print(data_filename)


In [None]:
endomill.add_instance_outpath(
    f'a=reconstructed_phylogeny_evaluation+source={kn.unpack(data_filename)["source"]}.endomill.ipynb',
)


In [None]:
#TODO: teeplot
actual_tree = apc.alife_dataframe_to_biopython_tree(original_df)
for inner_node in actual_tree.get_nonterminals():
    inner_node.name = None
Phylo.draw(actual_tree)
for instance, group in reconstructed_df.groupby('Instance'):
    tree = apc.alife_dataframe_to_biopython_tree(group)
    tree.root_at_midpoint()
    Phylo.draw(
        tree,
        label_func=lambda node: None if 'Inner' in str(node) else str(node),
    )


In [None]:
actual_tree = apc.alife_dataframe_to_dendropy_tree(original_df)
for inner_node in actual_tree.internal_nodes():
    inner_node.taxon = None

records = []
for instance, group in reconstructed_df.groupby('Instance'):
    tree = apc.alife_dataframe_to_dendropy_tree(group)
    tree.migrate_taxon_namespace(actual_tree.taxon_namespace)
    unweighted_robinson_foulds = dendropy.calculate.treecompare.unweighted_robinson_foulds_distance(
        actual_tree,
        tree,
    )
#     weighted_robinson_foulds = dendropy.calculate.treecompare.weighted_robinson_foulds_distance(
#         actual_tree,
#         tree,
#     )
#     euclidean = dendropy.calculate.treecompare.euclidean_distance(
#         actual_tree,
#         tree,
#     )

    records.append({
        **{
            'Instance': 'Instance',
            'Algorithm': ip.popsingleton(group['Algorithm'].unique()),
            'Treatment': ip.popsingleton(group['Treatment'].unique()),
            'Unweighted Robinson Foulds Distance Error': unweighted_robinson_foulds,
#             'Weighted Robinson Foulds Distance Error': weighted_robinson_foulds,
#             'Euclidean Distance Error': euclidean,
        },
        **kn.unpack(ip.popsingleton(group['Treatment'].unique())),
    })


res_df = pd.DataFrame.from_records(records)


In [None]:
res_df


In [None]:
nbm.print_dataframe_synopsis(res_df)


In [None]:
res_df.to_csv(
    f'a=reconstructed_phylogeny_evaluation+source={kn.unpack(data_filename)["source"]}',
    compression='gzip',
)
