In [None]:
import endomill
from nbmetalog import nbmetalog as nbm


In [None]:
nbm.print_metadata()


In [None]:
data_urls = [
    'https://github.com/mmore500/hereditary-stratigraph-concept/blob/26405273f59455d45002ad0745a12689bb486b0e/binder/phylogenetic-inference/a=pairwise_mrca_estimates+source=nk_lexicaseselection_seed110_pop165_mut.01_snapshot_500.csv.gz',
    'https://github.com/mmore500/hereditary-stratigraph-concept/blob/26405273f59455d45002ad0745a12689bb486b0e/binder/phylogenetic-inference/a=pairwise_mrca_estimates+source=nk_randomselection_seed7_pop100_mut.01_snapshot_5000.csv.gz',
    'https://github.com/mmore500/hereditary-stratigraph-concept/blob/26405273f59455d45002ad0745a12689bb486b0e/binder/phylogenetic-inference/a=pairwise_mrca_estimates+source=nk_sharingselection_seed10_pop100_mut.01_snapshot_5000.csv.gz',
    'https://github.com/mmore500/hereditary-stratigraph-concept/blob/26405273f59455d45002ad0745a12689bb486b0e/binder/phylogenetic-inference/a=pairwise_mrca_estimates+source=nk_tournamentselection_seed140_pop100_mut.01_snapshot_5000.csv.gz',
]

endomill.instantiate_over(
    parameter_packs=[
        {'data_url': data_url}
        for data_url in data_urls
    ],
)


In [None]:
#define papermil parameters
data_url: str


In [None]:
import alifedata_phyloinformatics_convert as apc
import itertools as it
from keyname import keyname as kn
from matplotlib import pyplot as plt
import numpy as np
from os.path import basename
import pandas as pd
import scipy
from scipy.cluster.hierarchy import dendrogram as scipy_dendrogram
from scipy.cluster.hierarchy import linkage as scipy_linkage
from scipy.spatial.distance import squareform as scipy_squareform
import sys
from teeplot import teeplot as tp
from tqdm import tqdm


In [None]:
nbm.print_metadata()


In [None]:
df = pd.read_csv(
    f'{data_url}?raw=true',
    compression='gzip',
)

nbm.print_dataframe_synopsis(df)

data_filename = basename(data_url)

print(data_filename)


In [None]:
endomill.add_instance_outpath(
    f'a=phylogeny_reconstruction+source={kn.unpack(data_filename)["source"]}.endomill.ipynb'
)


In [None]:
df['Treatment'] = df.apply(
    lambda row: kn.pack({
        'target': row['Stratigraphic Column Target Retained Bits'],
        'differentia': row['Differentia Bit Width'],
        'policy': row['Stratum Retention Policy'],
    }),
    axis=1,
)


In [None]:
max_gen = df['Generation of Taxon Compared From'].max()


In [None]:
def create_distance_matrix(df):
    assert len(df['Treatment'].unique() == 1)

    counter = it.count()
    taxon_to_index = {
        taxon : next(counter)
        for taxon in df['Taxon Compared From'].unique()
    }
    label_list = sorted(
        [*df['Taxon Compared From'].unique()],
        key=lambda x: taxon_to_index[x],
    )
    assert len(label_list) == len(taxon_to_index)
    distance_dict = {
    #     (taxon_to_index[row['Taxon Compared From']], taxon_to_index[row['Taxon Compared To']]) : row['Generation of Taxon Compared From'] - row['Generation Of MRCA Lower Bound (inclusive)']
#         (taxon_to_index[row['Taxon Compared From']], taxon_to_index[row['Taxon Compared To']]) : max_gen - row['Generation Of MRCA Lower Bound (inclusive)']
        (taxon_to_index[row['Taxon Compared From']], taxon_to_index[row['Taxon Compared To']]) : max_gen - row['Generation Of MRCA Upper Bound (exclusive)']
        for __, row in df.iterrows()
    #     for __, row in tqdm(df[df['Column Configuration'] == 'actual_bits=1016+actual_strata=127+bits_error=-8+differentia=8+policy=RecencyProportionalResolution+resolution=28+target_bits=1024'].iterrows())
    }

    distance_matrix = np.array([
        [
#             float(distance_dict[(i, j)] + distance_dict[(j, i)])
            float(distance_dict[(i, j)])
            if i != j
            else 0.0
            for j in range(len(taxon_to_index))
        ]
        for i in range(len(taxon_to_index))
    ])

    return distance_matrix, label_list


In [None]:
def construct_linkage(df):
    assert len(df['Treatment'].unique() == 1)
    distance_matrix, label_list = create_distance_matrix(df)
    np.nan_to_num(distance_matrix, nan=max_gen+1, copy=False)
    condensed_distance_matrix = scipy_squareform(
        distance_matrix,
    )
    return scipy_linkage(
        condensed_distance_matrix,
        method='complete',
    ), label_list


In [None]:
linkages = {
    treatment: construct_linkage(group)
    for treatment, group in tqdm(df.groupby('Treatment'))
}


In [None]:
for treatment, (linkage, label_list) in linkages.items():
    tp.tee(
        scipy_dendrogram,
        linkage,
        labels=label_list,
        orientation='top',
        distance_sort='descending',
        teeplot_outattrs={
            'treatment' : kn.demote(treatment),
        },
    )
    plt.show()


In [None]:
trees = {
    treatment: apc.scipy_linkage_matrix_to_dendropy_tree(
        linkage,
        leaf_taxon_labels=label_list,
    )
    \for treatment, (linkage, label_list) in linkages.items()
}


In [None]:
alife_dataframes = {
    treatment: apc.dendropy_tree_to_alife_dataframe(tree)
    for treatment, tree in trees.items()
}
for treatment, df in alife_dataframes.items():
    df['Treatment'] = treatment


In [None]:
res_df = pd.concat(alife_dataframes.values()).reset_index()


In [None]:
res_df


In [None]:
nbm.print_dataframe_synopsis(res_df)


In [None]:
res_df.to_csv(
    f'a=reconstructed_phylogenies+source={kn.unpack(data_filename)["source"]}',
    compression='gzip',
)
