# Phylo experiments

## import necessary classes

In [None]:
import numpy as np
from barcode import Barcode
from cell_state import CellTypeTree, CellState
from cell_state_simulator import CellTypeSimulator
from clt_simulator import CLTSimulator
from barcode_simulator import BarcodeSimulator
from alignment import AlignerNW
from clt_observer import ObservedAlignedSeq, CLTObserver
from clt_estimator import CLTParsimonyEstimator
from collapsed_tree import CollapsedTree
from alignment import AlignerNW
from IPython.display import display
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from itertools import product
from random import shuffle

## define simulation parameters


In [None]:
# time to simulate tree for
time = 4
# poisson rates for DSBs on targets
target_lambdas = np.array([1 for _ in range(10)])
# poisson rates for NHEJ on targets
repair_lambdas = np.array([150 for _ in range(10)]) 
# probability of imperfect repair
indel_probability = .1
# left and right average deletion
left_deletion_mu = 5
right_deletion_mu = 5
# average insertion length and dispersion
insertion_mu = 1
insertion_alpha = 10 # large dispersion for insertions
# cell branching process parameters
birth_lambda = 2
death_lambda = 1
# observed base mismatch rate from e.g. sequencing error
error_rate = 0.005
# fraction of simulated leaves that we sample
sampling_rate = 0.5
# Needleman-Wunsch alignment
aligner = AlignerNW()

## define a trivial cell-type tree with 1 type (we're not modeling cell types in this analysis)

In [None]:
cell_type_tree = CellTypeTree(cell_type=None, rate=1)

## instantiate barcode and tree simulators, leaf observer, and parsimony estimator

In [None]:
bcode_simulator = BarcodeSimulator(target_lambdas=np.array(target_lambdas),
                                   repair_rates=np.array(repair_lambdas),
                                   indel_probability=indel_probability,
                                   left_del_mu=left_deletion_mu,
                                   right_del_mu=right_deletion_mu,
                                   insertion_mu=insertion_mu,
                                   insertion_alpha=insertion_alpha)
cell_state_simulator = CellTypeSimulator(cell_type_tree)
# cell lineage tree (CLT) simulator combines barcode simulator, cell state simulator, and branching parameters
clt_simulator = CLTSimulator(birth_lambda, death_lambda, cell_state_simulator, bcode_simulator)
# observer object for getting the leaves of the tree with some error
observer = CLTObserver(sampling_rate=sampling_rate, error_rate=error_rate, aligner=aligner)
# PHYLIP Mix tree estimator
clt_estimator = CLTParsimonyEstimator()

## simulate a cell lineage tree (clt)

In [None]:
# keep simulating until we get a tree with at least n_leaves leaves, give up if we fail 1000 times in a row
n_leaves = 500
for trial in range(1, 1001):    
    simulated_clt = clt_simulator.simulate(Barcode(),
                                           CellState(categorical=cell_type_tree),
                                           time)
    print('try {}, {} leaves'.format(trial, len(simulated_clt)), end='\r')
    if len(simulated_clt) >= n_leaves:
        break

## Sample the simulated tree

In [None]:
# this returns the unique sampled sequences, and the tree with unobserved lineages pruned 
obs_leaves, pruned_clt = observer.observe_leaves(simulated_clt)
abundances = [leaf.abundance for leaf in obs_leaves]
print('cells sampled: {}'.format(sum(abundances)))
print('unique barcode sequences: {}'.format(len(obs_leaves)))
print('maximum abundance: {}'.format(max(abundances)))
print('abundance distribution:')
sns.distplot(abundances, color='grey', kde=False,
             bins=np.arange(.5, max(abundances) + .5),
             hist_kws=dict(edgecolor='k', lw=2))
plt.xlabel('cell abundance')
plt.ylabel('unique barcodes')
sns.despine()
plt.show()

## Collapsed tree (deduplicate repeated taxa)
- branch lengths in collapsed tree correspond to event set difference rather than time
- genotype abundance indicated by number and bars on right
- if we end up with homoplasy (repeated genotypes that aren't sisters) these will still be repeated in the tree

In [None]:
collapsed_clt = pruned_clt.copy() 
for node in collapsed_clt.iter_descendants():
    node.dist = len(set(node.barcode_events.events) ^ set(node.up.barcode_events.events))
collapsed_clt = CollapsedTree.collapse(collapsed_clt, preserve_leaves=True)
# plot the editing profile as in Aaron et al.
collapsed_clt.editing_profile() # add file name argument (e.g. 'profile.pdf') if you want it saved
plt.show()
collapsed_clt.ladderize()
# show the tree with alignment
display(collapsed_clt.savefig("%%inline"))

# Now we estimate the tree and compare to the truth
## first define custom tree distance function that can handle repeated genotypes in the true tree
the idea will be to aggregate (e.g. average) over all possible choices for positions of the repeated genotypes

In [None]:
def my_rf(tree1, tree2):
    """
    custom Robinson-Foulds tree distance that aggregates over resolutions of repeated genotypes
    tree1 has no repeats
    tree2 may have repeats
    returns list of possible RF values
    """
#     assert(set(leaf.barcode_events for leaf in tree1) == set(leaf.barcode_events for leaf in tree2))
    repeats = defaultdict(list)
    for leaf in tree2:
        repeats[leaf.barcode_events].append(leaf.name)
    rfs = []
    choices = list(product(*list(repeats.values())))
    shuffle(choices)
    for ct, choice in enumerate(choices,1):        
        choice = set(choice)
        tree2_copy = tree2.copy()
        for leaf in tree2_copy:
            if leaf.name not in choice:
                leaf.delete()
        rf = tree1.robinson_foulds(tree2_copy,
                           unrooted_trees=True,
                           attr_t1='barcode_events',
                           attr_t2='barcode_events')
        assert(len(rf[2]) > 0)
        rfs.append(rf[0])
        
        # stopping after 20
        if ct == 20:
            break
    return rfs

## Use PHYLIP mix estimator to get set of maximally parsimonious trees
- this could take a while
- we collapse zero-length branches in the binary trees to generate unique multifurcating trees
- we run twice, encoding hidden states as absent (0), or as unknown (?)
- **hypothesis:** the latter will result in more accurate reconstructions

In [None]:
parsimony_clts = clt_estimator.estimate(obs_leaves, encode_hidden=False)
parsimony_clts_hidden = clt_estimator.estimate(obs_leaves, encode_hidden=True)

## Distribution of distance between true tree and each of the parsimony trees
### we use the mean of the results of different repeat resolutions from the function above

In [None]:
RF = []
for parsimony_clt in parsimony_clts:
    rf = np.mean(my_rf(parsimony_clt, collapsed_clt))
    RF.append(rf)
RF_hidden = []
for parsimony_clt in parsimony_clts_hidden:
    rf = np.mean(my_rf(parsimony_clt, collapsed_clt))
    RF_hidden.append(rf)
#     parsimony_clt.ladderize()
#     display(parsimony_clt.savefig("%%inline"))
plt.hist([RF, RF_hidden],
         bins=np.arange(.5, np.ceil(max(RF + RF_hidden)) + 1.5),
         color=('green', 'red'),
         edgecolor='k',
         label=('hidden events encoded as 0', 'hidden events encoded as ?'),
         stacked=True)
plt.legend()
plt.xlim([0, None])
plt.xlabel('mean Robinson-Foulds distance to true tree')
plt.ylabel('number of parsimony trees')
plt.show()