Get strain count

In [8]:
import pandas as pd
import numpy as np

In [10]:
lineage_calls = pd.read_csv(
    '01_variant_calling/biosample_results.postfilter.csv', index_col=0).call
np.sum(
    np.isin(lineage_calls, ['lineage1', 'lineage2', 'lineage3', 'lineage4']))

np.int64(55879)

Check tree integrity
- Get set of strains in (DS or DR) subtree -> subtree_leaves
- For each node on the subtree:
    - Get the set of leaves on the subtree -> mrca_set
    - Get the MRCA node of these strains on the full tree.
    - Get the set of MRCA node leaves (supertree_set) that also exists in subtree set -> subtree_set
    - The MRCA set and subtree set must exactly match.

In [36]:
import mtbvartools as vt
from tqdm import tqdm
import pandas as pd

In [34]:
supertree_path = 'datasets/variants/global/241104_ancestor_calls.vcb/tree.nwk'
subtree_path = 'datasets/variants/global/250209_abxsen_ancestor_calls.vcb/tree.nwk'
prune_labels = ['canettii', 'SRR10522783']

# load trees
supertree = vt.loadTree(supertree_path)
supertree.prune_taxa_with_labels(prune_labels)

subtree = vt.loadTree(subtree_path)
subtree.prune_taxa_with_labels(prune_labels)

subtree_leaves = {ln.label for ln in subtree.leaf_nodes()}

# run comparison algorithm
output_dict = {}
for i, node in enumerate(tqdm(list(subtree.postorder_internal_node_iter()))):
    mrca_set = {ln.label for ln in node.leaf_nodes()}
    supertree_set = {ln.label for ln in supertree.mrca(taxon_labels=mrca_set).leaf_nodes()}
    output_dict[i] = {
        'len_mrca_set': len(mrca_set),
        'len_supertree_set': len(supertree_set),
        'is_match': supertree_set.intersection(subtree_leaves) == mrca_set}

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 30844/30844 [37:38<00:00, 13.65it/s]


In [43]:
dataset = pd.DataFrame(output_dict).T
dataset.index = [ln.label for ln in subtree.postorder_internal_node_iter()]
dataset

Unnamed: 0,len_mrca_set,len_supertree_set,is_match
internal1,2,2,True
internal2,3,3,True
internal3,2,2,True
internal4,2,2,True
internal5,3,3,True
...,...,...,...
internal30840,11134,17196,True
internal30841,12264,19718,True
internal30842,18886,28817,True
internal30843,27903,55259,False


Based on tree topology, above argues that the 30843 child leaf nodes includes a L1 strain - this does not appear to be the case....

In [52]:
subtree_30843 = {ln.label for ln in subtree.find_node_with_label('internal30843').leaf_nodes()}
supertree_L1 = {ln.label for ln in supertree.find_node_with_label('internal4119').leaf_nodes()}

In [56]:
supertree_L1.intersection(subtree_30843), len(subtree_30843), len(supertree_L1)

(set(), 27903, 4120)

In [57]:
mrca = {ln.label for ln in supertree.mrca(taxon_labels=subtree_30843).leaf_nodes()}

In [58]:
len(mrca)

55259

I think this is some artefact of rooting and since it occurs above all of my ABX sensitive/resistant analyses, I don't think it matters.

In [59]:
supertree_path = 'datasets/variants/global/241104_ancestor_calls.vcb/tree.nwk'
subtree_path = 'datasets/variants/global/250216_abxres_ancestor_calls.vcb/tree.nwk'
prune_labels = ['canettii', 'SRR10522783']

# load trees
supertree = vt.loadTree(supertree_path)
supertree.prune_taxa_with_labels(prune_labels)

subtree = vt.loadTree(subtree_path)
subtree.prune_taxa_with_labels(prune_labels)

subtree_leaves = {ln.label for ln in subtree.leaf_nodes()}

# run comparison algorithm
output_dict = {}
for i, node in enumerate(tqdm(list(subtree.postorder_internal_node_iter()))):
    mrca_set = {ln.label for ln in node.leaf_nodes()}
    supertree_set = {ln.label for ln in supertree.mrca(taxon_labels=mrca_set).leaf_nodes()}
    output_dict[i] = {
        'len_mrca_set': len(mrca_set),
        'len_supertree_set': len(supertree_set),
        'is_match': supertree_set.intersection(subtree_leaves) == mrca_set}

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 24327/24327 [38:28<00:00, 10.54it/s]


In [62]:
dataset = pd.DataFrame(output_dict).T
dataset.index = [ln.label for ln in subtree.postorder_internal_node_iter()]
dataset

Unnamed: 0,len_mrca_set,len_supertree_set,is_match
internal1,2,2,True
internal2,2,2,True
internal3,2,3,True
internal4,3,9,True
internal5,5,15,True
...,...,...,...
internal24323,6062,17196,True
internal24324,7454,19718,True
internal24325,9931,28817,True
internal24326,23236,55259,False
