### Packages

In [1]:
# !pip install dendropy

In [2]:
import sys  
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dendropy

### Paths

In [3]:
!pwd

/Users/yinzheng/Documents/GitHub/gem-analysis/build_tree


In [4]:
# INPUT_DIR = '/oscar/data/biol1525_s24/project_folders/phageassoc_group/gem-analysis/build_tree'
# SAVE_DIR = '/oscar/data/biol1525_s24/project_folders/phageassoc_group/gem-analysis/build_tree'
INPUT_DIR = ''
SAVE_DIR = ''
TREE_FILE = 'example_tree.newick'

### DendroPy references

https://dendropy.readthedocs.io/en/main/schemas/newick.html

https://dendropy.readthedocs.io/en/main/library/treemodel.html


### Read tree from newick file

In [5]:
tree = dendropy.Tree.get(
    path=os.path.join(INPUT_DIR, TREE_FILE),
    schema="newick",
    label=None,
    taxon_namespace=None,
    collection_offset=None,
    tree_offset=None,
    rooting="default-unrooted",
    edge_length_type=float,
    suppress_edge_lengths=False,
    extract_comment_metadata=True,
    store_tree_weights=False,
    finish_node_fn=None,
    case_sensitive_taxon_labels=False,
    preserve_underscores=False,
    suppress_internal_node_taxa=True,
    suppress_leaf_node_taxa=False,
    terminating_semicolon_required=True,
    ignore_unrecognized_keyword_arguments=False,
)

### Functions

In [6]:
def get_subtree(tree, taxa_labels, is_keyword_search=True):
    """
    Get the subtree of a tree rooted at a given node.

    Args:
        tree (Tree): The tree object to extract the subtree from.
        taxa_labels (list): A list of taxa labels to root the subtree at.
        is_keyword_search (bool): If True, the taxa_labels are treated as keywords to search for in the taxa labels.

    Returns:
        subtree: The subtree rooted at the given node.
        
    """

    filter_taxa_labels = taxa_labels

    if is_keyword_search:
        # get all taxa labels
        all_taxa_labels = [taxon.label for taxon in tree.taxon_namespace]
        # filter for taxa labels
        filter_taxa_labels = [label for label in all_taxa_labels if any(keyword in label for keyword in taxa_labels)]
    
    subtree = tree.extract_tree_with_taxa_labels(filter_taxa_labels)
    
    return subtree

In [7]:
def save_tree_vis(tree, file_name, is_print=False):
    """
    Save the ASCII representation of a tree to a text file.

    Args:
        tree (Tree): The tree object to save.
        file_name (str): The name of the file to save the ASCII tree to.
        
    """
    ascii_tree = tree.as_ascii_plot()
    with open(os.path.join(SAVE_DIR, file_name), 'w') as f:
        f.write(ascii_tree)
    if is_print:
        print(ascii_tree)

In [8]:
def save_tree_newick(tree, file_name):
    """
    Save the Newick representation of a tree to a text file.

    Args:
        tree (Tree): The tree object to save.
        file_name (str): The name of the file to save the Newick tree to.
        
    """
    with open(os.path.join(SAVE_DIR, file_name), 'w') as f:
        f.write(tree.as_string(schema='newick'))

### Save and visualize original and filtered trees

In [9]:
# original tree
save_tree_vis(tree, 'tree.txt')
print([taxon.label for taxon in tree.taxon_namespace][:5])
len(tree.leaf_nodes())

['KM279529.1', 'KM279544.1', 'KM279551.1', 'KM279530.1', 'KX380812.1']


165

In [10]:
# filter on taxa labels
subtree_1 = get_subtree(tree, ['MK564482.1', 'KU365902.1', 'KU365903.1', 'KX225485.1', 'KT187555.1', 'MK564487.1', 'EU482640.1'], is_keyword_search=False)
save_tree_vis(subtree_1, 'subtree_1.txt', is_print=True)
len(subtree_1.leaf_nodes())

                      /--------------------------------------------- MK564482.1
/---------------------+                                                        
|                     |                      /---------------------- KU365902.1
|                     \----------------------+                                 
|                                            \---------------------- KU365903.1
+                                                                              
|                                            /---------------------- KX225485.1
|                     /----------------------+                                 
|                     |                      \---------------------- KT187555.1
\---------------------+                                                        
                      |                      /---------------------- MK564487.1
                      \----------------------+                                 
                                        

7

In [11]:
save_tree_newick(subtree_1, 'subtree_1.newick')

In [12]:
# filter on taxa label keywords
subtree_2 = get_subtree(tree, ['KY', 'FJ'])
save_tree_vis(subtree_2, 'subtree_2.txt', is_print=True)
len(subtree_2.leaf_nodes())

/------------------------------------------------------------------- KY794785.1
|                                                                              
|            /------------------------------------------------------ FJ467493.1
|            |                                                                 
|            |                                               /------ FJ850066.1
|            |                                        /------+                 
|            |                                        |      \------ FJ850115.1
|     /------+      /---------------------------------+                        
+     |      |      |                                 |      /------ FJ850106.1
|     |      |      |                                 \------+                 
|     |      |      |                                        \------ KY474310.1
|     |      |      |                                                          
|     |      |      |             /-----

15