# Definitions

In [1]:
from ete3 import Tree
from tqdm import tqdm
import json

# Clustering

In [2]:
number_of_divisions = 50
for n in tqdm(range(1, number_of_divisions+1)):
    tree = Tree('tree')
    for i, node in enumerate(tree.traverse('postorder')):
        if not node.is_leaf():
            left_child = node.children[0]
            right_child = node.children[1]
            if len(left_child.children) <= n or len(right_child.children) <= n:
                if not left_child.is_leaf():
                    left_child.delete()
                if not right_child.is_leaf():
                    right_child.delete()
        tree.write(format=1, outfile=f"tree_simplified_{n}.nw")
    print(f' n={n}, number_of_clusters={len(set([leaf.up for leaf in tree]))}')
        

  2%|▏         | 1/50 [00:43<35:34, 43.55s/it] n=1, number_of_clusters=282
  4%|▍         | 2/50 [01:00<28:26, 35.56s/it] n=2, number_of_clusters=73
  6%|▌         | 3/50 [01:18<23:39, 30.21s/it] n=3, number_of_clusters=43
  8%|▊         | 4/50 [01:34<20:03, 26.16s/it] n=4, number_of_clusters=33
 10%|█         | 5/50 [01:51<17:28, 23.29s/it] n=5, number_of_clusters=27
 12%|█▏        | 6/50 [02:08<15:37, 21.32s/it] n=6, number_of_clusters=23
 14%|█▍        | 7/50 [02:25<14:27, 20.17s/it] n=7, number_of_clusters=19
 16%|█▌        | 8/50 [02:42<13:25, 19.19s/it] n=8, number_of_clusters=17
 18%|█▊        | 9/50 [03:33<19:33, 28.63s/it] n=9, number_of_clusters=17
 20%|██        | 10/50 [04:06<20:04, 30.12s/it] n=10, number_of_clusters=17
 22%|██▏       | 11/50 [04:24<17:04, 26.27s/it] n=11, number_of_clusters=11
 24%|██▍       | 12/50 [04:41<14:51, 23.46s/it] n=12, number_of_clusters=9
 26%|██▌       | 13/50 [04:58<13:17, 21.56s/it] n=13, number_of_clusters=9
 28%|██▊       | 14/50 [05:15<1

In [7]:
clusters = {i: [ch.name.split('.')[0] for ch in node.children if ch.is_leaf()] for i, node in enumerate(set([leaf.up for leaf in Tree(f'tree_simplified_12.nw')]))}
with open(f'clusters.json', 'w') as f:
    f.write(json.dumps(clusters))