In [3]:
from collections import defaultdict

import pandas as pd

In [26]:
path_to_species = "../data/interim/species_birds_in_tree.txt"
path_to_orders_tree = "../data/interim/orders.tre"
path_to_taxa = "../data/interim/taxonomy-9.1.csv"

In [27]:
with open(path_to_orders_tree) as fin:
    tree_str = fin.read().strip()
with open(path_to_species) as fin:
    species = list(map(str.strip, fin.readlines()))
    genuses = [x.split('_')[0] for x in species]

taxa = pd.read_csv(path_to_taxa)
taxa["Species"] = taxa["Species"].str.replace(" ", "_")

In [6]:
unk_genuses = set(genuses).difference(taxa.Genus)
unk_genuses

{'Megalaima', 'Melophus', 'Padda', 'Uragus', 'Vestiaria'}

In [7]:
[sp for sp in species if sp.split("_")[0] in unk_genuses]

['Megalaima_virens',
 'Melophus_lathami',
 'Padda_oryzivora',
 'Uragus_sibiricus',
 'Vestiaria_coccinea']

In [8]:
genus_updater = dict(
    Melophus="Emberiza",
    Megalaima="Psilopogon",
    Uragus="Carpodacus",
    Vestiaria="Drepanis",
    Padda="Lonchura",
)

In [9]:
sp2order = dict(taxa[taxa.Species.isin(species)][["Species", "Order"]].values)


In [10]:
for sp in set(species).difference(taxa.Species):
    genus = sp.split("_")[0]
    genus = genus_updater.get(genus) or genus

    order = taxa[taxa.Genus == genus].Order.values[0]
    assert isinstance(order, str)
    sp2order[sp] = order

assert len(sp2order) == len(species)

In [11]:
order2sp = defaultdict(list)
for sp, order in sp2order.items():
    order2sp[order].append(sp)

In [74]:
for order in order2sp:
    if order not in tree_str:
        print(order)

### All orders in constyraint tree!

In [1]:
from ete3 import PhyloTree

In [2]:
PhyloTree("../data/interim/constraint.tre").show()

### But some orders in tree are redundant!!!

In [12]:
import re

In [28]:
orders_from_tree = re.findall("[^\(\),;]+", tree_str)

In [29]:
order2sp.keys()

dict_keys(['Struthioniformes', 'Rheiformes', 'Apterygiformes', 'Casuariiformes', 'Tinamiformes', 'Anseriformes', 'Galliformes', 'Gaviiformes', 'Sphenisciformes', 'Procellariiformes', 'Podicipediformes', 'Phoenicopteriformes', 'Phaethontiformes', 'Ciconiiformes', 'Pelecaniformes', 'Suliformes', 'Accipitriformes', 'Otidiformes', 'Eurypygiformes', 'Gruiformes', 'Charadriiformes', 'Pterocliformes', 'Columbiformes', 'Musophagiformes', 'Cuculiformes', 'Strigiformes', 'Caprimulgiformes', 'Apodiformes', 'Coliiformes', 'Trogoniformes', 'Coraciiformes', 'Bucerotiformes', 'Piciformes', 'Falconiformes', 'Psittaciformes', 'Passeriformes'])

In [30]:
set(orders_from_tree).difference(order2sp.keys())

{'\n'}