In [1]:
import os
import pandas as pd
import math
import pickle
import pprint
pp = pprint.PrettyPrinter(indent=4)

# For phylogeny parsing
# !pip install opentree
from opentree import OT
# !pip install ete3
from ete3 import Tree, PhyloTree
import pdb



In [2]:
class PhylogenyCUB:
    # Phylogeny class for CUB dataset
    def __init__(self, filePath, node_ids=None, verbose=False):
        # cleaned_fine_tree_fileName = "1_tree-consensus-Hacket-AllSpecies.phy"
        cleaned_fine_tree_fileName = "1_tree-consensus-Hacket-AllSpecies-cub-names.phy"
        # cleaned_fine_tree_fileName = "1_tree-consensus-Hacket-27Species-cub-names.phy"
        
        self.node_ids = node_ids
        self.treeFileNameAndPath = os.path.join(filePath, cleaned_fine_tree_fileName)
        self.total_distance = -1 # -1 means we never calculated it before.

        self.distance_matrix = {}
        self.species_groups_within_relative_distance = {}
        self.get_tree(self.treeFileNameAndPath)
        self.get_total_distance()
    
    # Given two species names, get the phylo distance between them
    def get_distance(self, species1, species2):
        d= None
        if self.distance_matrix[species1][species2] == -1:
            if species1 == species2:
                return 0
            d = self.tree.get_distance(species1, species2)

            self.distance_matrix[species1][species2] = d
        else:
            d = self.distance_matrix[species1][species2]

        return d

    def get_siblings_by_name(self, species, relative_distance, verbose=False):
        
        self.get_species_groups(relative_distance, verbose)
        for species_group in self.species_groups_within_relative_distance[relative_distance]:
            if species in species_group:
                return species_group
        
        raise species+" was not found in " + self.species_groups_within_relative_distance[relative_distance]

    def get_parent_by_name(self, species, relative_distance, verbose=False):
        abs_distance = relative_distance*self.total_distance
        species_node = self.tree.search_nodes(name=species)[0]
        if verbose:
            print('distance to ancestor: ', abs_distance, ". relaive distance: ", relative_distance)

        # keep going up till distance exceeds abs_distance
        distance = 0
        parent = species_node
        while distance < abs_distance:
            if parent.up is None:
                break
            parent = parent.up
            distance = self.tree.get_distance(parent, species_node)
        
        return parent
    
    def get_distance_between_parents(self, species1, species2, relative_distance):
        parent1 = self.get_parent_by_name(species1, relative_distance)
        parent2 = self.get_parent_by_name(species2, relative_distance)
        return self.tree.get_distance(parent1, parent2)
    
    def get_species_groups(self, relative_distance, verbose=False):
        if relative_distance not in self.species_groups_within_relative_distance.keys():
            groups = {}

            for species in self.getLabelList():
                parent_node = self.get_parent_by_name(species, relative_distance, verbose)
                parent = parent_node.name
                if parent not in groups.keys():
                    groups[parent] = [species]
                else:
                    groups[parent].append(species)
            
            self.species_groups_within_relative_distance[relative_distance] = groups.values()
            
            if verbose:
                print("At relative_distance", relative_distance, ", the groups are:", groups.values())
        
        return self.species_groups_within_relative_distance[relative_distance]

    def getLabelList(self):
        return list(self.node_ids)

    # ------- privete functions

    def get_total_distance(self):
        if self.node_ids is None:
            self.node_ids = sorted([leaf.name for leaf in self.tree.iter_leaves()])

        self.init_distance_matrix()

        # maximum distance between root and lead node taken as total distance
        leaf_to_root_distances = [self.tree.get_distance(leaf) for leaf in self.tree.iter_leaves()]
        self.total_distance = max(leaf_to_root_distances)

        return self.total_distance

    def init_distance_matrix(self):
        for i in self.node_ids:
            self.distance_matrix[i] = {}
            for j in self.node_ids:
                self.distance_matrix[i][j] = -1
    
    def get_tree(self, treeFileNameAndPath):
        format_ = 1
        self.tree = PhyloTree(treeFileNameAndPath, format=format_)

        # setting a dummy name to the internal nodes if it is unnamed
        for i, node in enumerate(self.tree.traverse("postorder")):
            if not len(node.name) > 0:
                node.name = str(i)

In [3]:
phylogeny = PhylogenyCUB('/home/mridul/data/cub_phylogeny') 

In [4]:
tree=phylogeny.tree

In [5]:
distances_from_root = []
hops_fro_root = []
r = tree.get_tree_root()
for n in tree.traverse():
    distances_from_root.append(tree.get_distance(n, r))
    hops_fro_root.append(tree.get_distance(n, r, topology_only=True))
    
distances_from_root = [x/max(distances_from_root) for x in distances_from_root] # 1- ?

In [6]:
len(distances_from_root)

374

In [7]:
distances_from_root_ = []
hops_fro_root_ = []
r = tree.get_tree_root()
for n in tree.traverse():
    distances_from_root_.append(tree.get_distance(n, r))
    hops_fro_root_.append(tree.get_distance(n, r, topology_only=True))


In [8]:
max(distances_from_root_)

99.38018

In [9]:
cdf = np.cumsum(pdf)

plt.bar(bins_count1[1:], cdf, color="green", label="CDF", width = 0.01)
plt.plot(bins_count1[1:], bins_count1[1:])

plt.yticks(yticks)

xticks = []
j = 0
for indx, i in enumerate(bins_count1[1:]):
    if cdf[indx] > yticks[j]:
        xtick = bins_count1[indx-1]
        xticks.append(xtick)
        j = j+1
        if j == 3: break
plt.xticks(xticks)        

plt.title("CDF of distances from root")

plt.show()

print('The phylo quantizations should be at', xticks)

NameError: name 'np' is not defined

In [10]:
class Species_sibling_finder():
    # Contructor
    def __init__(self, phylogeny, genetic_distances_from_root):
        self.map = {}
        self.phylogeny = phylogeny
        for species in phylogeny.node_ids:
            self.map[species] = {}
            for indx, distance in enumerate(genetic_distances_from_root):
                distance_relative = get_relative_distance_for_level(genetic_distances_from_root, indx)
                self.map[species][get_loss_name(genetic_distances_from_root, indx)] = phylogeny.get_siblings_by_name(species, distance_relative)


    def map_speciesId_siblingVector(self, speciesId, loss_name):
        label_list = self.phylogeny.getLabelList()
        species = label_list[speciesId]
        siblings = self.map[species][loss_name]
        siblings_indices = list(map(lambda x: label_list.index(x), siblings))
        return siblings_indices

In [11]:
phyloDistances_string = '0.93,0.83,0.63'

In [12]:
def parse_phyloDistances(phyloDistances_string):
    phyloDistances_list_string = phyloDistances_string.split(",")
    sorted_distance = sorted(list(map(lambda x: float(x), phyloDistances_list_string)))
    return sorted_distance

In [13]:
phylo_distances = parse_phyloDistances(phyloDistances_string)

In [14]:
phylo_distances

[0.63, 0.83, 0.93]

In [15]:
def get_relative_distance_for_level(phylo_distances, level):
    return 1.0- (phylo_distances[level] if level < len(phylo_distances) else 1.0)

def get_loss_name(phylo_distances, level):
    return str(phylo_distances[level]).replace(".", "")+"distance"

# def get_relative_distance_for_level_given(self, level):
#     return get_relative_distance_for_level(phylo_distances, level)

In [16]:
siblingfinder = Species_sibling_finder(phylogeny, phylo_distances)

In [17]:
siblingfinder

<__main__.Species_sibling_finder at 0x7fb8629a0df0>

In [18]:
cub_mapping_dict = {}
for level, i in enumerate(phylo_distances):
    relative_distance = get_relative_distance_for_level(phylo_distances, level)
    # print(relative_distance)
    species_groups = phylogeny.get_species_groups(relative_distance)
    species_groups_list = list(map(lambda x: x[:], species_groups))
    dict_from_list = {index: sublist for index, sublist in enumerate(species_groups_list)}
    cub_mapping_dict[f'level_{level}'] = dict_from_list
    # print(species_groups)
    # species_groups_representatives = list(map(lambda x: x[0], species_groups))
    # print(species_groups_representatives)
    # pdb.set_trace()
    # species_groups_representatives = list(map(lambda x: self.phylogeny.getLabelList().index(x), species_groups_representatives))
    # self.mlb[get_loss_name(self.phylo_distances, level)] = species_groups_representatives

In [19]:
cub_mapping_dict.keys()

dict_keys(['level_0', 'level_1', 'level_2'])

In [23]:
cub_mapping_dict['level_1']

{0: ['001.Black_footed_Albatross',
  '002.Laysan_Albatross',
  '003.Sooty_Albatross'],
 1: ['004.Groove_billed_Ani'],
 2: ['005.Crested_Auklet',
  '006.Least_Auklet',
  '007.Parakeet_Auklet',
  '008.Rhinoceros_Auklet',
  '058.Pigeon_Guillemot',
  '071.Long_tailed_Jaeger',
  '072.Pomarine_Jaeger',
  '106.Horned_Puffin'],
 3: ['009.Brewer_Blackbird',
  '010.Red_winged_Blackbird',
  '011.Rusty_Blackbird',
  '012.Yellow_headed_Blackbird',
  '013.Bobolink',
  '020.Yellow_breasted_Chat',
  '021.Eastern_Towhee',
  '026.Bronzed_Cowbird',
  '027.Shiny_Cowbird',
  '049.Boat_tailed_Grackle',
  '076.Dark_eyed_Junco',
  '088.Western_Meadowlark',
  '095.Baltimore_Oriole',
  '096.Hooded_Oriole',
  '097.Orchard_Oriole',
  '098.Scott_Oriole',
  '099.Ovenbird',
  '109.American_Redstart',
  '113.Baird_Sparrow',
  '114.Black_throated_Sparrow',
  '115.Brewer_Sparrow',
  '116.Chipping_Sparrow',
  '117.Clay_colored_Sparrow',
  '119.Field_Sparrow',
  '120.Fox_Sparrow',
  '121.Grasshopper_Sparrow',
  '122.Harr

In [25]:
import json
with open('/home/mridul/data/cub_phylogeny/cub_ancestral_mapping.json', 'w') as file:
    json.dump(cub_mapping_dict, file, indent=4)

In [28]:
list_ancestor_level2 = []
for value in cub_mapping_dict['level_0'].values():
    list_ancestor_level2.extend(value)
list_ancestor_level2 = sorted(list_ancestor_level2)

In [29]:
len(dict_from_list = {index: sublist for index, sublist in enumerate(species_groups_list)})

190

In [31]:
dict_from_list = {index: sublist for index, sublist in enumerate(list_ancestor_level2)}
dict_from_list

{0: '001.Black_footed_Albatross',
 1: '002.Laysan_Albatross',
 2: '003.Sooty_Albatross',
 3: '004.Groove_billed_Ani',
 4: '005.Crested_Auklet',
 5: '006.Least_Auklet',
 6: '007.Parakeet_Auklet',
 7: '008.Rhinoceros_Auklet',
 8: '009.Brewer_Blackbird',
 9: '010.Red_winged_Blackbird',
 10: '011.Rusty_Blackbird',
 11: '012.Yellow_headed_Blackbird',
 12: '013.Bobolink',
 13: '014.Indigo_Bunting',
 14: '015.Lazuli_Bunting',
 15: '016.Painted_Bunting',
 16: '017.Cardinal',
 17: '018.Spotted_Catbird',
 18: '019.Gray_Catbird',
 19: '020.Yellow_breasted_Chat',
 20: '021.Eastern_Towhee',
 21: '023.Brandt_Cormorant',
 22: '024.Red_faced_Cormorant',
 23: '025.Pelagic_Cormorant',
 24: '026.Bronzed_Cowbird',
 25: '027.Shiny_Cowbird',
 26: '028.Brown_Creeper',
 27: '029.American_Crow',
 28: '030.Fish_Crow',
 29: '031.Black_billed_Cuckoo',
 30: '032.Mangrove_Cuckoo',
 31: '033.Yellow_billed_Cuckoo',
 32: '034.Gray_crowned_Rosy_Finch',
 33: '035.Purple_Finch',
 34: '036.Northern_Flicker',
 35: '037.Aca

In [32]:
with open('/home/mridul/data/cub_phylogeny/cub_mapping_species_level.json', 'w') as file:
    json.dump(dict_from_list, file, indent=4)