Visualisation of species tree with interesting taxonomic groups

Manual steps: defining species for basal divergence in high-level groups

In [1]:
%run common.ipynb

# Basic tree manipulations

In [2]:
t = ete3.Tree(SPECIES_TREE)
x = [leaf.name for leaf in t]
# re-shape tree

# re-root to Kertezsia
t.set_outgroup(t.get_common_ancestor('Anopheles_bellator','Anopheles_cruzii'))
# reorder nodes
t.ladderize(direction=1)
# node appearance
ns = ete3.NodeStyle(size=0)
for n in t.traverse():
    n.set_style(ns)

## sequencing data import

In [3]:
seq_data = pd.read_csv(CLUSTERING)

# Groups definition

In [4]:
# read taxonomy
c = pd.read_csv(TAXONOMY_FILE)
# remove species not included in tree
c = c[c.species.isin(x)]

In [5]:
c[c.species == 'Anopheles_carnevalei']

Unnamed: 0,species,subgenus,section,series,group,subgroup,complex,species_ncbi
11,Anopheles_carnevalei,Cellia,,Neomyzomyia,Ardensis,,Nili,Anopheles carnevalei


In [6]:
def get_spp(name, rank):
    return t.get_common_ancestor(*c.loc[c[rank] == name, 'species'].to_list())

# key groups
g = {
    # subgenera - use basal divergence only
    'Kerteszia_subgenus':get_spp('Kerteszia', 'subgenus'),
    'Anopheles_subgenus':get_spp('Anopheles', 'subgenus'),
    'Nyssorhynchus_subgenus':get_spp('Nyssorhynchus', 'subgenus'),
    'Cellia_subgenus':get_spp('Cellia', 'subgenus'),
    # series - use basal divergence only
    'Myzomyia_series':get_spp('Myzomyia', 'series'),
    'Neocellia_series':get_spp('Neocellia', 'series'),
    'Pyreto-_phorus_series':get_spp('Pyretophorus', 'series'),
    # non-monophyletic series with low support
#     'Neomyzomyia_series':get_spp('Neomyzomyia', 'series'),
    # groups and complexes - all species in all cases except gambiae complex
    'Marshallii_group':get_spp('Marshallii','group'),
    'Maculatus_group':get_spp('Maculatus','group'),
    'Gambiae_complex':get_spp('Gambiae','complex'),
    'Sundaicus_complex':get_spp('Sundaicus','complex'),
    # note the disagreement on rank between NCBI and Harbach
    'Punctulatus_group':get_spp('Punctulatus','complex'),
    'Leucosphyrus_group':get_spp('Leucosphyrus','group'),
    # loss of monophyly between nil/car and dur/vin
#     'Ardensis_group':get_spp('Ardensis','group'),
    'Nili_complex':get_spp('Nili','complex'),
    'Coustani_group':get_spp('Coustani','group')
}

# Tree drawing

In [7]:
def group_color(group, color):
    for node in group.iter_descendants("postorder"):
        node.set_style(ete3.NodeStyle(bgcolor=color, size=0))
    group.set_style(ete3.NodeStyle(bgcolor=color, size=0))
# background colours
# color names from https://stackoverflow.com/questions/22408237/named-colors-in-matplotlib
# subgenera
# group_color('Celia_subgenus', "LightSteelBlue")
# group_color('Anopheles_subgenus', "DarkSeaGreen")
# group_color('Nyssorhynchus_subgenus', "Khaki")
# group_color('Kertezsia_subgenus', "PeachPuff")
# series
# group_color('Myzomiya_series', "Silver")
# group_color('Neocellia_series', "SkyBlue")
# group_color('Pyretophorus_series', "Gainsboro")
# group_color('Neomyzomia_series', "PowderBlue")
# group_color('Neomyzomia_2_series', "PowderBlue")
# groups and complexes - red for unresolved, green for resolved
for name, group in g.items():
    if ('group' in name) or ('complex' in name):
        group_color(group, "SkyBlue")

group_color(t.get_common_ancestor('Anopheles_gambiae',
                                  'Anopheles_coluzzii'), 
            "MistyRose")
group_color(t.get_common_ancestor('Anopheles_coustani',
                                  'Anopheles_tenebrosus',
                                  'Anopheles_paludis',
                                  'Anopheles_ziemanni'), 
            "MistyRose")
group_color(t.get_common_ancestor('Anopheles_brohieri',
                                  'Anopheles_demeilloni',
                                  'Anopheles_hancocki'), 
            "MistyRose")

In [8]:
# add group names
for name, group in g.items():
    # species complex
    if 'group' in name or 'complex' in name:
        fsize=12
    # higher level
    else:
        fsize=14
    for s in name.split('_'):#[:2]:
        tf = ete3.TextFace(s, fsize=12)
        tf.hz_align=1
        tf.opacity=0.7
        group.add_face(tf, column=0, position='branch-right')

In [9]:
# add shapes to leaves
# samples per species
# sp_samples = pd.read_csv(CLUSTERING).groupby('species')['s_Sample'].unique().to_dict()
# for leaf in t.iter_leaves():
#     # reference genome samples present in species
#     ref_match = [x.startswith('anopheles') for x in sp_samples[leaf.name]]
#     # all-reference in species
#     if sum(ref_match) == len(ref_match):
#         nstyle = ete3.NodeStyle(shape='circle', size=5, fgcolor='DarkBlue')
#     # reference genomes present in species
#     elif sum(ref_match) > 0:
#         nstyle = ete3.NodeStyle(shape='square', size=5, fgcolor='DarkBlue')
#     # no reference genomes
#     else:
#         nstyle = ete3.NodeStyle(shape='circle', size=0, fgcolor='DarkRed')
#     leaf.set_style(nstyle)
#     # pretty species names
    
    

In [10]:
# remove branch lengths
t.convert_to_ultrametric()

from ete3 import Tree, faces, AttrFace, TreeStyle, NodeStyle, TextFace

# branches and support values
def layout(node):
    
    node.img_style["hz_line_width"] = 2 # Change the horizotal lines stroke size
    node.img_style["vt_line_width"] = 2 # Change the vertical lines stroke size
    if node.is_leaf():
        sp = node.name
        pretty_sp = 'An. ' + sp[10:]
        sp_samples = seq_data.loc[seq_data.species == sp, 's_Sample'].drop_duplicates()
        seq_nsamples = (~sp_samples.str.startswith('anoph')).sum()
        ref_nsamples = sp_samples.str.startswith('anoph').sum()
        faces.add_face_to_node(TextFace(pretty_sp, fsize=12), 
                               node, column=0, position='aligned', aligned=True)
        faces.add_face_to_node(TextFace("{}+".format(seq_nsamples), fsize=12), 
                               node, column=1, position='aligned', aligned=True)
        faces.add_face_to_node(TextFace("{}".format(ref_nsamples), fsize=12), 
                               node, column=2, position='aligned', aligned=True)
    if not node.is_leaf():
        faces.add_face_to_node(AttrFace("support", fsize=8, formatter="%g ", fgcolor='darkred'), 
                      node, position='branch-top', column=0)

# tree appearance
ts = ete3.TreeStyle()
ts.layout_fn = layout
ts.show_branch_support = False
ts.show_leaf_name = False
ts.show_branch_length = False
ts.show_scale = False
ts.draw_guiding_lines = True
ts.guiding_lines_type = 0
ts.guiding_lines_color = "black"

ts.allow_face_overlap = True
ts.extra_branch_line_type = 0
ts.extra_branch_line_color = "black"
# t.show(tree_style=ts)


In [11]:
t.render(SPECIES_TREE_FIG, tree_style=ts, units='in', h=13, dpi=300);

# Sample tree

In [23]:
# short names for non-standard assemblies
shortnames = {
    'anopheles-aquasalis-A_aquasalis_v1':'aaquv1',
    'anopheles-punctulatus-apwgs2contigsjxxa01':'apunj1',
    'anopheles-cracens-ASM209184v1-scaffolds':'acrav1',
    'anopheles-koliensis-akwgs3contigsjxxb01':'akolj1',
    'anopheles-arabiensis-sharakhov':'aaras1'
}

t = ete3.Tree(SAMPLE_TREE)
t.set_outgroup(t.get_common_ancestor('Abel-SP24','Acru-RS32'))
t.ladderize
t.ladderize(direction=1)
ns = ete3.NodeStyle(size=0)
for n in t.traverse():
    # shorten names for assemblies
    if n.name in shortnames.keys():
        n.name = shortnames[n.name]
    elif n.name.startswith('anopheles'):
        for delim in ('chromosomes', 'scaffolds'):
            if delim in n.name:
                shortname = n.name.split(delim)[-1]
                if len(shortname) > 0:
                    n.name = shortname
    n.set_style(ns)
t.render(SAMPLE_TREE_FIG);