In [3]:
import jdc
import ete3
import os
import re
import linecache
import pandas as pd
import numpy as np
import random
import subprocess
import colorlover as cl
import plotly
import plotly.plotly as ptl
from plotly import graph_objs as go
import pyparsing as pp

plotly_accession = open('/Users/thiberio/plotly_accession').read().split()
ptl.sign_in(plotly_accession[0], plotly_accession[1])

In [None]:
class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [2]:
class aggregate(object):
    
    def __init__(self, reference_tree, gene_tree_folder, aggregate_folder, reconciliation_folder,
                 overall_tree_support_thresh=80,
                 branch_support_thresh=95, 
                 ranger_confidence_threshold=0.9,
                 leaves_allowed=False):
        if type(reference_tree) is str:
            self.species_tree            = ete3.Tree(reference_tree, format=1)
        else:
            self.species_tree            = reference_tree.copy()
        self.overall_tree_support_thresh = overall_tree_support_thresh
        self.support_threshold           = branch_support_thresh
        self.ranger_confidence_threshold = ranger_confidence_threshold
        self.leaves_allowed              = leaves_allowed
        self.gene_tree_folder            = gene_tree_folder
        self.aggregate_folder            = aggregate_folder
        self.reconciliation_folder       = reconciliation_folder

In [5]:
%%add_to aggregate
def match_rooting(self, reference_root, tree_to_root):
    tmp_tree = tree_to_root.copy()
    for node in sorted( reference_root.children, key=len ):
        if node.is_leaf():
            leaf = tmp_tree.get_leaves_by_name(node.name)[0]
            tmp_tree.set_outgroup(leaf)
            return tmp_tree
        else:
            is_it_monophyletic, clade_type, fucking_up = tmp_tree.check_monophyly(
                node.get_leaf_names(), 
                'name',
                unrooted=False
            )
            if is_it_monophyletic:
                equivalent = tmp_tree.get_common_ancestor(node.get_leaf_names())
                tmp_tree.set_outgroup(equivalent)
            else:
                tmp_tree.set_outgroup(fucking_up.pop())
                equivalent = tmp_tree.get_common_ancestor(node.get_leaf_names())
                tmp_tree.set_outgroup(equivalent)

            return tmp_tree

In [6]:
%%add_to aggregate
def name_branches_as_reconciliation(self, reconciliation_file, tree):
    branches         = re.findall('^(m\d+) = LCA\[(\S+), (\S+)\]:', reconciliation_file, re.M)
    duplicated_names = {}
    for name, leaf1, leaf2 in branches:
        node = tree.get_common_ancestor(leaf1, leaf2)
        if node.name:
            duplicated_names[name] = node.name
            continue
        node.name = name
        node.add_feature('ranger_name', name)
    return tree, duplicated_names

In [14]:
%%add_to aggregate
def parse_aggregated(self, group):
    if not os.path.isdir('%s/%s' % (self.reconciliation_folder, group)) \
    or not os.path.isfile('%s/%s' % (self.aggregate_folder, group)):
        return {group:None}

    aggregated = open('%s/%s' % (self.aggregate_folder, group)).read()
    with cd('%s/%s' % (self.reconciliation_folder, group)):
        gene_tree     = {'named':ete3.Tree(linecache.getline('%s.output1' %group, 8), format=1)}

    gene_tree['support'] = self.match_rooting(
        gene_tree['named'],
        ete3.Tree('%s/%s.treefile' % (self.gene_tree_folder, group))
    )
    gene_tree, duplicated_names = self.name_branches_as_reconciliation(aggregated, gene_tree['support'])
    gene_tree.add_feature('group', group)
    
    ufboot_distribution = [node.support for node in gene_tree.traverse() if not node.is_leaf()]
    if np.percentile(ufboot_distribution, 25) < self.overall_tree_support_thresh:
        return {group:None}

    num_replicates = float(re.match('Processed (\d+) files', aggregated).group(1))

    if not self.leaves_allowed:
        transfers = re.findall('^(m\d+) = .*, Transfers = ([^0]\d+?)\], \[Most Frequent mapping --> (n\d+), \
(\d+) times\], \[Most Frequent recipient --> (n\d+), (\d+) times\].', aggregated, re.M)
    else:
        transfers = re.findall('^(m\d+) = .*, Transfers = ([^0]\d+?)\], \[Most Frequent mapping --> (\S+), \
(\d+) times\], \[Most Frequent recipient --> (\S+), (\d+) times\].',   aggregated, re.M)

    selected_transfers = []
    for donor_map_name, ranger_confidence, donor_name, ranger_confidence_donor,\
    recipient_name, ranger_confidence_recipient in transfers:
        if donor_map_name in duplicated_names:
            donor_map = gene_tree.search_nodes(name=duplicated_names[donor_map_name])[0]
        else:
            donor_map = gene_tree.search_nodes(name=donor_map_name)[0]

        recipient_map_search = re.search(
            '^({children[0]}|{children[1]}).*Most Frequent mapping --> {recipient}'.format(
                recipient=recipient_name,
                children=[child.name for child in donor_map.children]),
            aggregated, re.M)
        
        if recipient_map_search:
            recipient_map_name = recipient_map_search.group(1)
            if not all([donor_name, recipient_name, donor_map_name, recipient_map_name]):
                continue
            /.append({'donor':donor_name, 'recipient':recipient_name,
                                       'donor_map':donor_map_name, 'recipient_map':recipient_map_name,
                                       'bipartition_support':donor_map.support,
                                       'ranger_confidence':int(ranger_confidence),
                                       'ranger_confidence_donor':int(ranger_confidence_donor),
                                       'ranger_confidence_recipient':int(ranger_confidence_recipient)})
    transfers_df = pd.DataFrame(selected_transfers)
    transfers_df['family'] = group
    return [transfers_df, gene_tree]

In [15]:
%%add_to aggregate
def assess_dtl_dist(self, df, assess_donor_dtl=True, assess_recipient_dtl=False):
    transfer_df = df.copy()
    
    if assess_donor_dtl:
        donor_subtrees      = []
        donor_maps          = []
        donor_subtree_sizes = []
        for donor_map in transfer_df.donor_map.unique():
            donor_maps.append(donor_map)
            gene_donor_branch = next(gene_tree.iter_search_nodes(name=donor_map))
            donor_subtrees.append(gene_donor_branch.write(format=9))
            donor_subtree_sizes.append(len(gene_donor_branch))

        with open('tmp_ranger.input', 'w') as out:
            out.write('%s\n' % self.species_tree.write(format=9))
            out.write('\n'.join(donor_trees))

        subprocess.call([
            '/work/ranger/CorePrograms/Ranger-DTL.mac', 
            '-q',
            '-i', 'tmp_ranger.input',
            '-o', 'tmp_ranger.output'
        ])
        
        for donor_map, subtree_size, dtl_cost in zip(
                 donor_maps,
                 donor_subtree_sizes,
                 re.findall('^The minimum reconciliation cost is: (\d+)',
                            open('tmp_ranger.output').read(),
                            re.M)):
            transfer_df.loc[transfer_df.donor_map==donor_map,
                            'donor_dtl_size_ratio'] = int(reconciliation_cost)/subtree_size
        

    return transfer_df

In [15]:
%%add_to aggregate
def assess_dtl_cost(self, df, assess_donor_dtl=True, assess_recipient_dtl=False):
    transfer_df = df.copy()

    if assess_donor_dtl:
        donor_subtrees      = []
        donor_maps          = []
        donor_subtree_sizes = []
        for group in transfer_df.family.unique():
            with cd('%s/%s' % (self.reconciliation_folder, group)):
                gene_tree = ete3.Tree(linecache.getline('%s.output1' %group, 8), format=1)
            for donor_map in transfer_df.loc[transfer_df.family==group,
                                             'donor_map'].unique():
                donor_maps.append([group, donor_map])
                gene_donor_branch = next(gene_tree.iter_search_nodes(name=donor_map))
                donor_subtrees.append(gene_donor_branch.write(format=9))
                donor_subtree_sizes.append(len(gene_donor_branch))

        with open('tmp_ranger.input', 'w') as out:
            out.write('%s\n' % self.species_tree.write(format=9))
            out.write('\n'.join(donor_subtrees))

        subprocess.call([
            '/work/ranger/CorePrograms/Ranger-DTL.mac', 
            '-q',
            '-i', 'tmp_ranger.input',
            '-o', 'tmp_ranger.output'
        ])
        
        for (group, donor_map), subtree_size, dtl_cost in zip(
                 donor_maps,
                 donor_subtree_sizes,
                 re.findall('^The minimum reconciliation cost is: (\d+)',
                            open('tmp_ranger.output').read(),
                            re.M)):
            transfer_df.loc[(transfer_df.donor_map==donor_map) & (transfer_df.family==group),
                            'donor_dtl_size_ratio'] = int(dtl_cost)/subtree_size
        

    return transfer_df

In [1]:
%%add_to aggregate
def name_species_tree_nodes(self, reconciliation_file):
    species_tree_named_nodes = ete3.Tree(
        linecache.getline(
            reconciliation_file, 5
        ), format=1)

    for node in species_tree_named_nodes.traverse():
        if node.is_leaf():
            continue
        else:
            equivalent_node = self.species_tree.get_common_ancestor(node.get_leaf_names())
            if equivalent_node.get_topology_id() == node.get_topology_id():
                equivalent_node.name = node.name
                equivalent_node.add_feature('ranger_name', node.name)
            else:
                print('missmatching node: %s' % node.name)

UsageError: Cell magic `%%add_to` not found.


In [None]:
%%add_to aggregate
def assess_transfer_distance(self, df):
    transfer_df = df.copy()
    grouped_by_donor_recipient  = transfer_df.groupby(['donor','recipient'])
    for donor, recipient in grouped_by_donor_recipient.groups:
        transfer_df.loc[(transfer_df.donor==donor) & 
                        (transfer_df.recipient==recipient),
                        'donor_recipient_distance'] = self.species_tree.get_distance(donor, recipient)

    for donor in transfer_df.donor.unique():
        transfer_df.loc[transfer_df.donor==donor,
                        'donor_depth'] = self.species_tree.get_distance(donor, topology_only=True)
        transfer_df.loc[transfer_df.donor==donor,
                        'donor_subtree_size'] = len(
            next(self.species_tree.iter_search_nodes(ranger_name=donor))
        )

    for recipient in transfer_df.recipient.unique():
        transfer_df.loc[transfer_df.recipient==recipient,
                        'recipient_depth'] = self.species_tree.get_distance(recipient, topology_only=True)
        transfer_df.loc[transfer_df.recipient==recipient,
                        'recipient_subtree_size'] = len(
            next(self.species_tree.iter_search_nodes(ranger_name=recipient))
        )
    
    transfer_df['donor_depth/size_ratio']     = transfer_df['donor_depth'] / transfer_df['donor_subtree_size']
    transfer_df['recipient_depth/size_ratio'] = transfer_df['recipient_depth'] / transfer_df['recipient_subtree_size']
    return transfer_df

In [335]:
%%add_to aggregate
def map_taxonomic_level(self, df, taxa_table=None):
    ncbi     = ete3.NCBITaxa()

    taxa_df = pd.read_csv(taxa_table, sep='\t')
    taxa_df['Unnamed: 0'] = taxa_df['Unnamed: 0'].apply(lambda x: x.replace('_', '').split('.')[0])
    taxa_df['accession'] = taxa_df['accession'].apply(lambda x: x.replace('_', '').split('.')[0])
    taxa_df.set_index('Unnamed: 0', inplace=True)
    
    taxonomy_df = pd.DataFrame()

    for leaf in self.species_tree.get_leaf_names():
        if leaf in taxa_df.index:
            node_name = taxa_df.index[taxa_df.index == leaf][0]
        elif leaf in taxa_df.accession.values:
            node_name = taxa_df.query('accession==@leaf').index[0]
        else:
            continue

        if pd.notnull(taxa_df.loc[node_name, 'taxid']):
            taxid = taxa_df.loc[node_name, 'taxid']
            lineage = {j:i
                       for i, j in ncbi.get_rank(
                           ncbi.get_lineage(taxid)).items()
                      }
            lineage['leaf_name'] = leaf
            taxonomy_df = taxonomy_df.append(lineage, ignore_index=True)
    taxonomy_df.set_index('leaf_name', inplace=True)
    
    to_drop = []
    for column in taxonomy_df.columns:
        if column not in ['class', 'species', 'superkingdom', 'genus',
                          'order', 'phylum',  'family',       'kingdom']:
            to_drop.append(column)
    taxonomy_df.drop(to_drop, axis='columns', inplace=True)

    transfer_df = df.copy()
    for index, row in transfer_df.iterrows():
        donor_descendants     = next(
            self.species_tree.iter_search_nodes(ranger_name=row.donor)
        ).get_leaf_names()
        recipient_descendants = next(
            self.species_tree.iter_search_nodes(ranger_name=row.recipient)
        ).get_leaf_names()
                
        donor_taxonomy = taxonomy_df.loc[[taxon for taxon in donor_descendants if taxon in taxonomy_df.index]]
        recipient_taxonomy = taxonomy_df.loc[[taxon for taxon  in recipient_descendants if taxon in taxonomy_df.index]]

        if not donor_taxonomy.shape[0] or not recipient_taxonomy.shape[0]:
            continue
        
        donor_taxonomy.dropna(axis=1, how='any', inplace=True)
        recipient_taxonomy.dropna(axis=1, how='any', inplace=True)
        
        donor_taxonomy = next(donor_taxonomy.loc[:,
                                                 np.invert(donor_taxonomy.T.duplicated().values)
                                                ].iterrows())[1]
        recipient_taxonomy = next(recipient_taxonomy.loc[:,
                                                         np.invert(recipient_taxonomy.T.duplicated().values)
                                                        ].iterrows())[1]

        common_ranks = donor_taxonomy.index.intersection(recipient_taxonomy.index)

        for rank in ['species', 'genus',  'family',  'order',
                     'class',   'phylum', 'kingdom', 'superkingdom']:
            if rank in common_ranks[donor_taxonomy[common_ranks]==recipient_taxonomy[common_ranks]]:
                break
        
        transfer_df.loc[index, 'transfer_level'] = rank
        
    return(transfer_df)

In [None]:
%%add_to aggregate
def cluster_redundant_transfers(self, df):
    extended_df =  df.copy()
    extended_df['donor_ancestry'] = extended_df.apply(
        lambda row:\
        [ancestor.name for ancestor in next(
            self.species_tree.iter_search_nodes(name=row.donor)).get_ancestors()],
        axis=1)
    extended_df['recipient_ancestry'] = extended_df.apply(
        lambda row:\
        [ancestor.name for ancestor in next(
            self.species_tree.iter_search_nodes(name=row.recipient)).get_ancestors()],
        axis=1)

    clusters = []
    for name, row in extended_df.iterrows():
        matching_transfers = extended_df[(
                    (extended_df.donor_ancestry.apply(lambda x: row.donor in x)) |
                    (extended_df.donor == row.donor)
                ) &
                (
                    (extended_df.recipient_ancestry.apply(lambda x: row.recipient in x)) |
                    (extended_df.recipient == row.recipient)
                )].index
        existing_cluster = False
        for index, cluster in enumerate(clusters):
            if not cluster.isdisjoint(matching_transfers):
                existing_cluster = True
                clusters[index].update(matching_transfers)
                break
        if not existing_cluster:
            clusters.append(set(matching_transfers))

In [None]:
%%add_to aggregate
def interactive_dynamic_plot(self, df):
    tracers = []
    max_x   = 0
    max_y   = 0
    rank_order = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom']
    rank_color = dict(zip(rank_order, np.linspace(0,1,15)[1::2]))
    rank_color['superkingdom'] = rank_color['kingdom']

    colors = cl.scales['9']['qual']['Paired']
    colorscale = []
    for pos, value in enumerate(np.linspace(0,1,8)):
        colorscale.append([value, colors[pos]])
        colorscale.append([value, colors[pos+1]])

    for group in df.family.unique():
        tracer = {'x':[], 'y':[], 'text':[], 'marker_color':[], 'marker_size':10}
        group_number = int(group.split('_')[0])
        for index, row in df.query('family==@group').iterrows():
            tracer['x'   ].append(row.donor_recipient_distance)
            tracer['y'   ].append(row['donor_depth/size_ratio'])
            tracer['text'].append('group_%i#%i' % (group_number, index))
            tracer['marker_color'].append(rank_color[row.transfer_level]
                                          if pd.notnull(row.transfer_level) else 1)
    
        if np.max(tracer['x']) > max_x:
            max_x = np.max(tracer['x'])
        if np.max(tracer['y']) > max_y:
            max_y = np.max(tracer['y'])
        tracer = go.Scatter(x=tracer['x'],
                            y=tracer['y'],
                            mode='markers',
                            text=tracer['text'],
                            name='group_%s' % group.split('_')[0],
                            hoverinfo='text', showlegend=True,
                            marker=dict(size=tracer['marker_size'],
                                        color=tracer['marker_color'],
                                        colorscale=colorscale,
                                        cmax=1,
                                        cmin=0,
                                        symbol='circle',
                                        opacity=0.7)
                           )
        tracers.append(tracer)

    tracers = sorted(tracers, key = lambda x: int(x['name'].split('_')[1]))
    
    layout    = go.Layout(
        title='Interactive index HGT candidates plot!',
        hovermode='closest',
        width=1500, height=1000,
        xaxis=dict(title='Donor-Recipient distance', 
                   autorange=False, 
                   range=[0, max_x+max_x*0.01]),
        yaxis=dict(title='Donor depth/size ratio', 
                   autorange=False, 
                   range=[0, max_y+max_y*0.01]),
        updatemenus=[
            {'buttons':[{'label':'Show all',
                         'method':'restyle',
                         'args': [ 'visible', True]},
                        {'label':'Hide all',
                         'method':'restyle',
                         'args': [ 'visible', ['legendonly']*len(tracers)+[True]]}]}
        ]
    )
    
    tracers.append(go.Scatter(x=[max_x],
                             y=[max_y],
                             mode='markers',
                             name='colorbar',
                             showlegend=False,
                             marker=dict(size=10,
                                        color=0,
                                        symbol='circle',
                                        opacity=0.0,
                                        colorscale=colorscale,
                                        cmin=0,
                                        cmax=1,
                                        colorbar=dict(title='HGT within:',
                                                      x=1.25,
                                                      titleside = 'top',
                                                      tickvals = np.linspace(0,1,15)[1::2],
                                                      ticktext = rank_order,
                                                      ticks = 'outside')
                                        )
                           )
                  )
    
    fig       = go.Figure(data=tracers, layout=layout)
    plot      = plotly.offline.plot(fig, filename='./test.html', auto_open=False)

In [2]:
%%add_to aggregate
def visualize_in_figtree(self, df, taxa_table=None):
    ncbi     = ete3.NCBITaxa()

    taxa_df = pd.read_csv(taxa_table, sep='\t')
    taxa_df['Unnamed: 0'] = taxa_df['Unnamed: 0'].apply(lambda x: x.replace('_', '').split('.')[0])
    taxa_df['accession'] = taxa_df['accession'].apply(lambda x: x.replace('_', '').split('.')[0])
    taxa_df.set_index('Unnamed: 0', inplace=True)
    
    out  = open('species_tree-hgt.figTree', 'w')
    out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(self.species_tree))
    branch_names = {}
    for count, node in enumerate(self.species_tree.traverse()):
        if node.is_leaf():
            if node.name in taxa_df.index:
                node_name = taxa_df.index[taxa_df.index == node.name][0]
            elif node.name in taxa_df.accession.values:
                node_name = taxa_df.query('accession==@node.name').index[0]
            else:
                out.write('\t%s\n' %(node.name))
                continue
                
            comment = ['source_name="%s"' % taxa_df.loc[node_name, 'Organism']]
            if pd.isnull(taxa_df.loc[node_name, 'taxid']):
                out.write('\t%s ' %(node.name))
            else:
                taxid = taxa_df.loc[node_name, 'taxid']
                lineage = {j:i
                           for i, j in ncbi.get_rank(
                               ncbi.get_lineage(taxid)).items()
                          }
                lineage_names = ncbi.get_taxid_translator(lineage.values())

                out.write('\t%s ' % (node.name))
                for rank in ['class', 'phylum', 'order', 'family']:
                    if rank in lineage:
                        comment.append('tax_%s="%s"' % (rank, lineage_names[lineage[rank]]))
            out.write('[&%s]\n' %' '.join(comment))

        else:
            branch_names['_branch_%i_' % count] = '&support=%i,ranger_name=%s' %(node.support, node.ranger_name)
            as_donor     = {}
            as_recipient = {}
            for index, row in df.query('donor==@node.ranger_name').iterrows():
                if not row.family.split('_')[0] in as_donor:
                    as_donor[row.family.split('_')[0]] = ''
                as_donor[row.family.split('_')[0]] += '#%i' % index
            for index, row in df.query('recipient==@node.ranger_name').iterrows():
                if not row.family.split('_')[0] in as_recipient:
                    as_recipient[row.family.split('_')[0]] = ''
                as_recipient[row.family.split('_')[0]] += '#%i' % index

            for group, role in as_donor.items():
                branch_names['_branch_%i_' % count] += ',group_%s=donor%s' % (group, role)
            for group, role in as_recipient.items():
                if group in as_donor:
                    branch_names['_branch_%i_' % count] += '/recipient%s' % role
                else:
                    branch_names['_branch_%i_' % count] += ',group_%s=recipient%s' % (group, role)

            node.name = '_branch_%i_' % count


    newick_text = self.species_tree.write(format=1, dist_formatter='%.10f')
    for key, value in branch_names.items():
        newick_text = newick_text.replace(key, '[%s]' % value)
    out.write(';\nend;\n')
    out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
    out.close()

UsageError: Cell magic `%%add_to` not found.


In [304]:
%%add_to aggregate
def visualize_in_gene_figtree(self, df, taxa_table=None):
    ncbi     = ete3.NCBITaxa()

    taxa_df = pd.read_csv(taxa_table, sep='\t')
    taxa_df['Unnamed: 0'] = taxa_df['Unnamed: 0'].apply(lambda x: x.split('.')[0])
    taxa_df['accession'] = taxa_df['accession'].apply(lambda x: x.split('.')[0])
    taxa_df.set_index('Unnamed: 0', inplace=True)
    
    folder = 'highlighted_gene_trees'
    if not os.path.isdir(folder):
        os.mkdir(folder)
    else:
        os.system('rm -rf %s/*' % folder)
    
    for group in df.family.unique():
        group_df = df.query('family==@group')
        group_num   = int(group.split('_')[0])
        newick_text = open('gene_trees/%s.treefile.rooted' % group).read()
        gene_tree   = ete3.Tree(re.sub('\[.*\];$', ';', newick_text.strip(), flags=re.M))
        for leaf in gene_tree.get_leaves():
            if leaf.name.count('_') == 1:
                gene, genome = leaf.name.split('_')
            elif leaf.name.count('_') > 1 and re.search('GC[AF]_', leaf.name):
                gene, genome = re.search('^([^.]+).+?(GC[AF]_\d+)', leaf.name, re.M).groups()
            elif leaf.name.count('_') == 2 and re.search('_PRJ', leaf.name):
                gene, genome = re.search('^(.+)_(PRJ.+)$', leaf.name, re.M).groups()
            else:
                print(leaf.name)
            gene = gene.split('.')[0]
            leaf.add_feature('true_name', leaf.name)
            leaf.add_feature('genome', genome)
            leaf.name = '%s_%s' % (genome.replace('_', ''), gene.replace('_', ''))
        
        tmp = self.name_branches_as_reconciliation(open('ranger/%s/%s.output1' % (group, group)).read(),
                                                   gene_tree)
        out  = open('%s/group_%i-hgt.figTree' % (folder, group_num), 'w')
        out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(gene_tree))
        branch_names = {}
        for count, node in enumerate(tmp[0].traverse()):

            if node.is_leaf():
                if node.genome in taxa_df.index:
                    node_name = node.genome
                elif node.genome in taxa_df.accession.values:
                    node_name = taxa_df.query('accession==@node.genome').index[0]
                else:
                    out.write('\t%s\n' %(node.true_name))
                    continue

                comment = ['source_name="%s"' % taxa_df.loc[node_name, 'Organism']]
                if pd.isnull(taxa_df.loc[node_name, 'taxid']):
                    out.write('\t%s ' %(node.name))
                else:
                    taxid = taxa_df.loc[node_name, 'taxid']
                    lineage = {j:i
                               for i, j in ncbi.get_rank(
                                   ncbi.get_lineage(taxid)).items()
                              }
                    lineage_names = ncbi.get_taxid_translator(lineage.values())

                    out.write('\t%s ' % (node.name))
                    for rank in ['class', 'phylum', 'order', 'family']:
                        if rank in lineage:
                            comment.append('tax_%s="%s"' % (rank, lineage_names[lineage[rank]]))
                out.write('[&%s]\n' %' '.join(comment))

            else:
                branch_names['_branch_%i_' % count] = '&ranger_name=%s' %(node.ranger_name)
                as_donor     = {}
                as_recipient = {}
                for index, row in group_df.query('donor_map==@node.ranger_name').iterrows():
                    if not row.family.split('_')[0] in as_donor:
                        as_donor[row.family.split('_')[0]] = ''
                    as_donor[row.family.split('_')[0]] += '#%i' % index
                for index, row in group_df.query('recipient_map==@node.ranger_name').iterrows():
                    if not row.family.split('_')[0] in as_recipient:
                        as_recipient[row.family.split('_')[0]] = ''
                    as_recipient[row.family.split('_')[0]] += '#%i' % index

                for group, role in as_donor.items():
                    branch_names['_branch_%i_' % count] += ',role=donor%s' % role
                for group, role in as_recipient.items():
                    if group in as_donor:
                        branch_names['_branch_%i_' % count] += '/recipient%s' % role
                    else:
                        branch_names['_branch_%i_' % count] += ',role=recipient%s' % role

                node.name = '_branch_%i_' % count

        newick_text = tmp[0].write(format=1, dist_formatter='%.10f')
        for key, value in branch_names.items():
            newick_text = newick_text.replace(key, '[%s]' % value)
        out.write(';\nend;\n')
        out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
        out.close()