In [1]:
import jdc
import ete3
import os
import re
import linecache
import pandas as pd
import numpy as np
import random

In [None]:
class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [2]:
class aggregate(object):
    
    def __init__(self, reference_tree, gene_tree_folder, aggregate_folder, reconciliation_folder,
                 branch_support_thresh=0.9, 
                 ranger_confidence_threshold=0.9,
                 leaves_allowed=False):
        if type(reference_tree) is str:
            self.species_tree            = ete3.Tree(reference_tree, format=1)
        else:
            self.species_tree            = reference_tree.copy()
        self.support_threshold           = branch_support_thresh
        self.ranger_confidence_threshold = ranger_confidence_threshold
        self.leaves_allowed              = leaves_allowed
        self.gene_tree_folder            = gene_tree_folder
        self.aggregate_folder            = aggregate_folder
        self.reconciliation_folder       = reconciliation_folder

In [5]:
%%add_to aggregate
def match_rooting(self, reference_root, tree_to_root):
    tmp_tree = tree_to_root.copy()
    for node in sorted( reference_root.children, key=len ):
        if node.is_leaf():
            leaf = tmp_tree.get_leaves_by_name(node.name)[0]
            tmp_tree.set_outgroup(leaf)
            return tmp_tree
        else:
            is_it_monophyletic, clade_type, fucking_up = tmp_tree.check_monophyly(
                node.get_leaf_names(), 
                'name',
                unrooted=False
            )
            if is_it_monophyletic:
                equivalent = tmp_tree.get_common_ancestor(node.get_leaf_names())
                tmp_tree.set_outgroup(equivalent)
            else:
                tmp_tree.set_outgroup(fucking_up.pop())
                equivalent = tmp_tree.get_common_ancestor(node.get_leaf_names())
                tmp_tree.set_outgroup(equivalent)

            return tmp_tree

In [6]:
%%add_to aggregate
def name_branches_as_reconciliation(self, reconciliation_file, tree):
    branches         = re.findall('^(m\d+) = LCA\[(\S+), (\S+)\]:', reconciliation_file, re.M)
    duplicated_names = {}
    for name, leaf1, leaf2 in branches:
        node = tree.get_common_ancestor(leaf1, leaf2)
        if node.name:
            duplicated_names[name] = node.name
            continue
        node.name = name
    return tree, duplicated_names

In [14]:
%%add_to aggregate
def parse_aggregated(self, group):
    if not os.path.isdir('%s/%s' % (self.reconciliation_folder, group)) \
    or not os.path.isfile('%s/%s' % (self.aggregate_folder, group)):
        return {group:None}

    aggregated = open('%s/%s' % (self.aggregate_folder, group)).read()
    with cd('%s/%s' % (self.reconciliation_folder, group)):
        gene_tree     = {'named':ete3.Tree(linecache.getline('%s-MAD.ranger_out1' %group, 8), format=1)}

    gene_tree['support'] = self.match_rooting(
        gene_tree['named'],
        ete3.Tree('%s/%s.tree' % (self.gene_tree_folder, group))
    )
    gene_tree, duplicated_names = self.name_branches_as_reconciliation(aggregated, gene_tree['support'])
    
    ufboot_distribution = [node.support for node in gene_tree.traverse() if not node.is_leaf()]
    if np.percentile(ufboot_distribution, 25) < 80:
        return {group:None}

    num_replicates = float(re.match('Processed (\d+) files', aggregated).group(1))

    if not self.leaves_allowed:
        transfers = re.findall('^(m\d+) = .*, Transfers = [^0]\d+?\], \[Most Frequent mapping --> (n\d+), \
(\d+) times\], \[Most Frequent recipient --> (n\d+), (\d+) times\].', aggregated, re.M)
    else:
        transfers = re.findall('^(m\d+) = .*, Transfers = [^0]\d+?\], \[Most Frequent mapping --> (\S+), \
(\d+) times\], \[Most Frequent recipient --> (\S+), (\d+) times\].',   aggregated, re.M)

    confident_transfers = []
    for donor_map, donor, ranger_confidence_donor, recipient, ranger_confidence_recipient in transfers:
        if int(ranger_confidence_donor)     < self.ranger_confidence_threshold*num_replicates or \
           int(ranger_confidence_recipient) < self.ranger_confidence_threshold*num_replicates:
            continue
        confident_transfers.append((donor_map, donor, recipient))

    selected_transfers = []
    for donor_map_name, donor_name, recipient_name in confident_transfers:
        if donor_map_name in duplicated_names:
            donor_map = gene_tree.search_nodes(name=duplicated_names[donor_map_name])[0]
        else:
            donor_map = gene_tree.search_nodes(name=donor_map_name)[0]
        if donor_map.support < 95:
            continue

        recipient_map_search = re.search(
            '^({children[0]}|{children[1]}).*Most Frequent mapping --> {recipient}'.format(
                recipient=recipient_name,
                children=[child.name for child in donor_map.children]),
            aggregated, re.M)
        
        if recipient_map_search:
            recipient_map_name = recipient_map_search.group(1)
            if not all([donor_name, recipient_name, donor_map_name, recipient_map_name]):
                continue
            selected_transfers.append({'donor':donor_name, 'recipient':recipient_name,
                                       'donor_map':donor_map_name, 'recipient_map':recipient_map_name})
    return {group:[selected_transfers, gene_tree]}

In [15]:
%%add_to aggregate
def assess_dtl_dist(self, tmp_input):
    group, (transfer_data, gene_tree) = tmp_input
    dtl_distances   = []
    donor_trees     = []
    recipient_trees = []
    for transfer in transfer_data:
        recipient_branch = gene_tree.search_nodes(name=transfer['recipient_map'])[0]
        donor_branch     = recipient_branch.get_sisters()[0]

        donor_trees.append(    donor_branch.write(    format=9))

    #
    # donor compatibility assessment
    os.system( 'cp species_tree.template tmp_ranger-%s.input' %(multiprocessing.current_process().name))
    out = open('tmp_ranger-%s.input' %multiprocessing.current_process().name, 'a')
    out.write('\n'.join(donor_trees))
    out.close()
    os.system('/work/ranger/CorePrograms/Ranger-DTL.mac -q -i tmp_ranger-%s.input -o tmp_ranger-%s.output' 
              % (multiprocessing.current_process().name,
                 multiprocessing.current_process().name)
             )
    dtl_distances.extend(
        [float(reconciliation_cost)/len(donor_tree) 
         for reconciliation_cost, donor_tree in zip(
             re.findall('^The minimum reconciliation cost is: (\d+)',
                        open('tmp_ranger-%s.output' %multiprocessing.current_process().name).read(),
                        re.M
                       ),
             donor_trees)
        ]
    )

    return {group:dtl_distances}