In [79]:
import ete3
import os
import re
import multiprocessing
import pickle as pkl
import linecache
import pandas as pd
import numpy as np
import random
import plotly
import plotly.plotly as ptl
from plotly import graph_objs as go
import pyparsing as pp
import subprocess

plotly_accession = open('/Users/thiberio/plotly_accession').read().split()
ptl.sign_in(plotly_accession[0], plotly_accession[1])

os.chdir('/work/Alphas_and_Cyanos')

class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

<h3>Functions!</h3>

In [2]:
def match_rooting(ref_tree, tree2):
    tree_to_root = tree2.copy()
    for node in sorted( ref_tree.children, key=len ):
        if node.is_leaf():
            leaf = tree_to_root.get_leaves_by_name(node.name)[0]
            tree_to_root.set_outgroup(leaf)
            break
        else:
            is_it_monophyletic, clade_type, fucking_up = tree_to_root.check_monophyly(node.get_leaf_names(), 'name', unrooted=False)
            if is_it_monophyletic:
                equivalent = tree_to_root.get_common_ancestor(node.get_leaf_names())
                tree_to_root.set_outgroup(equivalent)
            else:
                tree_to_root.set_outgroup(fucking_up.pop())
                equivalent = tree_to_root.get_common_ancestor(node.get_leaf_names())
                tree_to_root.set_outgroup(equivalent)
            break

    return tree_to_root

In [3]:
def rename_branches(reconciliation_file, tree):
    branches         = re.findall('^(m\d+) = LCA\[(\S+), (\S+)\]:', reconciliation_file, re.M)
    duplicated_names = {}
    for name, leaf1, leaf2 in branches:
        node = tree.get_common_ancestor(leaf1, leaf2)
        if node.name:
            duplicated_names[name] = node.name
            continue
        node.name = name
    return tree, duplicated_names

In [77]:
def parse_aggregated(folder, threshold=0.9, leaves_allowed=False):
    if not os.path.isdir(folder) or not os.path.isfile(
            '/work/Alphas_and_Cyanos/aggregated/mad_roots-stricter_branch_lengths/%s' % folder):
        return {folder:None}

    aggregated = open(
        '/work/Alphas_and_Cyanos/aggregated/mad_roots-stricter_branch_lengths/%s' % folder).read()
    with cd(folder):
        gene_tree     = {'named':ete3.Tree(linecache.getline('%s-MAD.ranger_out1' %folder, 8), format=1)}

    gene_tree['support'] = match_rooting(
        gene_tree['named'],
        ete3.Tree('/work/Alphas_and_Cyanos/ranger_input_trees-no_long_branches/%s.tree' %folder))
    gene_tree, duplicated_names = rename_branches(aggregated, gene_tree['support'])
    
    ufboot_distribution = [node.support for node in gene_tree.traverse() if not node.is_leaf()]
    if np.percentile(ufboot_distribution, 25) < 80:
        return {folder:None}

    num_replicates = float(re.match('Processed (\d+) files', aggregated).group(1))

    if not leaves_allowed:
        transfers = re.findall('^(m\d+) = .*, Transfers = [^0]\d+?\], \[Most Frequent mapping --> (n\d+), \
(\d+) times\], \[Most Frequent recipient --> (n\d+), (\d+) times\].', aggregated, re.M)
    else:
        transfers = re.findall('^(m\d+) = .*, Transfers = [^0]\d+?\], \[Most Frequent mapping --> (\S+), \
(\d+) times\], \[Most Frequent recipient --> (\S+), (\d+) times\].',   aggregated, re.M)

    supported_transfers = []
    for donor_map, donor, ranger_confidence_donor, recipient, ranger_confidence_recipient in transfers:
        if int(ranger_confidence_donor)     < threshold*num_replicates or \
           int(ranger_confidence_recipient) < threshold*num_replicates:
            continue
        supported_transfers.append((donor_map, donor, recipient))

    selected_transfers = []
    for donor_map_name, donor_name, recipient_name in supported_transfers:
        if donor_map_name in duplicated_names:
            donor_map = gene_tree.search_nodes(name=duplicated_names[donor_map_name])[0]
        else:
            donor_map = gene_tree.search_nodes(name=donor_map_name)[0]
        if donor_map.support < 95:
            continue

        recipient_map_search = re.search(
            '^({children[0]}|{children[1]}).*Most Frequent mapping --> {recipient}'.format(
                recipient=recipient_name,
                children=[child.name for child in donor_map.children]),
            aggregated, re.M)
        
        if recipient_map_search:
            recipient_map_name = recipient_map_search.group(1)
            if not all([donor_name, recipient_name, donor_map_name, recipient_map_name]):
                continue
            selected_transfers.append({'donor':donor_name, 'recipient':recipient_name,
                                       'donor_map':donor_map_name, 'recipient_map':recipient_map_name})
    return {folder:[selected_transfers, gene_tree]}

In [122]:
def assess_dtl_dist(tmp_input):
    group, (transfer_data, gene_tree) = tmp_input
    dtl_distances   = []
    donor_trees     = []
    recipient_trees = []
    for transfer in transfer_data:
        recipient_branch = gene_tree.search_nodes(name=transfer['recipient_map'])[0]
        donor_branch     = recipient_branch.get_sisters()[0]

        donor_trees.append(    donor_branch.write(    format=9))

    #
    # donor compatibility assessment
    os.system( 'cp species_tree.template tmp_ranger-%s.input' %(multiprocessing.current_process().name))
    out = open('tmp_ranger-%s.input' %multiprocessing.current_process().name, 'a')
    out.write('\n'.join(donor_trees))
    out.close()
    os.system('/work/ranger/CorePrograms/Ranger-DTL.mac -q -i tmp_ranger-%s.input -o tmp_ranger-%s.output' 
              % (multiprocessing.current_process().name,
                 multiprocessing.current_process().name)
             )
    dtl_distances.extend(
        [float(reconciliation_cost)/len(donor_tree) 
         for reconciliation_cost, donor_tree in zip(
             re.findall('^The minimum reconciliation cost is: (\d+)',
                        open('tmp_ranger-%s.output' %multiprocessing.current_process().name).read(),
                        re.M
                       ),
             donor_trees)
        ]
    )

    return {group:dtl_distances}

<h3>real code...</h3>

In [6]:
import jdc
%run base_functions.ipynb

In [7]:
yeah = aggregate()

TypeError: __init__() missing 4 required positional arguments: 'reference_tree', 'gene_tree_folder', 'aggregate_folder', and 'reconciliation_folder'

In [80]:
with cd('reconciliations/mad_roots-stricter_branch_lengths'):
    pool    = multiprocessing.Pool(processes=15)
    results = pool.map(parse_aggregated, os.listdir('.'))
    pool.close()
    pool.join()

    transfers = {}
    for filtered in results:
        if  list(filtered.values()) != [None] and list(filtered.values())[0][0] != []:
            transfers.update(filtered)

out = open('aggregated/mad_transfers-test.pkl', 'wb')
pkl.dump(transfers, out)
out.close()

In [82]:
out = open('aggregated/maxtic.constrains-test', 'w')
for group, (transfer_data, gene_tree) in transfers.items():
    for transfer in transfer_data:
        out.write('%s\t%s\n' % (transfer['donor'], transfer['recipient']))
out.close()

subprocess.call(['python',
                 '/work/ale/maxtic/MaxTiC.py',
                 'rooted_partitions-with_named_branches.treefile',
                 'aggregated/maxtic.constrains-test',
                 'ls=180'])

maxtic = pd.read_table('aggregated/maxtic.constrains-test_MT_output_partial_order',
                       header=None,
                       names=['donor', 'recipient', 'weight', 'no_idea'],
                       sep=' ')

In [111]:
maxtic_compatible_transfers = {}
for group, (transfer_data, gene_tree) in transfers.items():
    tmp_transfers = []
    for transfer in transfer_data:
        if maxtic[(maxtic.donor==transfer['donor']) & (maxtic.recipient==transfer['recipient'])].shape[0]:
            tmp_transfers.append(transfer)
    if tmp_transfers:
        maxtic_compatible_transfers[group] = [tmp_transfers, gene_tree]

In [126]:
pool = multiprocessing.Pool(processes=18)
results = pool.map(assess_dtl_dist, list(maxtic_compatible_transfers.items()))

In [129]:
donor_dtl_distances = {}
for element in results:
    donor_dtl_distances.update(element)

In [188]:
reference_tree         = ete3.Tree('rooted_partitions-with_named_branches.treefile', format=1)
transfer_distances     = {}
donor_distance_to_root = {}
donor_complexity_ratio = {}
for group, (transfer_data, gene_tree) in maxtic_compatible_transfers.items():
    for transfer in transfer_data:
        pair         = frozenset([transfer['donor'], transfer['recipient']])
        donor_branch = reference_tree.search_nodes(name=transfer['donor']    )[0]
        
        if pair not in transfer_distances:
            recipient_branch = reference_tree.search_nodes(name=transfer['recipient'])[0]
            transfer_distances[pair] = donor_branch.get_distance(recipient_branch, topology_only=False)

        if transfer['donor'] not in donor_distance_to_root:
            tmp_dist = reference_tree.get_distance(
                transfer['donor'],
                topology_only=False
            )
            donor_distance_to_root[transfer['donor']] = tmp_dist
            donor_subtree_complexity = sum([node.dist
                                            for node in donor_branch.traverse()
                                            if node.name != transfer['donor']])
            tmp_dist = reference_tree.get_distance(
                transfer['donor'],
                topology_only=True
            )            
            donor_complexity_ratio[transfer['donor']] = tmp_dist/len(donor_branch)

In [206]:
maxtic.loc[(maxtic.donor==maxtic_compatible_transfers[group][0][position]['donor']) &
                                (maxtic.recipient==maxtic_compatible_transfers[group][0][position]['recipient']), 'weight'].squeeze()


4.0

In [220]:
tracer = {'color':[], 'x':[], 'y':[], 'text':[], 'marker_size':[]}
for group in donor_dtl_distances.keys():
    for position in range(len(donor_dtl_distances[group])):
        if [maxtic_compatible_transfers[group][0][position]['donor'],
            maxtic_compatible_transfers[group][0][position]['recipient']] not in maxtic_compatible:
            continue

        tracer['x'    ].append(
            transfer_distances[frozenset(
                [maxtic_compatible_transfers[group][0][position]['donor'],
                 maxtic_compatible_transfers[group][0][position]['recipient']]
            )]
        )
        tracer['y'    ].append(
            donor_complexity_ratio[maxtic_compatible_transfers[group][0][position]['donor']]
        )
        tracer['text' ].append('%s-#%i' %(group, position))
        tracer['color'].append(donor_dtl_distances[group][position])
        
        transfer_count = maxtic.loc[
            (maxtic.donor==maxtic_compatible_transfers[group][0][position]['donor']) &
            (maxtic.recipient==maxtic_compatible_transfers[group][0][position]['recipient']),
            'weight'].squeeze()
        tracer['marker_size'].append(10+transfer_count*0.7)


color_range          = np.linspace(np.min(tracer['color']), np.max(tracer['color']), 100)
tracer['color_bins'] = np.digitize(tracer['color'], color_range)
tracer_df = pd.DataFrame.from_dict(tracer)

binned_df = tracer_df.groupby(by='color_bins')

bins        = []
for bin in binned_df.groups.keys():
    tmp_df = binned_df.get_group(bin)
    bins.append(
        go.Scatter(
            x=tmp_df.x.values,
            y=tmp_df.y.values,
            mode='markers',
            text=tmp_df.text.values,
            name=str(round(color_range[bin-1], 4)),
            hoverinfo='text',
            showlegend=False,
            marker=dict(
                size=tmp_df.marker_size.values,
                color=tmp_df.color.values,
                colorscale='RdBu',
                cmax=tracer_df.color.values.max(),
                cmin=tracer_df.color.values.min(),
                symbol='circle',
                opacity=.7,
            )
        )
    )

#
# source: https://plot.ly/python/sliders/
steps = [dict(label='All',
                method='restyle',
                args=[
                    'visible', [True] * (len(bins) + 1)
                ])
]
for i in range(len(bins)):
    step = dict(label=bins[i]['name'],
                method='restyle',
                args=[
#                    'visible', [False] * i + [True] * (len(bins) - i)
                    'visible', [False]  * (len(bins))
                ])
    step['args'][1].append(True)
    step['args'][1][i] = True
    steps.append(step)
slider = dict(steps=steps, currentvalue={'prefix':'Donor subtree DTL: '}, pad={'t':50})
bins.append(
    go.Scatter(
        x=[np.min(tracer['x']), np.max(tracer['x'])],
        y=[np.min(tracer['y']), np.max(tracer['y'])],
        showlegend=False,
        mode='markers',
        marker=dict(
            size=10, 
            color=[0.5], 
            colorscale='RdBu', 
            cmax=np.max(tracer['color']), 
            cmin=np.min(tracer['color']), 
            symbol='circle', 
            opacity=0,
            colorbar=dict(title='Donor subtree DTL cost')
        )
    )
)

layout    = go.Layout(
    title='Donor/Recipient subtree reconciliation costs',
    hovermode='closest',
    width=1200, height=1000,
    xaxis=dict(title='Donor-Recipient distance'),
    yaxis=dict(title='Donor branch distance to root and donor subtree ratio'),
    sliders=[slider])
fig       = go.Figure(data=bins, layout=layout)
plot      = plotly.offline.plot(fig, filename='./test.html', auto_open=False)