In [1]:
import pandas as pd
import re
import itertools
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import igraph as ig
from IPython.display import HTML
import multiprocessing
import pickle as pkl
import random
from collections import Counter
import os
import subprocess
from scipy.stats import mannwhitneyu
import ete3
from copy import deepcopy
from math import ceil

ncbi = ete3.NCBITaxa()
%run assess_connections-functions.ipynb

%cd ~/work/eggNOG/

/nobackup1b/users/thiberio/eggNOG


In [2]:
sampled_genomes = pd.read_csv('../kelsey/genomes.tab',
                              sep='\t',
                              index_col=0)

lineages = pd.DataFrame()
for taxid in sampled_genomes.species_taxid.unique():
    if pd.isna(taxid):
        continue
    lineages = lineages.append({tax_rank: tmp_taxid 
                                 for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()},
                                ignore_index=True)
lineages = lineages.reindex(columns=['class', 'family',  'genus', 'phylum',
                                     'order', 'species', 'superkingdom']).copy()
lineages = lineages.query('superkingdom == 2').copy()

In [3]:
sampled_phyla = [phylum for phylum in lineages.phylum.unique().astype(int) if phylum > 0]

In [4]:
working_groups  = pd.read_parquet('working_eggNOG_groups.parquet', engine='fastparquet')
working_trees   = pd.read_parquet('working_eggNOG_trees.parquet' , engine='fastparquet')
eggNOG_taxonomy = pd.read_parquet('eggNOG_taxonomy.parquet'      , engine='fastparquet')

In [5]:
with open('all_results.pkl', 'rb') as _:
    phylum_connections = pkl.load(_)

In [6]:
regular_connections     = []
significant_connections = []

for group_id, group_clusters in phylum_connections:
    if not group_clusters:
        continue
    
    for cluster_num, cluster_data in group_clusters.items():

        for ref_phylum, phyla_dists in cluster_data.items():
            
            if not phyla_dists['df'].shape[0]:
                continue

            closest_phylum = phyla_dists['df'].iloc[0, 0]
            if not closest_phylum in sampled_phyla:
                continue

            if phyla_dists['significant']:
                significant_connections.append( (ref_phylum,
                                                 phyla_dists['df'].iloc[0, 0],
                                                 phyla_dists['df'].iloc[0, 1], 
                                                 '%s#%i' % (group_id, cluster_num)) )

                for index, tmp_series in phyla_dists['df'].iloc[1:, :].iterrows():
                    if tmp_series.phylum in sampled_phyla:
                        regular_connections.append( (ref_phylum,
                                                         tmp_series.phylum,
                                                         tmp_series['median'], 
                                                         '%s#%i' % (group_id, cluster_num)) )

            else:
                for index, tmp_series in phyla_dists['df'].iterrows():
                    if tmp_series.phylum in sampled_phyla:
                        regular_connections.append( (ref_phylum,
                                                         tmp_series.phylum,
                                                         tmp_series['median'], 
                                                         '%s#%i' % (group_id, cluster_num)) )

significant_df = pd.DataFrame(data=significant_connections, columns=['source', 'target', 'distance', 'group'])
regular_df     = pd.DataFrame(data=regular_connections,     columns=['source', 'target', 'distance', 'group'])

significant_df.dropna(how='any', inplace=True)
regular_df.dropna(    how='any', inplace=True)

In [129]:
#
# iqtree
#

def assess_transfers_between_phyla(related_phyla):
# related_phyla = (1090, 1117)

    candidates    = set(significant_df.query('source=="%i" and target=="%i"' % related_phyla).group.values).union(
                        significant_df.query('target=="%i" and source=="%i"' % related_phyla).group.values
                    )
    descriptions  = {f'from {related_phyla[0]} to {related_phyla[1]}': set(),
                     f'from {related_phyla[1]} to {related_phyla[0]}': set(),
                      'sisters':                                       set(),
                      'no clear relationship between phyla':           set()}

    for candidate in candidates:

        group_id, cluster_num = candidate.split('#')

        if os.path.isfile('candidates/iqtree/%s-cluster%s.treefile.rooted' % (group_id, 
                                                                              cluster_num)):
            with cd('candidates/iqtree/'):
                #
                # we need the original tree cause mad removes support values, so we just transfer root positions
                try:
                    tmp_tree   = match_rooting(ete3.Tree('%s-cluster%s.treefile.rooted' % (group_id, 
                                                                                           cluster_num),
                                                         format=1),
                                               ete3.Tree('%s-cluster%s.treefile'        % (group_id, 
                                                                                           cluster_num),
                                                         format=1))
                except ete3.parser.newick.NewickError:
                    continue

        print(candidate)

        #
        # extract taxIDs to subsample taxonomy tables
        taxids = {}
        for leaf in tmp_tree.get_leaf_names():
            #
            # leaf names are composed by <taxid>.<locus_tag>
            #   ps: locus_tag may also have "." within it
            tmp_taxid = int(leaf.split('.')[0])

            if tmp_taxid not in taxids:
                taxids[tmp_taxid] = []
            taxids[tmp_taxid].append(leaf)

        #
        # select taxIDs from each assessed phylum...
        phylum1_taxonomy = eggNOG_taxonomy.loc[taxids].query('phylum==@related_phyla[0]')
        phylum2_taxonomy = eggNOG_taxonomy.loc[taxids].query('phylum==@related_phyla[1]')

        #
        # ... and their respect leaves
        phylum1_leaves = set()
        for taxid in phylum1_taxonomy.index:
            phylum1_leaves.update(taxids[taxid])

        phylum2_leaves = set()
        for taxid in phylum2_taxonomy.index:
            phylum2_leaves.update(taxids[taxid])
        
        all_leaves = phylum1_leaves.union(phylum2_leaves)

        #
        # as tree traversing through ete3 isn't very efficient, and doesn't scalate very well,
        #   we create an iGraph dag for more effient traversing
        dag  = tree_to_dag_bb_and_alrt(tmp_tree)          # we need a directed version...
        udag = dag.as_undirected(mode='each') # ... and an undirected one for different processes

        #
        # placeholders where we will add monophyletic clades for each phylum
        phylum1_clades = set() 
        phylum2_clades = set()

        ignored_nodes  = [] # descendants of monophyletic nodes should be ignored once their
                            #   ancestors have been stored.

        #
        # traverse through internal nodes
        for node in dag.vs:
            if node.index in ignored_nodes:
                continue

            node_leaves = get_leaf_names(node)

            if all_leaves.isdisjoint(node_leaves):
                ignored_nodes.extend(get_descendant_indices(node, leaves=True))

            #
            # if there aren't leaves from other phyla within this node it is monophyletic
            if phylum1_leaves.intersection(node_leaves) and phylum1_leaves.issuperset(node_leaves):
                phylum1_clades.add(node.index)
                ignored_nodes.extend(get_descendant_indices(node, leaves=True))

            elif phylum2_leaves.intersection(node_leaves) and phylum2_leaves.issuperset(node_leaves):
                phylum2_clades.add(node.index)
                ignored_nodes.extend(get_descendant_indices(node, leaves=True))

        #
        # now we add some flexibility to the monophyly of nodes for three reasons:
        #   1) donor nodes within gene trees will never be monophyletic since the recipient
        #      must be nested within it.
        #   2) if there are other transfers from the donor and/or recipient phyla to a 3rd one
        #      we still want to capture it.
        #   3) good ol' phylogenetic uncertainty
        phylum1_clades = merge_polyphyletic_clades(phylum1_clades, udag)
        phylum2_clades = merge_polyphyletic_clades(phylum2_clades, udag)

        #
        # flag if we can identify relations between phyla:
        #   1 nested within 2
        #   2 nested within 1
        #   1 and 2 are sisters
        #
        # if no relation is identified, leave as false
        phyla_relationship_flag = 0

        for clade1, clade2 in itertools.product(phylum1_clades, phylum2_clades):

            #
            # capture ancestors of clade1 by querying nodes between itself and the root node
            clade1_ancestors = udag.vs[clade1].get_shortest_paths(udag.vs[0])[0][1:]
            #
            # if clade2 within clade1 ancestors it means that clade1 is nested within clade2
            #   evidence of transfer from clade2 -> clade1
            if clade2 in clade1_ancestors:
                well_supported = False
                for intermediary in dag.vs[clade2].get_shortest_paths(dag.vs[clade1])[0][:-1]:
                    alrt   = dag.vs[intermediary].in_edges()[0]['alrt'  ]
                    ufboot = dag.vs[intermediary].in_edges()[0]['ufboot']

                    if alrt >= 80 and ufboot >= 95:
                        well_supported = True
                        break

                if not well_supported:
                    continue
                descriptions[f'from {related_phyla[1]} to {related_phyla[0]}'].add(candidate)
                phyla_relationship_flag = 1
                continue

            #
            # if clade1 within clade2 ancestors it means that clade2 is nested within clade1
            #   evidence of transfer from clade1 -> clade2
            clade2_ancestors = udag.vs[clade2].get_shortest_paths(udag.vs[0])[0][1:]
            if clade1 in clade2_ancestors:
                well_supported = False
                for intermediary in dag.vs[clade1].get_shortest_paths(dag.vs[clade2])[0][:-1]:
                    alrt   = dag.vs[intermediary].in_edges()[0]['alrt'  ]
                    ufboot = dag.vs[intermediary].in_edges()[0]['ufboot']

                    if alrt >= 80 and ufboot >= 95:
                        well_supported = True
                        break

                if not well_supported:
                    continue
                descriptions[f'from {related_phyla[0]} to {related_phyla[1]}'].add(candidate)
                phyla_relationship_flag = 1
                continue

            #
            # if clade1 and clade2 are have the same parent noe it means they are sisters
            #   evidence of hgt, but information about directionality
            if clade1_ancestors[0] == clade2_ancestors[0]:
#                 if dag.vs[clade1].predecessors()[0].index:
#                     support = dag.vs[clade1].predecessors()[0].in_edges()[0]['support']
#                     print('%i and %i are sisters              (%s)' % (related_phyla[0], related_phyla[1], candidate), support)
                descriptions['sisters'].add(candidate)
#                 else:
#                     print('%i and %i are sisters              (%s)' % (related_phyla[0], related_phyla[1], candidate), 'root')
                phyla_relationship_flag = 1
                continue

    #     with cd('candidates/'):
    #         with open('%s-cluster%s.figTree' % (group_id, cluster_num), 'w') as out:
    #             out.write(visualize_reconstruct_candidate(tmp_tree))

        #
        # if the flag still is False, we couldn't identify an relationship between phyla
        if not phyla_relationship_flag:
            descriptions['no clear relationship between phyla'].add(candidate)

    return(descriptions)

In [None]:
            well_supported = False
            for intermediary in dag.vs[clade1].get_shortest_paths(dag.vs[clade2])[0][:-1]:
                alrt   = dag.vs[intermediary].in_edges()[0]['alrt'  ]
                ufboot = dag.vs[intermediary].in_edges()[0]['ufboot']

                if alrt >= 80 and ufboot >= 95:
                    well_supported = True
                    break

            if not well_supported:
                continue


In [130]:
# descriptions = []
# for phylum_pair in itertools.combinations(sampled_phyla, 2):
#     if 1224 in phylum_pair:
#         continue
    
#     print(phylum_pair)
#     descriptions[phylum_pair] = assess_transfers_between_phyla(phylum_pair)
    
descriptions = assess_transfers_between_phyla((200795, 1117))

COG2452#1
COG3046#1
COG2896#1
COG4665#4
COG1075#1
COG4663#1
COG0612#4
COG0637#4
2Z87P#0
COG0857#0
COG0720#3
COG1666#2
COG2324#4
COG0115#2
COG1770#1
COG0345#1
COG3281#3
COG0001#4
COG0554#3
COG4118#7
COG3259#2
COG1300#1
COG1704#1
COG1186#2
COG1801#4
COG2981#4
COG3544#2
COG0275#0
COG4467#0
COG0725#2
COG3429#0
COG2848#0
COG2343#0
2Z8NV#0
COG4978#0
COG5493#2
COG1517#2
COG0783#3
COG0499#0
COG1894#0
COG0310#4
COG1086#8
COG0354#0
COG0258#0
COG1528#1
COG1785#0
COG1350#1
COG1351#7
COG0828#0
COG5000#0
COG0167#0
COG1941#3
COG3379#4
COG1905#1
COG0310#1
COG0049#2
COG1304#5
COG0769#4
COG1353#5
COG1336#1
COG1177#0
COG1554#0
COG5635#3
COG2326#1
COG5207#0
COG1314#5
COG0461#2


In [124]:
for key, value in descriptions.items():
    print(key, len(value))

from 200795 to 1117 0
from 1117 to 200795 0
sisters 0
no clear relationship between phyla 0


In [55]:
def tree_to_dag_bb_and_alrt(tree):
    for count, node in enumerate(tree.traverse()):
        node.add_feature('alrt', 0.0)
        node.add_feature('ufboot', 0.0)

        if not node.is_leaf():
            support_regex = re.match('^(\d+(?:\.\d+)?)\/(\d+(?:\.\d+)?)$', node.name)
            if support_regex:
                node.alrt = float(support_regex.group(1))
                node.ufboot = float(support_regex.group(2))
            node.name     = 'node_%i' % count

    edges = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                edges.append((node.name,
                              child.name,
                              child.dist,
                              child.alrt,
                              child.ufboot))

    dag  = ig.Graph.TupleList(edges     =tuple(edges), 
                              directed  =True,
                              edge_attrs=['weight', 'alrt', 'ufboot']
                             )
    dag.vs['is_leaf'] = [False if name.startswith('node_') else True
                         for name in dag.vs['name']]
    return(dag)

In [114]:
related_phyla = (200795, 1117)
descriptions  = {f'from {related_phyla[0]} to {related_phyla[1]}': set(),
                 f'from {related_phyla[1]} to {related_phyla[0]}': set(),
                  'sisters':                                       set(),
                  'no clear relationship between phyla':           set()}
candidates    = set(significant_df.query('source=="%i" and target=="%i"' % related_phyla).group.values).union(
                    significant_df.query('target=="%i" and source=="%i"' % related_phyla).group.values
                )

for candidate in candidates:

    group_id, cluster_num = candidate.split('#')

    if os.path.isfile('candidates/iqtree/%s-cluster%s.treefile.rooted' % (group_id, 
                                                                          cluster_num)):
        with cd('candidates/iqtree/'):
            #
            # we need the original tree cause mad removes support values, so we just transfer root positions
            try:
                tmp_tree   = match_rooting(ete3.Tree('%s-cluster%s.treefile.rooted' % (group_id, 
                                                                                       cluster_num),
                                                     format=1),
                                           ete3.Tree('%s-cluster%s.treefile'        % (group_id, 
                                                                                       cluster_num),
                                                     format=1))
            except ete3.parser.newick.NewickError:
                continue
    if os.path.isfile('candidates/missing_iqtree/%s-cluster%s.treefile.rooted' % (group_id, 
                                                                                  cluster_num)):
        with cd('candidates/missing_iqtree/'):
            #
            # we need the original tree cause mad removes support values, so we just transfer root positions
            try:
                tmp_tree   = match_rooting(ete3.Tree('%s-cluster%s.treefile.rooted' % (group_id, 
                                                                                       cluster_num),
                                                     format=1),
                                           ete3.Tree('%s-cluster%s.treefile'        % (group_id, 
                                                                                       cluster_num),
                                                     format=1))
            except ete3.parser.newick.NewickError:
                continue

    #
    # extract taxIDs to subsample taxonomy tables
    taxids = {}
    for leaf in tmp_tree.get_leaf_names():
        #
        # leaf names are composed by <taxid>.<locus_tag>
        #   ps: locus_tag may also have "." within it
        tmp_taxid = int(leaf.split('.')[0])

        if tmp_taxid not in taxids:
            taxids[tmp_taxid] = []
        taxids[tmp_taxid].append(leaf)

    #
    # select taxIDs from each assessed phylum...
    phylum1_taxonomy = eggNOG_taxonomy.loc[taxids].query('phylum==@related_phyla[0]')
    phylum2_taxonomy = eggNOG_taxonomy.loc[taxids].query('phylum==@related_phyla[1]')

    #
    # ... and their respect leaves
    phylum1_leaves = set()
    for taxid in phylum1_taxonomy.index:
        phylum1_leaves.update(taxids[taxid])

    phylum2_leaves = set()
    for taxid in phylum2_taxonomy.index:
        phylum2_leaves.update(taxids[taxid])

    all_leaves = phylum1_leaves.union(phylum2_leaves)

    #
    # as tree traversing through ete3 isn't very efficient, and doesn't scalate very well,
    #   we create an iGraph dag for more effient traversing
    dag  = tree_to_dag_bb_and_alrt(tmp_tree)          # we need a directed version...
    udag = dag.as_undirected(mode='each') # ... and an undirected one for different processes

    #
    # placeholders where we will add monophyletic clades for each phylum
    phylum1_clades = set() 
    phylum2_clades = set()

    ignored_nodes  = [] # descendants of monophyletic nodes should be ignored once their
                        #   ancestors have been stored.

    #
    # traverse through internal nodes
    for node in dag.vs:
        if node.index in ignored_nodes:
            continue

        node_leaves = get_leaf_names(node)

        if all_leaves.isdisjoint(node_leaves):
            ignored_nodes.extend(get_descendant_indices(node, leaves=True))

        #
        # if there aren't leaves from other phyla within this node it is monophyletic
        if phylum1_leaves.intersection(node_leaves) and phylum1_leaves.issuperset(node_leaves):
            phylum1_clades.add(node.index)
            ignored_nodes.extend(get_descendant_indices(node, leaves=True))

        elif phylum2_leaves.intersection(node_leaves) and phylum2_leaves.issuperset(node_leaves):
            phylum2_clades.add(node.index)
            ignored_nodes.extend(get_descendant_indices(node, leaves=True))

    #
    # now we add some flexibility to the monophyly of nodes for three reasons:
    #   1) donor nodes within gene trees will never be monophyletic since the recipient
    #      must be nested within it.
    #   2) if there are other transfers from the donor and/or recipient phyla to a 3rd one
    #      we still want to capture it.
    #   3) good ol' phylogenetic uncertainty
    phylum1_clades = merge_polyphyletic_clades(phylum1_clades, udag)
    phylum2_clades = merge_polyphyletic_clades(phylum2_clades, udag)

    #
    # flag if we can identify relations between phyla:
    #   1 nested within 2
    #   2 nested within 1
    #   1 and 2 are sisters
    #
    # if no relation is identified, leave as false
    phyla_relationship_flag = 0

    for clade1, clade2 in itertools.product(phylum1_clades, phylum2_clades):

        #
        # capture ancestors of clade1 by querying nodes between itself and the root node
        clade1_ancestors = udag.vs[clade1].get_shortest_paths(udag.vs[0])[0][1:]
        #
        # if clade2 within clade1 ancestors it means that clade1 is nested within clade2
        #   evidence of transfer from clade2 -> clade1
        if clade2 in clade1_ancestors:
            
            well_supported = False
            for intermediary in dag.vs[clade2].get_shortest_paths(dag.vs[clade1])[0][:-1]:
                alrt   = dag.vs[intermediary].in_edges()[0]['alrt'  ]
                ufboot = dag.vs[intermediary].in_edges()[0]['ufboot']

                if alrt >= 80 and ufboot >= 95:
                    well_supported = True
                    break

            if not well_supported:
                continue

            descriptions[f'from {related_phyla[1]} to {related_phyla[0]}'].add(candidate)
            phyla_relationship_flag = 1
            break

        #
        # if clade1 within clade2 ancestors it means that clade2 is nested within clade1
        #   evidence of transfer from clade1 -> clade2
        clade2_ancestors = udag.vs[clade2].get_shortest_paths(udag.vs[0])[0][1:]
        if clade1 in clade2_ancestors:
            
            well_supported = False
            for intermediary in dag.vs[clade1].get_shortest_paths(dag.vs[clade2])[0][:-1]:
                alrt   = dag.vs[intermediary].in_edges()[0]['alrt'  ]
                ufboot = dag.vs[intermediary].in_edges()[0]['ufboot']

                if alrt >= 80 and ufboot >= 95:
                    well_supported = True
                    break

            if not well_supported:
                continue

            descriptions[f'from {related_phyla[0]} to {related_phyla[1]}'].add(candidate)
            phyla_relationship_flag = 1
            break

        #
        # if clade1 and clade2 are have the same parent noe it means they are sisters
        #   evidence of hgt, but information about directionality
        if clade1_ancestors[0] == clade2_ancestors[0]:
#                 if dag.vs[clade1].predecessors()[0].index:
#                     support = dag.vs[clade1].predecessors()[0].in_edges()[0]['support']
#                     print('%i and %i are sisters              (%s)' % (related_phyla[0], related_phyla[1], candidate), support)
            descriptions['sisters'].add(candidate)
#                 else:
#                     print('%i and %i are sisters              (%s)' % (related_phyla[0], related_phyla[1], candidate), 'root')
            phyla_relationship_flag = 1
            continue

In [115]:
descriptions

{'from 200795 to 1117': {'COG0049#2',
  'COG0345#1',
  'COG0354#0',
  'COG0554#3',
  'COG0637#4',
  'COG1177#0',
  'COG1304#5',
  'COG1351#7',
  'COG1554#0',
  'COG2452#1',
  'COG4978#0'},
 'from 1117 to 200795': {'COG0115#2',
  'COG1941#3',
  'COG2343#0',
  'COG2981#4',
  'COG4467#0'},
 'sisters': {'COG0167#0',
  'COG0310#4',
  'COG1894#0',
  'COG4978#0',
  'COG5000#0',
  'COG5493#2'},
 'no clear relationship between phyla': set()}

In [100]:
dag.vs[129].get_shortest_paths(dag.vs[355])[0][:-1]

[129, 161, 197, 245, 301]

In [106]:
well_supported = False
for intermediary in dag.vs[129].get_shortest_paths(dag.vs[355])[0][:-1]:
    alrt   = dag.vs[intermediary].in_edges()[0]['alrt'  ]
    ufboot = dag.vs[intermediary].in_edges()[0]['ufboot']
    
    if alrt >= 80 and ufboot >= 95:
        well_supported = True
        break

print(alrt, ufboot)

97.3 84.0


In [107]:
well_supported

False

In [74]:
pwd

'/nobackup1b/users/thiberio/eggNOG'

In [80]:
tmp_tree.search_nodes(name='node_28')

[]

In [91]:
dag.vs[355].in_edges()[0]['ufboot'  ]

100.0

In [77]:
with open('candidates/COG0499#0.iqtree.figTree', 'w') as out:
    out.write(visualize_reconstructed_candidate(deepcopy(tmp_tree)))

In [72]:
def visualize_reconstructed_candidate(tree):

    
    out = "#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree)

    count               = 0
    internal_node_names = {}
    for node in tree.traverse():
        if node.is_leaf():
            taxid, locus_tag = node.name.split('.')
            try:
                lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
            except ValueError:
                out += '\t%s\n' %(node.name)
                continue
            else:
                lineage_names = ncbi.get_taxid_translator(lineage.values())

            out += '\t%s ' %(node.name)
            comment = []
            for rank in ['class', 'phylum', 'order', 'family', 'species']:
                if rank in lineage:
                    comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
            if 'tax_phylum="Cyanobacteria"' in comment:
                comment.append('!color=#00ff00')
            elif 'tax_phylum="Chloroflexi"' in comment:
                comment.append('!color=#ff0000')
                            
            out += '[&%s]\n' %' '.join(comment)

        else:
            internal_node_names['node_%i_' % count] = '[&node_name=%s,ufboot=%.2f, alrt=%.2f]' % (deepcopy(node.name), node.ufboot, node.alrt)
            node.name = 'node_%i_' % count
            count += 1

    newick_text = tree.write(format=1)
    for tmp_name, full_name in internal_node_names.items():
        newick_text = newick_text.replace(tmp_name, full_name)
        
    out += ';\nend;\n'
    out += 'begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text

    return(out)