In [6]:
import pandas as pd
import ete3
import re
import community
import networkx as nx
import itertools
import numpy as np
from sklearn import manifold
from scipy.spatial.distance import squareform, pdist
from matplotlib import pyplot as plt
import seaborn as sns
import igraph as ig
import plotly
import chart_studio.plotly as ptl
import plotly.graph_objects as go
import colorlover as cl
from IPython.display import HTML
import multiprocessing
import pickle as pkl
import random
from sklearn import mixture
from collections import Counter
import random
from scipy.stats import mannwhitneyu

ptl.sign_in('lthiberiol', 'm15ikp59lt')
ncbi = ete3.NCBITaxa()

%cd /work/eggNOG/

/work/eggNOG


In [3]:
sampled_genomes = pd.read_csv('/work/kelsey/genomes.tab',
                              sep='\t',
                              index_col=0)

In [4]:
lineages = pd.DataFrame()
for taxid in sampled_genomes.species_taxid.unique():
    if pd.isna(taxid):
        continue
    lineages = lineages.append({tax_rank: tmp_taxid 
                                 for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()},
                                ignore_index=True)
lineages = lineages.reindex(columns=['class', 'family',  'genus', 'phylum',
                                     'order', 'species', 'superkingdom']).copy()
lineages = lineages.query('superkingdom == 2').copy()

In [7]:
eggNOG_sample = pd.read_csv('e5.bacteria.taxid_info.tsv',
                            sep='\t',
                            comment='#',
                            names=['Taxid', 'Sci.Name', 'Rank', 'Named Lineage', 'Taxid Lineage'],
                            header=None,
                            index_col=0)

In [8]:
eggNOG_lineage = pd.DataFrame()
for taxid in eggNOG_sample.index.unique():
    if pd.isna(taxid):
        continue
    tmp = pd.Series({tax_rank: tmp_taxid 
                     for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()})
    tmp.name = taxid
    eggNOG_lineage = eggNOG_lineage.append(tmp)

eggNOG_lineage = eggNOG_lineage.reindex(columns=['class', 'family',  'genus', 'phylum',
                                                 'order', 'species', 'superkingdom']).copy()
eggNOG_lineage = eggNOG_lineage.query('superkingdom == 2').copy()

eggNOG_target_phyla = eggNOG_lineage[eggNOG_lineage.phylum.isin(lineages.phylum.unique())]


taxid 1344012 was translated into 480813


taxid 443255 was translated into 1901


taxid 1525715 was translated into 1545044


taxid 861530 was translated into 29382


taxid 1317118 was translated into 1379903


taxid 67281 was translated into 67351


taxid 1353531 was translated into 1708715


taxid 1288963 was translated into 1232681


taxid 1345697 was translated into 1921421


taxid 1552758 was translated into 1885902


taxid 469595 was translated into 1639133


taxid 469596 was translated into 100884


taxid 1434929 was translated into 1820025


taxid 1104325 was translated into 1158600


taxid 911239 was translated into 122355


taxid 265729 was translated into 246786


taxid 1122931 was translated into 1203610


taxid 1118055 was translated into 33037


taxid 1219084 was translated into 1123384


taxid 667632 was translated into 863227


taxid 1166016 was translated into 1905730


taxid 1408427 was translated into 1094555


taxid 520709 was translated into 1530123


taxid 13362

In [9]:
eggNOG_groups = pd.read_csv('2_members.tsv',
                            sep='\t',
                            header=None,
                            usecols=[1,2,3,4],
                            names=['group_id', 'num_proteins', 'num_taxa', 'members'])

tmp           = eggNOG_groups.members.map(lambda cell: [int(taxid) 
                                                        for taxid in re.findall('(\d+)\.(?:[^,]+)', cell)])
tmp.name      = 'taxa'
eggNOG_groups = eggNOG_groups.join(tmp)

In [10]:
def get_phyla_overlap(taxa):
    #taxids = [int(_) for _ in taxa]
    group_phyla = set(eggNOG_lineage.loc[taxa, 'phylum'].unique())
    overlapped_phyla = group_phyla.intersection(lineages.phylum.unique())
    return(overlapped_phyla)

eggNOG_target_groups = eggNOG_groups[eggNOG_groups.taxa.map(lambda cell: 
                                                            True if len(get_phyla_overlap(cell)) > 1 
                                                            else False)]

In [11]:
eggNOG_trees = pd.read_csv('2_trees.tsv',
                           sep='\t',
                           header=None,
                           usecols=[1,2,3],
                           index_col=0,
                           names=['group_id', 'fast', 'tree'])
eggNOG_trees = eggNOG_trees.reindex(index=eggNOG_target_groups.group_id)

In [12]:
eggNOG_chloroflexi = eggNOG_lineage.query('phylum==200795').index
eggNOG_cyano       = eggNOG_lineage.query('phylum==1117'  ).index

chloroflexi_count = eggNOG_target_groups.taxa.map(lambda x: len(eggNOG_chloroflexi.intersection(set(x))))
cyano_count       = eggNOG_target_groups.taxa.map(lambda x: len(eggNOG_cyano.intersection(set(x))))

In [13]:
def get_pairwise_distances(group_id):
    
    tree = ete3.Tree(eggNOG_trees.loc[group_id, 'tree'])

    leaf_names = []
    for count, node in enumerate(tree.traverse()):
        if node.is_leaf():
            leaf_names.append(node.name)
        else:
            node.name = 'node_%i' % count
    leaf_names = np.array(leaf_names)

    nodes         = []
    children      = []
    branch_length = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                nodes.append(         node.name)
                children.append(     child.name)
                branch_length.append(child.dist)

    branch_length_df                  = pd.DataFrame()
    branch_length_df['node']          = nodes
    branch_length_df['child']         = children
    branch_length_df['branch_length'] = branch_length

    dag  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                      'child', 
                                                      'branch_length']].itertuples(index=False), 
                                directed=False, 
                                weights=True)
    
    dist_matrix = pd.DataFrame(index  =leaf_names, 
                               columns=leaf_names, 
                               data   =np.array(dag.shortest_paths(source=leaf_names, 
                                                                   target=leaf_names, 
                                                                   weights='weight'))
                              )
    return(dist_matrix)

In [14]:
def create_taxa_graph(dist_matrix, phyla):
    triu_indices       = np.triu_indices_from(dist_matrix, k=1)
    
    edge_list                 = pd.DataFrame()
    edge_list['phylum1']      = phyla[triu_indices[0]]
    edge_list['phylum2']      = phyla[triu_indices[1]]
    edge_list['sequence1']    = dist_matrix.index[triu_indices[0]]
    edge_list['sequence2']    = dist_matrix.index[triu_indices[1]]
    edge_list['distance']     = dist_matrix.values[triu_indices]
    edge_list['inverse_dist'] = np.e**np.negative(edge_list.distance)

    graph  = ig.Graph.TupleList(edges=edge_list[['sequence1', 
                                                 'sequence2', 
                                                 'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    
    return(edge_list, graph)

In [15]:
# def assess_cluster(reference_phylum, minimal_freq_phyla, cluster_edges):
#     cluster_dists = pd.DataFrame(columns=['phylum', 'median', 'distances'])

#     for phylum1, phylum2 in itertools.combinations(minimal_freq_phyla, 2):
#         if   phylum1 == reference_phylum:
#             phylum = phylum2
#         elif phylum2 == reference_phylum:
#             phylum = phylum1
#         else:
#             continue

#         inter_phyla_dists = cluster_edges.loc[((cluster_edges.phylum1==phylum1)&(cluster_edges.phylum2==phylum2))|\
#                                               ((cluster_edges.phylum2==phylum1)&(cluster_edges.phylum1==phylum2)), 
#                                               'distance'].values

#         try:
#             phylum_3rd_quartile = np.median(inter_phyla_dists)
#         except IndexError:
#             continue        

#         cluster_dists = cluster_dists.append(pd.Series(data=[phylum, phylum_3rd_quartile, inter_phyla_dists], 
#                                            index=['phylum', 'median', 'distances']),
#                                  ignore_index=True)

#     return(cluster_dists)

In [16]:
def assess_cluster(reference_phylum, minimal_freq_phyla, cluster_edges, cluster_nodes):
    cluster_dists = pd.DataFrame(columns=['phylum', 'median', 'distances'])

    for phylum1, phylum2 in itertools.combinations(minimal_freq_phyla, 2):
        if   phylum1 == reference_phylum:
            phylum = phylum2
        elif phylum2 == reference_phylum:
            phylum = phylum1
        else:
            continue

        #
        # source: https://stackoverflow.com/questions/53246086/convert-list-of-edges-to-adjacency-matrix
        #
        inter_phyla = cluster_edges.loc[((cluster_edges.phylum1==phylum1)&(cluster_edges.phylum2==phylum2))|\
                                        ((cluster_edges.phylum2==phylum1)&(cluster_edges.phylum1==phylum2))]
        indices     = np.unique(inter_phyla[['sequence1', 'sequence2']])
        adjacencies = pd.DataFrame(data=0.0, index=indices, columns=indices)

        indexer     = adjacencies.index.get_indexer

        adjacencies.values[indexer(inter_phyla.sequence1), indexer(inter_phyla.sequence2)]  = inter_phyla.distance.values
        adjacencies.values[indexer(inter_phyla.sequence2), indexer(inter_phyla.sequence1)] += inter_phyla.distance.values

        tmp_closest_to_phylum = adjacencies.loc[cluster_nodes.loc[cluster_nodes.phylum==1117,   'name'],
                                                cluster_nodes.loc[cluster_nodes.phylum==phylum, 'name']].sum()
        tmp_closest_to_phylum.sort_values(inplace=True)
        tmp_closest_to_phylum = tmp_closest_to_phylum.index[:5]

        try:
            distances_to_reference_phylum = adjacencies.loc[cluster_nodes.loc[cluster_nodes.phylum==1117,   'name'],
                                                            tmp_closest_to_phylum].values.flatten()
        except IndexError:
            continue        

        cluster_dists = cluster_dists.append(pd.Series(data =[phylum, 
                                                              np.median(distances_to_reference_phylum), 
                                                              distances_to_reference_phylum], 
                                                       index=['phylum', 'median', 'distances']),
                                             ignore_index=True)
    return(cluster_dists)

In [17]:
def get_phyla_evol_distances(group_id):    
    dist_matrix = get_pairwise_distances(group_id)

    taxids = [int(leaf.split('.')[0]) for leaf in dist_matrix.index]
    phyla  = eggNOG_lineage.loc[taxids, 'phylum'].values.astype(int)

    edge_list, graph  = create_taxa_graph(dist_matrix, phyla)

    random.seed(12345)
    clusters = graph.community_multilevel(weights='weight')

    node_data = pd.DataFrame(columns=['name', 'phylum', 'cluster'],
                             data   =zip(dist_matrix.index, 
                                         phyla, 
                                         clusters.membership)
                            )

    family_cyano_count     = sum(node_data.phylum==1117)
    
    cluster_evol_relations = {}

    for cluster_num in set(clusters.membership):
        cluster_nodes = node_data[node_data.cluster==cluster_num]

        if sum(cluster_nodes.phylum==1117) < family_cyano_count*0.3:
            continue
        
        cluster_edges = edge_list.loc[(edge_list.sequence1.isin(cluster_nodes.name))&
                                      (edge_list.sequence2.isin(cluster_nodes.name)),
                                      ['phylum1', 'phylum2', 'sequence1', 'sequence2', 'distance']]

        minimal_freq_phyla = [phylum for phylum, frequency in Counter(cluster_nodes.phylum).items() if frequency>=5 \
                                                                                                    and phylum > 0]
        cluster_edges      = cluster_edges[(cluster_edges.phylum1.isin(minimal_freq_phyla)) &\
                                           (cluster_edges.phylum2.isin(minimal_freq_phyla))]
        normalizer         = np.median(cluster_edges.distance)
        cluster_edges      = cluster_edges[cluster_edges.phylum1 != cluster_edges.phylum2] 

        #
        #
        #
        cluster_dists = assess_cluster(1117, 
                                       minimal_freq_phyla, 
                                       cluster_edges,
                                       cluster_nodes)
        
        cluster_dists.sort_values('median', inplace=True)
        cluster_evol_relations[cluster_num]                  = {'df':cluster_dists[['phylum', 'median']].copy(),
                                                                'significant':False}
        if not cluster_dists.shape[0]:
            continue

        cluster_evol_relations[cluster_num]['df']['median'] /= normalizer
        if cluster_dists.shape[0] == 1:
            cluster_evol_relations[cluster_num]['significant'] = True
            continue

        hypothesis = mannwhitneyu(cluster_dists.iloc[0, 2], 
                                  cluster_dists.iloc[1, 2], 
                                  alternative='less')
        effect_size = hypothesis.statistic / (len(cluster_dists.iloc[0, 2])*len(cluster_dists.iloc[1, 2]))
        
        if hypothesis.pvalue < 0.01 and effect_size < 0.2:
            cluster_evol_relations[cluster_num]['significant'] = True
    
    return(group_id, cluster_evol_relations)

In [18]:
def create_taxa_graph(dist_matrix, phyla):
    triu_indices       = np.triu_indices_from(dist_matrix, k=1)
    
    edge_list                 = pd.DataFrame()
    edge_list['phylum1']      = phyla[triu_indices[0]]
    edge_list['phylum2']      = phyla[triu_indices[1]]
    edge_list['sequence1']    = dist_matrix.index[triu_indices[0]]
    edge_list['sequence2']    = dist_matrix.index[triu_indices[1]]
    edge_list['distance']     = dist_matrix.values[triu_indices]
    edge_list['inverse_dist'] = np.e**np.negative(edge_list.distance)

    graph  = ig.Graph.TupleList(edges=edge_list[['sequence1', 
                                                 'sequence2', 
                                                 'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    
    return(edge_list, graph)

In [28]:
def visualize_in_figTree(group_id):
    tree = ete3.Tree(eggNOG_trees.loc[group_id, 'tree'], format=0)
    
    out  = open('%s.figTree' % group_id, 'w')
    out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree))
    
    for node in tree.traverse():
        if node.is_leaf():
            taxid, locus_tag = node.name.split('.')
            try:
                lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
            except ValueError:
                out.write('\t%s\n' %(node.name))
                continue
            else:
                lineage_names = ncbi.get_taxid_translator(lineage.values())

            out.write('\t%s ' %(node.name))
            comment = []
            for rank in ['class', 'phylum', 'order', 'family', 'species']:
                if rank in lineage:
                    comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
            out.write('[&%s]\n' %' '.join(comment))

    newick_text = tree.write(format=0)
    out.write(';\nend;\n')
    out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
    out.close()

In [19]:
def extract_cluster(clusterID):
    group_id, cluster_num = clusterID.split('#')
    dist_matrix = get_pairwise_distances(group_id)

    taxids = [int(leaf.split('.')[0]) for leaf in dist_matrix.index]
    phyla  = eggNOG_lineage.loc[taxids, 'phylum'].values.astype(int)

    edge_list, graph  = create_taxa_graph(dist_matrix, phyla)

    random.seed(12345)
    clusters = graph.community_multilevel(weights='weight')

    node_data = pd.DataFrame(columns=['name', 'phylum', 'cluster'],
                             data   =zip(dist_matrix.index, 
                                         phyla, 
                                         clusters.membership)
                            )
    
    cluster_seqs = node_data.loc[node_data.cluster==int(cluster_num), 'name'].values
    
    if not os.path.isfile('alignment/%s' % group_id):
        subprocess.call(['curl', 
                         'http://eggnogapi5.embl.de/nog_data/text/raw_alg/%s' % group_id,
                         '--output', 'alignment/%s.gz' % group_id])
        subprocess.call(['gzip', '-d', 'alignment/%s.gz' % group_id])
    
    with open('alignment/%s' % group_id) as fasta_handle,\
         open('alignment/%s-cluster%s.faa' % (group_id, cluster_num), 'w') as out:
        for entry in fasta_handle.read().split('>'):
            if entry and entry.split()[0] in cluster_seqs:
                out.write('>%s' % entry)

In [20]:
def visualize_candidates(group_cluster):
    extract_cluster(group_cluster)
    
    group_id, cluster_num = group_cluster.split('#')
    
    with open('alignment/%s-cluster%s.aln' % (group_id, cluster_num), 'w') as out:
        subprocess.call(['mafft', '--auto', '--reorder', 'alignment/%s-cluster%s.faa' % (group_id, cluster_num)],
                        stdout=out)
        
    subprocess.call(['/Users/thiberio/anaconda2/bin/FastTree',
                     '-gamma', 
                     '-wag', 
                     '-out', 'alignment/%s-cluster%s.tree' % (group_id, cluster_num), 
                     'alignment/%s-cluster%s.aln' % (group_id, cluster_num)])
    
    tree = ete3.Tree('alignment/%s-cluster%s.tree' % (group_id, cluster_num), format=0)

    out  = open('%s-cluster%s.fastFigTree' % (group_id, cluster_num), 'w')
    out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree))

    for node in tree.traverse():
        if node.is_leaf():
            taxid, locus_tag = node.name.split('.')
            try:
                lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
            except ValueError:
                out.write('\t%s\n' %(node.name))
                continue
            else:
                lineage_names = ncbi.get_taxid_translator(lineage.values())

            out.write('\t%s ' %(node.name))
            comment = []
            for rank in ['class', 'phylum', 'order', 'family', 'species']:
                if rank in lineage:
                    comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
            out.write('[&%s]\n' %' '.join(comment))

    newick_text = tree.write(format=0)
    out.write(';\nend;\n')
    out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
    out.close()

In [32]:
%%time
get_phyla_evol_distances('COG0415')

CPU times: user 15.6 s, sys: 1.64 s, total: 17.2 s
Wall time: 16.7 s


('COG0415', {1: {'df':     phylum    median
   7   200795  0.284011
   6     1297  0.388293
   0     1224  0.426472
   1   203682  0.639114
   2   201174  0.672776
   3      976  0.688130
   5     1239  1.105878
   4    57723  1.114026
   8   203691  1.162283
   10    1090  1.167904
   9   200918  1.199879
   11  200783  1.224844, 'significant': False}, 5: {'df':    phylum    median
   0    1224  0.706522
   1  200795  0.722727
   4     976  0.750055
   2   74201  0.807053
   3    1239  0.850086, 'significant': False}})

In [None]:
# groups interesting to test distances between cyano and chloroflexi
test_groups = eggNOG_target_groups[(chloroflexi_count>=10) & (cyano_count>=70)]

test_groups = test_groups[test_groups.num_proteins<10_000]
print(test_groups.shape)

In [None]:
%%time
pool    = multiprocessing.Pool(processes=5, maxtasksperchild=5)
results = pool.map_async(get_phyla_evol_distances, test_groups.group_id.values)
pool.close()
pool.join()

In [None]:
candidates = 'COG0049#3  COG0073#0 COG0180#1 COG0310#0\
              COG0415#2  COG0499#2 COG0685#8 COG1304#5\
              COG1633#10 COG1666#3 COG2264#0 COG2324#2'

In [None]:
for candidate in candidates.split():
    print(candidate)
    visualize_candidates(candidate)

In [33]:
for candidate in ['COG0377#4', 'COG0540#4', 'COG0769#6']:
    print(candidate)
    visualize_candidates(candidate)

COG0377#4
COG0540#4
COG0769#6


In [34]:
visualize_candidates('COG0003#7')

In [27]:
candidates = ['COG0003#7', 'COG0674#1', 'COG1013#1', 'COG1014#2', 'COG1610#4',
              'COG2867#5', 'COG3188#7', 'COG3349#5', 'COG5523#0']
pool = multiprocessing.Pool(processes=10)
pool.map(visualize_candidates, candidates)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [35]:
visualize_in_figTree('COG0499')

In [None]:
tree = ete3.Tree('alignment/COG0049-cluster3.tree', format=0)

out  = open('COG0049-cluster3.fastFigTree', 'w')
out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree))

for node in tree.traverse():
    if node.is_leaf():
        taxid, locus_tag = node.name.split('.')
        try:
            lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
        except ValueError:
            out.write('\t%s\n' %(node.name))
            continue
        else:
            lineage_names = ncbi.get_taxid_translator(lineage.values())

        out.write('\t%s ' %(node.name))
        comment = []
        for rank in ['class', 'phylum', 'order', 'family', 'species']:
            if rank in lineage:
                comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
        out.write('[&%s]\n' %' '.join(comment))

newick_text = tree.write(format=0)
out.write(';\nend;\n')
out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
out.close()