In [1]:
import pandas as pd
import ete3
import re
import community
import networkx as nx
import itertools
import numpy as np
from sklearn import manifold
from scipy.spatial.distance import squareform
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist
import igraph as ig
import plotly
import chart_studio.plotly as ptl
import plotly.graph_objects as go
import colorlover as cl
from IPython.display import HTML
import multiprocessing
import pickle as pkl
import random
from sklearn import mixture
from collections import Counter

ptl.sign_in('lthiberiol', 'm15ikp59lt')
ncbi = ete3.NCBITaxa()
%cd /work/eggNOG/

/work/eggNOG


In [2]:
sampled_genomes = pd.read_csv('/work/kelsey/genomes.tab',
                              sep='\t',
                              index_col=0)

In [3]:
lineages = pd.DataFrame()
for taxid in sampled_genomes.species_taxid.unique():
    if pd.isna(taxid):
        continue
    lineages = lineages.append({tax_rank: tmp_taxid 
                                 for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()},
                                ignore_index=True)
lineages = lineages.reindex(columns=['class', 'family',  'genus', 'phylum',
                                     'order', 'species', 'superkingdom']).copy()
lineages = lineages.query('superkingdom == 2').copy()

In [4]:
eggNOG_sample = pd.read_csv('e5.bacteria.taxid_info.tsv',
                            sep='\t',
                            comment='#',
                            names=['Taxid', 'Sci.Name', 'Rank', 'Named Lineage', 'Taxid Lineage'],
                            header=None,
                            index_col=0)

In [5]:
eggNOG_lineage = pd.DataFrame()
for taxid in eggNOG_sample.index.unique():
    if pd.isna(taxid):
        continue
    tmp = pd.Series({tax_rank: tmp_taxid 
                     for tmp_taxid, tax_rank in ncbi.get_rank(ncbi.get_lineage(taxid)).items()})
    tmp.name = taxid
    eggNOG_lineage = eggNOG_lineage.append(tmp)

eggNOG_lineage = eggNOG_lineage.reindex(columns=['class', 'family',  'genus', 'phylum',
                                                 'order', 'species', 'superkingdom']).copy()
eggNOG_lineage = eggNOG_lineage.query('superkingdom == 2').copy()

eggNOG_target_phyla = eggNOG_lineage[eggNOG_lineage.phylum.isin(lineages.phylum.unique())]


taxid 1344012 was translated into 480813


taxid 443255 was translated into 1901


taxid 1525715 was translated into 1545044


taxid 861530 was translated into 29382


taxid 1317118 was translated into 1379903


taxid 67281 was translated into 67351


taxid 1353531 was translated into 1708715


taxid 1288963 was translated into 1232681


taxid 1345697 was translated into 1921421


taxid 1552758 was translated into 1885902


taxid 469595 was translated into 1639133


taxid 469596 was translated into 100884


taxid 1434929 was translated into 1820025


taxid 1104325 was translated into 1158600


taxid 911239 was translated into 122355


taxid 265729 was translated into 246786


taxid 1122931 was translated into 1203610


taxid 1118055 was translated into 33037


taxid 1219084 was translated into 1123384


taxid 667632 was translated into 863227


taxid 1166016 was translated into 1905730


taxid 1408427 was translated into 1094555


taxid 520709 was translated into 1530123


taxid 13362

In [6]:
eggNOG_groups = pd.read_csv('2_members.tsv',
                            sep='\t',
                            header=None,
                            usecols=[1,2,3,4],
                            names=['group_id', 'num_proteins', 'num_taxa', 'members'])

tmp           = eggNOG_groups.members.map(lambda cell: [int(taxid) 
                                                        for taxid in re.findall('(\d+)\.(?:[^,]+)', cell)])
tmp.name      = 'taxa'
eggNOG_groups = eggNOG_groups.join(tmp)

In [7]:
def get_phyla_overlap(taxa):
    #taxids = [int(_) for _ in taxa]
    group_phyla = set(eggNOG_lineage.loc[taxa, 'phylum'].unique())
    overlapped_phyla = group_phyla.intersection(lineages.phylum.unique())
    return(overlapped_phyla)

eggNOG_target_groups = eggNOG_groups[eggNOG_groups.taxa.map(lambda cell: 
                                                            True if len(get_phyla_overlap(cell)) > 1 
                                                            else False)]

In [8]:
eggNOG_trees = pd.read_csv('2_trees.tsv',
                           sep='\t',
                           header=None,
                           usecols=[1,2,3],
                           index_col=0,
                           names=['group_id', 'fast', 'tree'])
eggNOG_trees = eggNOG_trees.reindex(index=eggNOG_target_groups.group_id)

In [9]:
eggNOG_chloroflexi = eggNOG_lineage.query('phylum==200795').index
eggNOG_cyano       = eggNOG_lineage.query('phylum==1117'  ).index

chloroflexi_count = eggNOG_target_groups.taxa.map(lambda x: len(eggNOG_chloroflexi.intersection(set(x))))
cyano_count       = eggNOG_target_groups.taxa.map(lambda x: len(eggNOG_cyano.intersection(set(x))))

In [10]:
# groups interesting to test distances between cyano and chloroflexi
test_groups = eggNOG_target_groups[(chloroflexi_count>=10) & (cyano_count>=70)]

In [13]:
test_groups = test_groups[test_groups.num_proteins<5_000]

In [190]:
test_groups.head()

Unnamed: 0,group_id,num_proteins,num_taxa,members,taxa
659,2Z832,120,120,"102125.Xen7305DRAFT_00015630,102129.Lepto7375D...","[102125, 102129, 102232, 103690, 1089550, 1094..."
202326,COG0001,4694,3218,"1000565.METUNv1_01136,1000565.METUNv1_03305,10...","[1000565, 1000565, 1001240, 1001530, 1001585, ..."
202327,COG0002,3957,3641,"1000565.METUNv1_02881,1001240.GY21_06330,10015...","[1000565, 1001240, 1001530, 1001585, 100226, 1..."
202328,COG0003,2423,1167,"100226.SCO2128,100226.SCO3577,100226.SCO3578,1...","[100226, 100226, 100226, 1002339, 1003195, 100..."
202330,COG0005,3783,3028,"1000565.METUNv1_01305,1000569.HMPREF1040_0706,...","[1000565, 1000569, 1000570, 1000588, 1001240, ..."


In [223]:
for group_id in random.sample(test_groups.group_id.to_list(), 50):
    print(group_id)
    
    tree = ete3.Tree(eggNOG_trees.loc[group_id, 'tree'])

    leaf_names = []
    for count, node in enumerate(tree.traverse()):
        if node.is_leaf():
            leaf_names.append(node.name)
        else:
            node.name = 'node_%i' % count
    leaf_names = np.array(leaf_names)

    nodes         = []
    children      = []
    branch_length = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                nodes.append(         node.name)
                children.append(     child.name)
                branch_length.append(child.dist)

    branch_length_df                  = pd.DataFrame()
    branch_length_df['node']          = nodes
    branch_length_df['child']         = children
    branch_length_df['branch_length'] = branch_length

    dag  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                      'child', 
                                                      'branch_length']].itertuples(index=False), 
                                directed=False, 
                                weights=True)
    matrix = np.array(dag.shortest_paths(source=leaf_names, target=leaf_names, weights='weight'))

    taxids = [int(leaf.split('.')[0]) for leaf in leaf_names]
    phyla  = eggNOG_lineage.loc[taxids, 'phylum'].values.astype(int)

    cyano_count = phyla[phyla==1117].shape[0]

    triu_indices       = np.triu_indices_from(matrix, k=1)
    df                 = pd.DataFrame()
    df['phylum1']      = phyla[triu_indices[0]]
    df['phylum2']      = phyla[triu_indices[1]]
    df['sequence1']    = leaf_names[triu_indices[0]]
    df['sequence2']    = leaf_names[triu_indices[1]]
    df['distance']     = matrix[triu_indices]
    df['inverse_dist'] = np.e**np.negative(df.distance)
    df['distance']    /= np.percentile(df.distance, 25)

    graph  = ig.Graph.TupleList(edges=df[['sequence1', 
                                          'sequence2', 
                                          'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    clusters = graph.community_multilevel(weights='weight')

    graph.vs['cluster'] = clusters.membership
    graph.vs['phylum']  = phyla

    family_cyano_count = len(graph.vs.select(phylum=1117))

    for cluster_num in set(clusters.membership):
        tmp_cluster = graph.vs.select(cluster=cluster_num)

        cluster_size = len(tmp_cluster)

        cluster_cyanos = tmp_cluster.select(phylum=1117)
        cyano_count    = len(cluster_cyanos)
        if cyano_count < family_cyano_count*0.3:
            continue

        cluster_not_cyanos = tmp_cluster.select(lambda node: 
                                                True if node['phylum']!=1117 else False)

        cluster_cyano_names     = cluster_cyanos.get_attribute_values(    'name')
        cluster_not_cyano_names = cluster_not_cyanos.get_attribute_values('name')

        sub_df = df.loc[((df.sequence2.isin(cluster_cyano_names))&(df.sequence1.isin(cluster_not_cyano_names))) | \
                        ((df.sequence1.isin(cluster_cyano_names))&(df.sequence2.isin(cluster_not_cyano_names))),
                        ['phylum1', 'phylum2', 'distance']]

        tmp1 = df.loc[(df.sequence1.isin(cluster_cyano_names))&\
                      (df.sequence2.isin(cluster_not_cyano_names)),
                      ['phylum2', 'distance']]
        tmp1.rename(columns={'phylum2':'phylum'}, inplace=True)

        tmp2 = df.loc[(df.sequence2.isin(cluster_cyano_names))&\
                      (df.sequence1.isin(cluster_not_cyano_names)),
                      ['phylum1', 'distance']]
        tmp2.rename(columns={'phylum1':'phylum'}, inplace=True)

        sub_df = tmp1.append(tmp2)

        threshold = np.infty
        closest_phyla = None
        for phylum in sub_df.phylum.unique():
            if len(tmp_cluster.select(phylum=phylum)) < 5:
                continue

#             phylum_3rd_quartile = np.percentile(sub_df.loc[sub_df.phylum==phylum, 'distance'], 75)
            phylum_3rd_quartile = np.percentile(sub_df.loc[sub_df.phylum==phylum, 'distance'].sort_values()[:10], 75)
            if phylum_3rd_quartile < threshold:
                threshold     = phylum_3rd_quartile
                closest_phyla = phylum

        if closest_phyla:
            print('  Closest to cyanos: %s (%.4f)' % (closest_phyla, threshold))
        #break

COG0164
  Closest to cyanos: 1224 (0.7323)
COG3956
  Closest to cyanos: 1224 (0.6270)
COG0440
  Closest to cyanos: 201174 (0.4823)
COG2059
  Closest to cyanos: 1224 (0.2091)
COG1008
  Closest to cyanos: 1239 (0.7764)
COG0040
  Closest to cyanos: 1224 (0.6313)
COG1007
COG0244
  Closest to cyanos: 74201 (0.6802)
COG1633
  Closest to cyanos: 200795 (0.2590)
COG0233
  Closest to cyanos: 203691 (0.6087)
COG0066
  Closest to cyanos: 1297 (0.6756)
COG0128
  Closest to cyanos: 1239 (0.3883)
COG0481
  Closest to cyanos: 1239 (0.5762)
COG1950
  Closest to cyanos: 1224 (0.4578)
COG0103
  Closest to cyanos: 1239 (0.5296)
COG0651
COG0192
  Closest to cyanos: 1224 (0.4446)
COG1200
  Closest to cyanos: 200795 (0.5268)
COG0432
  Closest to cyanos: 976 (0.2911)
  Closest to cyanos: 1224 (0.4156)
COG2255
  Closest to cyanos: 1239 (0.7476)
COG2374
  Closest to cyanos: 976 (0.2406)
  Closest to cyanos: 1224 (0.2024)
COG0812
  Closest to cyanos: 201174 (0.5836)
COG2343
  Closest to cyanos: 200795 (0.2299)


In [222]:
#
# test specific families
#

tree = ete3.Tree(eggNOG_trees.loc['COG1235', 'tree'])

leaf_names = []
for count, node in enumerate(tree.traverse()):
    if node.is_leaf():
        leaf_names.append(node.name)
    else:
        node.name = 'node_%i' % count
leaf_names = np.array(leaf_names)

nodes         = []
children      = []
branch_length = []
for node in tree.traverse():
    if not node.is_leaf():
        for child in node.get_children():
            nodes.append(         node.name)
            children.append(     child.name)
            branch_length.append(child.dist)

branch_length_df                  = pd.DataFrame()
branch_length_df['node']          = nodes
branch_length_df['child']         = children
branch_length_df['branch_length'] = branch_length

dag  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                  'child', 
                                                  'branch_length']].itertuples(index=False), 
                            directed=False, 
                            weights=True)
matrix = np.array(dag.shortest_paths(source=leaf_names, target=leaf_names, weights='weight'))

taxids = [int(leaf.split('.')[0]) for leaf in leaf_names]
phyla  = eggNOG_lineage.loc[taxids, 'phylum'].values.astype(int)

cyano_count = phyla[phyla==1117].shape[0]

triu_indices       = np.triu_indices_from(matrix, k=1)
df                 = pd.DataFrame()
df['phylum1']      = phyla[triu_indices[0]]
df['phylum2']      = phyla[triu_indices[1]]
df['sequence1']    = leaf_names[triu_indices[0]]
df['sequence2']    = leaf_names[triu_indices[1]]
df['distance']     = matrix[triu_indices]
df['inverse_dist'] = np.e**np.negative(df.distance)
df['distance']    /= np.percentile(df.distance, 25)

graph  = ig.Graph.TupleList(edges=df[['sequence1', 
                                      'sequence2', 
                                      'inverse_dist']].itertuples(index=False), 
                            directed=False, 
                            weights =True)
clusters = graph.community_multilevel(weights='weight')

graph.vs['cluster'] = clusters.membership
graph.vs['phylum']  = phyla

family_cyano_count = len(graph.vs.select(phylum=1117))

for cluster_num in set(clusters.membership):
    tmp_cluster = graph.vs.select(cluster=cluster_num)

    cluster_size = len(tmp_cluster)

    cluster_cyanos = tmp_cluster.select(phylum=1117)
    cyano_count    = len(cluster_cyanos)
    if cyano_count < family_cyano_count*0.3:
        continue

    cluster_not_cyanos = tmp_cluster.select(lambda node: 
                                            True if node['phylum']!=1117 else False)

    cluster_cyano_names     = cluster_cyanos.get_attribute_values(    'name')
    cluster_not_cyano_names = cluster_not_cyanos.get_attribute_values('name')

    sub_df = df.loc[((df.sequence2.isin(cluster_cyano_names))&(df.sequence1.isin(cluster_not_cyano_names))) | \
                    ((df.sequence1.isin(cluster_cyano_names))&(df.sequence2.isin(cluster_not_cyano_names))),
                    ['phylum1', 'phylum2', 'distance']]

    tmp1 = df.loc[(df.sequence1.isin(cluster_cyano_names))&\
                  (df.sequence2.isin(cluster_not_cyano_names)),
                  ['phylum2', 'distance']]
    tmp1.rename(columns={'phylum2':'phylum'}, inplace=True)

    tmp2 = df.loc[(df.sequence2.isin(cluster_cyano_names))&\
                  (df.sequence1.isin(cluster_not_cyano_names)),
                  ['phylum1', 'distance']]
    tmp2.rename(columns={'phylum1':'phylum'}, inplace=True)

    sub_df = tmp1.append(tmp2)

    threshold = np.infty
    closest_phyla = None
    for phylum in sub_df.phylum.unique():
        if len(tmp_cluster.select(phylum=phylum)) < 5:
            continue

#         phylum_3rd_quartile = np.percentile(sub_df.loc[sub_df.phylum==phylum, 'distance'], 75)
        phylum_3rd_quartile = np.percentile(sub_df.loc[sub_df.phylum==phylum, 'distance'].sort_values()[:10], 75)
        if phylum_3rd_quartile < threshold:
            threshold     = phylum_3rd_quartile
            closest_phyla = phylum

    if closest_phyla:
        print('  Closest to cyanos: %s (%.4f)' % (closest_phyla, threshold))
    #break

  Closest to cyanos: 1224 (0.2179)


In [221]:
sub_df.loc[sub_df.phylum==phylum, 'distance'].sort_values()[:10]

8107866    0.530517
8108117    0.534875
8107603    0.535898
8108326    0.535917
8107606    0.536312
8108116    0.537013
8107739    0.537890
8107605    0.538613
8107986    0.539905
8108237    0.540248
8108321    0.541015
8108432    0.541092
8108433    0.542177
8107867    0.543328
8108236    0.545622
8108434    0.546425
8107985    0.546635
8107740    0.546806
8107863    0.546837
8108435    0.547320
8107738    0.548814
8107604    0.548908
8108322    0.549000
8108436    0.549271
8108323    0.549553
8108325    0.549642
8108524    0.549780
8108439    0.550022
8108440    0.550258
8108235    0.550964
             ...   
8108641    0.710094
8108308    0.710535
8108515    0.710895
8108424    0.713334
8108108    0.716957
8108644    0.719162
8108517    0.719374
8108425    0.719438
8108110    0.720537
8108706    0.720860
8108705    0.721543
8108586    0.721939
8108518    0.722272
8108709    0.730095
8108707    0.733323
8108766    0.735517
8108710    0.738417
8108711    0.739520
8108765    0.746118


In [201]:
for cluster_num in set(clusters.membership):
    tmp_cluster = graph.vs.select(cluster=cluster_num)

    cluster_size = len(tmp_cluster)

    cluster_cyanos = tmp_cluster.select(phylum=1117)
    cyano_count    = len(cluster_cyanos)
    if not cyano_count:
        continue
    else:
        break

In [205]:
len(cluster_cyanos)

1

In [204]:
cyano_count

1

In [210]:
family_cyano_count = len(graph.vs.select(phylum=1117))

In [211]:
family_cyano_count

548

In [None]:
def get_closest_phylum_to_cyano(group_id):
    tree = ete3.Tree(eggNOG_trees.loc[group_id, 'tree'])

    leaf_names = []
    for count, node in enumerate(tree.traverse()):
        if node.is_leaf():
            leaf_names.append(node.name)
        else:
            node.name = 'node_%i' % count
    leaf_names = np.array(leaf_names)

    nodes         = []
    children      = []
    branch_length = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                nodes.append(         node.name)
                children.append(     child.name)
                branch_length.append(child.dist)

    branch_length_df                  = pd.DataFrame()
    branch_length_df['node']          = nodes
    branch_length_df['child']         = children
    branch_length_df['branch_length'] = branch_length

    graph  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                        'child', 
                                                        'branch_length']].itertuples(index=False), 
                                directed=False, 
                                weights=True)
    matrix = np.array(graph.shortest_paths(source=leaf_names, target=leaf_names, weights='weight'))

    taxids = [int(leaf.split('.')[0]) for leaf in leaf_names]
    phyla  = eggNOG_lineage.loc[taxids, 'phylum'].values.astype(int)

    cyano_count = phyla[phyla==1117].shape[0]

    triu_indices       = np.triu_indices_from(matrix, k=1)
    df                 = pd.DataFrame()
    df['phylum1']      = phyla[triu_indices[0]]
    df['phylum2']      = phyla[triu_indices[1]]
    df['sequence1']    = leaf_names[triu_indices[0]]
    df['sequence2']    = leaf_names[triu_indices[1]]
    df['distance']     = matrix[triu_indices]
    df['inverse_dist'] = np.e**np.negative(df.distance)

    graph  = ig.Graph.TupleList(edges=df[['sequence1', 
                                          'sequence2', 
                                          'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    clusters = graph.community_multilevel(weights='weight')
    
    intra_phylum_df = df[df.phylum1==df.phylum2].copy()
    inter_phylum_df = df[df.phylum1!=df.phylum2]
    
    intra_phylum_df['inverse_dist'] = np.e**np.negative(intra_phylum_df.distance)

    graph  = ig.Graph.TupleList(edges=intra_phylum_df[['sequence1', 
                                                       'sequence2', 
                                                       'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    clusters = graph.community_multilevel(weights='weight')

    cluster_map = {node['name']:cluster_num
                   for node, cluster_num in zip(graph.vs(),
                                                clusters.membership)}
    intra_phylum_df['sequence1_cluster'] = intra_phylum_df.sequence1.apply(lambda sequence_name: cluster_map[sequence_name])
    intra_phylum_df['sequence2_cluster'] = intra_phylum_df.sequence2.apply(lambda sequence_name: cluster_map[sequence_name])

    monophyletic_intra_phylum_df = intra_phylum_df[intra_phylum_df.sequence1_cluster==intra_phylum_df.sequence2_cluster]
    
    return(intra_phylum_df)

#     with open('chloroflexi_cyano_dists/%s.pkl' % group_id, 'wb') as out:
#         pkl.dump(cyano_VS_chloroflexi/np.median(monophyletic_intra_phylum_df.distance), out)
    
#     return(group_id)

In [None]:
def get_norm_patristic_distances(group_id):
    tree = ete3.Tree(eggNOG_trees.loc[group_id, 'tree'])

    leaf_names = []
    for count, node in enumerate(tree.traverse()):
        if node.is_leaf():
            leaf_names.append(node.name)
        else:
            node.name = 'node_%i' % count
    leaf_names = np.array(leaf_names)

    nodes         = []
    children      = []
    branch_length = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                nodes.append(         node.name)
                children.append(     child.name)
                branch_length.append(child.dist)

    branch_length_df                  = pd.DataFrame()
    branch_length_df['node']          = nodes
    branch_length_df['child']         = children
    branch_length_df['branch_length'] = branch_length

    graph  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                        'child', 
                                                        'branch_length']].itertuples(index=False), 
                                directed=False, 
                                weights=True)
    matrix = np.array(graph.shortest_paths(source=leaf_names, target=leaf_names, weights='weight'))

    taxids = [int(leaf.split('.')[0]) for leaf in leaf_names]
    phyla  = eggNOG_lineage.loc[taxids, 'phylum'].values.astype(int)

    triu_indices   = np.triu_indices_from(matrix, k=1)
    df = pd.DataFrame()
    df['phylum1']  = phyla[triu_indices[0]]
    df['phylum2']  = phyla[triu_indices[1]]
    df['sequence1']    = leaf_names[triu_indices[0]]
    df['sequence2']    = leaf_names[triu_indices[1]]
    df['distance'] = matrix[triu_indices]
    
    df = df[(df.phylum1!=1224) & (df.phylum2!=1224)]
    intra_phylum_distances = df.loc[df.phylum1==df.phylum2, 'distance']
    
    intra_phylum_df = df[df.phylum1==df.phylum2].copy()
    inter_phylum_df = df[df.phylum1!=df.phylum2]
    
    cyano_VS_chloroflexi = inter_phylum_df.loc[(inter_phylum_df.phylum1.isin([200795, 1117])) & \
                                               (inter_phylum_df.phylum2.isin([200795, 1117])),
                                               'distance'].values

    intra_cyanobacteria = intra_phylum_df.loc[intra_phylum_df.phylum1 == 1117,   'distance'].values
    hypothesis          = mannwhitneyu(intra_cyanobacteria, cyano_VS_chloroflexi, alternative='less')
    if hypothesis.pvalue > 0.05:
        return(None)
    
    intra_chloroflexi   = intra_phylum_df.loc[intra_phylum_df.phylum1 == 200795, 'distance'].values
    hypothesis          = mannwhitneyu(intra_chloroflexi, cyano_VS_chloroflexi, alternative='less')
    if hypothesis.pvalue > 0.05:
        return(None)

    intra_phylum_df['inverse_dist'] = np.e**np.negative(intra_phylum_df.distance)

    graph  = ig.Graph.TupleList(edges=intra_phylum_df[['sequence1', 
                                                       'sequence2', 
                                                       'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    clusters = graph.community_multilevel(weights='weight')

    cluster_map = {node['name']:cluster_num
                   for node, cluster_num in zip(graph.vs(),
                                                clusters.membership)}
    intra_phylum_df['sequence1_cluster'] = intra_phylum_df.sequence1.apply(lambda sequence_name: cluster_map[sequence_name])
    intra_phylum_df['sequence2_cluster'] = intra_phylum_df.sequence2.apply(lambda sequence_name: cluster_map[sequence_name])

    monophyletic_intra_phylum_df = intra_phylum_df[intra_phylum_df.sequence1_cluster==intra_phylum_df.sequence2_cluster]
    
    return(intra_phylum_df)

#     with open('chloroflexi_cyano_dists/%s.pkl' % group_id, 'wb') as out:
#         pkl.dump(cyano_VS_chloroflexi/np.median(monophyletic_intra_phylum_df.distance), out)
    
#     return(group_id)

In [None]:
group_sample = group_ids

In [None]:
matrices  = []
group_ids = []
for group_id in group_sample:
    #group_id = random.choice(test_groups.loc[test_groups.num_proteins<10_000, 'group_id'].values.tolist())
    test = get_norm_patristic_distances(group_id)
    matrices.append(test.copy())
    group_ids.append(group_id)
    print(len(matrices))

In [None]:
fig, axs = plt.subplots(nrows=10, figsize=(10, 15))
for i, j in zip(matrices, axs):
    sns.kdeplot(i.distance, shade=True, label='all intra phylum', ax=j)
    monophyletic_distances = i[i.sequence1_cluster==i.sequence2_cluster].distance
    sns.kdeplot(monophyletic_distances, shade=True, label='only intra cluster and intra phylum', ax=j)

    j.fill_between([np.median(i.distance)], [j.get_ylim()[0]], [j.get_ylim()[1]], color='r')
#     j.fill_between([np.percentile(i.distance, 75)], [j.get_ylim()[0]], [j.get_ylim()[1]], color='r')
#     j.fill_between([np.percentile(i.distance, 25)], [j.get_ylim()[0]], [j.get_ylim()[1]], color='r')
    
    j.fill_between([np.median(monophyletic_distances)], [j.get_ylim()[0]], [j.get_ylim()[1]], color='g')
#     j.fill_between([np.percentile(monophyletic_distances, 75)], [j.get_ylim()[0]], [j.get_ylim()[1]], color='g')
#     j.fill_between([np.percentile(monophyletic_distances, 25)], [j.get_ylim()[0]], [j.get_ylim()[1]], color='g')

In [None]:
def get_norm_patristic_distances(group_id):
    tree = ete3.Tree(eggNOG_trees.loc[group_id, 'tree'])

    leaf_names = []
    for count, node in enumerate(tree.traverse()):
        if node.is_leaf():
            leaf_names.append(node.name)
        else:
            node.name = 'node_%i' % count
    leaf_names = np.array(leaf_names)

    nodes         = []
    children      = []
    branch_length = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                nodes.append(         node.name)
                children.append(     child.name)
                branch_length.append(child.dist)

    branch_length_df                  = pd.DataFrame()
    branch_length_df['node']          = nodes
    branch_length_df['child']         = children
    branch_length_df['branch_length'] = branch_length

    graph  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                        'child', 
                                                        'branch_length']].itertuples(index=False), 
                                directed=False, 
                                weights=True)
    matrix = np.array(graph.shortest_paths(source=leaf_names, target=leaf_names, weights='weight'))

    taxids = [int(leaf.split('.')[0]) for leaf in leaf_names]
    phyla  = eggNOG_lineage.loc[taxids, 'phylum'].values.astype(int)

    triu_indices   = np.triu_indices_from(matrix, k=1)
    df = pd.DataFrame()
    df['phylum1']  = phyla[triu_indices[0]]
    df['phylum2']  = phyla[triu_indices[1]]
    df['sequence1']    = leaf_names[triu_indices[0]]
    df['sequence2']    = leaf_names[triu_indices[1]]
    df['distance'] = matrix[triu_indices]
    
    df = df[(df.phylum1!=1224) & (df.phylum2!=1224)]
    intra_phylum_distances = df.loc[df.phylum1==df.phylum2, 'distance']
    
    intra_phylum_df = df[df.phylum1==df.phylum2].copy()
    inter_phylum_df = df[df.phylum1!=df.phylum2]
    
    cyano_VS_chloroflexi = inter_phylum_df.loc[(inter_phylum_df.phylum1.isin([200795, 1117])) & \
                                               (inter_phylum_df.phylum2.isin([200795, 1117])),
                                               'distance'].values

    intra_cyanobacteria = intra_phylum_df.loc[intra_phylum_df.phylum1 == 1117,   'distance'].values
    hypothesis          = mannwhitneyu(intra_cyanobacteria, cyano_VS_chloroflexi, alternative='less')
    if hypothesis.pvalue > 0.05:
        return(None)
    
    intra_chloroflexi   = intra_phylum_df.loc[intra_phylum_df.phylum1 == 200795, 'distance'].values
    hypothesis          = mannwhitneyu(intra_chloroflexi, cyano_VS_chloroflexi, alternative='less')
    if hypothesis.pvalue > 0.05:
        return(None)

    intra_phylum_df['inverse_dist'] = np.e**np.negative(df.distance)

    graph  = ig.Graph.TupleList(edges=intra_phylum_df[['sequence1', 
                                                       'sequence2', 
                                                       'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    clusters = graph.community_multilevel(weights='weight')

    cluster_map = {node['name']:cluster_num
                   for node, cluster_num in zip(graph.vs(),
                                                clusters.membership)}
    intra_phylum_df['sequence1_cluster'] = intra_phylum_df.sequence1.apply(lambda sequence_name: cluster_map[sequence_name])
    intra_phylum_df['sequence2_cluster'] = intra_phylum_df.sequence2.apply(lambda sequence_name: cluster_map[sequence_name])

    monophyletic_intra_phylum_df = intra_phylum_df[intra_phylum_df.sequence1_cluster==intra_phylum_df.sequence2_cluster]
    
    return(intra_phylum_df)

#     with open('chloroflexi_cyano_dists/%s.pkl' % group_id, 'wb') as out:
#         pkl.dump(cyano_VS_chloroflexi/np.median(monophyletic_intra_phylum_df.distance), out)
    
#     return(group_id)