In [1]:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas  as pd
import numpy   as np
import os
import re
import itertools
from scipy.spatial.distance import squareform
from collections import Counter
import ete3

%run /work/jupyter_notebooks/gene\ family\ distances/correlate_evolution.ipynb
%cd  /work/clusterEvo/new_tests/eggNOG/

/work/clusterEvo/new_tests/eggNOG


In [2]:
import plotly
import chart_studio.plotly as ptl
import plotly.graph_objects as go

ptl.sign_in('lthiberiol', 'm15ikp59lt')

In [3]:
eggNOG_groups = pd.read_csv('2157_members.tsv',
                            sep='\t',
                            header=None,
                            usecols=[1,2,3,4],
                            names=['group_id', 'num_proteins', 'num_taxa', 'members'])

tmp           = eggNOG_groups.members.map(lambda cell: [int(taxid) 
                                                        for taxid in re.findall('(\d+)\.(?:[^,]+)', cell)])
tmp.name      = 'taxa'
eggNOG_groups = eggNOG_groups.join(tmp)

In [4]:
one_duplication = eggNOG_groups.query('(num_proteins == num_taxa+1) &'
                                      '(num_taxa >= 10) &'
                                      '(num_taxa <= 10)')
# single_copy.to_csv('single_copy_genes.tsv', sep='\t')
print(one_duplication.shape)
one_duplication.head()

(31, 5)


Unnamed: 0,group_id,num_proteins,num_taxa,members,taxa
1392,arCOG00207,11,10,"1104324.P186_0784,1104324.P186_1686,1198449.AC...","[1104324, 1104324, 1198449, 178306, 272557, 41..."
1724,arCOG00599,11,10,"178306.PAE0153,178306.PAE1775,368408.Tpen_1003...","[178306, 178306, 368408, 384616, 397948, 41035..."
2260,arCOG01211,11,10,"1104324.P186_0847,1365176.N186_06385,178306.PA...","[1104324, 1365176, 178306, 368408, 368408, 391..."
3009,arCOG02069,11,10,"1202768.JROF01000004_gene3179,1220534.B655_117...","[1202768, 1220534, 1227453, 192952, 269797, 41..."
3449,arCOG02623,11,10,"1006006.Mcup_0813,273057.SSO1595,273063.STK_20...","[1006006, 273057, 273063, 330779, 399549, 4535..."


In [5]:
def taxa_jaccard(ids):
    taxon_set1 = set(one_duplication.loc[one_duplication.group_id==ids[0],
                                         'taxa'].squeeze())
    taxon_set2 = set(one_duplication.loc[one_duplication.group_id==ids[1],
                                         'taxa'].squeeze())
    
    j_index = len(
        taxon_set1.intersection(taxon_set2)
    ) / len(
        taxon_set1.union(taxon_set2)
    )
    
    return(j_index)

In [6]:
group1, group2, j_indices = [], [], []
for ids in itertools.combinations(one_duplication.group_id.values, 2):
    j_indices.append(taxa_jaccard(ids))
    group1.append(ids[0])
    group2.append(ids[1])

In [7]:
jaccard_df = pd.DataFrame(columns=[  'group1', 'group2', 'jaccard'],
                          data   =zip(group1,   group2,   j_indices))

In [8]:
jaccard_df.query('jaccard >= 0.7')

Unnamed: 0,group1,group2,jaccard
41,arCOG00599,arCOG05535,0.818182
294,arCOG05433,arCOG05535,0.818182


In [9]:
eggNOG_trees = pd.read_csv('2157_trees.tsv',
                           sep      ='\t',
                           header   =None,
                           usecols  =[1,2,3],
                           index_col=0,
                           names    =['group_id', 'fast', 'tree'])
eggNOG_trees = eggNOG_trees.reindex(index=one_duplication.group_id)

In [10]:
m1 = get_matrix_from_tree(
    ete3.Tree(eggNOG_trees.loc['arCOG05535',
                               'tree'])
)

m2 = get_matrix_from_tree(
    ete3.Tree(eggNOG_trees.loc['arCOG00599',
                               'tree'])
)

m3 = get_matrix_from_tree(
    ete3.Tree(eggNOG_trees.loc['arCOG05433',
                               'tree'])
)

In [40]:
%run /work/jupyter_notebooks/gene\ family\ distances/correlate_evolution.ipynb

In [54]:
matrix1 = m1.copy()
matrix2 = m2.copy()
matrix2.drop(index='178306.PAE1775',
             columns='178306.PAE1775',
             inplace=True)

In [55]:
matrix1, taxa1, matrix2, taxax2 = balance_matrices(matrix1, matrix2, gene_sep='.')

In [56]:
triu_indices = np.triu_indices_from(matrix1, k=1)
condensed1   = matrix1.values[triu_indices]
condensed2   = matrix2.values[triu_indices]

matrix1_label = [f'{index1} VS {index2}' for index1, index2 in zip(matrix1.index[triu_indices[0]],
                                                                   matrix1.index[triu_indices[1]])]
matrix2_label = [f'{index1} VS {index2}' for index1, index2 in zip(matrix2.index[triu_indices[0]],
                                                                   matrix2.index[triu_indices[1]])]

combined_labels = [f'{label1}<br>{label2}' for label1, label2 in zip(matrix1_label,
                                                                   matrix2_label)]

node_color = ['red' if '178306' in label else 'black' for label in combined_labels]

In [57]:
node_color = []
for label in matrix1_label:
    
    if label.count('178306') > 1:
        node_color.append('blue')
    elif '178306.PAE0154'  in label:
        node_color.append('red')
    elif '178306.PAE1775a' in label:
        node_color.append('green')
    else:
        node_color.append('black')
        
node_shape = []
for label in matrix2_label:
    
    if label.count('178306') > 1:
        node_shape.append('diamond')
    elif '178306.PAE0153'  in label:
        node_shape.append('cross')
    elif '178306.PAE1775' in label:
        node_shape.append('triangle-up')
    else:
        node_shape.append('circle')


In [58]:
node_x      = condensed1.copy()
node_y      = condensed2.copy()
node_text   = combined_labels
    
node_trace = go.Scatter(
    x        =node_x, 
    y        =node_y,
    text     =node_text,
    mode     ='markers',
    name     ='Proteins',
    hoverinfo='text',
    opacity  =0.7,
    marker   =dict(symbol      =node_shape,
                   color       =node_color,
                   size        =10,
                   line_width  =1,
                   line_color  ='white')
)

In [59]:
fig = go.Figure(data=[node_trace], # here is important to add edges first so they don't cover nodes
                layout=go.Layout(template          ='simple_white',
                                 title             ='Test',
                                 titlefont_size    =16,
                                 showlegend        =True,
                                 legend_orientation='h',
                                 hovermode         ='closest',
                                 margin            =dict(b=20,
                                                         l=5,
                                                         r=5,
                                                         t=40),
#                                  xaxis             =dict(showgrid     =False,
#                                                          zeroline     =False,
#                                                          showticklabels=False),
#                                  yaxis             =dict(showgrid      =False, 
#                                                          zeroline      =False, 
#                                                          showticklabels=False)
                                )
                )

plotly.offline.plot(fig, filename='/Users/thiberio/test.html', config={'scrollZoom': True}, auto_open=False)

'/Users/thiberio/test.html'