In [60]:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas  as pd
import numpy   as np
import os
import re
import itertools
from scipy.spatial.distance import squareform
from collections import Counter
import ete3

%run /work/jupyter_notebooks/gene\ family\ distances/correlate_evolution.ipynb
%cd  /work/clusterEvo/new_tests/eggNOG/

/work/clusterEvo/new_tests/eggNOG


In [104]:
import plotly
import chart_studio.plotly as ptl
import plotly.graph_objects as go

ptl.sign_in('lthiberiol', 'm15ikp59lt')

In [2]:
eggNOG_groups = pd.read_csv('2157_members.tsv',
                            sep='\t',
                            header=None,
                            usecols=[1,2,3,4],
                            names=['group_id', 'num_proteins', 'num_taxa', 'members'])

tmp           = eggNOG_groups.members.map(lambda cell: [int(taxid) 
                                                        for taxid in re.findall('(\d+)\.(?:[^,]+)', cell)])
tmp.name      = 'taxa'
eggNOG_groups = eggNOG_groups.join(tmp)

In [11]:
one_duplication = eggNOG_groups.query('(num_proteins == num_taxa+1) &'
                                      '(num_taxa >= 10) &'
                                      '(num_taxa <= 10)')
# single_copy.to_csv('single_copy_genes.tsv', sep='\t')
print(one_duplication.shape)
one_duplication.head()

(31, 5)


Unnamed: 0,group_id,num_proteins,num_taxa,members,taxa
1392,arCOG00207,11,10,"1104324.P186_0784,1104324.P186_1686,1198449.AC...","[1104324, 1104324, 1198449, 178306, 272557, 41..."
1724,arCOG00599,11,10,"178306.PAE0153,178306.PAE1775,368408.Tpen_1003...","[178306, 178306, 368408, 384616, 397948, 41035..."
2260,arCOG01211,11,10,"1104324.P186_0847,1365176.N186_06385,178306.PA...","[1104324, 1365176, 178306, 368408, 368408, 391..."
3009,arCOG02069,11,10,"1202768.JROF01000004_gene3179,1220534.B655_117...","[1202768, 1220534, 1227453, 192952, 269797, 41..."
3449,arCOG02623,11,10,"1006006.Mcup_0813,273057.SSO1595,273063.STK_20...","[1006006, 273057, 273063, 330779, 399549, 4535..."


In [20]:
def taxa_jaccard(ids):
    taxon_set1 = set(one_duplication.loc[one_duplication.group_id==ids[0],
                                         'taxa'].squeeze())
    taxon_set2 = set(one_duplication.loc[one_duplication.group_id==ids[1],
                                         'taxa'].squeeze())
    
    j_index = len(
        taxon_set1.intersection(taxon_set2)
    ) / len(
        taxon_set1.union(taxon_set2)
    )
    
    return(j_index)

In [25]:
group1, group2, j_indices = [], [], []
for ids in itertools.combinations(one_duplication.group_id.values, 2):
    j_indices.append(taxa_jaccard(ids))
    group1.append(ids[0])
    group2.append(ids[1])

In [27]:
jaccard_df = pd.DataFrame(columns=[  'group1', 'group2', 'jaccard'],
                          data   =zip(group1,   group2,   j_indices))

In [29]:
jaccard_df.query('jaccard >= 0.7')

Unnamed: 0,group1,group2,jaccard
41,arCOG00599,arCOG05535,0.818182
294,arCOG05433,arCOG05535,0.818182


In [46]:
def get_duplicated_taxa(group_id):
    taxa = re.findall('(?:^|,)(\d+?)\.', 
                      one_duplication.loc[one_duplication.group_id==group_id,
                                         'members'].squeeze(),
                      re.M)
    
    for taxon, count in Counter(taxa).items():
        if count > 1:
            yield(taxon)

In [49]:
next(get_duplicated_taxa('arCOG05535'))

'178306'

In [50]:
next(get_duplicated_taxa('arCOG00599'))

'178306'

In [53]:
next(get_duplicated_taxa('arCOG05433'))

'178306'

In [54]:
eggNOG_trees = pd.read_csv('2157_trees.tsv',
                           sep      ='\t',
                           header   =None,
                           usecols  =[1,2,3],
                           index_col=0,
                           names    =['group_id', 'fast', 'tree'])
eggNOG_trees = eggNOG_trees.reindex(index=one_duplication.group_id)

In [195]:
m1 = get_matrix_from_tree(
    ete3.Tree(eggNOG_trees.loc['arCOG05535',
                               'tree'])
)

m2 = get_matrix_from_tree(
    ete3.Tree(eggNOG_trees.loc['arCOG00599',
                               'tree'])
)

m3 = get_matrix_from_tree(
    ete3.Tree(eggNOG_trees.loc['arCOG05433',
                               'tree'])
)

In [446]:
matrix1 = m1.copy()
matrix2 = m2.copy()

tmp_taxa = []
for index in matrix1.index:
    genome, gene = re.search('^(\d+?)\.(.*)$', index).groups()
    tmp_taxa.append([index, genome, gene])

taxa1 = pd.DataFrame(columns=['taxon', 'genome', 'gene'],
                     data   =tmp_taxa)

tmp_taxa = []
for index in matrix2.index:
    genome, gene = re.search('^(\d+?)\.(.*)$', index).groups()
    tmp_taxa.append([index, genome, gene])

taxa2 = pd.DataFrame(columns=['taxon', 'genome', 'gene'],
                     data   =tmp_taxa)

shared_genomes = np.intersect1d(taxa1.genome.unique(), 
                                taxa2.genome.unique())

taxa1 = taxa1[taxa1.genome.isin(shared_genomes)]
taxa2 = taxa2[taxa2.genome.isin(shared_genomes)]

In [447]:
taxa1_frequency = Counter(taxa1.genome) 
taxa2_frequency = Counter(taxa2.genome)

for genome in shared_genomes:
    genome1_count = taxa1_frequency[genome]
    genome2_count = taxa2_frequency[genome]

    if genome1_count > 1:
        for _ in range(genome1_count - 1):
            for index, row in taxa2.iloc[::-1].query('genome == @genome').iterrows():
                row.taxon += f'.{_}'
                taxa2 = taxa2.append(row, ignore_index=True)
                
                reference_name = re.sub('\.\d+$', '', row.taxon, flags=re.M)
                matrix2[    row.taxon] = matrix2[    reference_name]
                matrix2.loc[row.taxon] = matrix2.loc[reference_name]


    if genome2_count > 1:
        for _ in range(genome2_count - 1):
            for index, row in taxa1.query('genome == @genome').iterrows():
                row.taxon += f'.{_}'
                taxa1 = taxa1.append(row, ignore_index=True)
                
                reference_name = re.sub('\.\d+$', '', row.taxon, flags=re.M)
                matrix1[    row.taxon] = matrix1[    reference_name]
                matrix1.loc[row.taxon] = matrix1.loc[reference_name]
    
taxa1.sort_values('genome', inplace=True)
taxa2.sort_values('genome', inplace=True)

matrix1 = matrix1.reindex(index  =taxa1.taxon, 
                          columns=taxa1.taxon, 
                          copy   =True)
matrix2 = matrix2.reindex(index  =taxa2.taxon, 
                          columns=taxa2.taxon, 
                          copy   =True)

In [603]:
triu_indices = np.triu_indices_from(matrix1, k=1)
condensed1   = matrix1.values[triu_indices]
condensed2   = matrix2.values[triu_indices]

#
# ignore comparisons within the same species
matrix1_distinct_genomes = (taxa1.iloc[triu_indices[0], 1].values != \
                            taxa1.iloc[triu_indices[1], 1].values
                           ).astype(int)
matrix2_distinct_genomes = (taxa2.iloc[triu_indices[0], 1].values !=\
                            taxa2.iloc[triu_indices[1], 1].values
                           ).astype(int)

model = Model(line)
data  = Data(condensed1, 
             condensed2, 
             wd=matrix1_distinct_genomes, 
             we=matrix2_distinct_genomes
)
odr = ODR(data, 
          model,
          beta0=[np.std(condensed2) /\
                 np.std(condensed1)]
         )

regression = odr.run()

residual_df = pd.DataFrame(columns=['x_taxon1',   'x_genome1', 
                                    'x_taxon2',   'x_genome2', 
                                    
                                    'y_taxon1',   'y_genome1', 
                                    'y_taxon2',   'y_genome2', 
                                    
                                    'x_residual', 'y_residual'],
                           data   =zip(matrix1.index[triu_indices[0]],
                                       taxa1.iloc[triu_indices[0], 1].values,
                                       matrix1.index[triu_indices[1]],
                                       taxa1.iloc[triu_indices[1], 1].values,
                                       
                                       matrix2.index[triu_indices[0]],
                                       taxa2.iloc[triu_indices[0], 1].values,
                                       matrix2.index[triu_indices[1]],
                                       taxa2.iloc[triu_indices[1], 1].values,
                                       
                                       abs(regression.delta),
                                       abs(regression.eps))
                          )
residual_df['residual_total'] = residual_df.x_residual + residual_df.y_residual

within_genomes = ((residual_df.x_genome1 == residual_df.x_genome2) | 
                  (residual_df.y_genome1 == residual_df.y_genome2))

residual_df.drop(index=residual_df.index[within_genomes], inplace=True)

In [613]:
matrix1_homologs = taxa1.loc[taxa1.genome=="178306", 
                           'taxon'].values
matrix2_homologs = taxa2.loc[taxa2.genome=="178306", 
                           'taxon'].values

homolog_combinations = pd.DataFrame(columns=['homolog1', 
                                             'homolog2', 
                                             'residual_sum'])
for homolog1, homolog2 in itertools.product(matrix1_homologs,
                                            matrix2_homologs):
    tmp_df = residual_df.query('(x_taxon1 == @homolog1 | x_taxon2 == @homolog1) &'
                               '(y_taxon1 == @homolog2 | y_taxon2 == @homolog2)')
    
    if not tmp_df.shape[0]:
        continue
    
    homolog1 = re.sub('\.\d$', 
                      '',
                      homolog1, 
                      flags=re.M)
    homolog2 = re.sub('\.\d$',
                      '', 
                      homolog2, 
                      flags=re.M)
    
    homolog_combinations = homolog_combinations.append(
        pd.Series(data=[homolog1, 
                        homolog2, 
                        tmp_df.residual_total.sum()],
                  index=['homolog1', 
                         'homolog2', 
                         'residual_sum']), 
        ignore_index=True
    )
    
homolog_combinations.sort_values('residual_sum', inplace=True)
best_pairs = set()
while homolog_combinations.shape[0]:
    first_row = homolog_combinations.iloc[0]
    best_pairs.add((first_row.homolog1, first_row.homolog2))
    homolog_combinations = homolog_combinations.query(f'(homolog1 != "{first_row.homolog1}") & '
                                                      f'(homolog2 != "{first_row.homolog2}")').copy()

In [None]:
def match_copies(matrix1, matrix2, taxa1, taxa2):

    triu_indices = np.triu_indices_from(matrix1, k=1)
    condensed1   = matrix1.values[triu_indices]
    condensed2   = matrix2.values[triu_indices]

    #
    # ignore comparisons within the same species
    matrix1_distinct_genomes = (taxa1.iloc[triu_indices[0], 1].values != \
                                taxa1.iloc[triu_indices[1], 1].values
                               ).astype(int)
    matrix2_distinct_genomes = (taxa2.iloc[triu_indices[0], 1].values !=\
                                taxa2.iloc[triu_indices[1], 1].values
                               ).astype(int)

    model = Model(line)
    data  = Data(condensed1, 
                 condensed2, 
                 wd=matrix1_distinct_genomes, 
                 we=matrix2_distinct_genomes
    )
    odr = ODR(data, 
              model,
              beta0=[np.std(condensed2) /\
                     np.std(condensed1)]
             )

    regression = odr.run()

    residual_df = pd.DataFrame(columns=['x_taxon1',   'x_genome1', 
                                        'x_taxon2',   'x_genome2', 

                                        'y_taxon1',   'y_genome1', 
                                        'y_taxon2',   'y_genome2', 

                                        'x_residual', 'y_residual'],
                               data   =zip(matrix1.index[triu_indices[0]],
                                           taxa1.iloc[triu_indices[0], 1].values,
                                           matrix1.index[triu_indices[1]],
                                           taxa1.iloc[triu_indices[1], 1].values,

                                           matrix2.index[triu_indices[0]],
                                           taxa2.iloc[triu_indices[0], 1].values,
                                           matrix2.index[triu_indices[1]],
                                           taxa2.iloc[triu_indices[1], 1].values,

                                           abs(regression.delta),
                                           abs(regression.eps))
                              )
    residual_df['residual_total'] = residual_df.x_residual + residual_df.y_residual

    within_genomes = ((residual_df.x_genome1 == residual_df.x_genome2) | 
                      (residual_df.y_genome1 == residual_df.y_genome2))

    residual_df.drop(index=residual_df.index[within_genomes], inplace=True)
    
    matrix1_homologs = taxa1.loc[taxa1.genome=="178306", 
                               'taxon'].values
    matrix2_homologs = taxa2.loc[taxa2.genome=="178306", 
                               'taxon'].values

    homolog_combinations = pd.DataFrame(columns=['homolog1', 
                                                 'homolog2', 
                                                 'residual_sum'])
    for homolog1, homolog2 in itertools.product(matrix1_homologs,
                                                matrix2_homologs):
        tmp_df = residual_df.query('(x_taxon1 == @homolog1 | x_taxon2 == @homolog1) &'
                                   '(y_taxon1 == @homolog2 | y_taxon2 == @homolog2)')

        if not tmp_df.shape[0]:
            continue

        homolog1 = re.sub('\.\d$', 
                          '',
                          homolog1, 
                          flags=re.M)
        homolog2 = re.sub('\.\d$',
                          '', 
                          homolog2, 
                          flags=re.M)

        homolog_combinations = homolog_combinations.append(
            pd.Series(data=[homolog1, 
                            homolog2, 
                            tmp_df.residual_total.sum()],
                      index=['homolog1', 
                             'homolog2', 
                             'residual_sum']), 
            ignore_index=True
        )

    homolog_combinations.sort_values('residual_sum', inplace=True)
    best_pairs = set()
    while homolog_combinations.shape[0]:
        first_row = homolog_combinations.iloc[0]
        best_pairs.add((first_row.homolog1, first_row.homolog2))
        homolog_combinations = homolog_combinations.query(f'(homolog1 != "{first_row.homolog1}") & '
                                                          f'(homolog2 != "{first_row.homolog2}")').copy()

In [455]:
triu_indices = np.triu_indices_from(matrix1, k=1)
condensed1   = matrix1.values[triu_indices]
condensed2   = matrix2.values[triu_indices]

matrix1_label = [f'{index1} VS {index2}' for index1, index2 in zip(matrix1.index[triu_indices[0]],
                                                                   matrix1.index[triu_indices[1]])]
matrix2_label = [f'{index1} VS {index2}' for index1, index2 in zip(matrix2.index[triu_indices[0]],
                                                                   matrix2.index[triu_indices[1]])]

combined_labels = [f'{label1}<br>{label2}' for label1, label2 in zip(matrix1_label,
                                                                   matrix2_label)]

node_color = ['red' if '178306' in label else 'black' for label in combined_labels]

In [456]:
node_color = []
for label in matrix1_label:
    
    if label.count('178306') > 1:
        node_color.append('blue')
    elif '178306.PAE0154'  in label:
        node_color.append('red')
    elif '178306.PAE1775a' in label:
        node_color.append('green')
    else:
        node_color.append('black')
        
node_shape = []
for label in matrix2_label:
    
    if label.count('178306') > 1:
        node_shape.append('diamond')
    elif '178306.PAE0153'  in label:
        node_shape.append('cross')
    elif '178306.PAE1775' in label:
        node_shape.append('triangle-up')
    else:
        node_shape.append('circle')


In [457]:
node_x      = condensed1.copy()
node_y      = condensed2.copy()
node_text   = combined_labels
    
node_trace = go.Scatter(
    x        =node_x, 
    y        =node_y,
    text     =node_text,
    mode     ='markers',
    name     ='Proteins',
    hoverinfo='text',
    opacity  =0.7,
    marker   =dict(symbol      =node_shape,
                   color       =node_color,
                   size        =10,
                   line_width  =1,
                   line_color  ='white')
)

In [458]:
fig = go.Figure(data=[node_trace], # here is important to add edges first so they don't cover nodes
                layout=go.Layout(template          ='simple_white',
                                 title             ='Test',
                                 titlefont_size    =16,
                                 showlegend        =True,
                                 legend_orientation='h',
                                 hovermode         ='closest',
                                 margin            =dict(b=20,
                                                         l=5,
                                                         r=5,
                                                         t=40),
#                                  xaxis             =dict(showgrid     =False,
#                                                          zeroline     =False,
#                                                          showticklabels=False),
#                                  yaxis             =dict(showgrid      =False, 
#                                                          zeroline      =False, 
#                                                          showticklabels=False)
                                )
                )

plotly.offline.plot(fig, filename='/Users/thiberio/test.html', config={'scrollZoom': True}, auto_open=False)

'/Users/thiberio/test.html'

In [428]:
matrix1

taxon,178306.PAE0154,178306.PAE1775a,384616.Pisl_1556,397948.Cmaq_0962,410359.Pcal_1246,444157.Tneu_0584,572478.Vdis_1352,698757.Pogu_1576,768679.TTX_1300,985053.VMUT_2172
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
178306.PAE0154,0.0,2.412437,2.558183,0.943222,2.288786,2.567279,2.187575,2.445332,2.526642,2.267118
178306.PAE1775a,2.412437,0.0,1.012768,3.022081,0.743371,1.021864,1.421092,0.265907,0.981227,1.500635
384616.Pisl_1556,2.558183,1.012768,0.0,3.167827,0.848433,0.130635,1.566838,1.045663,0.395001,1.646381
397948.Cmaq_0962,0.943222,3.022081,3.167827,0.0,2.89843,3.176923,2.797219,3.054976,3.136286,2.876762
410359.Pcal_1246,2.288786,0.743371,0.848433,2.89843,0.0,0.85753,1.297441,0.776266,0.816892,1.376984
444157.Tneu_0584,2.567279,1.021864,0.130635,3.176923,0.85753,0.0,1.575934,1.054759,0.404098,1.655477
572478.Vdis_1352,2.187575,1.421092,1.566838,2.797219,1.297441,1.575935,0.0,1.453987,1.535297,0.379897
698757.Pogu_1576,2.445332,0.265907,1.045663,3.054976,0.776266,1.054759,1.453987,0.0,1.014122,1.53353
768679.TTX_1300,2.526642,0.981227,0.395001,3.136286,0.816892,0.404098,1.535297,1.014122,0.0,1.61484
985053.VMUT_2172,2.267118,1.500635,1.646381,2.876762,1.376984,1.655477,0.379897,1.53353,1.61484,0.0
