In [17]:
from scipy.odr              import Model, Data, RealData, ODR
from scipy.stats            import linregress
from scipy.optimize         import curve_fit
from scipy.spatial.distance import squareform
from matplotlib             import pyplot as plt
from sklearn.linear_model   import HuberRegressor
from copy                   import deepcopy
from collections            import Counter
from scipy.stats            import pearsonr
from io import BytesIO

import igraph     as ig
import numpy      as np
import seaborn    as sns
import pandas     as pd
import ipywidgets as widgets

import sys
import random
import os
import subprocess
import re
import ete3

In [2]:
class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [3]:
def cles(lessers, greaters):
    """Common-Language Effect Size
    Probability that a random draw from `greater` is in fact greater
    than a random draw from `lesser`.
    Args:
      lesser, greater: Iterables of comparables.
      
      https://github.com/ajschumacher/cles/blob/master/cles.py
    """
    if len(lessers) == 0 and len(greaters) == 0:
        raise ValueError('At least one argument must be non-empty')
    # These values are a bit arbitrary, but make some sense.
    # (It might be appropriate to warn for these cases.)
    if len(lessers) == 0:
        return 1
    if len(greaters) == 0:
        return 0
    numerator = 0
    lessers, greaters = sorted(lessers), sorted(greaters)
    lesser_index = 0
    for greater in greaters:
        while lesser_index < len(lessers) and lessers[lesser_index] < greater:
            lesser_index += 1
        numerator += lesser_index  # the count less than the greater
    denominator = len(lessers) * len(greaters)
    return float(numerator) / denominator

#### Base linear model

In [4]:
def line(x, slope):
    """Basic linear regression 'model'"""
    return (slope * x) + 0

#### Basic wODR function

In [6]:
def run_odr(x, y, x_weights, y_weights):
    mod = Model(line)
    dat = Data(x, 
               y, 
               wd=x_weights, 
               we=y_weights
    )
    odr = ODR(dat, 
              mod,
              beta0=[np.std(y)/np.std(x)])
    return(odr.run())

#### Preliminary wODR weight estimation

In [5]:
def estimate_weights(x, y, weight_estimation='gm'):
    if weight_estimation == 'gm':
        slope = np.std(y)/np.std(x)
        x_res = abs(x - line(y, 
                             slope))
        y_res = abs(y - line(x, 
                             slope))

    elif weight_estimation == 'huber':
        huber_xy  = HuberRegressor(fit_intercept=False).fit(x.reshape(-1, 1), y)
        huber_yx  = HuberRegressor(fit_intercept=False).fit(y.reshape(-1, 1), x)

        y_res     = abs(y - line(x, 
                                 huber_xy.coef_))

        x_res     = abs(x - line(y, 
                                 huber_yx.coef_))
        
    elif weight_estimation == 'ols':
        xy_params = curve_fit(line, x, y)
        y_res     = abs(y - line(x, 
                                 xy_params[0]))
        
        yx_params = curve_fit(line, y, x)
        x_res     = abs(x - line(y, 
                                 yx_params[0]))
    else:
        raise Exception('weight_estimation must be "gm", "huber", or "ols"')

    #
    # if residuals are equal do zero it drives the weight to infinity,
    #     and it is good practice not weigh things infinitely
    x_res[x_res==0] = 1e-10
    y_res[y_res==0] = 1e-10
    return(1/abs(x_res), 
           1/abs(y_res))

#### Load data from ".mldist" file

In [7]:
def load_matrix(aln_file=None):
        
    dist_matrix = pd.read_csv(aln_file, 
                              delim_whitespace = True, 
                              skiprows         = 1, 
                              header           = None,
                              index_col        = 0)
    dist_matrix.columns = dist_matrix.index
    
    return(dist_matrix)

#### Alternativelly, load pairwise distances from newick file

In [8]:
def get_matrix_from_tree(newick_txt=None):
    tree = ete3.Tree(newick_txt, format=1)

    leaf_names = tree.get_leaf_names()
    for count, node in enumerate(tree.traverse()):
        if not node.is_leaf():
            node.name = 'node_%i' % count

    edges = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                edges.append((node.name,
                              child.name,
                              child.dist))

    dag  = ig.Graph.TupleList(edges     =tuple(edges), 
                              directed  =False,
                              edge_attrs=['weight']
                             )
    
    patristic_distances     = np.array(dag.shortest_paths(source=leaf_names, 
                                                          target=leaf_names, 
                                                          weights='weight'))
                                       
    np.fill_diagonal(patristic_distances, 0.0)
    
    dist_matrix = pd.DataFrame(index  =leaf_names, 
                               columns=leaf_names, 
                               data   =patristic_distances)
    return(dist_matrix)

#### Match possibly co-evolving genes within a genome by looking for pairs minimzing wODR residuals

In [9]:
def match_copies(matrix1, matrix2, taxa1, taxa2):
    
    all_taxon_pairs           = pd.DataFrame()
    all_taxon_pairs['taxon1'] = [re.sub('\|\d$', '', taxon, flags=re.M)
                                for taxon in taxa1.taxon]
    all_taxon_pairs['taxon2'] = [re.sub('\|\d$', '', taxon, flags=re.M)
                                for taxon in taxa2.taxon]

    triu_indices = np.triu_indices_from(matrix1, k=1)
    condensed1   = matrix1.values[triu_indices]
    condensed2   = matrix2.values[triu_indices]

    model = Model(line)
    data  = Data(condensed1, 
                 condensed2)
    odr   = ODR(data, 
                model,
                beta0=[np.std(condensed2) /
                       np.std(condensed1)]
               )

    regression = odr.run()

    residual_df = pd.DataFrame(columns=['x_taxon1',   'x_genome1', 
                                        'x_taxon2',   'x_genome2', 

                                        'y_taxon1',   'y_genome1', 
                                        'y_taxon2',   'y_genome2', 

                                        'x_residual', 'y_residual'],
                               data=zip(matrix1.index[triu_indices[0]],        #x_taxon1
                                        taxa1.iloc[triu_indices[0], 1].values, #x_genome1
                                        matrix1.index[triu_indices[1]],        #x_taxon2
                                        taxa1.iloc[triu_indices[1], 1].values, #x_genome2

                                        matrix2.index[triu_indices[0]],        #y_taxon1
                                        taxa2.iloc[triu_indices[0], 1].values, #y_genome1
                                        matrix2.index[triu_indices[1]],        #y_taxon2
                                        taxa2.iloc[triu_indices[1], 1].values, #y_genome2

                                        abs(regression.delta),                 #x_residual
                                        abs(regression.eps))                   #y_residual
                              )
    residual_df['combined_residual'] = residual_df.x_residual + residual_df.y_residual

    within_genomes = ((residual_df.x_genome1 == residual_df.x_genome2) | 
                      (residual_df.y_genome1 == residual_df.y_genome2))

    residual_df.drop(index  =residual_df.index[within_genomes], 
                     inplace=True)
    
    for genome in taxa1.genome[taxa1.genome.duplicated()].unique():
    
        matrix1_homologs = taxa1.loc[taxa1.genome==genome, 
                                     'taxon'].values
        matrix2_homologs = taxa2.loc[taxa2.genome==genome, 
                                     'taxon'].values

        homolog_combinations = pd.DataFrame(columns=['homolog1', 
                                                     'homolog2', 
                                                     'residual_sum'])
        for homolog1, homolog2 in itertools.product(matrix1_homologs,
                                                    matrix2_homologs):
            tmp_df = residual_df.query('(x_taxon1 == @homolog1 | x_taxon2 == @homolog1) &'
                                       '(y_taxon1 == @homolog2 | y_taxon2 == @homolog2)')

            if not tmp_df.shape[0]:
                continue

            #
            # remove "|<num>" sufix from taxon names to obtain original name
            homolog1 = re.sub('\|\d$', 
                              '',
                              homolog1, 
                              flags=re.M)
            homolog2 = re.sub('\|\d$',
                              '', 
                              homolog2, 
                              flags=re.M)

            # add all residuals related to each pair of possibly co-evolving genes
            #     within a single genome to a dataframe
            homolog_combinations = homolog_combinations.append(
                pd.Series(index=['homolog1', 
                                 'homolog2', 
                                 'residual_sum'],
                          data =[homolog1, 
                                 homolog2, 
                                 tmp_df.combined_residual.sum()]), 
                ignore_index=True
            )

        homolog_combinations.sort_values('residual_sum', inplace=True)
        best_pairs = set()
        while homolog_combinations.shape[0]:
            first_row = homolog_combinations.iloc[0]
            best_pairs.add((first_row.homolog1, first_row.homolog2))
            homolog_combinations = homolog_combinations.query(f'(homolog1 != "{first_row.homolog1}") & '
                                                              f'(homolog2 != "{first_row.homolog2}")').copy()
            
        # drop all gene combinations where one is not each other's best pairing
        for homolog1, homolog2 in best_pairs:
            indices_to_drop = all_taxon_pairs.query(
                '(taxon1 == @homolog1 & taxon2 != @homolog2) |'
                '(taxon1 != @homolog1 & taxon2 == @homolog2)'
            ).index

            all_taxon_pairs.drop(index=indices_to_drop, 
                                inplace=True)

            taxa1.drop(index  =indices_to_drop, 
                       inplace=True)
            taxa2.drop(index  =indices_to_drop, 
                       inplace=True)
    
    matrix1 = matrix1.reindex(index  =taxa1.taxon, 
                              columns=taxa1.taxon, 
                              copy   =True)
    matrix2 = matrix2.reindex(index  =taxa2.taxon, 
                              columns=taxa2.taxon, 
                              copy   =True)
    
    return(matrix1, taxa1, matrix2, taxa2)

#### Balance distance matrices, duplicate rows/columns to reflect multiples copies in the compared gene families

In [46]:
def balance_matrices(matrix1, matrix2):

    if gene_ids.value:
        shared_genomes = np.intersect1d(matrix1.index,
                                        matrix2.index)

        matrix1 = matrix1.reindex(index  =shared_genomes,
                                  columns=shared_genomes,
                                  copy   =True)
        matrix2 = matrix2.reindex(index  =shared_genomes,
                                  columns=shared_genomes,
                                  copy   =True)

        return (matrix1, None,
                matrix2, None)
    
    tmp_taxa = []
    for index in matrix1.index:
        genome, gene = re.search(parse_leaf, index).groups()
        tmp_taxa.append([index, genome, gene])
    taxa1 = pd.DataFrame(columns=['taxon', 'genome', 'gene'],
                         data   =tmp_taxa)

    tmp_taxa = []
    for index in matrix2.index:
        genome, gene = re.search(parse_leaf, index).groups()
        tmp_taxa.append([index, genome, gene])
    taxa2 = pd.DataFrame(columns=['taxon', 'genome', 'gene'],
                         data=tmp_taxa)

    shared_genomes = np.intersect1d(taxa1.genome.unique(), 
                                    taxa2.genome.unique())

    taxa1 = taxa1[taxa1.genome.isin(shared_genomes)]
    taxa2 = taxa2[taxa2.genome.isin(shared_genomes)]

    if not taxa1.genome.is_unique or not taxa2.genome.is_unique:
    
        taxa1_frequency = taxa1.genome.value_counts() 
        taxa2_frequency = taxa2.genome.value_counts() 

        for genome in shared_genomes:
            genome1_count = taxa1_frequency[genome]
            genome2_count = taxa2_frequency[genome]

            if genome1_count > 1:
                #
                # one of the matrices must be traversed in the inversed order to make sure an 
                #     all VS all combination is obtained. That is the reason of the "iloc[::-1]"
                #     during the querying
                tmp_df = taxa2.iloc[::-1].query('genome == @genome').copy()
                for _ in range(genome1_count - 1):
                    for index, row in tmp_df.iterrows():
                        tmp_row = row.copy()
                        tmp_row.taxon += f'|{_}'
                        taxa2      = taxa2.append(tmp_row, ignore_index=True)

                        reference_name = re.sub('\|\d+$', '', tmp_row.taxon, flags=re.M)
                        matrix2[    tmp_row.taxon] = matrix2[    reference_name]
                        matrix2.loc[tmp_row.taxon] = matrix2.loc[reference_name]


            if genome2_count > 1:
                #
                # as we queried the other matrix in the reverse order, we traverse this one regularly
                tmp_df = taxa1.query('genome == @genome').copy()
                for _ in range(genome2_count - 1):
                    for index, row in tmp_df.iterrows():
                        tmp_row = row.copy()
                        tmp_row.taxon += f'|{_}'
                        taxa1 = taxa1.append(tmp_row, ignore_index=True)

                        reference_name = re.sub('\|\d+$', '', tmp_row.taxon, flags=re.M)
                        matrix1[    tmp_row.taxon] = matrix1[    reference_name]
                        matrix1.loc[tmp_row.taxon] = matrix1.loc[reference_name]

    #
    # sort both taxa tables according to genomes for properly matching
    taxa1.sort_values('genome', inplace=True)
    taxa2.sort_values('genome', inplace=True)

    taxa1.reset_index(drop=True, inplace=True)
    taxa2.reset_index(drop=True, inplace=True)
        
    #
    # match matrices index and column sorting as taxa tables
    matrix1 = matrix1.reindex(index  =taxa1.taxon, 
                              columns=taxa1.taxon, 
                              copy   =True)
    matrix2 = matrix2.reindex(index  =taxa2.taxon, 
                              columns=taxa2.taxon, 
                              copy   =True)
    
    if not taxa1.genome.is_unique or not taxa2.genome.is_unique:
        matrix1, taxa1, matrix2, taxa2 = match_copies(matrix1, matrix2, taxa1, taxa2)
    
    return(matrix1, taxa1, 
           matrix2, taxa2)

### Where the magic happens

In [35]:
def assess_coevolution(matrix1, matrix2):

    matrix1, taxa1, matrix2, taxa2 = balance_matrices(matrix1.copy(), 
                                                      matrix2.copy())

    min_overlap = True
    if       gene_ids.value and taxa1.genome.unique().shape[0] < min_taxa_overlap.value:
        min_overlap = False
    elif not gene_ids.value and               matrix1.shape[0] < min_taxa_overlap.value:
        min_overlap = False

    if not min_overlap:
        print(f'Assessed matrices have less than {min_taxa_overlap.value} taxa overlap. '
               'To change this behavior adjust overlap parameter.',
              file=sys.stderr)
        return([None, None])

    condensed1 = squareform(matrix1.values, checks=False)
    condensed2 = squareform(matrix2.values, checks=False)
    
    odr_weights = estimate_weights(condensed1, condensed2)
    
    regression = run_odr(condensed1, 
                         condensed2, 
                         *odr_weights)
    
    #
    # calculate R^2 from wODR model.
    mean_x = np.mean(condensed1)
    mean_y = np.mean(condensed2)

    mean_pred_x = regression.xplus.mean()
    mean_pred_y = regression.y.mean()

    x_SSres = sum(regression.delta**2)
    y_SSres = sum(regression.eps  **2)
    SSres   = x_SSres + y_SSres

    x_SSreg = sum(
        (regression.xplus - mean_pred_x)**2
    )
    y_SSreg = sum(
        (regression.y     - mean_pred_y)**2
    )
    SSreg   = x_SSreg + y_SSreg

    x_SStot = sum(
        (condensed1 - mean_x)**2
    )
    y_SStot = sum(
        (condensed2 - mean_y)**2
    )
    SStot   = x_SStot + y_SStot

    r2 = 1 - SSres/SStot
#     r2 = SSreg/SStot
    
    return(regression, r2)

In [None]:
# dist1 = run_dist_matrix('/work/clusterEvo/distance_matrices/000284/000284')
# dist2 = run_dist_matrix('/work/clusterEvo/distance_matrices/000302/000302')

# regression, r2 = assess_coevolution(dist1, dist2)

# regression.pprint()
# print(f'\nR**2 = {r2}')

## Beta: [0.49615544]
## Beta Std Error: [0.00033644]
## Beta Covariance: [[1.99398167e-06]]
## Residual Variance: 0.056766828371899364
## Inverse Condition #: 1.0
## Reason(s) for Halting:
##   Sum of squares convergence
##
## R**2 = 0.8947962483462916

## Crappy notebook interface, but still an interface!

In [12]:
evol_dist_source = widgets.Dropdown(
    options    =[('',                       0       ), 
                 ('FASTA files',            'fasta' ), 
                 ('IQTree ".mldist" files', 'matrix'), 
                 ('newick files',           'tree'  )],
    disabled   =False,
    indent     =False,
    value      =0,
    layout     ={'width':'auto'}
)

must_align = widgets.Checkbox(
    value   =False,  
    disabled=True,
    indent  =False,
    description='Provided FASTAS are not yet aligned',
    layout     ={'width':'auto'}
)

gene_ids = widgets.Checkbox(
    value   =False,  
    disabled=False,
    indent  =False,
    description='Sequences are identified by genome only (all sequences from the same genome have the same name)',
    layout     ={'width':'auto'}
)

min_taxa_overlap = widgets.IntText(value      =5, 
                                   indent     =False,
                                   disabled   =False)

genome_gene_sep = widgets.Dropdown(
    options    =[('',                                     0  ), 
                 ('<genome>_<gene>', '_'), 
                 ('<genome>|<gene>', '|'), 
                 ('<genome>.<gene>', '.')],
    disabled   =False,
    indent     =True,
    value      =0,
    layout     ={'width':'auto'}
)

input_files = widgets.FileUpload(
    accept  ='',   # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=True, # True to accept multiple files upload else False
    disabled=True
)

def toggle_align_widgets(dropdown_source):
    input_files.disabled = not dropdown_source.new
    
    if dropdown_source.new == 'fasta':
        must_align.disabled = False
    else:
        must_align.disabled = True
        must_align.value    = False
    
def toggle_genome_gene_sep(checkbox):
    genome_gene_sep.disabled = checkbox.new
    if checkbox.new:
        genome_gene_sep.value = 0
        
def clear_uploads(*args):
    input_files.value.clear()
    input_files._counter = 0
    
clear_button = widgets.Button(description='Clear upload',
                              button_style='warning',
                              tooltip     ='Click to clear uploaded files')
clear_button.on_click(clear_uploads)

evol_dist_source.observe(toggle_align_widgets, names='value')
gene_ids.observe(toggle_genome_gene_sep,       names='value')

In [13]:
display(widgets.HBox([widgets.Label('Source of pairwise distances: '), 
                      evol_dist_source]),
        must_align,
        gene_ids,
        widgets.HBox([widgets.Label('Genome and gene ids are separated by which character: '),
                      genome_gene_sep]),
        
        widgets.HBox([widgets.Label('Minimum taxa containing both assessed gene families: '),
                      min_taxa_overlap]),
        
        input_files,
        clear_button,
       )

HBox(children=(Label(value='Source of pairwise distances: '), Dropdown(layout=Layout(width='auto'), options=((…

Checkbox(value=False, description='Provided FASTAS are not yet aligned', disabled=True, indent=False, layout=L…

Checkbox(value=False, description='Sequences are identified by genome only (all sequences from the same genome…

HBox(children=(Label(value='Genome and gene ids are separated by which character: '), Dropdown(layout=Layout(w…

HBox(children=(Label(value='Minimum taxa containing both assessed gene families: '), IntText(value=5)))

FileUpload(value={}, description='Upload', disabled=True, multiple=True)



#### Parsing provided data and parameters

In [14]:
if not input_files._counter > 1:
    raise ValueError('You must upload at least two files!')

In [18]:
dist_matrices = []
group_names   = []

if evol_dist_source.value == 'tree':
    for file_name, file_itself in input_files.value.items():
        dist_matrices.append( get_matrix_from_tree(file_itself['content'].decode('utf-8')) )
        group_names.append( file_name )
        
elif evol_dist_source.value == 'matrix':
    for file_name, file_itself in input_files.value.items():
        dist_matrices.append( load_matrix(BytesIO(file_itself['content'])) )
        group_names.append( file_name )

In [19]:
if   genome_gene_sep.value == '_':
    parse_leaf = re.compile('^(GC[AF]_\d+(?:\.\d)?)[_|](.*)$')
elif genome_gene_sep.value == '|':
    parse_leaf = re.compile('^(\S+?)\|(\S+)$')
elif genome_gene_sep.value == '.':
    parse_leaf = re.compile('^(\d+?)\.(.*)$')
    
if min_taxa_overlap.value < 2:
    min_taxa_overlap.value = 2

In [56]:
a.beta, b

(array([0.49615582]), 0.8947960994077019)

In [48]:
group_names

['000302.mldist', '000284.mldist']

In [38]:
m1 = dist_matrices[0].copy()
m2 = dist_matrices[1].copy()

balance_matrices(m1, m2)

(Empty DataFrame
 Columns: []
 Index: [], None, Empty DataFrame
 Columns: []
 Index: [], None)

In [55]:
a, b = assess_coevolution(m2, m1)

In [57]:
yeah

NameError: name 'yeah' is not defined