In [180]:
from scipy.odr              import Model, Data, RealData, ODR
from scipy.stats            import linregress
from scipy.optimize         import curve_fit
from scipy.spatial.distance import squareform
from matplotlib             import pyplot as plt
from sklearn.linear_model   import HuberRegressor
from copy                   import deepcopy
from collections            import Counter
import numpy   as np
import seaborn as sns
import pandas  as pd
import random
import os
import subprocess
import re

In [63]:
class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [19]:
def line(x, slope):
    """Basic linear regression 'model'"""
    return (slope * x) + 0

In [14]:
def estimate_weights(x, y, huber=True):
    if huber:
        huber_xy  = HuberRegressor(fit_intercept=False).fit(x.reshape(-1, 1), y)
        huber_yx  = HuberRegressor(fit_intercept=False).fit(y.reshape(-1, 1), x)

        x_inliers = x[~huber_xy.outliers_ & \
                      ~huber_yx.outliers_]
        y_inliers = y[~huber_xy.outliers_ & \
                      ~huber_yx.outliers_]

        xy_params = curve_fit(line, 
                              x_inliers, 
                              y_inliers)
        y_res     = abs(y - line(x, 
                                 xy_params[0]))

        yx_params = curve_fit(line, 
                              y_inliers, 
                              x_inliers)
        x_res     = abs(x - line(y, 
                                 yx_params[0]))
    else:
        xy_params = curve_fit(line, x, y)
        y_res     = abs(y - line(x, 
                                 xy_params[0]))
        
        yx_params = curve_fit(line, y, x)
        x_res     = abs(x - line(y, 
                                 yx_params[0]))

    return(1/abs(x_res), 
           1/abs(y_res))

In [219]:
def run_odr(x, y, x_weights, y_weights):
    mod = Model(line)
    dat = Data(x, 
               y, 
               wd=x_weights, 
               we=y_weights
    )
    odr = ODR(dat, 
              mod,
              beta0=[1])
    return(odr.run())

In [90]:
def run_dist_matrix(aln_file=None, iqtree_path='iqtree', num_threads=1):
    path     = '/'.join(aln_file.split('/')[:-1])
    filename = aln_file.split('/')[-1]
    
    with cd(path):
        
        if not os.path.isfile(f'{filename}.mldist'):
            subprocess.call([iqtree_path, 
                             '-s',     filename, 
                             '-m',     'LG+G', 
                             '-te',    'BIONJ',
                             '-nt',    'AUTO',
                             '-ntmax', str(num_threads),
                             '-keep-ident', '-safe', '-quiet'])
        
        dist_matrix = pd.read_csv(f'{filename}.mldist', 
                                  delim_whitespace = True, 
                                  skiprows         = 1, 
                                  header           = None,
                                  index_col        = 0)
        dist_matrix.columns = dist_matrix.index
    
    return(dist_matrix)

In [182]:
def balance_matrices(matrix1, matrix2):
    
    tmp_taxa = []
    for index in matrix1.index:
        genome, gene = re.search('^(GC[AF]_\d+(?:\.\d)?)[_|](.*)$', index).groups()
        tmp_taxa.append([index, genome, gene])

    taxa1 = pd.DataFrame(columns=['taxon', 'genome', 'gene'],
                         data=tmp_taxa)

    tmp_taxa = []
    for index in matrix2.index:
        genome, gene = re.search('^(GC[AF]_\d+(?:\.\d)?)[_|](.*)$', index).groups()
        tmp_taxa.append([index, genome, gene])

    taxa2 = pd.DataFrame(columns=['taxon', 'genome', 'gene'],
                         data=tmp_taxa)

    shared_genomes = np.intersect1d(taxa1.genome.unique(), 
                                    taxa2.genome.unique())

    taxa1 = taxa1[taxa1.genome.isin(shared_genomes)]
    taxa2 = taxa2[taxa2.genome.isin(shared_genomes)]

    if not taxa1.genome.is_unique or not taxa2.genome.is_unique:
        return(None)
    
        taxa1_frequency = Counter(taxa1.genome) 
        taxa2_frequency = Counter(taxa2.genome)

        for (genome1, genome1_count), \
            (genome2, genome2_count) in zip(taxa1_frequency.items(), 
                                            taxa2_frequency.items()):

            genome_frequency_difference = genome1_count - genome2_count

            if genome_frequency_difference > 0:
                for _ in range(abs(genome_frequency_difference)):
                    taxa2.append(pd.Series([f'{genome}|dummy{_}', 
                                            genome, 
                                            f'dummy{_}'], 
                                           index=['taxon', 'genome', 'gene']), 
                                 ignore_index=True)

            if genome_frequency_difference < 0:
                for _ in range(abs(genome_frequency_difference)):
                    taxa1.append(pd.Series([f'{genome}|dummy{_}', 
                                            genome, 
                                            f'dummy{_}'], 
                                           index=['taxon', 'genome', 'gene']), 
                                 ignore_index=True)

    taxa1.sort_values('genome', inplace=True)
    taxa2.sort_values('genome', inplace=True)
    
    matrix1 = matrix1.reindex(index  =taxa1.taxon, 
                              columns=taxa1.taxon, 
                              copy   =True)
    matrix2 = matrix2.reindex(index  =taxa2.taxon, 
                              columns=taxa2.taxon, 
                              copy   =True)
    
    matrix1[matrix1.isna()] = 0.0
    matrix2[matrix2.isna()] = 0.0
    
    return(matrix1, taxa1, matrix2, taxa2)

In [98]:
def assess_coevolution(matrix1, matrix2):
    matrix1, taxa1, matrix2, taxa2 = balance_matrices(matrix1, matrix2)
    
    condensed1 = squareform(matrix1.values)
    condensed2 = squareform(matrix2.values)
    
    odr_weights = estimate_weights(condensed1, condensed2)
    
    regression = run_odr(condensed1, 
                         condensed2, 
                         *odr_weights)
    
    return(len(taxa1.genome.unique()), len(taxa2.genome.unique()), len(shared_genomes))

In [92]:
dist1 = run_dist_matrix('/work/clusterEvo/distance_matrices/000284/000284')
dist2 = run_dist_matrix('/work/clusterEvo/distance_matrices/000302/000302')

In [220]:
matrix1 = deepcopy(dist1)
matrix2 = deepcopy(dist2)

matrix1, taxa1, matrix2, taxa2 = balance_matrices(matrix1, matrix2)

condensed1 = squareform(matrix1.values)
condensed2 = squareform(matrix2.values)

odr_weights = estimate_weights(condensed1, condensed2)

regression = run_odr(condensed1, 
                     condensed2, 
                     *odr_weights)