In [3]:
import os
import pandas as pd
import re
import ete3
from scipy.spatial.distance import squareform
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, HuberRegressor
import numpy as np
from sklearn.metrics import mean_squared_error

from scipy.stats import pearsonr, linregress
import seaborn as sns
from Bio import SeqIO, SearchIO, AlignIO, Align, Alphabet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import random

import matplotlib.colors as colors
import matplotlib.cm as cmx
import subprocess
import itertools
from Bio import SeqIO, SearchIO, AlignIO, Align, Alphabet
import multiprocessing
from copy import deepcopy

os.chdir('/work/site_rate/hug_et_al')

In [20]:
ncbi = ete3.NCBITaxa()

def read_mldist(filename):
    raw_file = open(filename).read()
    raw_file = raw_file.split('\n')
    raw_file.pop(0)
    genomes  = re.findall('^(\S+)', '\n'.join(raw_file), re.M)


    out = open('%s.tab' %filename, 'wb')
    out.write('\t%s\n' % '\t'.join(genomes))
    for line in raw_file:
        out.write('%s\n' % '\t'.join(line.split()))
    out.close()

    tmp = pd.read_table('%s.tab' %filename, index_col=0)
    return tmp.copy()

original           = read_mldist('mldistances.mldist')
condensed_original = squareform(original.values)
x_plot             = np.linspace(condensed_original.min(),
                                 condensed_original.max())

In [22]:
simulations = {}
for category in range(1,13):
    simulations[category] = read_mldist('categories/%i_mldist.mldist' %category)
    
#pw_dist_bins = [np.percentile(condensed_original, decile) for decile in range(20, 81, 20)]
pw_dist_bins          = np.linspace(condensed_original.min(), condensed_original.max(), 6)
binning               = np.digitize(condensed_original, pw_dist_bins)
binning[binning == 6] = 5
colors                = '#6B242E #30862D #335A99 #782D86 #FF7733'.split()

In [35]:
for bin in range(1,6):
    tmp = condensed_original[binning==bin]
    print tmp.min(), tmp.max()

1e-06 1.3466729
1.346673 2.6933302
2.6933637999999998 4.0400158
4.0400169 5.3865433
5.386791400000001 6.733360799999999


In [25]:
for category, df in simulations.items():
    print 'Site-rate category %i' %category

    condensed_simulation = squareform(df.values)

    fig, ax = plt.subplots()
    ax.scatter(condensed_original, condensed_simulation, color='black', edgecolor='none', alpha=0.3)

    for bin in set(binning):

        x = condensed_original[  binning == bin]
        y = condensed_simulation[binning == bin]

        x_plot = np.linspace(x.min(),
                             x.max())

        lm_yx = LinearRegression(fit_intercept=False)
        lm_yx.fit(y.reshape(-1,1),
                  x)

        lm_xy = LinearRegression(fit_intercept=False)
        lm_xy.fit(x.reshape(-1,1),
                  y)

        prediction_y = lm_xy.predict(x_plot.reshape(-1,1))
        mse          = mean_squared_error(lm_yx.predict(condensed_simulation.reshape(-1,1)), condensed_original)

        print '\tbin %i mean of squared residuals: %.4f' %(bin, mse)

        ax.plot(x_plot, prediction_y, linewidth=3, color=colors[bin-1], label='mse=%.3f' %mse)

    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.035), ncol=5, frameon=False)
    fig.set_size_inches(10, 6)
    fig.tight_layout()
    fig.savefig('%i.png' %category, dpi=300)
    plt.close()


Site-rate category 1
	bin 1 mean of squared residuals: 3.9341
	bin 2 mean of squared residuals: 3.7459
	bin 3 mean of squared residuals: 4.5433
	bin 4 mean of squared residuals: 5.3557
	bin 5 mean of squared residuals: 9.4036
Site-rate category 2
	bin 1 mean of squared residuals: 3.4859
	bin 2 mean of squared residuals: 3.2256
	bin 3 mean of squared residuals: 4.3129
	bin 4 mean of squared residuals: 5.4369
	bin 5 mean of squared residuals: 7.6194
Site-rate category 3
	bin 1 mean of squared residuals: 3.0626
	bin 2 mean of squared residuals: 2.7154
	bin 3 mean of squared residuals: 4.1655
	bin 4 mean of squared residuals: 5.4727
	bin 5 mean of squared residuals: 6.5669
Site-rate category 4
	bin 1 mean of squared residuals: 2.7782
	bin 2 mean of squared residuals: 2.3695
	bin 3 mean of squared residuals: 4.1542
	bin 4 mean of squared residuals: 5.6353
	bin 5 mean of squared residuals: 7.9345
Site-rate category 5
	bin 1 mean of squared residuals: 2.4468
	bin 2 mean of squared residuals: 