In [None]:
cores = 20
strains = 'https://evocellnet.github.io/ecoref/data/strains.tsv'
phenotypes = 'https://evocellnet.github.io/ecoref/data/phenotypic_data.tsv'
pathogenicity = '../data/phenotypes/phenotypes.tsv'
gdir = '../data/genomes/'
filtered = '../out/associations/summary_cont_lmm_kmer.tsv'
rtab = '../out/roary/gene_presence_absence.Rtab'

In [None]:
cores = int(cores)

In [None]:
# plotting imports
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text

sns.set_style('white')

plt.rc('font', size=11)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=11)
plt.rc('axes', labelsize=12, titlesize=12)
plt.rc('legend', fontsize=11)

In [None]:
import os
import random
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import metrics
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

In [None]:
def match(x, y, c,
          xcolumn='s-scores',
          ycolumn='killed'):
    j = x.loc[c, xcolumn]
    k = y.loc[j.index, ycolumn]
    idx = j.index.intersection(k.index)
    if len(idx) == 0:
        return np.nan, np.nan
    j = j.loc[idx]
    k = k.loc[idx]
    return j, k

def correlate(x, y, c,
              method='pearson',
              xcolumn='s-scores',
              ycolumn='killed'):
    j, k = match(x, y, c, xcolumn, ycolumn)
    if method == 'pearson':
        return stats.pearsonr(j, k)
    else:
        return stats.spearmanr(j, k)

In [None]:
f = pd.read_table(filtered,
                  index_col=0)
f = f[f['specific_hits'] > 0]
r = pd.read_table(rtab, index_col=0)
r = r.loc[f.index].copy(deep=True)

In [None]:
hpi = r.sum()
hpi.name = 'hpi'
hpi = hpi.to_frame()

In [None]:
exclude = {'IAI29',
           'IAI64',
           'NILS80'}
genomes = {x.split('.')[0]
           for x in os.listdir(gdir)
           if x.endswith('.fasta')} - exclude

In [None]:
s = pd.read_table(strains, index_col=1)
s.index = [x.replace(' ', '')
           if x.startswith('NILS')
           else x
           for x in s.index]
s = s.loc[[k for k,v in s.T.iteritems()
           if k in genomes
           and v.values[0] != 'NT12008']][['Strain Identifier', 'Full Strain Name After Genome Analysis']].dropna()
s = s['Strain Identifier']
s = s.reset_index().set_index(
    'Strain Identifier'
    ).drop('NT12008').reset_index().set_index(
    'index')['Strain Identifier']
d = {v: k
     for k, v in s.iteritems()}

In [None]:
p = pd.read_table(phenotypes).set_index('strain')
p.index = [d.get(x, x)
           for x in p.index]
p.index.name = 'strain'
p = p.reset_index().set_index(['strain', 'condition'])
p = p.loc[s.index]
p = p.reset_index().set_index(['condition', 'strain'])
p = p.sort_index()
p['signed-qvalue'] = [np.sign(x) * -np.log10(y)
                      for x, y, z in p.values]
# idx = p.groupby('condition')['growth-defect-phenotype'
#         ].sum()[p.groupby('condition')[
#       'growth-defect-phenotype'].sum() > 0].index
# p = p.loc[idx]

In [None]:
k = pd.read_table(pathogenicity,
                  index_col=0)

In [None]:
res = []
for c in {x[0] for x in p.index}:
    r, pval = correlate(p, k, c)
    res.append((c, 'pearson', 's-score', r, pval))
    r, pval = correlate(p, k, c, method='spearman')
    res.append((c, 'spearman', 's-score', r, pval))
    r, pval = correlate(p, k, c, xcolumn='signed-qvalue')
    res.append((c, 'pearson', 'signed-qvalue', r, pval))
    r, pval = correlate(p, k, c, method='spearman', xcolumn='signed-qvalue')
    res.append((c, 'spearman', 'signed-qvalue', r, pval))
    r, pval = correlate(p, hpi, c, ycolumn='hpi')
    res.append((c, 'pearson', 's-score.hpi', r, pval))
    r, pval = correlate(p, hpi, c, method='spearman', ycolumn='hpi')
    res.append((c, 'spearman', 's-score.hpi', r, pval))
    r, pval = correlate(p, hpi, c, xcolumn='signed-qvalue', ycolumn='hpi')
    res.append((c, 'pearson', 'signed-qvalue.hpi', r, pval))
    r, pval = correlate(p, hpi, c, method='spearman', xcolumn='signed-qvalue', ycolumn='hpi')
    res.append((c, 'spearman', 'signed-qvalue.hpi', r, pval))
r = pd.DataFrame(res,
                 columns=['condition',
                          'method',
                          'column',
                          'r',
                          'p'])

In [None]:
plt.figure(figsize=(12, 7))

plt.subplot(121)

r1 = r[(r['method'] == 'pearson') &
       (r['column'] == 's-score')].copy(deep=True)
r1 = r1.set_index('condition')

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 4.5].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.title('s-scores Vs. pathogenicity')

plt.ylim(-0.5, 9)

plt.xlabel('pearson\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$')

plt.subplot(122)

r1 = r[(r['method'] == 'pearson') &
       (r['column'] == 's-score.hpi')].copy(deep=True)
r1 = r1.set_index('condition')

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 4.5].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.title('s-scores Vs. HPI island')

plt.ylim(-0.5, 9)

plt.xlabel('pearson\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$');

In [None]:
plt.figure(figsize=(12, 7))

plt.subplot(121)

r1 = r[(r['method'] == 'spearman') &
       (r['column'] == 'signed-qvalue')].copy(deep=True)
r1 = r1.set_index('condition')

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 4.5].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.title('signed-qvalue Vs. pathogenicity')

plt.ylim(-0.5, 9.5)

plt.xlabel('spearman\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$')

plt.subplot(122)

r1 = r[(r['method'] == 'spearman') &
       (r['column'] == 'signed-qvalue.hpi')].copy(deep=True)
r1 = r1.set_index('condition')

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 4.5].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.title('signed-qvalue Vs. HPI island')

plt.ylim(-0.5, 9.5)

plt.xlabel('spearman\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$');

In [None]:
def rndmfrst(x, y, xcolumn='s-scores'):
    c = x[xcolumn].unstack().dropna().T
    c['mouse'] = y.loc[c.index, 'phenotype']
    
    df = c[sorted(set(c.columns) - {'mouse'})]
    target = c['mouse']
    
    cv = model_selection.StratifiedShuffleSplit(n_splits=1,
                                                test_size=0.33,
                                                random_state=np.random.RandomState(42))
    train_idx, test_idx = next(cv.split(df, target))
    
    df_train, target_train = df.iloc[train_idx], target[train_idx]
    df_test, target_test = df.iloc[test_idx], target[test_idx]
    
    param_grid = {'n_estimators': [int(x) for x in np.logspace(1, 3, 6)],
                  'max_features': range(int(np.sqrt(df.shape[1])), int(df.shape[1] *0.7), 5)}

    clf = RandomForestClassifier(random_state=np.random.RandomState(42))
    grid_search = model_selection.GridSearchCV(clf,
                                               param_grid=param_grid,
                                               cv=model_selection.StratifiedShuffleSplit(n_splits=10,
                                                              test_size=0.33,
                                                              random_state=np.random.RandomState(42)),
                                               scoring='f1',
                                               n_jobs=cores)
    grid_search.fit(df_train, target_train)
    
    clf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                 max_features=grid_search.best_params_['max_features'],
                                 random_state=np.random.RandomState(42))
    clf.fit(df_train, target_train)
    predict_test = pd.Series(clf.predict(df_test),
                             index=target_test.index,
                             name='prediction')
    combined = target_test.to_frame().join(predict_test.to_frame())
    print(metrics.f1_score(combined['mouse'],
                           combined['prediction']),
          grid_search.best_score_)
    
    plt.figure(figsize=(7, 3.5))

    plt.subplot(121)

    fpr, tpr, _ = metrics.roc_curve(combined['mouse'], clf.predict_proba(df_test)[:, 1])

    plt.plot(fpr, tpr,
             'k-',
             label='AUC %.2f' % metrics.auc(fpr, tpr))
    plt.plot([0, 1],
             [0, 1],
             '--',
             color=sns.xkcd_rgb['grey'])

    plt.legend(frameon=True)

    plt.xlabel('fpr'),
    plt.ylabel('tpr')

    plt.subplot(122)

    prec, rec, _ = metrics.precision_recall_curve(combined['mouse'], clf.predict_proba(df_test)[:, 1])

    plt.plot(rec, prec,
             'k-',)
    
    plt.axhline(target_test[target_test == 1].shape[0] / target_test.shape[0],
                color=sns.xkcd_rgb['grey'],
                ls='dashed')

    plt.xlabel('recall'),
    plt.ylabel('precision')

    plt.tight_layout()
    
    return clf

In [None]:
clf1 = rndmfrst(p, k, 's-scores')

In [None]:
clf2 = rndmfrst(p, k, 'signed-qvalue')

In [None]:
fi = pd.Series(clf1.feature_importances_,
               p['s-scores'].unstack().dropna().T.columns)
r1 = r[(r['method'] == 'pearson') &
       (r['column'] == 's-score')].copy(deep=True)
r1['fi'] = [np.nan for x in range(r1.shape[0])]
r1 = r1.set_index('condition')
r1.loc[fi.index,
       'fi'] = fi.values
r1 = r1.dropna()

plt.figure(figsize=(6, 7))

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            s=r1['fi']*2000,
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 2].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.xlabel('pearson\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$');

In [None]:
fi = pd.Series(clf1.feature_importances_,
               p['s-scores'].unstack().dropna().T.columns)
r1 = r[(r['method'] == 'spearman') &
       (r['column'] == 's-score')].copy(deep=True)
r1['fi'] = [np.nan for x in range(r1.shape[0])]
r1 = r1.set_index('condition')
r1.loc[fi.index,
       'fi'] = fi.values
r1 = r1.dropna()

plt.figure(figsize=(6, 7))

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            s=r1['fi']*2000,
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 2].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.xlabel('spearman\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$');

In [None]:
fi = pd.Series(clf2.feature_importances_,
               p['s-scores'].unstack().dropna().T.columns)
r1 = r[(r['method'] == 'pearson') &
       (r['column'] == 'signed-qvalue')].copy(deep=True)
r1['fi'] = [np.nan for x in range(r1.shape[0])]
r1 = r1.set_index('condition')
r1.loc[fi.index,
       'fi'] = fi.values
r1 = r1.dropna()

plt.figure(figsize=(6, 7))

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            s=r1['fi']*2000,
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 2].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.xlabel('pearson\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$');

In [None]:
fi = pd.Series(clf2.feature_importances_,
               p['s-scores'].unstack().dropna().T.columns)
r1 = r[(r['method'] == 'spearman') &
       (r['column'] == 'signed-qvalue')].copy(deep=True)
r1['fi'] = [np.nan for x in range(r1.shape[0])]
r1 = r1.set_index('condition')
r1.loc[fi.index,
       'fi'] = fi.values
r1 = r1.dropna()

plt.figure(figsize=(6, 7))

plt.scatter(r1['r'],
            -np.log10(r1['p']),
            s=r1['fi']*2000,
            color='k',
            marker='o',
            alpha=0.5)

texts = []
for t, v in r1[-np.log10(r1['p']) > 2].iterrows():
    texts.append(plt.text(v['r'],
                          -np.log10(v['p']),
                          t,
                          ha='center',
                          va='center'))
adjust_text(texts,
            arrowprops=dict(arrowstyle='->', color='k'),
            force_points=2)

plt.xlabel('spearman\'s $r$')
plt.ylabel('$-log_{10}(pvalue)$');