<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Pathway-analysis" data-toc-modified-id="Pathway-analysis-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Pathway analysis</a></span></li><li><span><a href="#Pathway-aggregation" data-toc-modified-id="Pathway-aggregation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pathway aggregation</a></span></li><li><span><a href="#Retrieve-pathways-and-compounds" data-toc-modified-id="Retrieve-pathways-and-compounds-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Retrieve pathways and compounds</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import biom
import arviz as az
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
from scipy.spatial.distance import euclidean
import seaborn as sns

from util import extract_differentials, ranking, btest, read_kegg_dict

%matplotlib inline

In [2]:
fname = '../sfari/data/sra/Combined/age_sex_matched_posterior/differential_posterior.nc'
differentials = extract_differentials(fname)
lr = ranking(differentials)

Plot the results

In [None]:
genes = defaultdict()
plt.style.use('dark_background')
lr = lr.sort_values('effect_size')
threshold = 1e-3 / len(lr)
cond = np.logical_and(lr['pvalue'] < threshold, lr['effect_size'] > 0)
genes[name] = lr.loc[cond]  # top ASD microbes
genes[name] = genes[name].sort_values('effect_size')
idx = np.arange(len(lr))
lr = lr.sort_values('mean')
cmap = matplotlib.cm.get_cmap('seismic')
# norm = matplotlib.colors.Normalize(vmin=min(lr['prob_lr']), vmax=max(lr['prob_lr']))
norm = matplotlib.colors.TwoSlopeNorm(
    vmin=min(lr['prob_lr']), vcenter=0., vmax=max(lr['prob_lr']))
lr['index'] = idx
plt.fill_between(lr['index'], lr['5%'], lr['95%'], fc='b')
idx = np.logical_and(lr['prob_lr'] > 0, lr['effect_size'] > 0)
i = np.logical_and(lr['pvalue'] < threshold, idx)
plt.fill_between(lr.loc[i, 'index'], lr.loc[i, '5%'], lr.loc[i, '95%'], fc='r')
plt.plot(lr['index'], lr['mean'], c='c')
plt.xlabel('Taxa', fontsize=18)
plt.xticks([])
plt.title(name, fontsize=18)
plt.ylabel('log(ASD/Control) + K', labelpad=90, rotation=0, fontsize=14)
#plt.colorbar(cbar)
plt.show()
plt.tight_layout()

In [None]:
from q2_matchmaker._stats import hotelling_ttest, spherical_test
spherical_test(x.values.T)

# Pathway analysis

In [None]:
rrna_lr = lr
idx1 = rrna_lr['pvalue'] < 0.001 / len(rrna_lr)
idx2 = np.logical_and(rrna_lr['prob_lr'] > 0, rrna_lr['effect_size'] > 0)
idx3 = np.logical_and(rrna_lr['prob_lr'] < 0, rrna_lr['effect_size'] < 0)
rrna_asd = rrna_lr.loc[np.logical_and(idx1, idx2)]
rrna_con = rrna_lr.loc[np.logical_and(idx1, idx3)]

In [None]:
ko16S = pd.read_table('../sfari/data/sra/Combined/picrust2_out_pipeline/KO_predicted.tsv', index_col=0)

kegg_con_16S, kegg_asd_16S = btest(
    ko16S.reindex(rrna_asd.index).dropna(), 
    ko16S.reindex(rrna_con.index).dropna())    
kegg_asd_16S = kegg_asd_16S.reset_index()
kegg_asd_16S.columns = ['KEGG', 'pvalue']
kegg_asd_16S['-log(pvalue)'] = -np.log(kegg_asd_16S['pvalue'] + 1e-200)

In [None]:
threshold = (0.001 / len(kegg_asd_16S))
kegg_asd_16S = kegg_asd_16S.loc[kegg_asd_16S['pvalue'] < threshold]

In [None]:
kegg_asd_16S

In [None]:
kegg_asd_16S.to_csv('../results/16S_KEGG.csv')

# Pathway aggregation

In [None]:
kegg_dir = '../results/kegg'
pwy2kegg = read_kegg_dict(f'{kegg_dir}/pathway-to-ko.txt', 
                          ['Pathway', 'KO'])

pwy_name = pd.read_table(f'{kegg_dir}/pathway_name.txt', header=None)
pwy_name.columns = ['Pathway', 'Name']
pwy2kegg = pd.merge(pwy2kegg, pwy_name, left_on='Pathway', right_on='Pathway')

lookup = {d: i for i, d in enumerate(pwy2kegg['Name'].value_counts().index)}
pwy2kegg['ID'] = pwy2kegg['Name'].apply(lambda x: lookup[x])

In [None]:
sig_16S = kegg_asd_16S
sig_16S = pd.merge(pwy2kegg, sig_16S, left_on='KO', right_on='KEGG').drop_duplicates()
pwy2kegg = pwy2kegg.set_index('Pathway')

In [None]:
sig_16S['Name'].value_counts()

In [None]:
from gneiss.sort import mean_niche_estimator
n = 30
sig_16S_ = pd.pivot(sig_16S, index='Pathway', columns='KO', values='-log(pvalue)').fillna(0)
# focus on most representative pathways
p2k = pwy2kegg.loc[sig_16S_.index]
pnames = sig_16S['Name'].value_counts().head(n).index
pwys = sig_16S['Pathway'].value_counts().head(n).index
sig_16S_ = sig_16S_.loc[pwys]
sig_16S_.index = pnames
# Filter one-off genes
sig_16S_ = sig_16S_.loc[:, sig_16S_.sum(axis=0) > 3]

# Make it look more pretty
pwy2name = sig_16S[['Name', 'ID']].set_index('Name').drop_duplicates()
kegg_order = mean_niche_estimator(sig_16S_, pwy2name.loc[sig_16S_.index, 'ID'])
kegg_order = kegg_order.dropna()
kegg_order = kegg_order.sort_values()
sig_16S_ = sig_16S_.loc[:, kegg_order.index]

fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(sig_16S_, ax=ax)

#_ = plt.yticks(np.arange(len(sig_16S_.index)), pnames)
plt.xticks([])
plt.xlabel('KEGG IDs', fontsize=14)
plt.title('16S pathways', fontsize=18)

# Retrieve pathways and compounds

In [None]:
pwy2compound = read_kegg_dict(f'{kegg_dir}/pathway-to-compound.txt', 
                              ['Pathway', 'Compound'])
sig_16S = pd.merge(sig_16S, pwy2compound, left_on='Pathway', right_on='Pathway')
sig_16S.to_csv('../results/16S_compounds.csv')

In [None]:
sig_16S.shape