<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Pathway-analysis" data-toc-modified-id="Pathway-analysis-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Pathway analysis</a></span></li><li><span><a href="#Pathway-aggregation" data-toc-modified-id="Pathway-aggregation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pathway aggregation</a></span></li><li><span><a href="#Pathway-class-level" data-toc-modified-id="Pathway-class-level-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Pathway class level</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import biom
import arviz as az
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
from scipy.spatial.distance import euclidean
from scipy.stats import ttest_1samp
import seaborn as sns

from util import ranking, read_kegg_dict

%matplotlib inline

In [2]:
fname = '../sfari/data/recount3/age_sex_matched_posterior/differential_posterior.nc'
posterior = az.from_netcdf(fname)

name = 'RNAseq_combined'
x = posterior['posterior']['diff'].to_dataframe().reset_index().pivot(
    index='features', columns=['chain', 'draw'], values='diff')

lr = ranking(x)

In [3]:
genes = defaultdict()
plt.style.use('dark_background')
lr = lr.sort_values('effect_size')
threshold = 1e-3 / len(lr)
cond = np.logical_and(lr['pvalue'] < threshold, lr['effect_size'] > 0)
genes[name] = lr.loc[cond]  # top ASD microbes
genes[name] = genes[name].sort_values('effect_size')
idx = np.arange(len(lr))
lr = lr.sort_values('mean')
cmap = matplotlib.cm.get_cmap('seismic')
# norm = matplotlib.colors.Normalize(vmin=min(lr['prob_lr']), vmax=max(lr['prob_lr']))
norm = matplotlib.colors.TwoSlopeNorm(
    vmin=min(lr['prob_lr']), vcenter=0., vmax=max(lr['prob_lr']))
lr['index'] = idx
plt.fill_between(lr['index'], lr['5%'], lr['95%'], fc='b')
idx = np.logical_and(lr['prob_lr'] > 0, lr['effect_size'] > 0)
i = np.logical_and(lr['pvalue'] < threshold, idx)
plt.fill_between(lr.loc[i, 'index'], lr.loc[i, '5%'], lr.loc[i, '95%'], fc='r')
plt.plot(lr['index'], lr['mean'], c='c')
plt.xlabel('Transcripts', fontsize=18)
plt.xticks([])
plt.title(name, fontsize=18)
plt.ylabel('log(ASD/Control) + K', labelpad=90, rotation=0, fontsize=14)
#plt.colorbar(cbar)
plt.show()
plt.tight_layout()

KeyError: 'prob_lr'

In [None]:
from q2_matchmaker._stats import hotelling_ttest, spherical_test
spherical_test(x.values.T)

In [None]:
fname = '/mnt/home/jmorton/ceph/sfari/data/recount3/ensembl2kegg.txt'
ensembl2ko = pd.read_table(fname, index_col=0)

In [None]:
open('../sfari/data/recount3/ensembl_ids.txt', 'w').write('\n'.join(list(x.index)))

# Pathway analysis

In [None]:
idx1 = lr['pvalue'] < 0.001 / len(lr)
idx2 = np.logical_and(lr['prob_lr'] > 0, lr['effect_size'] > 0)
idx3 = np.logical_and(lr['prob_lr'] < 0, lr['effect_size'] < 0)

rna_asd = lr.loc[np.logical_and(idx1, idx2)]
rna_con = lr.loc[np.logical_and(idx1, idx3)]

In [None]:
fname = '/mnt/home/jmorton/ceph/sfari/data/recount3/ensembl2kegg.txt'
ensembl2ko = pd.read_table(fname, index_col=0)
ensembl2ko = ensembl2ko.set_index('gene_stable_id')

In [None]:
idx = list(map(lambda x: x.split('.')[0], rna_asd.index))
rna_asd = rna_asd.reset_index()
rna_asd.index = idx

In [None]:
kegg_asd_RNA = pd.merge(ensembl2ko, rna_asd, left_index=True, right_index=True)
kegg_asd_RNA = kegg_asd_RNA.rename(columns={'KO': 'KEGG'})
kegg_asd_RNA['HSA'] = kegg_asd_RNA['KEGG'].apply(lambda x: x.split(':')[-1])
kegg_asd_RNA['-log(pvalue)'] = -np.log(kegg_asd_RNA['pvalue'] + 1e-200)

kegg_asd_RNA.to_csv('../results/RNA_KEGG.csv')
kegg_asd_RNA['KEGG'].drop_duplicates().to_csv('../results/hsa_kegg.csv', index=None, header=None)

# Pathway aggregation

In [None]:
kegg_dir = '../results/hsa_kegg'

pwy2kegg = read_kegg_dict(f'{kegg_dir}/ko-to-pathway.txt', 
                          ['HSA', 'Pathway'])

pwy_name = pd.read_table(f'{kegg_dir}/pathway_name.txt', header=None)
pwy_name.columns = ['Pathway', 'Name']
pwy2kegg = pd.merge(pwy2kegg, pwy_name, left_on='Pathway', right_on='Pathway')
lookup = {d: i for i, d in enumerate(pwy2kegg['Name'].value_counts().index)}
pwy2kegg['ID'] = pwy2kegg['Name'].apply(lambda x: lookup[x])

In [None]:
sig_RNA = kegg_asd_RNA
sig_RNA = pd.merge(pwy2kegg, sig_RNA, left_on='HSA', right_on='HSA').drop_duplicates()
pwy2kegg = pwy2kegg.set_index('Pathway')

In [None]:
kegg_asd_RNA

In [None]:
pwy2kegg

In [None]:
sig_RNA['Name'].value_counts()

In [None]:
sig_RNA

# Pathway class level

In [None]:
pwy2compound = read_kegg_dict(f'{kegg_dir}/pathway-to-compound.txt', 
                              ['Pathway', 'Compound'])
sig_RNA = pd.merge(sig_RNA, pwy2compound, left_on='Pathway', right_on='Pathway')
sig_RNA.to_csv('../results/RNA_compounds.csv')

In [None]:
sig_RNA