The only purpose of this notebook is to compute Rsquared values

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Datasets" data-toc-modified-id="Datasets-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Datasets</a></span><ul class="toc-item"><li><span><a href="#16S-amplicon-datasets" data-toc-modified-id="16S-amplicon-datasets-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>16S amplicon datasets</a></span></li></ul></li><li><span><a href="#Bioinformatics-pipeline" data-toc-modified-id="Bioinformatics-pipeline-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Bioinformatics pipeline</a></span></li><li><span><a href="#Differential-Ranking-approach" data-toc-modified-id="Differential-Ranking-approach-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Differential Ranking approach</a></span></li></ul></div>

In [1]:
# This block is just for importing the necessary libraries.  
from collections import defaultdict
# Numerical libraries
import pandas as pd
import numpy as np
import biom
import arviz as az
from scipy.spatial.distance import euclidean
# Plotting libraries
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib_venn import venn2, venn3
# custom utility and plotting functions
from util import (extract_differentials, select_features, 
                  get_genomic_data, collapse_transcripts, 
                  aggregate_pathways,
                  ranking, btest, log_pvalue, read_kegg_dict,
                  ilr_transform_differentials,
                  rename_clades, all_feature_ids, 
                  match_all_differentials,
                  create_projection,
                  project_data
                 )
from plot import rankplot, networkplot, barplot_annotate_brackets, vectorplot
# directory paths
amp_directory = '../sfari/data/sra/Combined'
wgs_directory = '../sfari/data/sra_shotgun/Combined'
vir_directory = '../sfari/data/sra_shotgun/Combined'
rna_directory = '../sfari/data/recount3'
kang_directory = '../sfari/data/sra/Kang2017'
sepp_directory = '../sfari/data/sra/Kang2017/deblur/sepp-v2'

kegg_dir = '../results/kegg'
hsa_dir = '../results/hsa_kegg'

%matplotlib inline

In [2]:
from arviz.utils import Dask
from dask.distributed import Client
import dask
Dask.enable_dask(dask_kwargs={"dask": "parallelized", "output_dtypes": [float]})

client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:46247  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 16  Cores: 128  Memory: 0.98 TiB


In [3]:
# load 16S, whole genome shotgun and RNAseq data
posterior_name = 'age_sex_matched_posterior'
amp_fname = f'{amp_directory}/{posterior_name}/amp_differentials-v4.nc'
wgs_fname = f'{wgs_directory}/{posterior_name}/ogus_differentials-v4.nc'
rna_fname = f'{rna_directory}/{posterior_name}/rna_differentials-v4.nc'
vir_fname = f'{wgs_directory}/{posterior_name}/viral_differentials-v4.nc'

group_kwargs={'posterior':{'chunks': {'features': 100}}}
    
amp_posterior = az.from_netcdf(amp_fname, group_kwargs=group_kwargs) 
wgs_posterior = az.from_netcdf(wgs_fname, group_kwargs=group_kwargs) 
rna_posterior = az.from_netcdf(rna_fname, group_kwargs=group_kwargs) 
vir_posterior = az.from_netcdf(vir_fname, group_kwargs=group_kwargs) 

In [4]:
amp_data = f'{amp_directory}/age_sex_matched.biom'
wgs_data = f'{wgs_directory}/ogus_table.biom'
rna_data = f'{rna_directory}/table.biom'
vir_data = f'{vir_directory}/viral_table.biom'

amp_data = biom.load_table(amp_data)
wgs_data = biom.load_table(wgs_data)
rna_data = biom.load_table(rna_data)
vir_data = biom.load_table(vir_data)

Compute baseline R2

In [5]:
from sklearn.metrics import r2_score

vir_raveled = vir_data.to_dataframe().values.ravel()
vir_zeros = np.zeros(len(vir_raveled))
print('virome r2', r2_score(vir_raveled, vir_zeros))

amp_raveled = amp_data.to_dataframe().values.ravel()
amp_zeros = np.zeros(len(amp_raveled))
print('amplicon r2', r2_score(amp_raveled, amp_zeros))

wgs_raveled = wgs_data.to_dataframe().values.ravel()
wgs_zeros = np.zeros(len(wgs_raveled))
print('whole genome r2', r2_score(wgs_raveled, wgs_zeros))

rna_raveled = rna_data.to_dataframe().values.ravel()
rna_zeros = np.zeros(len(rna_raveled))
print('rnaseq r2', r2_score(rna_raveled, rna_zeros))

virome r2 -0.0022478813936464004
amplicon r2 -0.004547557053765194
whole genome r2 -0.004193836069174228
rnaseq r2 -0.0024745415684750416


Compute R2

In [6]:
amp_pred = amp_posterior['posterior_predictive'].stack(sample=("chain", "draw"))['y_predict']
wgs_pred = wgs_posterior['posterior_predictive'].stack(sample=("chain", "draw"))['y_predict']
rna_pred = rna_posterior['posterior_predictive'].stack(sample=("chain", "draw"))['y_predict']
vir_pred = vir_posterior['posterior_predictive'].stack(sample=("chain", "draw"))['y_predict']

In [7]:
amp_pred = amp_pred.stack(feature=('features', 'samples'))
wgs_pred = wgs_pred.stack(feature=('features', 'samples'))
rna_pred = rna_pred.stack(feature=('features', 'samples'))
vir_pred = vir_pred.stack(feature=('features', 'samples'))

amp_pred = amp_pred.fillna(0)
wgs_pred = wgs_pred.fillna(0)
rna_pred = rna_pred.fillna(0)
vir_pred = vir_pred.fillna(0)

Match RNAseq dataset / predictions

In [8]:
rna_features = set(rna_posterior.posterior_predictive.features.values)
filter_f = lambda v, i, m: i in rna_features
rna_data.filter(filter_f, axis='observation')

47992 x 98 <class 'biom.table.Table'> with 3500618 nonzero entries (74% dense)

In [9]:
import xarray as xr
amp_obs = xr.DataArray(amp_data.matrix_data.todense()).chunk(chunks={'dim_0': 100})
wgs_obs = xr.DataArray(wgs_data.matrix_data.todense()).chunk(chunks={'dim_0': 100})
rna_obs = xr.DataArray(rna_data.matrix_data.todense()).chunk(chunks={'dim_0': 100})
vir_obs = xr.DataArray(vir_data.matrix_data.todense()).chunk(chunks={'dim_0': 100})

amp_obs = amp_obs.stack(feature=('dim_0', 'dim_1'))
wgs_obs = wgs_obs.stack(feature=('dim_0', 'dim_1'))
rna_obs = rna_obs.stack(feature=('dim_0', 'dim_1'))
vir_obs = vir_obs.stack(feature=('dim_0', 'dim_1'))

In [10]:
az.r2_score(vir_obs.values, vir_pred.values)

r2        0.969386
r2_std    0.142991
dtype: float64

In [11]:
az.r2_score(amp_obs.values, amp_pred.values)

r2        0.923055
r2_std    0.217568
dtype: float64

In [12]:
az.r2_score(wgs_obs.values, wgs_pred.values)

r2        0.986130
r2_std    0.088631
dtype: float64

In [13]:
az.r2_score(rna_obs.values, rna_pred.values)

r2        0.992813
r2_std    0.053691
dtype: float64

# Datasets

## 16S amplicon datasets

- Zou2020
- Chen2020
- Fasano2020
- Berding2020
- Fouquier2021
- Cao2021
- Dan2020
- Zurita2019

In [None]:
client.close()

# Bioinformatics pipeline

# Differential Ranking approach