<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Phylogenetic-heatmap" data-toc-modified-id="Phylogenetic-heatmap-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Phylogenetic heatmap</a></span></li><li><span><a href="#ILR-transform" data-toc-modified-id="ILR-transform-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>ILR transform</a></span></li></ul></div>

In [1]:
# This block is just for importing the necessary libraries.  
import os
from collections import defaultdict
# Numerical libraries
import pandas as pd
import numpy as np
import biom
import arviz as az
from scipy.spatial.distance import euclidean
from gneiss.balances import sparse_balance_basis
# Plotting libraries
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib_venn import venn2, venn3
# custom utility and plotting functions
from util import (extract_differentials, select_features, 
                  get_genomic_data, collapse_transcripts, 
                  aggregate_pathways,
                  ranking, btest, log_pvalue, read_kegg_dict,
                  ilr_transform_differentials,
                  rename_clades, create_projection,
                  match_all_differentials,
                  match_all_differentials_and_tree)
from plot import (rankplot, networkplot, vectorplot)
import random

# directory paths
amp_directory = '../sfari/data/sra/Combined'
wgs_directory = '../sfari/data/sra_shotgun/Combined'
rna_directory = '../sfari/data/recount3'
kang_directory = '../sfari/data/sra/Kang2017'
sepp_directory = '../sfari/data/sra/Kang2017/deblur/sepp-v2'
results_dir = '../results'
kegg_dir = f'{results_dir}/kegg'
hsa_dir = f'{results_dir}/hsa_kegg'
np.random.seed(0)
random.seed(0)
%matplotlib inline

In [2]:
# load 16S, whole genome shotgun and RNAseq data
posterior_name = 'age_sex_matched_posterior'
#amp_fname = f'{amp_directory}/{posterior_name}/amp_differentials-v4.nc'
amp_fname = f'{kang_directory}/week0_ogu/differentials-v8.nc'
wgs_fname = f'{wgs_directory}/{posterior_name}/ogus_differentials-v5.nc'
rna_fname = f'{rna_directory}/{posterior_name}/rna_differentials-v4.nc'
vir_fname = f'{wgs_directory}/{posterior_name}/viral_differentials-v5.nc'

amp_diffs = extract_differentials(amp_fname)
wgs_diffs = extract_differentials(wgs_fname)
rna_diffs = extract_differentials(rna_fname)
vir_diffs = extract_differentials(vir_fname)

# Compute statistical tests for each data layer
# Here, we will only focus on the top 10% of the features
amp_stats = ranking(amp_diffs, reference_percentile=90)
wgs_stats = ranking(wgs_diffs, reference_percentile=90)
rna_stats = ranking(rna_diffs, reference_percentile=90)
vir_stats = ranking(vir_diffs, reference_percentile=90)

Load taxonomy

In [3]:
taxonomy = pd.read_table('~/databases/wol/taxonomy/ranks.tsv', index_col=0)
taxid = pd.read_table('~/databases/wol/taxonomy/taxid.map', header=None, dtype=str)
taxid.columns = ['GOTU', 'genome']
mapping = pd.merge(taxid, taxonomy, left_on='GOTU', right_index=True)
mapping = mapping.set_index('genome')

Combine 16S, SMS and viral host information into a single readout

In [4]:
otu_md = pd.DataFrame({'16S': amp_stats['mean'], 'SMS': wgs_stats['mean']})
all_edges = pd.read_table('../results/interaction_metadata.txt', index_col=0)
viral_hosts = set(all_edges['microbe'].unique().astype(np.str))
otu_md['viral_host'] = list(map(lambda x: str(x) in viral_hosts, otu_md.index))
otu_md['viral_host'] = otu_md['viral_host'].astype(np.int64)
otu_md['16S'] = otu_md['16S'].fillna(0)
otu_md['SMS'] = otu_md['SMS'].fillna(0)

# integrate taxonomy information
otu_md = pd.merge(otu_md, mapping, left_index=True, right_index=True, how='left')
otu_md = otu_md.reset_index().set_index('GOTU')

# fill in the nans
otu_md = otu_md.fillna('Other')

# scale 16S and SMS data
def scale_f(x):
    if x <= -4:
        return -4
    if x >= 4: 
        return 4
    else:
        return np.round(x)
otu_md['16S'] = otu_md['16S'].apply(scale_f).astype(np.int64)
otu_md['SMS'] = otu_md['SMS'].apply(scale_f).astype(np.int64)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  viral_hosts = set(all_edges['microbe'].unique().astype(np.str))


In [5]:
def new_phylum(x):
    if x in {'Firmicutes', 'Proteobacteria', 'Bacteroidetes', 'Actinobacteria'}:
        return x
    return 'Other'
otu_md['phylum'] = otu_md['phylum'].apply(new_phylum)

In [6]:
from skbio import TreeNode
tree_fname = '/mnt/home/jmorton/databases/wol/trees/tree.nwk'
tree = TreeNode.read(tree_fname)

# take intersection over tree tips and metadata file
tids = {n.name for n in tree.tips()}
tids = tids & set(otu_md.index)
otu_md = otu_md.loc[tids]
tree = tree.shear(otu_md.index)

In [7]:
res_dir = '../results/phylogenetic'
otu_md.index.name = 'featureid'
tree.write(f'{res_dir}/tree.nwk')
otu_md.to_csv(f'{res_dir}/otu_metadata.txt', sep='\t')

# Phylogenetic heatmap

# ILR transform