<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Phylogenetic-heatmap" data-toc-modified-id="Phylogenetic-heatmap-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Phylogenetic heatmap</a></span></li><li><span><a href="#ILR-transform" data-toc-modified-id="ILR-transform-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>ILR transform</a></span></li></ul></div>

In [1]:
# This block is just for importing the necessary libraries.  
import os
from collections import defaultdict
# Numerical libraries
import pandas as pd
import numpy as np
import biom
import arviz as az
from scipy.spatial.distance import euclidean
from gneiss.balances import sparse_balance_basis
# Plotting libraries
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib_venn import venn2, venn3
# custom utility and plotting functions
from util import (extract_differentials, select_features, 
                  get_genomic_data, collapse_transcripts, 
                  aggregate_pathways,
                  ranking, btest, log_pvalue, read_kegg_dict,
                  ilr_transform_differentials,
                  rename_clades, create_projection,
                  match_all_differentials,
                  match_all_differentials_and_tree)
from plot import (rankplot, networkplot, vectorplot)
import random

# directory paths
amp_directory = '../sfari/data/sra/Combined'
wgs_directory = '../sfari/data/sra_shotgun/Combined'
rna_directory = '../sfari/data/recount3'
kang_directory = '../sfari/data/sra/Kang2017'
sepp_directory = '../sfari/data/sra/Kang2017/deblur/sepp-v2'
results_dir = '../results'
kegg_dir = f'{results_dir}/kegg'
hsa_dir = f'{results_dir}/hsa_kegg'
np.random.seed(0)
random.seed(0)
%matplotlib inline

In [2]:
wgs_directory

'../sfari/data/sra_shotgun/Combined'

In [3]:
# load 16S, whole genome shotgun and RNAseq data
posterior_name = 'age_sex_matched_posterior'
amp_fname = f'{amp_directory}/{posterior_name}/amp_differentials-v5.nc'
#amp_fname = f'{kang_directory}/week0_ogu/differentials-v8.nc'
wgs_fname = f'{wgs_directory}/{posterior_name}/ogus_differentials-v7.nc'
rna_fname = f'{rna_directory}/{posterior_name}/rna_differentials-v4.nc'
vir_fname = f'{wgs_directory}/{posterior_name}/viral_differentials-v5.nc'

amp_diffs = extract_differentials(amp_fname)
wgs_diffs = extract_differentials(wgs_fname)
rna_diffs = extract_differentials(rna_fname)
vir_diffs = extract_differentials(vir_fname)

# Compute statistical tests for each data layer
# Here, we will only focus on the top 10% of the features
amp_stats = ranking(amp_diffs, reference_percentile=90)
wgs_stats = ranking(wgs_diffs, reference_percentile=90)
rna_stats = ranking(rna_diffs, reference_percentile=90)
vir_stats = ranking(vir_diffs, reference_percentile=90)

Load taxonomy

In [4]:
fname = '/mnt/home/jmorton/ceph/gg2/2022.8/2022.8.taxonomy.id.tsv'
taxonomy = pd.read_table(fname, index_col=0)
gg_taxonomy = taxonomy.loc[amp_stats.index]

taxonomy = pd.read_table('~/ceph/wol2/wol2/taxonomy/lineages.txt', index_col=0, header=None)
wol_taxonomy = taxonomy.loc[wgs_stats.index]

def fix_taxon(x):
    cols = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
    for c in cols:
        x = x.replace(c, '')
    return x.split('; ')
gg_taxonomy = pd.DataFrame(
    list(gg_taxonomy.Taxon.apply(fix_taxon).values),
    index=gg_taxonomy.index, 
    columns=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
wol_taxonomy = pd.DataFrame(
    list(wol_taxonomy[1].apply(fix_taxon).values),
    index=wol_taxonomy.index, 
    columns=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])

gg_taxonomy['phylum'] = gg_taxonomy.phylum.apply(lambda x: x.split('_')[0])

taxonomy = pd.concat((gg_taxonomy, wol_taxonomy), axis=0)
taxonomy = taxonomy[~taxonomy.index.duplicated(keep='first')]

Combine 16S, SMS and viral host information into a single readout

In [5]:
#otu_md = pd.DataFrame({'16S': amp_stats['mean'], 'SMS': wgs_stats['mean']})
otu_md = pd.merge(amp_stats[['mean']], wgs_stats[['mean']], 
                  left_index=True, right_index=True, how='outer')
otu_md = otu_md.rename(columns={'mean_x': '16S', 'mean_y': 'SMS'})
all_edges = pd.read_table('../results/interaction_metadata.txt', index_col=0)
viral_hosts = set(all_edges['microbe'].unique().astype(np.str))
otu_md['viral_host'] = list(map(lambda x: str(x) in viral_hosts, otu_md.index))
otu_md['viral_host'] = otu_md['viral_host'].astype(np.int64)
otu_md['16S'] = otu_md['16S'].fillna(0)
otu_md['SMS'] = otu_md['SMS'].fillna(0)

# integrate taxonomy information
otu_md = pd.merge(otu_md, taxonomy, left_index=True, right_index=True, how='left')
#otu_md = otu_md.reset_index().set_index('GOTU')

# fill in the nans
otu_md = otu_md.fillna('Other')

# scale 16S and SMS data
def scale_f(x):
    if x <= -4:
        return -4
    if x >= 4: 
        return 4
    else:
        return np.round(x)
otu_md['16S'] = otu_md['16S'].apply(scale_f).astype(np.int64)
otu_md['SMS'] = otu_md['SMS'].apply(scale_f).astype(np.int64)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  viral_hosts = set(all_edges['microbe'].unique().astype(np.str))


In [6]:
otu_md

Unnamed: 0_level_0,16S,SMS,viral_host,kingdom,phylum,class,order,family,genus,species
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAVN02000007,2,0,0,Bacteria,Actinobacteriota,Coriobacteriia,Coriobacteriales,Coriobacteriaceae,Collinsella,Collinsella aerofaciens_F
AB004744,0,0,0,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Enterobacter_B,Enterobacter_B asburiae
AB037875,0,0,0,Bacteria,Firmicutes,Clostridia_A,Peptostreptococcales,Anaerovoracaceae,Mogibacterium,Mogibacterium timidum
AB100804,4,0,0,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Lactococcus_A,Lactococcus_A lactis
AB116294,0,0,0,Bacteria,Actinobacteriota,Actinomycetia,Actinomycetales,Bifidobacteriaceae,Bifidobacterium,Bifidobacterium catenulatum
...,...,...,...,...,...,...,...,...,...,...
Y13364,-1,0,0,Bacteria,Firmicutes,Bacilli,Staphylococcales,Gemellaceae,Gemella,
Y17657,-1,0,0,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella_A,Klebsiella_A pneumoniae
Y18175,1,0,0,Bacteria,Firmicutes,Clostridia_A,Clostridiales,Clostridiaceae,Clostridium_T,Clostridium_T sartagoforme_A
Y18176,0,0,0,Bacteria,Firmicutes,Clostridia_A,Clostridiales,Clostridiaceae,Clostridium_T,Clostridium_T disporicum_A


In [7]:
def new_phylum(x):
    if x in {'Firmicutes', 'Proteobacteria', 
             'Bacteroidota', 'Actinobacteriota', 'Desulfobacterota'}:
        return x
    return 'Other'
otu_md['phylum'] = otu_md['phylum'].apply(new_phylum)

In [8]:
from bp import parse_newick, to_skbio_treenode, to_skbio_treearray

fname = '/mnt/home/jmorton/ceph/gg2/2022.7/2022.7.phylogeny.id.nwk'
bp_tree = parse_newick(open(fname).read())
bp_tree = bp_tree.shear(set(otu_md.index))
tree = to_skbio_treenode(bp_tree)
tree.prune()

In [9]:
res_dir = '../results/phylogenetic'
#otu_md = otu_md.set_index('index')
otu_md.index.name = 'featureid'
tree.write(f'{res_dir}/tree.nwk')
otu_md.to_csv(f'{res_dir}/otu_metadata.txt', sep='\t')

# Phylogenetic heatmap

# ILR transform