In [1]:
import biom
from gneiss.balances import sparse_balance_basis
import numpy as np
import pandas as pd
from bp import parse_newick, to_skbio_treenode

In [2]:
micro_table = biom.load_table('../data/hcc-data/46119_otu_table.biom').to_dataframe().T
metab_table = biom.load_table('../data/hcc-data/metabolites.biom').to_dataframe().T
metadata = pd.read_table('../data/sample_information_from_prep_2458.tsv', index_col=0)

metabolite_md = pd.read_table('../data/hcc-data/metabolite_feature_metadata.txt', index_col=0)

In [3]:
metadata = metadata.reset_index().set_index('host_subject_id')
metadata = metadata.loc[metab_table.index]
micro_table = micro_table.loc[metadata['sample_id'].values]
micro_table.index = metadata.index
metab_table = metab_table.loc[metadata.index]

# drop infrequent taxa
micro_table = micro_table.loc[:, (micro_table > 0).sum(axis=0) >= 10]

In [4]:
# phylogenetic tree for microbes
gg_dir='/Users/mortonjt/Documents/research/databases/gg/gg_13_5_otus/trees'
bp_tree = parse_newick(open(f'{gg_dir}/97_otus.tree').read())
bp_tree2 = bp_tree.shear(set(micro_table.columns))
sktree = to_skbio_treenode(bp_tree2)
sktree.prune()

# rename internal nodes
i = 0
for n in sktree.levelorder():
    if n.name is None:
        n.name = f'clade{i}'
        i += 1

In [5]:
common_ids = list(set(metab_table.columns) & set(metabolite_md.index))
metab_table = metab_table[common_ids]
metabolite_md = metabolite_md.loc[common_ids]

In [6]:
metabolite_md['mz'] = list(map(lambda x: float(x.split('_')[0]), metabolite_md.index))

In [7]:
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage
from skbio import TreeNode
from gneiss.util import rename_internal_nodes, match_tips

dm = pdist(metabolite_md['mz'].values.reshape(-1, 1))
lm = linkage(dm, 'average')
met_tree = TreeNode.from_linkage_matrix(lm, metabolite_md.index)
met_tree = rename_internal_nodes(met_tree)

In [8]:
micro_table, sktree = match_tips(micro_table, sktree)
metab_table, met_tree = match_tips(metab_table, met_tree)

Combine trees

In [9]:
combined_tree = TreeNode()
combined_tree.append(sktree)
combined_tree.append(met_tree)
combined_tree.name = 'Head'
combined_table = pd.concat((micro_table, metab_table), axis=1)

In [26]:
from gneiss.balances import sparse_balance_basis
Psi, nodes = sparse_balance_basis(combined_tree)

In [29]:
combined_table.shape, Psi.shape

((438, 13364), (13363, 13364))

In [32]:
combined_ilr = pd.DataFrame(combined_table.values @ Psi.T,
                            index=combined_table.index,
                            columns=nodes)

In [35]:
combined_ilr.to_csv('../data/combined/balances.csv')
combined_tree.write('../data/combined/tree.nwk')
metadata.to_csv('../data/combined/metadata.csv')