In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import anndata
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA

# sys.path.append("/home/phil/mixture_embeddings/notebooks/pac")
sys.path.append("/home/phil/mixture_embeddings")
# from util import mixture_embedding
from src.embeddings.mixture_embeddings import (
    get_mixture_embeddings,
    convert_geometry_after,
)


INFO: Using numpy backend


In [3]:
# Set pytorch backend

# os.environ["GEOMSTATS_BACKEND"] = "pytorch"
os.environ["GEOMSTATS_BACKEND"] = "numpy"


In [4]:
# Load data

# adata = anndata.read_h5ad("data/big_table.h5ad")
adata = anndata.read_h5ad("data/big_table_with_embeddings.h5ad")
# adata = anndata.read_h5ad("data/americangut_embeddings.h5ad")
adata.X /= adata.X.sum(axis=1)  # normalize

adata_df = adata.to_df()

# Truncate

# adata.X = adata.X.tocsr()
# adata = adata[:1000]

print(adata)
print(adata.to_df().iloc[:10].sum(axis=1))  # check that the data is normalized


AnnData object with n_obs × n_vars = 32608 × 37215
    obs: 'acid_reflux', 'acne_medication', 'acne_medication_otc', 'add_adhd', 'age_cat', 'age_years', 'alcohol_consumption', 'alcohol_frequency', 'alcohol_types_beercider', 'alcohol_types_red_wine', 'alcohol_types_sour_beers', 'alcohol_types_spiritshard_alcohol', 'alcohol_types_unspecified', 'alcohol_types_white_wine', 'allergic_to_i_have_no_food_allergies_that_i_know_of', 'allergic_to_peanuts', 'allergic_to_shellfish', 'allergic_to_tree_nuts', 'allergic_to_unspecified', 'alzheimers', 'animal_age', 'animal_free_text', 'animal_gender', 'animal_origin', 'animal_type', 'animal_type_free_text', 'anonymized_name', 'antibiotic_history', 'appendix_removed', 'artificial_gi_disorder_types_constipation', 'artificial_gi_disorder_types_diarrhea', 'artificial_gi_disorder_types_soft_stools', 'artificial_gi_disorder_types_stomachache', 'artificial_gi_disorder_types_unspecified', 'artificial_gi_disorders', 'artificial_sweeteners', 'artificial_sweetene

In [18]:
# Functions for embeddings

# Euclidean mixture embedding:
def get_euc_mix(ndim):
    otu_embeddings = pd.read_csv(
        f"/data/phil/otu_embeddings/embeddings_euclidean_{ndim}.csv",
        dtype={0: str},
    )
    otu_embeddings = otu_embeddings.set_index(otu_embeddings.columns[0])

    return get_mixture_embeddings(
        otu_table_df=adata_df,
        otu_embeddings_df=otu_embeddings.loc[adata_df.columns],
        space="euclidean",
        embedding_size=ndim,
        return_percent_converged=False,
        fmean_model=None,
        mode=None,
    )


# PCA:
def get_pca(ndim):
    ndim = np.min([ndim, adata.shape[0]])
    pca = PCA(n_components=ndim)
    return pca.fit_transform(adata.X.toarray())


# Hyperbolic mixture embedding:
def get_hyp_mix(ndim):
    otu_embeddings = pd.read_csv(
        f"/data/phil/otu_embeddings/embeddings_hyperbolic_{ndim}.csv",
        dtype={0: str},
    )
    otu_embeddings = otu_embeddings.set_index(otu_embeddings.columns[0])

    return get_mixture_embeddings(
        otu_table_df=adata_df,
        otu_embeddings_df=otu_embeddings.loc[adata_df.columns],
        embedding_size=ndim,
        space="hyperbolic",
        # fmean_model="hyperboloid",
        fmean_model="poincare",
        # mode="manual",
        mode="geomstats",
        return_percent_converged=False,
        convert_back=False,
        n_jobs=-1,
    )


In [6]:
def save_adata():
    adata.X = adata.X.tocsr()
    for embedding in adata.obsm.keys():
        if type(adata.obsm[embedding]) == pd.DataFrame:
            adata.obsm[embedding] = adata.obsm[embedding].values

    adata.write_h5ad("data/big_table_with_embeddings.h5ad")


# save_adata()


In [19]:
# for ndim in [2, 4, 8, 16, 32, 64, 128]:
for ndim in [2]:
    # for ndim in [16, 32, 64, 128]:
    # for ndim in [128]:
    # for ndim in [8]:
    print(ndim)
    # print("\teuc")
    # adata.obsm[f"euc_mix_{ndim}"] = get_euc_mix(ndim).values
    # print("\tpca")
    # adata.obsm[f"pca_{ndim}"] = get_pca(ndim)
    # print("\thyp")
    hyp_me = get_hyp_mix(ndim).values
    adata.obsm[f"poi_mix_geomstats_{ndim}"] = hyp_me
    # adata.obsm[f"hyp_mix_{ndim}"] = hyp_me
    # adata.obsm[f"poi_mix_{ndim}"] = np.array(
        # [convert_geometry_after("hyperboloid", "poincare", x) for x in hyp_me]
    # )
    # save_adata()


2



[A
[AINFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend
INFO: Using numpy backend






































































































































































































































































































































































































[A

In [13]:
save_adata()
