### Creating and publishing a hugging face dataset with references to anndata files. 

Can for example be used to train multimodal models with mmcontext

Use the initial embedder to include some initial embeddings into our anndata object which can then be used later.

In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
import anndata

adata = anndata.read_h5ad("../data/scRNA/raw/bowel_disease.h5ad")

In [75]:
# Delete objects that are not needed and are taking up space
del adata.obsm["X_cellwhisperer_umap"]

In [41]:
# get a subsampke of 1000 random cells
# import numpy as np
# np.random.seed(0)
# adata = adata[np.random.choice(adata.obs.index, 1000, replace=False), :]

In [76]:
adata.layers["counts"] = adata.X.copy()

In [77]:
from adata_hf_datasets.initial_embedder import InitialEmbedder

method = "pca"
embedder = InitialEmbedder(method=method)
embedder.fit(adata)
embedder.embed(adata)

In [78]:
from adata_hf_datasets.utils import split_anndata
import os

train_path = f"../data/scRNA/processed/{method}/train/bowel_disease.h5ad"
test_path = f"../data/scRNA/processed/{method}/test/bowel_disease.h5ad"
os.makedirs(os.path.dirname(train_path), exist_ok=True)
os.makedirs(os.path.dirname(test_path), exist_ok=True)
train_data, test_adata = split_anndata(adata, train_size=0.9)
train_data.write(train_path)
test_adata.write(test_path)

In [79]:
train_remote_path = f"datasets/{method}/train/bowel_disease.h5ad"
test_remote_path = f"datasets/{method}/test/bowel_disease.h5ad"

In [80]:
from dotenv import load_dotenv

load_dotenv(override=True)
nextcloud_config = {
    "url": "https://nxc-fredato.imbi.uni-freiburg.de",
    "username": "NEXTCLOUD_USER",  # env will we obtained within code
    "password": "NEXTCLOUD_PASSWORD",
    "remote_path": train_remote_path,
}

In [81]:
from adata_hf_datasets.adata_ref_ds import AnnDataSetConstructor
from adata_hf_datasets.adata_ref_ds import SimpleCaptionConstructor

# Create caption constructor with desired obs keys
for split, path in zip(["train", "test"], [train_remote_path, test_remote_path]):
    caption_constructor = SimpleCaptionConstructor(obs_keys=["cluster_label"])
    constructor = AnnDataSetConstructor(
        caption_constructor=caption_constructor,
        store_nextcloud=True,
        nextcloud_config=nextcloud_config,
        push_to_hf=True,
        hf_username="jo-mengr",
        dataset_name=f"bowel_disease_{method}_{split}",
    )
    constructor.add_anndata(file_path=train_path)
    # Get dataset
    dataset = constructor.get_dataset()

Directory already exists: datasets
Directory already exists: pca
Directory already exists: train
File is a valid .h5ad file.
Directory already exists: datasets
Directory already exists: pca
Directory already exists: train
File is a valid .h5ad file.
