### Creating and publishing a hugging face dataset with references to anndata files. 

Can for example be used to train multimodal models with mmcontext

Use the initial embedder to include some initial embeddings into our anndata object which can then be used later.

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from adata_hf_datasets.utils import setup_logging

setup_logging()

<RootLogger root (INFO)>

In [4]:
import anndata

adata = anndata.read_h5ad("../data/scRNA/raw/bowel_disease.h5ad")

In [5]:
# Delete objects that are not needed and are taking up space
del adata.obsm["X_cellwhisperer_umap"]

In [6]:
from adata_hf_datasets.initial_embedder import InitialEmbedder

method = "pca"
dataset_name = f"bowel_disease_{method}"

embedder = InitialEmbedder(method=method)
embedder.fit(adata)
embedder.embed(adata)

2025-02-07 09:39:36,165 - adata_hf_datasets.initial_embedder - INFO - Fitting method 'pca' with embedding_dim=64
2025-02-07 09:39:36,166 - adata_hf_datasets.initial_embedder - INFO - Fitting PCA with 64 components.
2025-02-07 09:39:44,168 - adata_hf_datasets.initial_embedder - INFO - Embedding data using method pca. Storing embeddings in X_pca.


In [29]:
from adata_hf_datasets.utils import split_anndata
import os
from pathlib import Path

project_dir = Path().resolve().parents[0]
train_path = f"{project_dir}/data/scRNA/processed/{method}/train/bowel_disease.h5ad"
test_path = f"{project_dir}/data/scRNA/processed/{method}/test/bowel_disease.h5ad"
os.makedirs(os.path.dirname(train_path), exist_ok=True)
os.makedirs(os.path.dirname(test_path), exist_ok=True)
train_data, test_adata = split_anndata(adata, train_size=0.9)
train_data.write(train_path)
test_adata.write(test_path)

In [30]:
train_remote_path = f"datasets/{method}/train/bowel_disease.h5ad"
test_remote_path = f"datasets/{method}/test/bowel_disease.h5ad"

In [31]:
from dotenv import load_dotenv

load_dotenv(override=True)
nextcloud_config = {
    "url": "https://nxc-fredato.imbi.uni-freiburg.de",
    "username": "NEXTCLOUD_USER",  # env will we obtained within code
    "password": "NEXTCLOUD_PASSWORD",
    "remote_path": train_remote_path,
}

In [33]:
from adata_hf_datasets.adata_ref_ds import AnnDataSetConstructor
from adata_hf_datasets.adata_ref_ds import SimpleCaptionConstructor
from datasets import DatasetDict

hf_dataset = DatasetDict()
# Create caption constructor with desired obs keys
for split, path in zip(["train", "test"], [train_remote_path, test_remote_path]):
    caption_constructor = SimpleCaptionConstructor(obs_keys=["cluster_label"])
    constructor = AnnDataSetConstructor(
        caption_constructor=caption_constructor,
        store_nextcloud=True,
        nextcloud_config=nextcloud_config,
    )
    constructor.add_anndata(file_path=train_path)
    # Get dataset
    dataset = constructor.get_dataset()
    hf_dataset[split] = dataset

2025-02-07 10:08:22,511 - root - INFO - File saved locally at /Users/mengerj/repos/adata_hf_datasets/data/scRNA/processed/pca/train/bowel_disease.h5ad


Directory already exists: datasets
Directory already exists: pca
Directory already exists: train


2025-02-07 10:08:37,247 - root - INFO - File uploaded to Nextcloud at datasets/pca/train/bowel_disease.h5ad with status code 204
2025-02-07 10:08:50,904 - adata_hf_datasets.adata_ref_ds - INFO - Successfully added anndata file: /Users/mengerj/repos/adata_hf_datasets/data/scRNA/processed/pca/train/bowel_disease.h5ad


File is a valid .h5ad file.


... storing 'caption' as categorical
2025-02-07 10:09:10,551 - root - INFO - File saved locally at /Users/mengerj/repos/adata_hf_datasets/data/scRNA/processed/pca/train/bowel_disease.h5ad


Directory already exists: datasets
Directory already exists: pca
Directory already exists: train


2025-02-07 10:09:20,361 - root - INFO - File uploaded to Nextcloud at datasets/pca/train/bowel_disease.h5ad with status code 423
2025-02-07 10:09:33,937 - adata_hf_datasets.adata_ref_ds - INFO - Successfully added anndata file: /Users/mengerj/repos/adata_hf_datasets/data/scRNA/processed/pca/train/bowel_disease.h5ad


File is a valid .h5ad file.


... storing 'caption' as categorical


In [36]:
from adata_hf_datasets.utils import annotate_and_push_dataset

caption_generation = f"""Captions were generated with the SimpleCaptionConstructor class. That means the previosly added annotation from the
                following obs_keys were concatenated: {caption_constructor.obs_keys}."""

embedding_generation = f"""Embeddings were generated with the InitialEmbedder class from the adata_hf_datasets package, with method = {method}, they have 
        {embedder.embedding_dim} dimensions, and are stored in adata.obsm['X_{method}']"""

annotate_and_push_dataset(
    dataset=dataset,
    caption_generation=caption_generation,
    embedding_generation=embedding_generation,
    repo_id=f"jo-mengr/{dataset_name}",
    readme_template_name="cellwhisperer",
)

TypeError: annotate_and_push_dataset() missing 1 required positional argument: 'self'