### Creating and publishing a hugging face dataset with references to anndata files. 

Can for example be used to train multimodal models with mmcontext. The datasets consist of a reference to a sample of an anndata file,
which can be stored locally or remotly on nextcloud. 

Use the initial embedder to include some initial embeddings into our anndata object which can then be used later.

In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
from adata_hf_datasets.utils import setup_logging

setup_logging()

<RootLogger root (INFO)>

In [39]:
import anndata

data_name = "cellxgene_pseudo_bulk"
adata = anndata.read_h5ad(f"../data/RNA/raw/{data_name}.h5ad")

KeyboardInterrupt: 

In [29]:
adata.obs["natural_language_annotation"]

cell_id
census_82f6af6d-5313-439a-9936-5e844be49a70_0      Glutamatergic neuron from the prefrontal corte...
census_82f6af6d-5313-439a-9936-5e844be49a70_1      Glutamatergic neuron from the parietal lobe of...
census_82f6af6d-5313-439a-9936-5e844be49a70_2      Nucleus sample from a 7-month-old human glutam...
census_82f6af6d-5313-439a-9936-5e844be49a70_3      Glutamatergic neuron cell type, obtained from ...
census_82f6af6d-5313-439a-9936-5e844be49a70_4      Glutamatergic neuron from the parietal lobe of...
                                                                         ...                        
census_e8681d74-ac9e-4be5-be14-1cf1bbd54dd7_753    Central nervous system macrophage derived from...
census_e8681d74-ac9e-4be5-be14-1cf1bbd54dd7_754    A central nervous system macrophage cell type,...
census_e8681d74-ac9e-4be5-be14-1cf1bbd54dd7_755    Central nervous system macrophage cell type, d...
census_e8681d74-ac9e-4be5-be14-1cf1bbd54dd7_756    Central nervous system macrophag

In [30]:
# Delete objects that are not needed and are taking up space
del adata.obsm["natural_language_annotation_replicates"]
del adata.layers

In [31]:
from adata_hf_datasets.initial_embedder import InitialEmbedder

method = "pca"
dataset_name = f"{data_name}_{method}"

embedder = InitialEmbedder(method=method)
embedder.fit(adata)
embedder.embed(adata)

2025-02-07 14:17:53,352 - adata_hf_datasets.initial_embedder - INFO - Fitting method 'pca' with embedding_dim=64
2025-02-07 14:17:53,353 - adata_hf_datasets.initial_embedder - INFO - Fitting PCA with 64 components.
2025-02-07 14:32:49,058 - adata_hf_datasets.initial_embedder - INFO - Embedding data using method pca. Storing embeddings in X_pca.


In [32]:
from adata_hf_datasets.utils import split_anndata
import os
from pathlib import Path

project_dir = Path().resolve().parents[0]
train_path = f"{project_dir}/data/scRNA/processed/{method}/{data_name}/train.h5ad"
val_path = f"{project_dir}/data/scRNA/processed/{method}/{data_name}/val.h5ad"
os.makedirs(os.path.dirname(train_path), exist_ok=True)
os.makedirs(os.path.dirname(val_path), exist_ok=True)
train_data, val_adata = split_anndata(adata, train_size=0.9)
train_data.write(train_path)
val_adata.write(val_path)

In [33]:
train_remote_path = f"datasets/{method}/train/bowel_disease.h5ad"
val_remote_path = f"datasets/{method}/val/bowel_disease.h5ad"

In [34]:
from dotenv import load_dotenv

load_dotenv(override=True)
nextcloud_config = {
    "url": "https://nxc-fredato.imbi.uni-freiburg.de",
    "username": "NEXTCLOUD_USER",  # env will we obtained within code
    "password": "NEXTCLOUD_PASSWORD",
    "remote_path": "",
}

In [35]:
from adata_hf_datasets.adata_ref_ds import AnnDataSetConstructor
from adata_hf_datasets.adata_ref_ds import SimpleCaptionConstructor
from datasets import DatasetDict

hf_dataset = DatasetDict()
# Create caption constructor with desired obs keys
for split, path in zip(["train", "val"], [train_path, val_path]):
    caption_constructor = SimpleCaptionConstructor(obs_keys=["cluster_label"])
    nextcloud_config["remote_path"] = eval(f"{split}_remote_path")
    constructor = AnnDataSetConstructor(
        caption_constructor=caption_constructor,
        store_nextcloud=True,
        nextcloud_config=nextcloud_config,
    )
    constructor.add_anndata(file_path=path)
    # Get dataset
    dataset = constructor.get_dataset()
    hf_dataset[split] = dataset

2025-02-07 14:36:00,605 - datasets - INFO - PyTorch version 2.6.0 available.
2025-02-07 14:38:57,374 - root - INFO - File saved locally at /Users/mengerj/repos/adata_hf_datasets/data/scRNA/processed/pca/cellxgene_pseudo_bulk/train.h5ad


Directory already exists: datasets
Directory already exists: pca
Directory already exists: train


In [69]:
from adata_hf_datasets.utils import annotate_and_push_dataset

caption_generation = f"""Captions were generated with the SimpleCaptionConstructor class. That means the previosly added annotation from the
                following obs_keys were concatenated: {caption_constructor.obs_keys}."""

embedding_generation = f"""Embeddings were generated with the InitialEmbedder class from the adata_hf_datasets package, with method = {method}, they have 
        {embedder.embedding_dim} dimensions, and are stored in adata.obsm['X_{method}']"""

annotate_and_push_dataset(
    dataset=hf_dataset,
    caption_generation=caption_generation,
    embedding_generation=embedding_generation,
    repo_id=f"jo-mengr/{dataset_name}",
    readme_template_name="cellxgene_pseudo_bulk",
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

In [70]:
from datasets import load_dataset

dataset_loaded = load_dataset(f"jo-mengr/{dataset_name}")

train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [59]:
dataset_loaded

DatasetDict({
    train: Dataset({
        features: ['anndata_ref', 'caption', 'label'],
        num_rows: 20114
    })
    test: Dataset({
        features: ['anndata_ref', 'caption', 'label'],
        num_rows: 20114
    })
})