From bb1984b1821b971af8a54efc087e60b1ee340ac2 Mon Sep 17 00:00:00 2001
From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Wed, 11 Jun 2025 12:15:41 +0000
Subject: [PATCH 1/3] Add conversion to TileDB SOMA

---
 .../config.vsh.yaml                           | 157 +++++
 .../from_h5mu_or_h5ad_to_tiledb/script.py     | 614 ++++++++++++++++++
 .../from_h5mu_or_h5ad_to_tiledb/test.py       | 447 +++++++++++++
 3 files changed, 1218 insertions(+)
 create mode 100644 src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml
 create mode 100644 src/convert/from_h5mu_or_h5ad_to_tiledb/script.py
 create mode 100644 src/convert/from_h5mu_or_h5ad_to_tiledb/test.py

diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml
new file mode 100644
index 00000000000..b90e133a20f
--- /dev/null
+++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml
@@ -0,0 +1,157 @@
+name: "from_h5mu_or_h5ad_to_tiledb"
+namespace: "convert"
+scope: "public"
+description: |
+  Convert a MuData or AnnData object to tiledb
+authors:
+  - __merge__: /src/authors/dries_schaumont.yaml
+    roles: [ author, maintainer ]
+argument_groups:
+  - name: "Input"
+    arguments:
+      - name: --input
+        description: |
+          Input AnnData or MuData file. When an AnnData file is provided, it is automatically assumed to 
+          contain transcriptome counts.
+        type: file
+        required: true
+        example: "input.h5mu"
+        direction: input
+  - name: "RNA modality"
+    arguments:
+      - name: --rna_modality
+        type: string
+        default: rna
+        description: |
+          The name used for the RNA modality. Used when input file is a MuData object.
+      - name: --rna_raw_layer_input
+        type: string
+        required: true
+        example: X
+        description: |
+          Location of the layer containing the raw transcriptome counts. Layers are looked for in .layers,
+          except when using the value 'X'; in which case .X is used.
+      - name: --rna_normalized_layer_input
+        type: string
+        required: true
+        example: log_normalized
+        description: |
+          Location of the layer containing the normalized counts. Layers are looked for in .layers,
+          except when using the value 'X'; in which case .X is used.
+      - name: --rna_var_gene_names_input
+        type: string
+        required: true
+        example: "gene_symbol"
+        description: |
+          Column in .var that provides the gene names. If not specified, the index from the input is used.
+      
+  - name: "Protein modality"
+    arguments:
+      - name: --prot_modality
+        description: |
+          The name used for the protein modality. Used when input file is a MuData object.
+          When not specified, the protein modality will not be processed.
+        type: string
+        required: false
+        example: prot
+      - name: --prot_raw_layer_input
+        type: string
+        example: X
+        description: |
+          Location of the layer containing the raw protein counts. Layers are looked for in .layers,
+          except when using the value 'X'; in which case .X is used.
+      - name: --prot_normalized_layer_input
+        type: string
+        example: clr
+        description: |
+          Location of the layer containing the normalized counts. Layers are looked for in .layers,
+          except when using the value 'X'; in which case .X is used.
+
+  - name: "Output slots"
+    arguments:
+      - name: "--rna_modality_output"
+        type: string
+        default: "rna"
+        description: |
+          TileDB Measurement name where the RNA modality will be stored.
+      - name: "--prot_modality_output"
+        type: string
+        default: "prot"
+        description: |
+          Name of the TileDB Measurement where the protein modality will be stored.
+      - name: "--obs_index_name_output"
+        description: |
+          Name of the index that is used to describe the cells (observations).
+        type: string
+        default: cell_id
+      - name: --rna_var_index_name_output
+        description: |
+          Output name of the index that is used to describe the genes.
+        type: string
+        default: rna_index
+      - name: --rna_raw_layer_output
+        description: |
+          Output location for the raw transcriptomics counts.
+        type: string
+        default: "X"
+      - name: --rna_normalized_layer_output
+        type: string
+        default: "log_normalized"
+        description: |
+          Output location for the normalized RNA counts.
+      - name: --rna_var_gene_names_output
+        type: string
+        default: "gene_symbol"
+        description: |
+          Name of the .var column that specifies the gene games.
+      - name: --prot_var_index_name_output
+        description: |
+          Output name of the index that is used to describe the proteins. 
+        type: string
+        default: prot_index
+      - name: --prot_raw_layer_output
+        type: string
+        default: "X"
+        description: |
+          Output location for the raw protein counts.
+      - name: --prot_normalized_layer_output
+        type: string
+        default: "log_normalized"
+        description: |
+          Output location for the normalized protein counts.
+
+  - name: "Output arguments"
+    arguments:
+      - name: "--tiledb_dir"
+        type: file
+        direction: output
+        description: |
+          Directory where the TileDB output will be written to.
+      
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages: 
+          - procps
+      - type: python
+        packages:
+          - tiledbsoma
+        __merge__: [/src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowmem, singlecpu]
\ No newline at end of file
diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py b/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py
new file mode 100644
index 00000000000..9650157cfd0
--- /dev/null
+++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py
@@ -0,0 +1,614 @@
+import sys
+import anndata
+import numpy as np
+import tiledbsoma
+
+# This import seems redundant, but it is required!
+import tiledbsoma.io
+import pandas as pd
+from functools import singledispatch
+from collections.abc import Mapping
+import h5py
+from shutil import copy2, rmtree
+from collections.abc import Callable
+from functools import partial
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+import pyarrow as pa
+from collections import defaultdict
+
+anndata.settings.allow_write_nullable_strings = True
+
+
+## VIASH START
+par = {
+    "input": "/home/di/code/openpipelines-multisample/pmbc_process_samples.h5mu",
+    # modality names
+    "rna_modality": "rna",
+    "rna_modality_output": "rna",
+    "prot_modality": "prot",
+    "prot_modality_output": "prot",
+    # rna var
+    "rna_var_gene_names_input": "gene_symbol",
+    "rna_var_gene_names_output": "gene_foo",
+    "rna_var_index_name_output": "rna_index",
+    # rna layers
+    "rna_raw_layer_input": "X",
+    "rna_raw_layer_output": "X",
+    "rna_normalized_layer_input": "log_normalized",
+    "rna_normalized_layer_output": "log_normalized",
+    # prot layers
+    "prot_raw_layer_input": "X",
+    "prot_raw_layer_output": "X",
+    "prot_normalized_layer_input": "clr",
+    "prot_normalized_layer_output": "log_normalized",
+    # prot var
+    "prot_var_index_name_output": "prot_index",
+    # observation index name
+    "obs_index_name_output": "cell_id",
+    # TileDB output directory
+    "tiledb_dir": "output",
+}
+
+
+meta = {
+    "name": "from_h5mu_or_h5ad_to_tiledb",
+    "resources_dir": "src/utils/",
+    "temp_dir": "/tmp",
+}
+## VIASH END
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+
+logger = setup_logger()
+
+LAYER_ARGUMENTS = (
+    "rna_raw_layer_input",
+    "rna_raw_layer_output",
+    "rna_normalized_layer_input",
+    "rna_normalized_layer_output",
+    "prot_raw_layer_input",
+    "prot_raw_layer_output",
+    "prot_normalized_layer_input",
+    "prot_normalized_layer_output",
+)
+
+
+def _to_pandas_nullable(table_or_df: pa.Table | pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert pyarrow Table or pandas dataframe to pandas dataframe that uses nullable data types.
+
+    This function takes the following into account:
+        1. AnnData supports writing BooleanDtype, IntegerDtype and StringDtype (which uses pd.NA)
+        2. AnnData does *not* support FloatingDtype
+        3. AnnData does support numpy-stype floating and integer dtypes (which use np.nan).
+        4. TileDB SOMA's `update_obs` does not allow casting floats to integers (potential information loss).
+        5. When pandas's `convert_dtypes` convert_integer argument is set to True,
+           floats will be converted to integer if they can be 'faithfully' casted.
+
+    Giving the previous constraints, it is *not* possible to only convert integers using `convert_types` because
+    it might cause some float columns to be converted to integers (which is incompatible with tiledbsoma).
+    Therefore, this function will leave integers and floats as-is.
+
+    Based on the conversion provided by this function, a suitable na representation (pd.NA or np.NA) for a column can
+    be chosen using `pd.api.types.is_extension_array_dtype(column_dtype)`
+    """
+    try:
+        df = table_or_df.to_pandas(
+            types_mapper=defaultdict(None, {pa.large_string(): pd.StringDtype()}).get
+        )
+    except AttributeError:
+        df = table_or_df.convert_dtypes(
+            infer_objects=False,
+            convert_string=True,
+            convert_integer=False,
+            convert_boolean=False,
+            convert_floating=False,
+        )
+    return df.convert_dtypes(
+        infer_objects=False,
+        convert_string=False,
+        convert_integer=False,
+        convert_boolean=True,
+        convert_floating=False,
+    )
+
+
+def _get_temp_h5ad():
+    return Path(NamedTemporaryFile(suffix=".h5ad", dir=meta["temp_dir"]).name)
+
+
+def _h5mu_to_h5ad(h5mu_path, modality_name):
+    """
+    Create an anndata file from a modality in a mudata file by
+    copying over the relevant h5 objects.
+    """
+    result = _get_temp_h5ad()
+    with h5py.File(h5mu_path, "r") as open_anndata:
+        with h5py.File(result, "w") as file_to_write:
+            for h5_key in open_anndata["mod"][modality_name].keys():
+                open_anndata.copy(
+                    source=f"/mod/{modality_name}/{h5_key}", dest=file_to_write
+                )
+    return result
+
+
+def _handle_layer_location(layer_name):
+    """
+    Handle the special case where 'X' is specified as layer
+    this layer is not present in `<mod_name>/layers/X` but in `<mod_name>/X`.
+    """
+    prefix = "layers/" if layer_name != "X" else ""
+    return f"{prefix}{layer_name}"
+
+
+def _check_input_args(par, input, is_mudata):
+    """
+    Check the input arguments:
+        * Arguments for protein modalities are not allowed when the input is not a MuData file.
+        * Arguments for input layers may not point to the same layer
+        * Output locations for layers may not be the same
+        * Input modalities must not be the same
+        * Input modalities must exist
+    """
+    if not is_mudata:
+        for par_name in (
+            "prot_modality",
+            "prot_raw_layer_input",
+            "prot_normalized_layer_input",
+        ):
+            if par[par_name]:
+                raise ValueError(
+                    f"'{par_name}' can not be used when the input is not a MuData file."
+                )
+
+    if par["prot_modality"]:
+        for prot_arg in ("prot_raw_layer_input", "prot_raw_layer_output"):
+            if not par[prot_arg]:
+                ValueError(
+                    f"When providing 'prot_modality', '{prot_arg}' must also be set."
+                )
+
+    NOT_SAME_VALUE = (
+        ("rna_raw_layer_input", "rna_normalized_layer_input"),
+        ("prot_raw_layer_input", "prot_normalized_layer_input"),
+        ("rna_raw_layer_output", "rna_normalized_layer_output"),
+        ("prot_raw_layer_output", "prot_normalized_layer_output"),
+        ("rna_modality", "prot_modality"),
+    )
+    for layer1_arg, layer2_arg in NOT_SAME_VALUE:
+        arg1_val, arg2_val = par[layer1_arg], par[layer2_arg]
+        if (arg1_val == arg2_val) and arg1_val is not None:
+            raise ValueError(
+                f"The value for argument '{layer1_arg}' ({arg1_val}) must not be the same as for argument '{layer2_arg}' ({arg2_val})"
+            )
+
+    with h5py.File(input, "r") as open_h5:
+        if "mod" in open_h5:
+            available_modalities = tuple(open_h5["mod"].keys())
+            for mod in (par["rna_modality"], par["prot_modality"]):
+                if mod is not None and f"/mod/{mod}" not in open_h5:
+                    raise ValueError(
+                        f"Modality '{mod}' was not found in object, available modalities: {available_modalities}"
+                    )
+
+
+def _copy_layer(source: str, dest: str, input_h5ad: Path, mod_obj: h5py.Group):
+    """
+    Copy a h5py object from one location (source) to a new location (dest).
+    The source location must be located in the input AnnData file, the destination
+    will be written to a h5py Group object.
+    """
+    logger.info("Copying layer '%s' from '%s' to '%s'", source, input_h5ad, dest)
+    if source == f"{mod_obj.name}/{dest}":
+        logger.info("Layer is already at the correct location. Nothing to do.")
+        return
+    if not input_h5ad.is_file():
+        raise FileNotFoundError(f"Could not link to {input_h5ad}: file does not exist.")
+    with h5py.File(input_h5ad) as open_input:
+        if source not in open_input:
+            raise ValueError(f"Layer {source} was not found in {input_h5ad}.")
+    if dest in mod_obj:
+        logger.info(f"{dest} is already present. Removing before creating link.")
+        del mod_obj[dest]
+    # ExternalLink could have been used instead of 'copy', but it does not seem to work with tileDB
+    with h5py.File(str(input_h5ad), "r") as open_input:
+        open_input.copy(
+            source=open_input[source], dest=mod_obj, name=dest, expand_external=True
+        )
+    logger.info("Copying complete.")
+
+
+def _copy_column(src, dest, _, df_h5_obj):
+    """
+    Duplicate a column in a dataframe and give a new name.
+    The input dataframe must be h5py object that is encoded by AnnData to store
+    a pandas dataframe (e.g. .obs and .var).
+    """
+    logger.info("Copying column '%s' to '%s' for '%s'.", src, dest, df_h5_obj.name)
+    if src == dest:
+        logger.info("Source and destination for the column are the same, not copying.")
+        return None
+    if dest in df_h5_obj:
+        raise ValueError(f"Column {dest} already exists in {df_h5_obj.name}.")
+    df_h5_obj[dest] = src
+    df_h5_obj.attrs["column-order"] = np.append(df_h5_obj.attrs["column-order"], dest)
+
+
+def _set_index_name(name, _, df_h5_obj):
+    """
+    Set the index for a dataframe to an existing column.
+    The input dataframe must be h5py object that is encoded by AnnData to store
+    a pandas dataframe (e.g. .obs and .var).
+    """
+    logger.info("Setting index name of '%s' to '%s'.", df_h5_obj.name, name)
+    original_index_name = df_h5_obj.attrs["_index"]
+    df_h5_obj.move(original_index_name, name)
+    df_h5_obj.attrs["_index"] = name
+    logger.info("Done setting index name")
+
+
+def _delete_all_keys(_, group_obj, exception=None):
+    """
+    Delete all keys from a h5py.Group object; which optional exceptions.
+    """
+    if exception is None:
+        exception = ()
+    logger.info(
+        "Deleting all keys from '%s'%s.",
+        group_obj.name,
+        f" except {', '.join(exception)}" if exception else "",
+    )
+    keys_to_delete = set(group_obj.keys()) - set(exception)
+    if not keys_to_delete:
+        logger.info("No keys to delete.")
+        return
+    for key in keys_to_delete:
+        del group_obj[key]
+    logger.info("Done deleting keys %s.", ", ".join(keys_to_delete))
+
+
+def _set_index_to_string_dtype(_, df_obj):
+    """
+    Change the index dataype for a dataframe to string when it is encoded as categorical.
+    The dataframe must be provided as a h5py Group object that has been encoded by AnnData.
+    """
+    logger.info(
+        "Checking if index of object '%s' is encoded as a categorical.", df_obj.name
+    )
+    index_col_name = df_obj.attrs["_index"]
+    index_col = df_obj[index_col_name]
+    logger.info("Column '%s' is being used as the index.", index_col_name)
+    index_col_dtype = index_col.attrs["encoding-type"]
+    if index_col_dtype == "categorical":
+        logger.info(
+            "Column '%s' is encoded as categorical, converting to string.",
+            index_col_name,
+        )
+        is_ordered = bool(index_col.attrs.get("ordered", False))
+        codes, categories = index_col["codes"], index_col["categories"]
+        column = pd.Categorical.from_codes(codes, categories, is_ordered)
+        column_str = column.astype(str).astype(object)
+        del df_obj[index_col_name]
+        df_obj.create_dataset(index_col_name, data=column_str)
+        index_col.attrs["encoding-type"] = "string-array"
+        logger.info("Conversion to string dtype complete.")
+        return
+    logger.info(
+        "Column '%s' not encoded as categorical. Leaving as is.", index_col_name
+    )
+
+
+def _get_rna_conversion_specification(par):
+    rna_spec = [
+        partial(_copy_layer, par["rna_raw_layer_input"], par["rna_raw_layer_output"]),
+        partial(
+            _copy_layer,
+            par["rna_normalized_layer_input"],
+            par["rna_normalized_layer_output"],
+        ),
+        {
+            "uns": _delete_all_keys,
+            "obsp": _delete_all_keys,
+            "var": [
+                partial(
+                    _copy_column,
+                    par["rna_var_gene_names_input"],
+                    par["rna_var_gene_names_output"],
+                ),
+                partial(_set_index_name, par["rna_var_index_name_output"]),
+                _set_index_to_string_dtype,
+            ],
+            "obs": partial(_set_index_name, par["obs_index_name_output"]),
+            "layers": partial(
+                _delete_all_keys,
+                exception=(
+                    par["rna_raw_layer_output"].removeprefix("layers/"),
+                    par["rna_normalized_layer_output"].removeprefix("layers/"),
+                ),
+            ),
+        },
+    ]
+    return rna_spec
+
+
+def _get_prot_conversion_specification(par):
+    prot_spec = [
+        partial(_copy_layer, par["prot_raw_layer_input"], par["prot_raw_layer_output"]),
+        partial(
+            _copy_layer,
+            par["prot_normalized_layer_input"],
+            par["prot_normalized_layer_output"],
+        ),
+        {
+            "var": [
+                partial(_set_index_name, par["prot_var_index_name_output"]),
+                _set_index_to_string_dtype,
+            ],
+            "uns": _delete_all_keys,
+            "obsp": _delete_all_keys,
+            # "varm": _delete_all_keys,
+            "layers": partial(
+                _delete_all_keys,
+                exception=(
+                    par["prot_raw_layer_output"].removeprefix("layers/"),
+                    par["prot_normalized_layer_output"].removeprefix("layers/"),
+                ),
+            ),
+            "obs": partial(_set_index_name, par["obs_index_name_output"]),
+        },
+    ]
+
+    return prot_spec
+
+
+def _copy_anndata_and_convert_inplace(anndata_path, conversion_specification):
+    converted_anndata = _get_temp_h5ad()
+    copy2(anndata_path, converted_anndata)
+    with h5py.File(converted_anndata, "r+") as open_prot_h5:
+        apply_conversion(conversion_specification, anndata_path, open_prot_h5)
+    return converted_anndata
+
+
+@singledispatch
+def apply_conversion(conversion_specification, input_h5ad, open_h5):
+    raise NotImplementedError(
+        f"Error: function 'apply_conversion' is not implemented for type '{type(conversion_specification)}'"
+    )
+
+
+@apply_conversion.register
+def _(conversion_specification: list, input_h5ad, open_h5):
+    for item in conversion_specification:
+        apply_conversion(item, input_h5ad, open_h5)
+
+
+@apply_conversion.register
+def _(conversion_specifications: Mapping, input_h5ad, open_h5):
+    for h5_key, conversion_spec in conversion_specifications.items():
+        if hasattr(open_h5, h5_key):
+            h5_obj = getattr(open_h5, h5_key)
+        else:
+            h5_obj = open_h5[h5_key]
+        apply_conversion(conversion_spec, input_h5ad, h5_obj)
+
+
+@apply_conversion.register
+def _(conversion_specifications: Callable, input_h5ad, open_h5):
+    conversion_specifications(input_h5ad, open_h5)
+
+
+def _remove_directory(dir_path):
+    """
+    Check if a directory exists and remove it does.
+    """
+    if dir_path.exists():
+        try:
+            # Delete the directory and all its contents
+            for item in dir_path.iterdir():
+                if item.isfile(item) or item.islink(item):
+                    item.unlink()  # Removes files or symbolic links
+                elif item.is_dir():
+                    rmtree(item)  # Removes directories recursively
+            logger.info("Directory '%s' has been deleted successfully.", dir_path)
+        except FileNotFoundError:
+            logger.info("Directory '%s' not found.", dir_path)
+        except PermissionError as e:
+            raise ValueError(
+                f"Permission denied: Unable to delete '{dir_path}'."
+            ) from e
+
+
+def _align_obs_columns(experiment_df, anndata_df):
+    def _create_na_columns(df_to_write, columns, dtype_df):
+        """
+        Create new columns in a dataframe with NA's as content while
+        making sure that the datatype of the newly created columns match
+        with those from another dataframe.
+        """
+        for col in columns:
+            dtype = dtype_df[col].dtype
+            na = pd.NA if pd.api.types.is_extension_array_dtype(dtype) else np.nan
+            df_to_write.loc[:, col] = pd.Series(
+                na, index=df_to_write.index, dtype=dtype
+            )
+        return df_to_write
+
+    logger.info("Making sure obs columns are aligned.")
+    contents_columns = set(experiment_df.columns)
+    logger.info("Already ingested .obs columns: %s", ", ".join(contents_columns))
+    logger.info("Retreiving obs columns to be added.")
+
+    obs_to_add_columns = set(anndata_df.columns)
+    logger.info("Will add the following columns: %s", ", ".join(obs_to_add_columns))
+    missing_columns_in_old = obs_to_add_columns - contents_columns
+    logger.info(
+        "Adding the following columns to the schema: %s.",
+        ", ".join(missing_columns_in_old),
+    )
+    experiment_df = _create_na_columns(
+        experiment_df, missing_columns_in_old, anndata_df
+    )
+
+    logger.info("Adjusting .obs succeeded!")
+    missing_columns_in_new = (
+        contents_columns
+        - obs_to_add_columns
+        - {"soma_joinid", par["obs_index_name_output"]}
+    )
+    logger.info(
+        "Adding the following columns to modality to be ingested: %s",
+        ", ".join(missing_columns_in_new),
+    )
+    anndata_df = _create_na_columns(anndata_df, missing_columns_in_new, experiment_df)
+    return experiment_df, anndata_df
+
+
+def main(par):
+    logger.info(f"Component {meta['name']} started.")
+    par["input"], par["tiledb_dir"] = Path(par["input"]), Path(par["tiledb_dir"])
+
+    # Handle case where input is a mudat file
+    with open(par["input"], "rb") as open_input_file:
+        # It is possible to create a MuData compatible h5 file without using
+        # MuData, and those could not have "MuData" as the first bytes.
+        # But that is really an edge case and this check should hold.
+        is_mudata = open_input_file.read(6) == b"MuData"
+
+    _check_input_args(par, par["input"], is_mudata)
+
+    # Reference layers other than "X" refer as "layer/<layer_name>"
+    for arg_name in LAYER_ARGUMENTS:
+        par[arg_name] = _handle_layer_location(par[arg_name])
+
+    rna_input = par["input"]
+    # Create a temporary file that holds the converted RNA h5ad
+    if is_mudata:
+        if not par["rna_modality"]:
+            raise ValueError(
+                "'rna_modality' argument must be set if the input is a MuData file."
+            )
+        # If the input is a MuData file, take the RNA modality and use that as input instead
+        rna_input = _h5mu_to_h5ad(par["input"], par["rna_modality"])
+
+    # Put a copy of the RNA input in the output file and convert it in-place
+    rna_converted = _copy_anndata_and_convert_inplace(
+        rna_input, _get_rna_conversion_specification(par)
+    )
+
+    if par["prot_modality"]:
+        prot_input = _h5mu_to_h5ad(par["input"], par["prot_modality"])
+        prot_converted = _copy_anndata_and_convert_inplace(
+            prot_input, _get_prot_conversion_specification(par)
+        )
+
+    tiledbsoma.logging.info()
+    _remove_directory(par["tiledb_dir"])
+
+    logger.info("Ingesting RNA modality")
+    rna_from_h5ad_args = {
+        "experiment_uri": str(par["tiledb_dir"]),
+        "input_path": rna_converted,
+        "measurement_name": par["rna_modality_output"],
+        "uns_keys": [],
+        "X_layer_name": "raw",
+    }
+    rna_from_h5ad_str = [
+        f"\t{param}: {param_val}\n" for param, param_val in rna_from_h5ad_args.items()
+    ]
+    logger.info(
+        "Calling tiledbsoma.io.from_h5ad with arguments: \n%s",
+        "".join(rna_from_h5ad_str).rstrip(),
+    )
+    tiledbsoma.io.from_h5ad(**rna_from_h5ad_args)
+    logger.info("Done ingesting RNA modality")
+
+    if par["prot_modality"]:
+        logger.info("Ingesting protein modality")
+        logger.info(
+            "Making sure that obs columns are shared between the two modalities."
+        )
+        with tiledbsoma.DataFrame.open(f"{par['tiledb_dir']}/obs") as obs_arr:
+            contents = _to_pandas_nullable(obs_arr.read().concat())
+
+        with h5py.File(prot_converted, "r") as open_h5:
+            obs_to_add = _to_pandas_nullable(anndata.io.read_elem(open_h5["obs"]))
+
+        contents, obs_to_add = _align_obs_columns(contents, obs_to_add)
+
+        with h5py.File(prot_converted, "r+") as open_h5:
+            anndata.io.write_elem(open_h5, "obs", obs_to_add)
+
+        with tiledbsoma.Experiment.open(str(par["tiledb_dir"]), "w") as open_experiment:
+            tiledbsoma.io.update_obs(
+                exp=open_experiment,
+                new_data=contents,
+                default_index_name=par["obs_index_name_output"],
+            )
+
+        logger.info("Initalizing prot modality schema.")
+        prot_from_h5ad_schema_args = {
+            "experiment_uri": str(par["tiledb_dir"]),
+            "input_path": prot_converted,
+            "measurement_name": par["prot_modality_output"],
+            "uns_keys": [],
+            "ingest_mode": "schema_only",
+            "X_layer_name": "raw",
+        }
+        prot_from_h5ad_schema_str = [
+            f"\t{param}: {param_val}\n"
+            for param, param_val in prot_from_h5ad_schema_args.items()
+        ]
+        logger.info(
+            "Calling tiledbsoma.io.from_h5ad with arguments:\n%s",
+            "".join(prot_from_h5ad_schema_str).rstrip(),
+        )
+        # Note the 'schema_only'
+        tiledbsoma.io.from_h5ad(**prot_from_h5ad_schema_args)
+
+        logger.info("Registering prot modality anndata")
+        # Register the second anndata object in the protein measurement
+        register_prot_args = {
+            "experiment_uri": str(par["tiledb_dir"]),
+            "h5ad_file_names": [prot_converted],
+            "measurement_name": par["prot_modality_output"],
+            "obs_field_name": par["obs_index_name_output"],
+            "var_field_name": par["prot_var_index_name_output"],
+            "append_obsm_varm": True,
+        }
+        register_prot_args_str = [
+            f"\t{param}: {param_val}\n"
+            for param, param_val in register_prot_args.items()
+        ]
+        logger.info(
+            "Calling tiledbsoma.io.register_h5ads with arguments:\n%s",
+            "".join(register_prot_args_str).rstrip(),
+        )
+        rd = tiledbsoma.io.register_h5ads(**register_prot_args)
+
+        rd.prepare_experiment(str(par["tiledb_dir"]))
+
+        # Ingest the second anndata object into the protein measurement
+        prot_write_args = {
+            "experiment_uri": str(par["tiledb_dir"]),
+            "input_path": prot_converted,
+            "measurement_name": par["prot_modality_output"],
+            "registration_mapping": rd,
+            "ingest_mode": "write",
+            "uns_keys": [],
+            "X_layer_name": "raw",
+        }
+        prot_write_args_str = [
+            f"\t{param}: {param_val}\n" for param, param_val in prot_write_args.items()
+        ]
+        logger.info(
+            "Calling tiledbsoma.io.from_h5ad with arguments:\n%s",
+            "".join(prot_write_args_str).rstrip(),
+        )
+        tiledbsoma.io.from_h5ad(**prot_write_args)
+
+    logger.info("Finished!")
+
+
+if __name__ == "__main__":
+    main(par)
diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py b/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py
new file mode 100644
index 00000000000..32bdd08a1ef
--- /dev/null
+++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py
@@ -0,0 +1,447 @@
+import sys
+import pytest
+import re
+import anndata as ad
+import mudata as md
+import numpy as np
+import pandas as pd
+import tiledbsoma
+import pyarrow as pa
+from subprocess import CalledProcessError
+
+
+## VIASH START
+meta = {
+    "resources_dir": "resources_test",
+    "executable": "./target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb",
+    "config": "./src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml",
+}
+## VIASH END
+
+
+@pytest.fixture
+def sample_1_modality_1():
+    """
+    >>> ad1.obs
+          Obs1 Shared_obs_col
+     obs1    A              B
+     obs2    C              D
+
+    >>> ad1.var
+                         Feat1  Shared_feat_col
+    var1                     a                b
+    var2                     c                d
+    overlapping_var_mod1     e                f
+
+    >>> ad1.X
+    array([[1, 2, 3],
+           [4, 5, 6]])
+    """
+
+    df = pd.DataFrame(
+        [[1, 2, 3], [4, 5, 6]],
+        index=["obs1", "obs2"],
+        columns=["var1", "var2", "overlapping_var_mod1"],
+    )
+
+    layer_1 = np.array([[94, 81, 60], [50, 88, 69]])
+
+    obs = pd.DataFrame(
+        [["A", "B", True, 0.9, 1], ["C", "D", False, 0.2, 2]],
+        index=df.index,
+        columns=["Obs1", "Shared_obs_col", "bool_column", "float_column", "int_column"],
+    )
+    var = pd.DataFrame(
+        [["a", "b"], ["c", "d"], ["e", "f"]],
+        index=df.columns,
+        columns=["Feat1", "Shared_feat_col"],
+    )
+    varm = np.random.rand(df.columns.size, 5)
+    ad1 = ad.AnnData(
+        X=df,
+        layers={"layer1": layer_1},
+        obs=obs,
+        var=var,
+        varm={"random_vals_mod1": varm},
+        uns={
+            "uns_unique_to_sample1": pd.DataFrame(
+                ["foo"], index=["bar"], columns=["col1"]
+            ),
+            "overlapping_uns_key": pd.DataFrame(
+                ["jing"], index=["jang"], columns=["col2"]
+            ),
+        },
+    )
+    return ad1
+
+
+@pytest.fixture
+def sample_1_modality_2():
+    """
+    >>> ad2.X
+    array([[ 7,  8],
+           [ 9, 10],
+           [11, 12]])
+
+    >>> ad2.obs
+         Obs2 Obs3 Shared_obs_col
+    obs3    E    F              G
+    obs2    H    I              J
+    obs5    K    L              M
+
+    >>> ad2.var
+                           Feat2  Shared_feat_col
+    overlapping_var_mod1       d                e
+    var5                       f                g
+
+    """
+    df = pd.DataFrame(
+        [[7, 8], [9, 10], [11, 12]],
+        index=["obs3", "obs2", "obs5"],
+        columns=["overlapping_var_mod1", "var4"],
+    )
+
+    layer_1 = np.array([[20, 35], [76, 93], [100, 38]])
+
+    obs = pd.DataFrame(
+        [["E", "F", "G"], ["H", "I", "J"], ["K", "L", "M"]],
+        index=df.index,
+        columns=["Obs2", "Obs3", "Shared_obs_col"],
+    )
+    var = pd.DataFrame(
+        [["d", "e"], ["f", "g"]], index=df.columns, columns=["Feat2", "Shared_feat_col"]
+    )
+    ad2 = ad.AnnData(df, obs=obs, var=var, layers={"layer_foo": layer_1})
+    return ad2
+
+
+@pytest.fixture
+def sample_1_mudata(sample_1_modality_1, sample_1_modality_2):
+    return md.MuData({"mod1": sample_1_modality_1, "mod2": sample_1_modality_2})
+
+
+@pytest.fixture
+def random_h5ad_path(random_path):
+    def wrapper():
+        return random_path(extension="h5ad")
+
+    return wrapper
+
+
+@pytest.fixture
+def sample_1_mudata_file(sample_1_mudata, random_h5mu_path):
+    output_path = random_h5mu_path()
+    sample_1_mudata.write(output_path)
+    return output_path
+
+
+@pytest.fixture
+def sample_1_modality_1_anndata_file(sample_1_modality_1, random_h5ad_path):
+    output_path = random_h5ad_path()
+    sample_1_modality_1.write(output_path)
+    return output_path
+
+
+def test_convert_anndata(run_component, sample_1_modality_1_anndata_file, tmp_path):
+    output_path = tmp_path / "output"
+    run_component(
+        [
+            "--input",
+            sample_1_modality_1_anndata_file,
+            "--rna_modality",
+            "mod1",
+            "--rna_raw_layer_input",
+            "X",
+            "--rna_normalized_layer_input",
+            "layer1",
+            "--rna_var_gene_names_input",
+            "Feat1",
+            "--tiledb_dir",
+            output_path,
+        ]
+    )
+    with tiledbsoma.DataFrame.open(f"{output_path}/obs") as obs_arr:
+        obs_contents = obs_arr.read().concat()
+        expected_schema = pa.schema(
+            [
+                pa.field("soma_joinid", pa.int64(), nullable=False),
+                pa.field("cell_id", pa.large_string()),
+                pa.field("Obs1", pa.large_string()),
+                pa.field("Shared_obs_col", pa.large_string()),
+                pa.field("bool_column", pa.bool_()),
+                pa.field("float_column", pa.float64()),
+                pa.field("int_column", pa.int64()),
+            ]
+        )
+        expected = pa.Table.from_arrays(
+            [
+                pa.array([0, 1]),
+                pa.array(["obs1", "obs2"]),
+                pa.array(["A", "C"]),
+                pa.array(["B", "D"]),
+                pa.array([True, False]),
+                pa.array([0.9, 0.2]),
+                pa.array([1, 2]),
+            ],
+            schema=expected_schema,
+        )
+        assert expected.equals(obs_contents)
+
+    with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr:
+        var_contents = obs_arr.read().concat()
+        expected_schema = pa.schema(
+            [
+                pa.field("soma_joinid", pa.int64(), nullable=False),
+                pa.field("rna_index", pa.large_string()),
+                pa.field("Feat1", pa.large_string()),
+                pa.field("Shared_feat_col", pa.large_string()),
+                pa.field("gene_symbol", pa.large_string()),
+            ]
+        )
+        expected = pa.Table.from_arrays(
+            [
+                pa.array([0, 1, 2]),
+                pa.array(["var1", "var2", "overlapping_var_mod1"]),
+                pa.array(["a", "c", "e"]),
+                pa.array(["b", "d", "f"]),
+                pa.array(["Feat1", "Feat1", "Feat1"]),
+            ],
+            schema=expected_schema,
+        )
+        assert expected.equals(var_contents)
+        with tiledbsoma.Experiment.open(f"{output_path}") as exp:
+            with tiledbsoma.ExperimentAxisQuery(
+                experiment=exp, measurement_name="rna"
+            ) as query:
+                assert query.n_obs == 2
+                data = query.to_anndata("raw").X.todense()
+                expected = (
+                    [
+                        [1, 2, 3],
+                        [4, 5, 6],
+                    ],
+                )
+                np.testing.assert_allclose(data, np.asmatrix(np.array(expected)))
+                log_normalized_data = query.to_anndata("log_normalized").X.todense()
+                expected_log_normalized = [
+                    [94, 81, 60],
+                    [50, 88, 69],
+                ]
+                np.testing.assert_allclose(
+                    log_normalized_data, np.asmatrix(np.array(expected_log_normalized))
+                )
+
+
+@pytest.mark.parametrize(
+    "extra_arg",
+    [
+        ("--prot_modality", "mod2"),
+        ("--prot_raw_layer_input", "X"),
+        ("--prot_normalized_layer_input", "layer_foo"),
+    ],
+)
+def test_convert_anndata_extra_prot_arguments_raises(
+    run_component, sample_1_modality_1_anndata_file, tmp_path, extra_arg
+):
+    output_path = tmp_path / "output"
+    args = [
+        "--input",
+        sample_1_modality_1_anndata_file,
+        "--rna_modality",
+        "mod1",
+        "--rna_raw_layer_input",
+        "X",
+        "--rna_normalized_layer_input",
+        "layer1",
+        "--rna_var_gene_names_input",
+        "Feat1",
+        "--tiledb_dir",
+        output_path,
+    ]
+    args += extra_arg
+    with pytest.raises(CalledProcessError) as err:
+        run_component(args)
+    assert re.search(
+        rf"'{extra_arg[0].lstrip('--')}' can not be used when the input is not a MuData file\.",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        [("--rna_raw_layer_input", "X"), ("--rna_normalized_layer_input", "X")],
+        [("--rna_raw_layer_output", "X"), ("--rna_normalized_layer_output", "X")],
+        [("--rna_modality", "rna"), ("--prot_modality", "rna")],
+        [("--prot_raw_layer_input", "X"), ("--prot_normalized_layer_input", "X")],
+        [("--prot_raw_layer_output", "X"), ("--prot_normalized_layer_output", "X")],
+    ],
+)
+def test_arg_values_should_not_be_the_same(
+    run_component, sample_1_mudata_file, tmp_path, params
+):
+    output_path = tmp_path / "output"
+    args = [
+        "--input",
+        sample_1_mudata_file,
+        "--rna_var_gene_names_input",
+        "Feat1",
+        "--tiledb_dir",
+        output_path,
+    ]
+    args += [param for param_tup in params for param in param_tup]
+    # The following avoids complaints about misszing input argument
+    if "--rna_raw_layer_input" not in args:
+        args += [
+            "--rna_raw_layer_input",
+            "X",
+            "--rna_normalized_layer_input",
+            "log_normalized",
+        ]
+    if "--rna_modality" not in args:
+        args += ["--rna_modality", "mod1"]
+
+    with pytest.raises(CalledProcessError) as err:
+        run_component(args)
+    assert re.search(
+        rf"The value for argument '{params[0][0].lstrip('-')}' \({params[0][1]}\) must not be the same as for argument '{params[1][0].lstrip('-')}' \({params[1][1]}\)",
+        err.value.stdout.decode("utf-8"),
+    )
+
+
+def test_output_directory_already_exists(run_component, sample_1_mudata_file, tmp_path):
+    output_path = tmp_path / "output"
+    output_path.mkdir()
+    run_component(
+        [
+            "--input",
+            sample_1_mudata_file,
+            "--rna_modality",
+            "mod1",
+            "--prot_modality",
+            "mod2",
+            "--rna_raw_layer_input",
+            "X",
+            "--rna_normalized_layer_input",
+            "layer1",
+            "--prot_raw_layer_input",
+            "X",
+            "--prot_normalized_layer_input",
+            "layer_foo",
+            "--rna_var_gene_names_input",
+            "Feat1",
+            "--tiledb_dir",
+            output_path,
+        ]
+    )
+
+
+def test_convert_mudata(run_component, sample_1_mudata_file, tmp_path):
+    output_path = tmp_path / "output"
+    run_component(
+        [
+            "--input",
+            sample_1_mudata_file,
+            "--rna_modality",
+            "mod1",
+            "--prot_modality",
+            "mod2",
+            "--rna_raw_layer_input",
+            "X",
+            "--rna_normalized_layer_input",
+            "layer1",
+            "--prot_raw_layer_input",
+            "X",
+            "--prot_normalized_layer_input",
+            "layer_foo",
+            "--rna_var_gene_names_input",
+            "Feat1",
+            "--tiledb_dir",
+            output_path,
+        ]
+    )
+
+    assert output_path.is_dir()
+    with tiledbsoma.DataFrame.open(f"{output_path}/obs") as obs_arr:
+        obs_contents = obs_arr.read().concat()
+        expected_schema = pa.schema(
+            [
+                pa.field("soma_joinid", pa.int64(), nullable=False),
+                pa.field("cell_id", pa.large_string()),
+                pa.field("Obs1", pa.large_string()),
+                pa.field("Shared_obs_col", pa.large_string()),
+                pa.field("bool_column", pa.bool_()),
+                pa.field("float_column", pa.float64()),
+                pa.field("int_column", pa.int64()),
+                pa.field("Obs2", pa.large_string()),
+                pa.field("Obs3", pa.large_string()),
+            ]
+        )
+        expected = pa.Table.from_arrays(
+            [
+                pa.array([0, 1, 2, 3]),
+                pa.array(["obs1", "obs2", "obs3", "obs5"]),
+                pa.array(["A", "C", None, None]),
+                pa.array(["B", "D", "G", "M"]),
+                pa.array([True, False, None, None]),
+                pa.array([0.9, 0.2, None, None]),
+                pa.array([1, 2, None, None]),
+                pa.array([None, None, "E", "K"]),
+                pa.array([None, None, "F", "L"]),
+            ],
+            schema=expected_schema,
+        )
+        assert expected.equals(obs_contents)
+
+    with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr:
+        var_contents = obs_arr.read().concat()
+        expected_schema = pa.schema(
+            [
+                pa.field("soma_joinid", pa.int64(), nullable=False),
+                pa.field("rna_index", pa.large_string()),
+                pa.field("Feat1", pa.large_string()),
+                pa.field("Shared_feat_col", pa.large_string()),
+                pa.field("gene_symbol", pa.large_string()),
+            ]
+        )
+        expected = pa.Table.from_arrays(
+            [
+                pa.array([0, 1, 2]),
+                pa.array(["var1", "var2", "overlapping_var_mod1"]),
+                pa.array(["a", "c", "e"]),
+                pa.array(["b", "d", "f"]),
+                pa.array(["Feat1", "Feat1", "Feat1"]),
+            ],
+            schema=expected_schema,
+        )
+        assert expected.equals(var_contents)
+    with tiledbsoma.Experiment.open(f"{output_path}") as exp:
+        with tiledbsoma.ExperimentAxisQuery(
+            experiment=exp, measurement_name="rna"
+        ) as query:
+            assert query.n_obs == 4
+            data = query.to_anndata("raw").X.todense()
+            expected = ([[1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]],)
+            np.testing.assert_allclose(data, np.asmatrix(np.array(expected)))
+            log_normalized_data = query.to_anndata("log_normalized").X.todense()
+            expected_log_normalized = [[94, 81, 60], [50, 88, 69], [0, 0, 0], [0, 0, 0]]
+            np.testing.assert_allclose(
+                log_normalized_data, np.asmatrix(np.array(expected_log_normalized))
+            )
+
+        with tiledbsoma.ExperimentAxisQuery(
+            experiment=exp, measurement_name="prot"
+        ) as query:
+            assert query.n_obs == 4
+            data = query.to_anndata("raw").X.todense()
+            expected = ([[0, 0], [9, 10], [7, 8], [11, 12]],)
+            np.testing.assert_allclose(data, np.asmatrix(np.array(expected)))
+            log_normalized_data = query.to_anndata("log_normalized").X.todense()
+            expected_log_normalized = [[0, 0], [76, 93], [20, 35], [100, 38]]
+            np.testing.assert_allclose(
+                log_normalized_data, np.asmatrix(np.array(expected_log_normalized))
+            )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))

From 5a020553f30ab88656de1bd664fc2f883c47c079 Mon Sep 17 00:00:00 2001
From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Wed, 11 Jun 2025 12:21:47 +0000
Subject: [PATCH 2/3] Update CHANGELOG entry

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d71adf59b7..e8506967c79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,11 @@
 
 * `download_file` has been deprecated and will be removed in openpipeline 3.0 (PR #1015).
 
+## NEW FUNCTIONALITY
+
+* (Experimental) Added `from_h5mu_or_h5ad_to_tiledb` component. Warning: the functionality in this component is experimental
+  and its behavior may change in future releases (PR #1034).
+
 ## MAJOR CHANGES
 
 * `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006).

From 059b0ecb7e76719e43756f7992a441d57732d4a2 Mon Sep 17 00:00:00 2001
From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Wed, 11 Jun 2025 12:25:28 +0000
Subject: [PATCH 3/3] Update config

---
 src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml
index b90e133a20f..55f741f92cf 100644
--- a/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml
+++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml
@@ -2,7 +2,9 @@ name: "from_h5mu_or_h5ad_to_tiledb"
 namespace: "convert"
 scope: "public"
 description: |
-  Convert a MuData or AnnData object to tiledb
+  Convert a MuData or AnnData object to tiledb. Currently, transcriptome and protein modalities are supported.
+
+  NOTE: The functionality provided by this component is experimental and may be subject to change. 
 authors:
   - __merge__: /src/authors/dries_schaumont.yaml
     roles: [ author, maintainer ]
@@ -154,4 +156,4 @@ runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [lowmem, singlecpu]
\ No newline at end of file
+      label: [midmem, midcpu]
\ No newline at end of file