From bb1984b1821b971af8a54efc087e60b1ee340ac2 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Wed, 11 Jun 2025 12:15:41 +0000 Subject: [PATCH 1/3] Add conversion to TileDB SOMA --- .../config.vsh.yaml | 157 +++++ .../from_h5mu_or_h5ad_to_tiledb/script.py | 614 ++++++++++++++++++ .../from_h5mu_or_h5ad_to_tiledb/test.py | 447 +++++++++++++ 3 files changed, 1218 insertions(+) create mode 100644 src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml create mode 100644 src/convert/from_h5mu_or_h5ad_to_tiledb/script.py create mode 100644 src/convert/from_h5mu_or_h5ad_to_tiledb/test.py diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml new file mode 100644 index 00000000000..b90e133a20f --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml @@ -0,0 +1,157 @@ +name: "from_h5mu_or_h5ad_to_tiledb" +namespace: "convert" +scope: "public" +description: | + Convert a MuData or AnnData object to tiledb +authors: + - __merge__: /src/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +argument_groups: + - name: "Input" + arguments: + - name: --input + description: | + Input AnnData or MuData file. When an AnnData file is provided, it is automatically assumed to + contain transcriptome counts. + type: file + required: true + example: "input.h5mu" + direction: input + - name: "RNA modality" + arguments: + - name: --rna_modality + type: string + default: rna + description: | + The name used for the RNA modality. Used when input file is a MuData object. + - name: --rna_raw_layer_input + type: string + required: true + example: X + description: | + Location of the layer containing the raw transcriptome counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + - name: --rna_normalized_layer_input + type: string + required: true + example: log_normalized + description: | + Location of the layer containing the normalized counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + - name: --rna_var_gene_names_input + type: string + required: true + example: "gene_symbol" + description: | + Column in .var that provides the gene names. If not specified, the index from the input is used. + + - name: "Protein modality" + arguments: + - name: --prot_modality + description: | + The name used for the protein modality. Used when input file is a MuData object. + When not specified, the protein modality will not be processed. + type: string + required: false + example: prot + - name: --prot_raw_layer_input + type: string + example: X + description: | + Location of the layer containing the raw protein counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + - name: --prot_normalized_layer_input + type: string + example: clr + description: | + Location of the layer containing the normalized counts. Layers are looked for in .layers, + except when using the value 'X'; in which case .X is used. + + - name: "Output slots" + arguments: + - name: "--rna_modality_output" + type: string + default: "rna" + description: | + TileDB Measurement name where the RNA modality will be stored. + - name: "--prot_modality_output" + type: string + default: "prot" + description: | + Name of the TileDB Measurement where the protein modality will be stored. + - name: "--obs_index_name_output" + description: | + Name of the index that is used to describe the cells (observations). + type: string + default: cell_id + - name: --rna_var_index_name_output + description: | + Output name of the index that is used to describe the genes. + type: string + default: rna_index + - name: --rna_raw_layer_output + description: | + Output location for the raw transcriptomics counts. + type: string + default: "X" + - name: --rna_normalized_layer_output + type: string + default: "log_normalized" + description: | + Output location for the normalized RNA counts. + - name: --rna_var_gene_names_output + type: string + default: "gene_symbol" + description: | + Name of the .var column that specifies the gene games. + - name: --prot_var_index_name_output + description: | + Output name of the index that is used to describe the proteins. + type: string + default: prot_index + - name: --prot_raw_layer_output + type: string + default: "X" + description: | + Output location for the raw protein counts. + - name: --prot_normalized_layer_output + type: string + default: "log_normalized" + description: | + Output location for the normalized protein counts. + + - name: "Output arguments" + arguments: + - name: "--tiledb_dir" + type: file + direction: output + description: | + Directory where the TileDB output will be written to. + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - tiledbsoma + __merge__: [/src/base/requirements/anndata_mudata.yaml, .] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, singlecpu] \ No newline at end of file diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py b/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py new file mode 100644 index 00000000000..9650157cfd0 --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/script.py @@ -0,0 +1,614 @@ +import sys +import anndata +import numpy as np +import tiledbsoma + +# This import seems redundant, but it is required! +import tiledbsoma.io +import pandas as pd +from functools import singledispatch +from collections.abc import Mapping +import h5py +from shutil import copy2, rmtree +from collections.abc import Callable +from functools import partial +from pathlib import Path +from tempfile import NamedTemporaryFile +import pyarrow as pa +from collections import defaultdict + +anndata.settings.allow_write_nullable_strings = True + + +## VIASH START +par = { + "input": "/home/di/code/openpipelines-multisample/pmbc_process_samples.h5mu", + # modality names + "rna_modality": "rna", + "rna_modality_output": "rna", + "prot_modality": "prot", + "prot_modality_output": "prot", + # rna var + "rna_var_gene_names_input": "gene_symbol", + "rna_var_gene_names_output": "gene_foo", + "rna_var_index_name_output": "rna_index", + # rna layers + "rna_raw_layer_input": "X", + "rna_raw_layer_output": "X", + "rna_normalized_layer_input": "log_normalized", + "rna_normalized_layer_output": "log_normalized", + # prot layers + "prot_raw_layer_input": "X", + "prot_raw_layer_output": "X", + "prot_normalized_layer_input": "clr", + "prot_normalized_layer_output": "log_normalized", + # prot var + "prot_var_index_name_output": "prot_index", + # observation index name + "obs_index_name_output": "cell_id", + # TileDB output directory + "tiledb_dir": "output", +} + + +meta = { + "name": "from_h5mu_or_h5ad_to_tiledb", + "resources_dir": "src/utils/", + "temp_dir": "/tmp", +} +## VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + +LAYER_ARGUMENTS = ( + "rna_raw_layer_input", + "rna_raw_layer_output", + "rna_normalized_layer_input", + "rna_normalized_layer_output", + "prot_raw_layer_input", + "prot_raw_layer_output", + "prot_normalized_layer_input", + "prot_normalized_layer_output", +) + + +def _to_pandas_nullable(table_or_df: pa.Table | pd.DataFrame) -> pd.DataFrame: + """ + Convert pyarrow Table or pandas dataframe to pandas dataframe that uses nullable data types. + + This function takes the following into account: + 1. AnnData supports writing BooleanDtype, IntegerDtype and StringDtype (which uses pd.NA) + 2. AnnData does *not* support FloatingDtype + 3. AnnData does support numpy-stype floating and integer dtypes (which use np.nan). + 4. TileDB SOMA's `update_obs` does not allow casting floats to integers (potential information loss). + 5. When pandas's `convert_dtypes` convert_integer argument is set to True, + floats will be converted to integer if they can be 'faithfully' casted. + + Giving the previous constraints, it is *not* possible to only convert integers using `convert_types` because + it might cause some float columns to be converted to integers (which is incompatible with tiledbsoma). + Therefore, this function will leave integers and floats as-is. + + Based on the conversion provided by this function, a suitable na representation (pd.NA or np.NA) for a column can + be chosen using `pd.api.types.is_extension_array_dtype(column_dtype)` + """ + try: + df = table_or_df.to_pandas( + types_mapper=defaultdict(None, {pa.large_string(): pd.StringDtype()}).get + ) + except AttributeError: + df = table_or_df.convert_dtypes( + infer_objects=False, + convert_string=True, + convert_integer=False, + convert_boolean=False, + convert_floating=False, + ) + return df.convert_dtypes( + infer_objects=False, + convert_string=False, + convert_integer=False, + convert_boolean=True, + convert_floating=False, + ) + + +def _get_temp_h5ad(): + return Path(NamedTemporaryFile(suffix=".h5ad", dir=meta["temp_dir"]).name) + + +def _h5mu_to_h5ad(h5mu_path, modality_name): + """ + Create an anndata file from a modality in a mudata file by + copying over the relevant h5 objects. + """ + result = _get_temp_h5ad() + with h5py.File(h5mu_path, "r") as open_anndata: + with h5py.File(result, "w") as file_to_write: + for h5_key in open_anndata["mod"][modality_name].keys(): + open_anndata.copy( + source=f"/mod/{modality_name}/{h5_key}", dest=file_to_write + ) + return result + + +def _handle_layer_location(layer_name): + """ + Handle the special case where 'X' is specified as layer + this layer is not present in `/layers/X` but in `/X`. + """ + prefix = "layers/" if layer_name != "X" else "" + return f"{prefix}{layer_name}" + + +def _check_input_args(par, input, is_mudata): + """ + Check the input arguments: + * Arguments for protein modalities are not allowed when the input is not a MuData file. + * Arguments for input layers may not point to the same layer + * Output locations for layers may not be the same + * Input modalities must not be the same + * Input modalities must exist + """ + if not is_mudata: + for par_name in ( + "prot_modality", + "prot_raw_layer_input", + "prot_normalized_layer_input", + ): + if par[par_name]: + raise ValueError( + f"'{par_name}' can not be used when the input is not a MuData file." + ) + + if par["prot_modality"]: + for prot_arg in ("prot_raw_layer_input", "prot_raw_layer_output"): + if not par[prot_arg]: + ValueError( + f"When providing 'prot_modality', '{prot_arg}' must also be set." + ) + + NOT_SAME_VALUE = ( + ("rna_raw_layer_input", "rna_normalized_layer_input"), + ("prot_raw_layer_input", "prot_normalized_layer_input"), + ("rna_raw_layer_output", "rna_normalized_layer_output"), + ("prot_raw_layer_output", "prot_normalized_layer_output"), + ("rna_modality", "prot_modality"), + ) + for layer1_arg, layer2_arg in NOT_SAME_VALUE: + arg1_val, arg2_val = par[layer1_arg], par[layer2_arg] + if (arg1_val == arg2_val) and arg1_val is not None: + raise ValueError( + f"The value for argument '{layer1_arg}' ({arg1_val}) must not be the same as for argument '{layer2_arg}' ({arg2_val})" + ) + + with h5py.File(input, "r") as open_h5: + if "mod" in open_h5: + available_modalities = tuple(open_h5["mod"].keys()) + for mod in (par["rna_modality"], par["prot_modality"]): + if mod is not None and f"/mod/{mod}" not in open_h5: + raise ValueError( + f"Modality '{mod}' was not found in object, available modalities: {available_modalities}" + ) + + +def _copy_layer(source: str, dest: str, input_h5ad: Path, mod_obj: h5py.Group): + """ + Copy a h5py object from one location (source) to a new location (dest). + The source location must be located in the input AnnData file, the destination + will be written to a h5py Group object. + """ + logger.info("Copying layer '%s' from '%s' to '%s'", source, input_h5ad, dest) + if source == f"{mod_obj.name}/{dest}": + logger.info("Layer is already at the correct location. Nothing to do.") + return + if not input_h5ad.is_file(): + raise FileNotFoundError(f"Could not link to {input_h5ad}: file does not exist.") + with h5py.File(input_h5ad) as open_input: + if source not in open_input: + raise ValueError(f"Layer {source} was not found in {input_h5ad}.") + if dest in mod_obj: + logger.info(f"{dest} is already present. Removing before creating link.") + del mod_obj[dest] + # ExternalLink could have been used instead of 'copy', but it does not seem to work with tileDB + with h5py.File(str(input_h5ad), "r") as open_input: + open_input.copy( + source=open_input[source], dest=mod_obj, name=dest, expand_external=True + ) + logger.info("Copying complete.") + + +def _copy_column(src, dest, _, df_h5_obj): + """ + Duplicate a column in a dataframe and give a new name. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Copying column '%s' to '%s' for '%s'.", src, dest, df_h5_obj.name) + if src == dest: + logger.info("Source and destination for the column are the same, not copying.") + return None + if dest in df_h5_obj: + raise ValueError(f"Column {dest} already exists in {df_h5_obj.name}.") + df_h5_obj[dest] = src + df_h5_obj.attrs["column-order"] = np.append(df_h5_obj.attrs["column-order"], dest) + + +def _set_index_name(name, _, df_h5_obj): + """ + Set the index for a dataframe to an existing column. + The input dataframe must be h5py object that is encoded by AnnData to store + a pandas dataframe (e.g. .obs and .var). + """ + logger.info("Setting index name of '%s' to '%s'.", df_h5_obj.name, name) + original_index_name = df_h5_obj.attrs["_index"] + df_h5_obj.move(original_index_name, name) + df_h5_obj.attrs["_index"] = name + logger.info("Done setting index name") + + +def _delete_all_keys(_, group_obj, exception=None): + """ + Delete all keys from a h5py.Group object; which optional exceptions. + """ + if exception is None: + exception = () + logger.info( + "Deleting all keys from '%s'%s.", + group_obj.name, + f" except {', '.join(exception)}" if exception else "", + ) + keys_to_delete = set(group_obj.keys()) - set(exception) + if not keys_to_delete: + logger.info("No keys to delete.") + return + for key in keys_to_delete: + del group_obj[key] + logger.info("Done deleting keys %s.", ", ".join(keys_to_delete)) + + +def _set_index_to_string_dtype(_, df_obj): + """ + Change the index dataype for a dataframe to string when it is encoded as categorical. + The dataframe must be provided as a h5py Group object that has been encoded by AnnData. + """ + logger.info( + "Checking if index of object '%s' is encoded as a categorical.", df_obj.name + ) + index_col_name = df_obj.attrs["_index"] + index_col = df_obj[index_col_name] + logger.info("Column '%s' is being used as the index.", index_col_name) + index_col_dtype = index_col.attrs["encoding-type"] + if index_col_dtype == "categorical": + logger.info( + "Column '%s' is encoded as categorical, converting to string.", + index_col_name, + ) + is_ordered = bool(index_col.attrs.get("ordered", False)) + codes, categories = index_col["codes"], index_col["categories"] + column = pd.Categorical.from_codes(codes, categories, is_ordered) + column_str = column.astype(str).astype(object) + del df_obj[index_col_name] + df_obj.create_dataset(index_col_name, data=column_str) + index_col.attrs["encoding-type"] = "string-array" + logger.info("Conversion to string dtype complete.") + return + logger.info( + "Column '%s' not encoded as categorical. Leaving as is.", index_col_name + ) + + +def _get_rna_conversion_specification(par): + rna_spec = [ + partial(_copy_layer, par["rna_raw_layer_input"], par["rna_raw_layer_output"]), + partial( + _copy_layer, + par["rna_normalized_layer_input"], + par["rna_normalized_layer_output"], + ), + { + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + "var": [ + partial( + _copy_column, + par["rna_var_gene_names_input"], + par["rna_var_gene_names_output"], + ), + partial(_set_index_name, par["rna_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "obs": partial(_set_index_name, par["obs_index_name_output"]), + "layers": partial( + _delete_all_keys, + exception=( + par["rna_raw_layer_output"].removeprefix("layers/"), + par["rna_normalized_layer_output"].removeprefix("layers/"), + ), + ), + }, + ] + return rna_spec + + +def _get_prot_conversion_specification(par): + prot_spec = [ + partial(_copy_layer, par["prot_raw_layer_input"], par["prot_raw_layer_output"]), + partial( + _copy_layer, + par["prot_normalized_layer_input"], + par["prot_normalized_layer_output"], + ), + { + "var": [ + partial(_set_index_name, par["prot_var_index_name_output"]), + _set_index_to_string_dtype, + ], + "uns": _delete_all_keys, + "obsp": _delete_all_keys, + # "varm": _delete_all_keys, + "layers": partial( + _delete_all_keys, + exception=( + par["prot_raw_layer_output"].removeprefix("layers/"), + par["prot_normalized_layer_output"].removeprefix("layers/"), + ), + ), + "obs": partial(_set_index_name, par["obs_index_name_output"]), + }, + ] + + return prot_spec + + +def _copy_anndata_and_convert_inplace(anndata_path, conversion_specification): + converted_anndata = _get_temp_h5ad() + copy2(anndata_path, converted_anndata) + with h5py.File(converted_anndata, "r+") as open_prot_h5: + apply_conversion(conversion_specification, anndata_path, open_prot_h5) + return converted_anndata + + +@singledispatch +def apply_conversion(conversion_specification, input_h5ad, open_h5): + raise NotImplementedError( + f"Error: function 'apply_conversion' is not implemented for type '{type(conversion_specification)}'" + ) + + +@apply_conversion.register +def _(conversion_specification: list, input_h5ad, open_h5): + for item in conversion_specification: + apply_conversion(item, input_h5ad, open_h5) + + +@apply_conversion.register +def _(conversion_specifications: Mapping, input_h5ad, open_h5): + for h5_key, conversion_spec in conversion_specifications.items(): + if hasattr(open_h5, h5_key): + h5_obj = getattr(open_h5, h5_key) + else: + h5_obj = open_h5[h5_key] + apply_conversion(conversion_spec, input_h5ad, h5_obj) + + +@apply_conversion.register +def _(conversion_specifications: Callable, input_h5ad, open_h5): + conversion_specifications(input_h5ad, open_h5) + + +def _remove_directory(dir_path): + """ + Check if a directory exists and remove it does. + """ + if dir_path.exists(): + try: + # Delete the directory and all its contents + for item in dir_path.iterdir(): + if item.isfile(item) or item.islink(item): + item.unlink() # Removes files or symbolic links + elif item.is_dir(): + rmtree(item) # Removes directories recursively + logger.info("Directory '%s' has been deleted successfully.", dir_path) + except FileNotFoundError: + logger.info("Directory '%s' not found.", dir_path) + except PermissionError as e: + raise ValueError( + f"Permission denied: Unable to delete '{dir_path}'." + ) from e + + +def _align_obs_columns(experiment_df, anndata_df): + def _create_na_columns(df_to_write, columns, dtype_df): + """ + Create new columns in a dataframe with NA's as content while + making sure that the datatype of the newly created columns match + with those from another dataframe. + """ + for col in columns: + dtype = dtype_df[col].dtype + na = pd.NA if pd.api.types.is_extension_array_dtype(dtype) else np.nan + df_to_write.loc[:, col] = pd.Series( + na, index=df_to_write.index, dtype=dtype + ) + return df_to_write + + logger.info("Making sure obs columns are aligned.") + contents_columns = set(experiment_df.columns) + logger.info("Already ingested .obs columns: %s", ", ".join(contents_columns)) + logger.info("Retreiving obs columns to be added.") + + obs_to_add_columns = set(anndata_df.columns) + logger.info("Will add the following columns: %s", ", ".join(obs_to_add_columns)) + missing_columns_in_old = obs_to_add_columns - contents_columns + logger.info( + "Adding the following columns to the schema: %s.", + ", ".join(missing_columns_in_old), + ) + experiment_df = _create_na_columns( + experiment_df, missing_columns_in_old, anndata_df + ) + + logger.info("Adjusting .obs succeeded!") + missing_columns_in_new = ( + contents_columns + - obs_to_add_columns + - {"soma_joinid", par["obs_index_name_output"]} + ) + logger.info( + "Adding the following columns to modality to be ingested: %s", + ", ".join(missing_columns_in_new), + ) + anndata_df = _create_na_columns(anndata_df, missing_columns_in_new, experiment_df) + return experiment_df, anndata_df + + +def main(par): + logger.info(f"Component {meta['name']} started.") + par["input"], par["tiledb_dir"] = Path(par["input"]), Path(par["tiledb_dir"]) + + # Handle case where input is a mudat file + with open(par["input"], "rb") as open_input_file: + # It is possible to create a MuData compatible h5 file without using + # MuData, and those could not have "MuData" as the first bytes. + # But that is really an edge case and this check should hold. + is_mudata = open_input_file.read(6) == b"MuData" + + _check_input_args(par, par["input"], is_mudata) + + # Reference layers other than "X" refer as "layer/" + for arg_name in LAYER_ARGUMENTS: + par[arg_name] = _handle_layer_location(par[arg_name]) + + rna_input = par["input"] + # Create a temporary file that holds the converted RNA h5ad + if is_mudata: + if not par["rna_modality"]: + raise ValueError( + "'rna_modality' argument must be set if the input is a MuData file." + ) + # If the input is a MuData file, take the RNA modality and use that as input instead + rna_input = _h5mu_to_h5ad(par["input"], par["rna_modality"]) + + # Put a copy of the RNA input in the output file and convert it in-place + rna_converted = _copy_anndata_and_convert_inplace( + rna_input, _get_rna_conversion_specification(par) + ) + + if par["prot_modality"]: + prot_input = _h5mu_to_h5ad(par["input"], par["prot_modality"]) + prot_converted = _copy_anndata_and_convert_inplace( + prot_input, _get_prot_conversion_specification(par) + ) + + tiledbsoma.logging.info() + _remove_directory(par["tiledb_dir"]) + + logger.info("Ingesting RNA modality") + rna_from_h5ad_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": rna_converted, + "measurement_name": par["rna_modality_output"], + "uns_keys": [], + "X_layer_name": "raw", + } + rna_from_h5ad_str = [ + f"\t{param}: {param_val}\n" for param, param_val in rna_from_h5ad_args.items() + ] + logger.info( + "Calling tiledbsoma.io.from_h5ad with arguments: \n%s", + "".join(rna_from_h5ad_str).rstrip(), + ) + tiledbsoma.io.from_h5ad(**rna_from_h5ad_args) + logger.info("Done ingesting RNA modality") + + if par["prot_modality"]: + logger.info("Ingesting protein modality") + logger.info( + "Making sure that obs columns are shared between the two modalities." + ) + with tiledbsoma.DataFrame.open(f"{par['tiledb_dir']}/obs") as obs_arr: + contents = _to_pandas_nullable(obs_arr.read().concat()) + + with h5py.File(prot_converted, "r") as open_h5: + obs_to_add = _to_pandas_nullable(anndata.io.read_elem(open_h5["obs"])) + + contents, obs_to_add = _align_obs_columns(contents, obs_to_add) + + with h5py.File(prot_converted, "r+") as open_h5: + anndata.io.write_elem(open_h5, "obs", obs_to_add) + + with tiledbsoma.Experiment.open(str(par["tiledb_dir"]), "w") as open_experiment: + tiledbsoma.io.update_obs( + exp=open_experiment, + new_data=contents, + default_index_name=par["obs_index_name_output"], + ) + + logger.info("Initalizing prot modality schema.") + prot_from_h5ad_schema_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "uns_keys": [], + "ingest_mode": "schema_only", + "X_layer_name": "raw", + } + prot_from_h5ad_schema_str = [ + f"\t{param}: {param_val}\n" + for param, param_val in prot_from_h5ad_schema_args.items() + ] + logger.info( + "Calling tiledbsoma.io.from_h5ad with arguments:\n%s", + "".join(prot_from_h5ad_schema_str).rstrip(), + ) + # Note the 'schema_only' + tiledbsoma.io.from_h5ad(**prot_from_h5ad_schema_args) + + logger.info("Registering prot modality anndata") + # Register the second anndata object in the protein measurement + register_prot_args = { + "experiment_uri": str(par["tiledb_dir"]), + "h5ad_file_names": [prot_converted], + "measurement_name": par["prot_modality_output"], + "obs_field_name": par["obs_index_name_output"], + "var_field_name": par["prot_var_index_name_output"], + "append_obsm_varm": True, + } + register_prot_args_str = [ + f"\t{param}: {param_val}\n" + for param, param_val in register_prot_args.items() + ] + logger.info( + "Calling tiledbsoma.io.register_h5ads with arguments:\n%s", + "".join(register_prot_args_str).rstrip(), + ) + rd = tiledbsoma.io.register_h5ads(**register_prot_args) + + rd.prepare_experiment(str(par["tiledb_dir"])) + + # Ingest the second anndata object into the protein measurement + prot_write_args = { + "experiment_uri": str(par["tiledb_dir"]), + "input_path": prot_converted, + "measurement_name": par["prot_modality_output"], + "registration_mapping": rd, + "ingest_mode": "write", + "uns_keys": [], + "X_layer_name": "raw", + } + prot_write_args_str = [ + f"\t{param}: {param_val}\n" for param, param_val in prot_write_args.items() + ] + logger.info( + "Calling tiledbsoma.io.from_h5ad with arguments:\n%s", + "".join(prot_write_args_str).rstrip(), + ) + tiledbsoma.io.from_h5ad(**prot_write_args) + + logger.info("Finished!") + + +if __name__ == "__main__": + main(par) diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py b/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py new file mode 100644 index 00000000000..32bdd08a1ef --- /dev/null +++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/test.py @@ -0,0 +1,447 @@ +import sys +import pytest +import re +import anndata as ad +import mudata as md +import numpy as np +import pandas as pd +import tiledbsoma +import pyarrow as pa +from subprocess import CalledProcessError + + +## VIASH START +meta = { + "resources_dir": "resources_test", + "executable": "./target/executable/convert/from_h5mu_or_h5ad_to_tiledb/from_h5mu_or_h5ad_to_tiledb", + "config": "./src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml", +} +## VIASH END + + +@pytest.fixture +def sample_1_modality_1(): + """ + >>> ad1.obs + Obs1 Shared_obs_col + obs1 A B + obs2 C D + + >>> ad1.var + Feat1 Shared_feat_col + var1 a b + var2 c d + overlapping_var_mod1 e f + + >>> ad1.X + array([[1, 2, 3], + [4, 5, 6]]) + """ + + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["obs1", "obs2"], + columns=["var1", "var2", "overlapping_var_mod1"], + ) + + layer_1 = np.array([[94, 81, 60], [50, 88, 69]]) + + obs = pd.DataFrame( + [["A", "B", True, 0.9, 1], ["C", "D", False, 0.2, 2]], + index=df.index, + columns=["Obs1", "Shared_obs_col", "bool_column", "float_column", "int_column"], + ) + var = pd.DataFrame( + [["a", "b"], ["c", "d"], ["e", "f"]], + index=df.columns, + columns=["Feat1", "Shared_feat_col"], + ) + varm = np.random.rand(df.columns.size, 5) + ad1 = ad.AnnData( + X=df, + layers={"layer1": layer_1}, + obs=obs, + var=var, + varm={"random_vals_mod1": varm}, + uns={ + "uns_unique_to_sample1": pd.DataFrame( + ["foo"], index=["bar"], columns=["col1"] + ), + "overlapping_uns_key": pd.DataFrame( + ["jing"], index=["jang"], columns=["col2"] + ), + }, + ) + return ad1 + + +@pytest.fixture +def sample_1_modality_2(): + """ + >>> ad2.X + array([[ 7, 8], + [ 9, 10], + [11, 12]]) + + >>> ad2.obs + Obs2 Obs3 Shared_obs_col + obs3 E F G + obs2 H I J + obs5 K L M + + >>> ad2.var + Feat2 Shared_feat_col + overlapping_var_mod1 d e + var5 f g + + """ + df = pd.DataFrame( + [[7, 8], [9, 10], [11, 12]], + index=["obs3", "obs2", "obs5"], + columns=["overlapping_var_mod1", "var4"], + ) + + layer_1 = np.array([[20, 35], [76, 93], [100, 38]]) + + obs = pd.DataFrame( + [["E", "F", "G"], ["H", "I", "J"], ["K", "L", "M"]], + index=df.index, + columns=["Obs2", "Obs3", "Shared_obs_col"], + ) + var = pd.DataFrame( + [["d", "e"], ["f", "g"]], index=df.columns, columns=["Feat2", "Shared_feat_col"] + ) + ad2 = ad.AnnData(df, obs=obs, var=var, layers={"layer_foo": layer_1}) + return ad2 + + +@pytest.fixture +def sample_1_mudata(sample_1_modality_1, sample_1_modality_2): + return md.MuData({"mod1": sample_1_modality_1, "mod2": sample_1_modality_2}) + + +@pytest.fixture +def random_h5ad_path(random_path): + def wrapper(): + return random_path(extension="h5ad") + + return wrapper + + +@pytest.fixture +def sample_1_mudata_file(sample_1_mudata, random_h5mu_path): + output_path = random_h5mu_path() + sample_1_mudata.write(output_path) + return output_path + + +@pytest.fixture +def sample_1_modality_1_anndata_file(sample_1_modality_1, random_h5ad_path): + output_path = random_h5ad_path() + sample_1_modality_1.write(output_path) + return output_path + + +def test_convert_anndata(run_component, sample_1_modality_1_anndata_file, tmp_path): + output_path = tmp_path / "output" + run_component( + [ + "--input", + sample_1_modality_1_anndata_file, + "--rna_modality", + "mod1", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + ) + with tiledbsoma.DataFrame.open(f"{output_path}/obs") as obs_arr: + obs_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("cell_id", pa.large_string()), + pa.field("Obs1", pa.large_string()), + pa.field("Shared_obs_col", pa.large_string()), + pa.field("bool_column", pa.bool_()), + pa.field("float_column", pa.float64()), + pa.field("int_column", pa.int64()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1]), + pa.array(["obs1", "obs2"]), + pa.array(["A", "C"]), + pa.array(["B", "D"]), + pa.array([True, False]), + pa.array([0.9, 0.2]), + pa.array([1, 2]), + ], + schema=expected_schema, + ) + assert expected.equals(obs_contents) + + with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr: + var_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("rna_index", pa.large_string()), + pa.field("Feat1", pa.large_string()), + pa.field("Shared_feat_col", pa.large_string()), + pa.field("gene_symbol", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2]), + pa.array(["var1", "var2", "overlapping_var_mod1"]), + pa.array(["a", "c", "e"]), + pa.array(["b", "d", "f"]), + pa.array(["Feat1", "Feat1", "Feat1"]), + ], + schema=expected_schema, + ) + assert expected.equals(var_contents) + with tiledbsoma.Experiment.open(f"{output_path}") as exp: + with tiledbsoma.ExperimentAxisQuery( + experiment=exp, measurement_name="rna" + ) as query: + assert query.n_obs == 2 + data = query.to_anndata("raw").X.todense() + expected = ( + [ + [1, 2, 3], + [4, 5, 6], + ], + ) + np.testing.assert_allclose(data, np.asmatrix(np.array(expected))) + log_normalized_data = query.to_anndata("log_normalized").X.todense() + expected_log_normalized = [ + [94, 81, 60], + [50, 88, 69], + ] + np.testing.assert_allclose( + log_normalized_data, np.asmatrix(np.array(expected_log_normalized)) + ) + + +@pytest.mark.parametrize( + "extra_arg", + [ + ("--prot_modality", "mod2"), + ("--prot_raw_layer_input", "X"), + ("--prot_normalized_layer_input", "layer_foo"), + ], +) +def test_convert_anndata_extra_prot_arguments_raises( + run_component, sample_1_modality_1_anndata_file, tmp_path, extra_arg +): + output_path = tmp_path / "output" + args = [ + "--input", + sample_1_modality_1_anndata_file, + "--rna_modality", + "mod1", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + args += extra_arg + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert re.search( + rf"'{extra_arg[0].lstrip('--')}' can not be used when the input is not a MuData file\.", + err.value.stdout.decode("utf-8"), + ) + + +@pytest.mark.parametrize( + "params", + [ + [("--rna_raw_layer_input", "X"), ("--rna_normalized_layer_input", "X")], + [("--rna_raw_layer_output", "X"), ("--rna_normalized_layer_output", "X")], + [("--rna_modality", "rna"), ("--prot_modality", "rna")], + [("--prot_raw_layer_input", "X"), ("--prot_normalized_layer_input", "X")], + [("--prot_raw_layer_output", "X"), ("--prot_normalized_layer_output", "X")], + ], +) +def test_arg_values_should_not_be_the_same( + run_component, sample_1_mudata_file, tmp_path, params +): + output_path = tmp_path / "output" + args = [ + "--input", + sample_1_mudata_file, + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + args += [param for param_tup in params for param in param_tup] + # The following avoids complaints about misszing input argument + if "--rna_raw_layer_input" not in args: + args += [ + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "log_normalized", + ] + if "--rna_modality" not in args: + args += ["--rna_modality", "mod1"] + + with pytest.raises(CalledProcessError) as err: + run_component(args) + assert re.search( + rf"The value for argument '{params[0][0].lstrip('-')}' \({params[0][1]}\) must not be the same as for argument '{params[1][0].lstrip('-')}' \({params[1][1]}\)", + err.value.stdout.decode("utf-8"), + ) + + +def test_output_directory_already_exists(run_component, sample_1_mudata_file, tmp_path): + output_path = tmp_path / "output" + output_path.mkdir() + run_component( + [ + "--input", + sample_1_mudata_file, + "--rna_modality", + "mod1", + "--prot_modality", + "mod2", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--prot_raw_layer_input", + "X", + "--prot_normalized_layer_input", + "layer_foo", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + ) + + +def test_convert_mudata(run_component, sample_1_mudata_file, tmp_path): + output_path = tmp_path / "output" + run_component( + [ + "--input", + sample_1_mudata_file, + "--rna_modality", + "mod1", + "--prot_modality", + "mod2", + "--rna_raw_layer_input", + "X", + "--rna_normalized_layer_input", + "layer1", + "--prot_raw_layer_input", + "X", + "--prot_normalized_layer_input", + "layer_foo", + "--rna_var_gene_names_input", + "Feat1", + "--tiledb_dir", + output_path, + ] + ) + + assert output_path.is_dir() + with tiledbsoma.DataFrame.open(f"{output_path}/obs") as obs_arr: + obs_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("cell_id", pa.large_string()), + pa.field("Obs1", pa.large_string()), + pa.field("Shared_obs_col", pa.large_string()), + pa.field("bool_column", pa.bool_()), + pa.field("float_column", pa.float64()), + pa.field("int_column", pa.int64()), + pa.field("Obs2", pa.large_string()), + pa.field("Obs3", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2, 3]), + pa.array(["obs1", "obs2", "obs3", "obs5"]), + pa.array(["A", "C", None, None]), + pa.array(["B", "D", "G", "M"]), + pa.array([True, False, None, None]), + pa.array([0.9, 0.2, None, None]), + pa.array([1, 2, None, None]), + pa.array([None, None, "E", "K"]), + pa.array([None, None, "F", "L"]), + ], + schema=expected_schema, + ) + assert expected.equals(obs_contents) + + with tiledbsoma.DataFrame.open(f"{output_path}/ms/rna/var") as obs_arr: + var_contents = obs_arr.read().concat() + expected_schema = pa.schema( + [ + pa.field("soma_joinid", pa.int64(), nullable=False), + pa.field("rna_index", pa.large_string()), + pa.field("Feat1", pa.large_string()), + pa.field("Shared_feat_col", pa.large_string()), + pa.field("gene_symbol", pa.large_string()), + ] + ) + expected = pa.Table.from_arrays( + [ + pa.array([0, 1, 2]), + pa.array(["var1", "var2", "overlapping_var_mod1"]), + pa.array(["a", "c", "e"]), + pa.array(["b", "d", "f"]), + pa.array(["Feat1", "Feat1", "Feat1"]), + ], + schema=expected_schema, + ) + assert expected.equals(var_contents) + with tiledbsoma.Experiment.open(f"{output_path}") as exp: + with tiledbsoma.ExperimentAxisQuery( + experiment=exp, measurement_name="rna" + ) as query: + assert query.n_obs == 4 + data = query.to_anndata("raw").X.todense() + expected = ([[1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]],) + np.testing.assert_allclose(data, np.asmatrix(np.array(expected))) + log_normalized_data = query.to_anndata("log_normalized").X.todense() + expected_log_normalized = [[94, 81, 60], [50, 88, 69], [0, 0, 0], [0, 0, 0]] + np.testing.assert_allclose( + log_normalized_data, np.asmatrix(np.array(expected_log_normalized)) + ) + + with tiledbsoma.ExperimentAxisQuery( + experiment=exp, measurement_name="prot" + ) as query: + assert query.n_obs == 4 + data = query.to_anndata("raw").X.todense() + expected = ([[0, 0], [9, 10], [7, 8], [11, 12]],) + np.testing.assert_allclose(data, np.asmatrix(np.array(expected))) + log_normalized_data = query.to_anndata("log_normalized").X.todense() + expected_log_normalized = [[0, 0], [76, 93], [20, 35], [100, 38]] + np.testing.assert_allclose( + log_normalized_data, np.asmatrix(np.array(expected_log_normalized)) + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) From 5a020553f30ab88656de1bd664fc2f883c47c079 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Wed, 11 Jun 2025 12:21:47 +0000 Subject: [PATCH 2/3] Update CHANGELOG entry --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d71adf59b7..e8506967c79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,11 @@ * `download_file` has been deprecated and will be removed in openpipeline 3.0 (PR #1015). +## NEW FUNCTIONALITY + +* (Experimental) Added `from_h5mu_or_h5ad_to_tiledb` component. Warning: the functionality in this component is experimental + and its behavior may change in future releases (PR #1034). + ## MAJOR CHANGES * `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006). From 059b0ecb7e76719e43756f7992a441d57732d4a2 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Wed, 11 Jun 2025 12:25:28 +0000 Subject: [PATCH 3/3] Update config --- src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml index b90e133a20f..55f741f92cf 100644 --- a/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml +++ b/src/convert/from_h5mu_or_h5ad_to_tiledb/config.vsh.yaml @@ -2,7 +2,9 @@ name: "from_h5mu_or_h5ad_to_tiledb" namespace: "convert" scope: "public" description: | - Convert a MuData or AnnData object to tiledb + Convert a MuData or AnnData object to tiledb. Currently, transcriptome and protein modalities are supported. + + NOTE: The functionality provided by this component is experimental and may be subject to change. authors: - __merge__: /src/authors/dries_schaumont.yaml roles: [ author, maintainer ] @@ -154,4 +156,4 @@ runners: - type: executable - type: nextflow directives: - label: [lowmem, singlecpu] \ No newline at end of file + label: [midmem, midcpu] \ No newline at end of file