Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@

* `metadata/add_id` and `metadata/grep_annotation_column`: Bump python to 3.11 (PR #697).

## Minor changes
## BUG FIXES

* `dataflow/split_modalities`: remove unused `compression` argument. Use `output_compression` instead (PR #714).

## MINOR CHANGES

* Bump viash to 0.8.5 (PR #697)

* `dataflow/split_modalities`: add more logging output and bump python to 3.12 (PR #714).

# openpipelines 1.0.0rc1

## BREAKING CHANGES
Expand Down
10 changes: 8 additions & 2 deletions src/base/openpipelinetestutils/fixtures.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from uuid import uuid4
import pytest

@pytest.fixture
def random_path(tmp_path):
def wrapper(extension=None):
extension = "" if not extension else extension
return tmp_path / f"{uuid4()}.{extension}"
return wrapper

@pytest.fixture
def random_h5mu_path(tmp_path):
def random_h5mu_path(random_path):
def wrapper():
return tmp_path / f"{uuid4()}.h5mu"
return random_path(extension="h5mu")
return wrapper

@pytest.fixture
Expand Down
11 changes: 2 additions & 9 deletions src/dataflow/split_modalities/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,30 +34,23 @@ functionality:
direction: output
example: types.csv
description: A csv containing the base filename and modality type per output file.
- name: "--compression"
type: string
description: The compression format to be used on the final h5mu object.
default: "gzip"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu
platforms:
- type: docker
image: python:3.10-slim
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: /src/base/requirements/anndata_mudata.yaml
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
- type: native
- type: nextflow
directives:
Expand Down
34 changes: 22 additions & 12 deletions src/dataflow/split_modalities/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
"input": "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
"output": "foo/",
"output_types": "foo_types.csv",
"compression": "gzip",
"output_compression": "gzip",
}
meta = {
"resources_dir": "."
}
### VIASH END

Expand All @@ -35,28 +38,35 @@ def setup_logger():

def main() -> None:
output_dir = Path(par["output"])
logger.info("Creating output directory '%s' if it does not exist", output_dir)
if not output_dir.is_dir():
logger.info("Creating %s", output_dir)
output_dir.mkdir(parents=True)

logger.info('Reading input file %s', par['input'])
sample = md.read_h5mu(par["input"].strip())
input_file = Path(par["input"])

logger.info('Creating output types csv')
logger.info("Reading input file '%s'", par['input'])
input_file = Path(par["input"].strip())
sample = md.read_h5mu(input_file)

logger.info('Creating output types CSV.')
modalities = list(sample.mod.keys())

logger.info("Found the following modalities:\n%s", "\n".join(modalities))
names = {mod_name: f"{input_file.stem}_{mod_name}.h5mu"
for mod_name in sample.mod.keys() }
df = pd.DataFrame({"name": list(names.keys()), "filename": list(names.values())})
for mod_name in modalities}
output_files = list(names.values())
logger.info("Will be creating the following output .h5mu files:\n%s", "\n".join(output_files))
df = pd.DataFrame({"name": modalities, "filename": output_files})
logger.info("Writing output_types CSV file to '%s'.", par["output_types"])
df.to_csv(par["output_types"], index=False)

logger.info('Splitting up modalities %s', ", ".join(sample.mod.keys()))
logger.info('Splitting input file into unimodal output files.')
for mod_name, mod in sample.mod.items():
logger.info("Processing modality '%s'", mod_name)
new_sample = md.MuData({mod_name: mod})
logger.info('Writing to %s', names[mod_name])
logger.info("Writing to '%s', with compression '%s'", names[mod_name], par["output_compression"])
new_sample.write_h5mu(output_dir / names[mod_name], compression=par["output_compression"])

logger.info("Done writing output file.")
logger.info("Finished")


if __name__ == "__main__":
main()
103 changes: 75 additions & 28 deletions src/dataflow/split_modalities/test.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,102 @@
import sys
import pytest
import mudata as md
import anndata as ad
import pandas as pd
import re
from openpipelinetestutils.asserters import assert_annotation_objects_equal
from textwrap import dedent

## VIASH START
meta = {
'functionality_name': './target/native/dataflow/split_modalities/split_modalities',
'resources_dir': './resources_test/'
'resources_dir': './resources_test/',
'config': './src/dataflow/split_modalities/config.vsh.yaml',
'executable': './target/docker/dataflow/split_modalities/split_modalities'
}
## VIASH END

input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
@pytest.fixture
def input_modality_1():
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"])
obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"])
var = pd.DataFrame([["a"], ["b"], ["c"]],
index=df.columns, columns=["Feat"])
ad1 = ad.AnnData(df, obs=obs, var=var)
return ad1

def test_split(run_component, tmp_path):
output_dir = tmp_path / "foo"
output_types = tmp_path / "foo.csv"

run_component([
"--input", input_file,

@pytest.fixture
def input_modality_2():
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"])
var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"])
obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"])
ad2 = ad.AnnData(df, obs=obs2, var=var2)
return ad2


@pytest.fixture
def input_h5mu(input_modality_1, input_modality_2):
tmp_mudata = md.MuData({'mod1': input_modality_1, 'mod2': input_modality_2})
return tmp_mudata


@pytest.fixture
def input_h5mu_path(write_mudata_to_file, input_h5mu):
return write_mudata_to_file(input_h5mu)

@pytest.mark.parametrize("compression", ["gzip", None])
def test_split(run_component, random_path, input_h5mu, input_h5mu_path,
input_modality_1, input_modality_2, compression):
output_dir = random_path()
output_types = random_path(extension="csv")
args = [
"--input", input_h5mu_path,
"--output", str(output_dir),
"--output_types", str(output_types),
"--output_compression", "gzip"
])
]
if compression:
args += ["--output_compression", compression]
run_component(args)
assert output_types.is_file()
assert output_dir.is_dir()

# todo: check whether contents of output_types is correct

# check output dir
dir_content = [h5mu_file for h5mu_file in output_dir.iterdir() if h5mu_file.suffix == ".h5mu"]
rna_file = output_dir / "pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu"
prot_file = output_dir / "pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu"
assert set(dir_content), set([prot_file == rna_file])
input_file_contents = md.read_h5mu(input_file)
rna = md.read_h5mu(rna_file)
prot = md.read_h5mu(prot_file)
dir_content = [h5mu_file for h5mu_file in output_dir.iterdir()
if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path]
mod1_file = output_dir / f"{input_h5mu_path.stem}_mod1.h5mu"
mod2_file = output_dir / f"{input_h5mu_path.stem}_mod2.h5mu"
assert set(dir_content) == set([mod1_file, mod2_file])
mod1 = md.read_h5mu(mod1_file)
mod2 = md.read_h5mu(mod2_file)
assert mod1.n_mod == 1
assert mod2.n_mod == 1

assert rna.n_mod == 1
assert prot.n_mod == 1
assert_annotation_objects_equal(mod1.mod['mod1'], input_modality_1)
assert_annotation_objects_equal(mod2.mod['mod2'], input_modality_2)

assert rna.n_obs == input_file_contents.n_obs
assert prot.n_obs == input_file_contents.n_obs
assert mod1.n_obs == input_h5mu.n_obs
assert mod2.n_obs == input_h5mu.n_obs

# When a var_key is only present for one modality, it is prefixed by the name of the
# modality followed by a colon and the name of the key (in the global .var).
replace_regex = r"(^rna:|^prot:)"
expected_var_keys = {re.sub(replace_regex, "", col_name) for col_name in input_file_contents.var_keys()}
assert set(rna.var_keys()) | set(prot.var_keys()) == expected_var_keys
replace_regex = r"(^mod1:|^mod2:)"
expected_var_keys = {re.sub(replace_regex, "", col_name) for col_name in input_h5mu.var_keys()}
assert set(mod1.var_keys()) | set(mod2.var_keys()) == expected_var_keys

assert set(mod1.var_keys()) == set(input_h5mu.mod['mod1'].var.columns)
assert set(mod2.var_keys()) == set(input_h5mu.mod['mod2'].var.columns)

assert set(rna.var_keys()) == set(input_file_contents.mod['rna'].var.columns)
assert set(rna.var_keys()) == set(input_file_contents.mod['prot'].var.columns)
expected_csv_output = dedent(
f"""\
name,filename
mod1,{mod1_file.name}
mod2,{mod2_file.name}
"""
)
with open(output_types, 'r') as open_csv_file:
result = open_csv_file.read()
assert result == expected_csv_output

if __name__ == "__main__":
sys.exit(pytest.main([__file__]))