diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b75055cd93..d111d96fec7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # openpipelines 3.0.0 +## BREAKING CHANGES + +* Removed `split_h5mu_train_test` component (PR #1020). + ## MAJOR CHANGES * `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006). diff --git a/src/dataflow/split_h5mu_train_test/config.vsh.yaml b/src/dataflow/split_h5mu_train_test/config.vsh.yaml deleted file mode 100644 index 17d4c45a2e9..00000000000 --- a/src/dataflow/split_h5mu_train_test/config.vsh.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: split_h5mu_train_test -namespace: dataflow -description: Split mudata object into training and testing (and validation) datasets based on observations into separate mudata objects. -authors: - - __merge__: /src/authors/jakub_majercik.yaml - roles: [ author ] - -argument_groups: - - name: Inputs - description: Input dataset in mudata format. - arguments: - - name: "--input" - type: file - description: The input (query) data to be labeled. Should be a .h5mu file. - direction: input - required: true - example: input.h5mu - - name: "--modality" - description: Which modality to process. - type: string - default: "rna" - required: false - - name: Outputs - description: Output arguments. - arguments: - - name: "--output_train" - type: file - description: The output training data in mudata format. - direction: output - example: output_train.h5mu - required: true - - name: "--output_test" - type: file - description: The output testing data in mudata format. - direction: output - example: output_test.h5mu - required: true - - name: "--output_val" - type: file - description: The output validation data in mudata format. - direction: output - required: false - example: output_val.h5mu - - name: "--compression" - type: string - choices: ["gzip", "lzf"] - required: false - example: "gzip" - - name: Split arguments - description: Model arguments. - arguments: - - name: "--test_size" - type: double - description: The proportion of the dataset to include in the test split. - default: 0.2 - min: 0.0 - max: 1.0 - - name: "--val_size" - type: double - description: The proportion of the dataset to include in the validation split. - min: 0.0 - max: 1.0 - - name: "--shuffle" - type: boolean_true - description: Whether or not to shuffle the data before splitting. - - name: "--random_state" - type: integer - description: The seed used by the random number generator. - -resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py - -test_resources: - - type: python_script - path: test.py - - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu - -engines: - - type: docker - image: python:3.10-slim - setup: - - type: apt - packages: - - libhdf5-dev - - procps - - type: python - __merge__: [ /src/base/requirements/scanpy.yaml, .] - - type: python - packages: - - scikit-learn==1.4.2 - - type: python - __merge__: [ /src/base/requirements/anndata_mudata.yaml, .] - __merge__: [ /src/base/requirements/python_test_setup.yaml, .] -runners: - - type: executable - - type: nextflow \ No newline at end of file diff --git a/src/dataflow/split_h5mu_train_test/script.py b/src/dataflow/split_h5mu_train_test/script.py deleted file mode 100644 index 551311824b7..00000000000 --- a/src/dataflow/split_h5mu_train_test/script.py +++ /dev/null @@ -1,77 +0,0 @@ -import mudata as mu -from sklearn.model_selection import train_test_split -import sys - -### VIASH START -par = { - "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", - "modality": "rna", - "test_size": 0.2, - "val_size": None, - "random_state": 42, - "output_train": "train.h5mu", - "output_val": "val.h5mu", - "output_test": "test.h5mu", - "compression": "gzip", - "shuffle": True, -} -### VIASH END - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger - -logger = setup_logger() - - -def main(): - input_mudata = mu.read_h5mu(par["input"]) - input_modality = input_mudata.mod[par["modality"]] - - n_obs = input_modality.n_obs - train_idx, test_idx = train_test_split( - range(n_obs), - test_size=par["test_size"], - random_state=par["random_state"], - shuffle=par["shuffle"], - ) - - if bool(par["val_size"]) != bool(par["output_val"]): - raise ValueError( - "Both --val_size and --output_val must be set to use validation set." - ) - - elif par["val_size"] and par["output_val"]: - if par["val_size"] + par["test_size"] > 1: - raise ValueError("Sum of test_size and val_size must not exceed 1.") - - val_size_relative = par["val_size"] / (1 - par["test_size"]) - train_idx, val_idx = train_test_split( - train_idx, - test_size=val_size_relative, - random_state=par["random_state"], - shuffle=par["shuffle"], - ) - - train_modality = input_modality[train_idx].copy() - val_modality = input_modality[val_idx].copy() - test_modality = input_modality[test_idx].copy() - - train_mudata = mu.MuData({par["modality"]: train_modality}) - val_mudata = mu.MuData({par["modality"]: val_modality}) - test_mudata = mu.MuData({par["modality"]: test_modality}) - - val_mudata.write_h5mu(par["output_val"], compression=par["compression"]) - - else: - train_modality = input_modality[train_idx].copy() - test_modality = input_modality[test_idx].copy() - - train_mudata = mu.MuData({par["modality"]: train_modality}) - test_mudata = mu.MuData({par["modality"]: test_modality}) - - train_mudata.write_h5mu(par["output_train"], compression=par["compression"]) - test_mudata.write_h5mu(par["output_test"], compression=par["compression"]) - - -if __name__ == "__main__": - main() diff --git a/src/dataflow/split_h5mu_train_test/test.py b/src/dataflow/split_h5mu_train_test/test.py deleted file mode 100644 index 3d2731e7194..00000000000 --- a/src/dataflow/split_h5mu_train_test/test.py +++ /dev/null @@ -1,173 +0,0 @@ -import sys -import os -import pytest -import subprocess -import re -import mudata as mu - - -## VIASH START -meta = {"resources_dir": "resources_test"} -## VIASH END - -input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" - - -def test_train_test(run_component, random_h5mu_path): - output_train = random_h5mu_path() - output_test = random_h5mu_path() - - run_component( - [ - "--input", - input_file, - "--modality", - "rna", - "--test_size", - "0.2", - "--output_train", - output_train, - "--output_test", - output_test, - ] - ) - - assert os.path.exists(output_train), "train file does not exist" - assert os.path.exists(output_test), "test file does not exist" - - input_mudata = mu.read_h5mu(input_file) - train_mudata = mu.read_h5mu(output_train) - test_mudata = mu.read_h5mu(output_test) - - assert list(train_mudata.mod.keys()) == list(test_mudata.mod.keys()) == ["rna"] - - assert ( - train_mudata.mod["rna"].n_obs + test_mudata.mod["rna"].n_obs - == input_mudata.mod["rna"].n_obs - ), "train and test data do not sum up to input data" - - assert ( - abs(train_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.8)) - <= 1 - ), "train data has wrong size" - assert ( - abs(test_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.2)) - <= 1 - ), "test data has wrong size" - - -def test_train_val_test(run_component, random_h5mu_path): - output_train = random_h5mu_path() - output_val = random_h5mu_path() - output_test = random_h5mu_path() - - run_component( - [ - "--input", - input_file, - "--modality", - "rna", - "--test_size", - "0.2", - "--val_size", - "0.1", - "--output_train", - output_train, - "--output_val", - output_val, - "--output_test", - output_test, - ] - ) - - assert os.path.exists(output_train), "train file does not exist" - assert os.path.exists(output_val), "val file does not exist" - assert os.path.exists(output_test), "test file does not exist" - - input_mudata = mu.read_h5mu(input_file) - train_mudata = mu.read_h5mu(output_train) - val_mudata = mu.read_h5mu(output_val) - test_mudata = mu.read_h5mu(output_test) - - assert ( - list(train_mudata.mod.keys()) - == list(val_mudata.mod.keys()) - == list(test_mudata.mod.keys()) - == ["rna"] - ) - - assert ( - train_mudata.mod["rna"].n_obs - + val_mudata.mod["rna"].n_obs - + test_mudata.mod["rna"].n_obs - == input_mudata.mod["rna"].n_obs - ), "train, val and test data do not sum up to input data" - - assert ( - abs(train_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.7)) - <= 1 - ), "train data has wrong size" - assert ( - abs(val_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.1)) - <= 1 - ), "val data has wrong size" - assert ( - abs(test_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.2)) - <= 1 - ), "test data has wrong size" - - -def test_raise_test_val_size(run_component): - with pytest.raises(subprocess.CalledProcessError) as err: - run_component( - [ - "--input", - input_file, - "--modality", - "rna", - "--test_size", - "0.9", - "--val_size", - "0.5", - "--output_train", - "train.h5mu", - "--output_val", - "val.h5mu", - "--output_test", - "test.h5mu", - ] - ) - - assert re.search( - r"Sum of test_size and val_size must not exceed 1.", - err.value.stdout.decode("utf-8"), - ) - - -def test_raise_invalid_val_out(run_component, random_h5mu_path): - with pytest.raises(subprocess.CalledProcessError) as err: - run_component( - [ - "--input", - input_file, - "--modality", - "rna", - "--test_size", - "0.2", - "--val_size", - "0.1", - "--output_train", - "train.h5mu", - "--output_test", - "test.h5mu", - ] - ) - - assert re.search( - r"Both --val_size and --output_val must be set to use validation set.", - err.value.stdout.decode("utf-8"), - ) - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__]))