Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

## MINOR CHANGES

* `filter/filter_with_scrublet`: provide cleaner error message when running scrublet on an empty modality (PR #929).

* Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920).

* Several annotation (`src/annotate/`) components (`onclass`, `celltypist`, `random_forest_annotation`, `scanvi`, `svm_annotation`): Updated input parameteres to ensure uniformity across components, implemented functionality to cross-check the overlap of genes between query and reference (model) datasets and implemented logic to allow for subsetting of genes (PR #919).
Expand Down
6 changes: 2 additions & 4 deletions src/filter/filter_with_scrublet/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,8 @@ engines:
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
packages:
- scrublet
- annoy==1.16.3
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
- annoy==1.17.3
__merge__: [/src/base/requirements/python_test_setup.yaml, .]

runners:
- type: executable
Expand Down
6 changes: 6 additions & 0 deletions src/filter/filter_with_scrublet/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
logger.info("Using layer '%s'.", "X" if not par["layer"] else par["layer"])
input_layer = data.X if not par["layer"] else data.layers[par["layer"]]

if 0 in input_layer.shape:
raise ValueError(
f"Modality {mod} of input Mudata {par['input']} appears "
f"to be empty (shape: {input_layer.shape})."
)

logger.info("\tRunning scrublet")
scrub = scr.Scrublet(input_layer)

Expand Down
187 changes: 117 additions & 70 deletions src/filter/filter_with_scrublet/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,74 @@
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
from scipy.sparse import csr_matrix, csr_array

## VIASH START
meta = {
"name": "foo",
"resources_dir": "resources_test/",
"executable": "target/executable/filter/filter_with_scrublet/filter_with_scrublet",
"config": "./src/filter/filter_with_scrublet/config.vsh.yaml",
}
# def run_component(args_as_list):
# try:
# subprocess_args = [meta['executable']] + args_as_list
# print(" ".join(subprocess_args), flush=True)
# subprocess.check_output(subprocess_args, stderr=subprocess.STDOUT)
# except subprocess.CalledProcessError as e:
# print(e.stdout.decode("utf-8"), flush=True)
# raise e
## VIASH END

# read input file
input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"
input_mu = mu.read_h5mu(input_path)
orig_obs = input_mu.mod["rna"].n_obs
orig_vars = input_mu.mod["rna"].n_vars
orig_prot_obs = input_mu.mod["prot"].n_obs
orig_prot_vars = input_mu.mod["prot"].n_vars


def test_filter_a_little_bit(run_component):
output_mu = "output-1.h5mu"
@pytest.fixture
def input_mudata_path():
return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"


@pytest.fixture
def input_mudata(input_mudata_path):
return mu.read_h5mu(input_mudata_path)


@pytest.fixture
def input_with_failed_run(random_h5mu_path, input_mudata_path):
new_mudata_path = random_h5mu_path()

mudata_in = mu.read_h5mu(input_mudata_path)

# Make test reproducable
np.random.seed(4)

# Simulate a failed scrublet run by passing very little cells
mudata = mudata_in[152].copy()
nobs = 100
x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0)

# Random perturbations because otherwise the detection fails in other ways (PCA cannot be run)
replace_rate = 0.000001
mask = np.random.choice(
[0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate)
).astype("bool")
r = np.random.rand(*x_data.shape) * np.max(x_data)
x_data[mask] = r[mask]

# create obs
obs_name = mudata.mod["rna"].obs.index.to_list()[0]
obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)])

# create resulting mudata
mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var)
new_mudata = mu.MuData({"rna": mod})
new_mudata.update()
new_mudata.write(new_mudata_path)

return new_mudata_path


def test_filter_a_little_bit(
run_component, random_h5mu_path, input_mudata_path, input_mudata
):
output_mu = random_h5mu_path()

run_component(
[
"--input",
input_path,
input_mudata_path,
"--output",
output_mu,
"--min_counts",
Expand All @@ -56,13 +90,17 @@ def test_filter_a_little_bit(run_component):

new_obs = mu_out.mod["rna"].n_obs
new_vars = mu_out.mod["rna"].n_vars
assert new_obs == orig_obs, "No RNA obs should have been filtered"
assert new_vars == orig_vars, "No RNA vars should have been filtered"
assert (
mu_out.mod["prot"].n_obs == orig_prot_obs
new_obs == input_mudata.mod["rna"].n_obs
), "No RNA obs should have been filtered"
assert (
new_vars == input_mudata.mod["rna"].n_vars
), "No RNA vars should have been filtered"
assert (
mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
), "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == orig_prot_vars
mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
), "No prot vars should have been filtered"
assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
"Gene Expression"
Expand All @@ -72,13 +110,15 @@ def test_filter_a_little_bit(run_component):
], "Feature types of prot modality should be Antibody Capture"


def test_filtering_a_lot(run_component):
output_mu = "output-2.h5mu"
def test_filtering_a_lot(
run_component, random_h5mu_path, input_mudata_path, input_mudata
):
output_mu = random_h5mu_path()

run_component(
[
"--input",
input_path,
input_mudata_path,
"--output",
output_mu,
"--modality",
Expand All @@ -95,11 +135,17 @@ def test_filtering_a_lot(run_component):
mu_out = mu.read_h5mu(output_mu)
new_obs = mu_out.mod["rna"].n_obs
new_vars = mu_out.mod["rna"].n_vars
assert new_obs < orig_obs, "Some cells should have been filtered"
assert new_vars == orig_vars, "No genes should have been filtered"
assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == orig_prot_vars
new_obs < input_mudata.mod["rna"].n_obs
), "Some cells should have been filtered"
assert (
new_vars == input_mudata.mod["rna"].n_vars
), "No genes should have been filtered"
assert (
mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
), "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
), "No prot vars should have been filtered"
assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
"Gene Expression"
Expand All @@ -109,49 +155,42 @@ def test_filtering_a_lot(run_component):
], "Feature types of prot modality should be Antibody Capture"


@pytest.fixture(scope="module")
def input_with_failed_run():
new_mudata_path = "pbmc-perturbed.h5mu"

mudata_in = mu.read_h5mu(input_path)

# Make test reproducable
np.random.seed(4)

# Simulate a failed scrublet run by passing very little cells
mudata = mudata_in[152].copy()
nobs = 100
x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0)

# Random perturbations because otherwise the detection fails in other ways (PCA cannot be run)
replace_rate = 0.000001
mask = np.random.choice(
[0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate)
).astype("bool")
r = np.random.rand(*x_data.shape) * np.max(x_data)
x_data[mask] = r[mask]

# create obs
obs_name = mudata.mod["rna"].obs.index.to_list()[0]
obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)])

# create resulting mudata
mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var)
new_mudata = mu.MuData({"rna": mod})
new_mudata.update()
new_mudata.write(new_mudata_path)
def test_empty_mudata(run_component, random_h5mu_path):
output_mu = random_h5mu_path()
empty_mudata_path = random_h5mu_path()
empty_mudata = mu.MuData(
{
modality: ad.AnnData(csr_array((5, 0), dtype=np.int8))
for modality in ("rna",)
}
)

return new_mudata_path
empty_mudata.write(empty_mudata_path)
with pytest.raises(subprocess.CalledProcessError) as err:
run_component(
[
"--input",
empty_mudata_path,
"--output",
output_mu,
"--output_compression",
"gzip",
]
)
assert re.search(
"ValueError: Modality rna of input Mudata .* appears to be empty",
err.value.stdout.decode("utf-8"),
)


@pytest.mark.xfail(strict=False)
def test_doublet_automatic_threshold_detection_fails(
run_component, input_with_failed_run
run_component, input_with_failed_run, random_h5mu_path
):
"""
Test if the component fails if doublet score threshold could not automatically be set
"""
output_mu = "output-4.h5mu"
output_mu = random_h5mu_path()

with pytest.raises(subprocess.CalledProcessError) as e_info:
run_component(
Expand Down Expand Up @@ -217,9 +256,11 @@ def test_doublet_automatic_threshold_detection_fails_recovery(
assert mu_out.mod["rna"].obs["filter_with_scrublet"].isna().all()


def test_selecting_input_layer(run_component, tmp_path):
output_mu = "output-2.h5mu"
input_data = mu.read_h5mu(input_path)
def test_selecting_input_layer(
run_component, tmp_path, random_h5mu_path, input_mudata, input_mudata_path
):
output_mu = random_h5mu_path()
input_data = mu.read_h5mu(input_mudata_path)
input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X
input_data.mod["rna"].X = None

Expand Down Expand Up @@ -248,11 +289,17 @@ def test_selecting_input_layer(run_component, tmp_path):
mu_out = mu.read_h5mu(output_mu)
new_obs = mu_out.mod["rna"].n_obs
new_vars = mu_out.mod["rna"].n_vars
assert new_obs < orig_obs, "Some cells should have been filtered"
assert new_vars == orig_vars, "No genes should have been filtered"
assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == orig_prot_vars
new_obs < input_mudata.mod["rna"].n_obs
), "Some cells should have been filtered"
assert (
new_vars == input_mudata.mod["rna"].n_vars
), "No genes should have been filtered"
assert (
mu_out.mod["prot"].n_obs == input_mudata.mod["prot"].n_obs
), "No prot obs should have been filtered"
assert (
mu_out.mod["prot"].n_vars == input_mudata.mod["prot"].n_vars
), "No prot vars should have been filtered"
assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [
"Gene Expression"
Expand Down
Loading