diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b8c16e61ad..7d0231086e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,11 @@ * `from_h5mu_or_h5ad_to_seurat`: converts an h5ad file or a single modality from an h5mu file to a seurat object (PR #1046). +* (Experimental) `differential_expression/create_pseudobulk`: Added a component to generate pseudobulk samples from single-cell transcriptomics data, + to create bulk-like expression profiles suitable for differential expression analysis with methods designed for bulk differential expression analysis. + Warning: the functionality in this component is experimental and its behavior may change in future releases (PR #1042). + + ## MAJOR CHANGES * `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006). diff --git a/resources_test_scripts/annotation_test_data.sh b/resources_test_scripts/annotation_test_data.sh old mode 100644 new mode 100755 index 1aea9ec3291..b9db9855e11 --- a/resources_test_scripts/annotation_test_data.sh +++ b/resources_test_scripts/annotation_test_data.sh @@ -34,15 +34,20 @@ wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz? # Process Tabula Sapiens Blood reference h5ad # (Select one individual and 100 cells per cell type) # normalize and log1p transform data +# Add treatment and disease columns python <=n].groupby('cell_ontology_class').head(n).index] -# assert sub_ref_adata_final.shape == (500, 58870) + +# Normalize and log1p transform data data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X) sc.pp.normalize_total(data_for_scanpy, target_sum=10000) sc.pp.log1p( @@ -50,8 +55,17 @@ sc.pp.log1p( base=None, layer=None, copy=False, -) +) sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X + +# Add treatment and disease columns +n_cells = sub_ref_adata_final.n_obs +treatment = np.random.choice(["ctrl", "stim"], size=n_cells, p=[0.5, 0.5]) +disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5]) +sub_ref_adata_final.obs["treatment"] = treatment +sub_ref_adata_final.obs["disease"] = disease + +# Write out data sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip') HEREDOC @@ -115,3 +129,13 @@ viash run src/integrate/scanvi/config.vsh.yaml --engine docker -- \ rm "${OUT}/scanvi_output.h5mu" rm "${OUT}/scvi_output.h5mu" rm -r "${OUT}/Pretrained_model/" + +echo "> Creating Pseudobulk Data for DGEA" +viash run src/differential_expression/create_pseudobulk/config.vsh.yaml --engine docker -- \ + --input "${OUT}/TS_Blood_filtered.h5mu" \ + --obs_grouping "cell_type" \ + --obs_sample_conditions "donor_id" \ + --obs_sample_conditions "treatment" \ + --obs_sample_conditions "disease" \ + --min_num_cells_per_sample 5 \ + --output "${OUT}/TS_Blood_filtered_pseudobulk.h5mu" \ No newline at end of file diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml new file mode 100644 index 00000000000..aa90b260be1 --- /dev/null +++ b/src/differential_expression/create_pseudobulk/config.vsh.yaml @@ -0,0 +1,111 @@ +name: differential_expression +namespace: "create_pseudobulk" +scope: "public" +description: | + Generation of pseudobulk samples from single-cell transcriptomics data, + by aggregating raw gene expression counts from individual cells to create + bulk-like expression profiles suitable for differential expression analysis + with methods designed for bulk differential expression analysis. + Note that this componentonly considers factors as explanatory variables, + and excludes covariates from the analysis. + +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ contributor ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file. + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is used. This layer must contain raw counts." + - name: "--obs_label" + type: string + required: true + description: ".obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters." + - name: "--obs_groups" + type: string + multiple: true + description: ".obs fields containing the experimental condition(s) to create pseudobulk samples for." + + - name: Arguments + arguments: + - name: "--aggregation_method" + type: string + choices: ["sum", "mean"] + default: "sum" + description: "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean." + - name: "--min_obs_per_sample" + type: integer + min: 1 + default: 30 + description: "Minimum number of cells per pseudobulk sample." + - name: --random_state + type: integer + description: | + The random seed for sampling. + min: 0 + default: 0 + required: false + - name: "--obs_cell_count" + type: string + default: "n_cells" + description: "The .obs field to store the number of observations per pseudobulk sample." + + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file containing the aggregated pseudobulk samples. + direction: output + required: true + - name: "--output_compression" + type: string + description: | + The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +engines: +- type: docker + image: python:3.12 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [lowcpu, lowmem] diff --git a/src/differential_expression/create_pseudobulk/script.py b/src/differential_expression/create_pseudobulk/script.py new file mode 100644 index 00000000000..a9af50f6d34 --- /dev/null +++ b/src/differential_expression/create_pseudobulk/script.py @@ -0,0 +1,107 @@ +import numpy as np +import mudata as mu +import pandas as pd +import sys +import scanpy as sc +import scipy.sparse as sp + +## VIASH START +par = { + "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "modality": "rna", + "input_layer": None, + "obs_label": "cell_type", + "obs_groups": ["treatment", "donor_id", "disease"], + "obs_cell_count": "n_cells", + "min_obs_per_sample": 5, + "random_state": 0, + "output": "test.h5mu", + "output_compression": "gzip", +} +meta = {"resources_dir": "src/utils"} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +def is_normalized(layer): + if sp.issparse(layer): + row_sums = np.array(layer.sum(axis=1)).flatten() + else: + row_sums = layer.sum(axis=1) + + return np.allclose(row_sums, 1) + + +def count_obs(adata, pb_adata, obs_cols): + counts = [] + for i in range(pb_adata.n_obs): + values = pb_adata.obs.iloc[i][obs_cols] + + mask = pd.Series([True] * adata.n_obs) + for col in obs_cols: + mask &= (adata.obs[col] == values[col]).values + + count = mask.sum() + counts.append(count) + + return counts + + +def main(): + # Read in data + logger.info(f"Reading input data {par['input']}...") + adata = mu.read_h5ad(par["input"], mod=par["modality"]).copy() + + # Make sure .X contains raw counts + if par["input_layer"]: + adata.X = adata.layers[par["input_layer"]] + if is_normalized(adata.X): + raise ValueError("Input layer must contain raw counts.") + + # Sanitize pseudobulk aggregation fields + pseudobulk_cols = [par["obs_label"]] + if par["obs_groups"]: + pseudobulk_cols += par["obs_groups"] + + for col in pseudobulk_cols: + if col not in adata.obs.columns: + raise ValueError(f"Required column '{col}' not found in .obs.") + adata.obs[col] = ( + adata.obs[col] + .astype(str) + .str.replace(" ", "_") + .str.replace("+", "") + .astype("category") + ) + + # Aggregate pseudobulk data per cell group + logger.info("Creating pseudobulk samples...") + adata_pb = sc.get.aggregate(adata, by=pseudobulk_cols, func="sum", axis=0) + adata_pb.X = adata_pb.layers["sum"] + del adata_pb.layers["sum"] + + adata_pb.obs_names = [ + "_".join(values) + for values in zip(*[adata_pb.obs[col] for col in pseudobulk_cols]) + ] + + # Filter pseudobulk samples based on minimum observation count + logger.info("Filtering pseudobulk samples based on minimum observation count...") + adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols) + adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]] + + logger.info( + f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes" + ) + + logger.info("Writing output data...") + mdata_pb = mu.MuData({"rna": adata_pb}) + mdata_pb.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == "__main__": + main() diff --git a/src/differential_expression/create_pseudobulk/test.py b/src/differential_expression/create_pseudobulk/test.py new file mode 100644 index 00000000000..f40963ee553 --- /dev/null +++ b/src/differential_expression/create_pseudobulk/test.py @@ -0,0 +1,113 @@ +import os +import mudata as mu +import sys +import pytest + +## VIASH START +meta = {"resources_dir": "resources_test/"} +## VIASH END + +sys.path.append(meta["resources_dir"]) + +input_path = meta["resources_dir"] + "/TS_Blood_filtered.h5mu" + + +def test_simple_execution(run_component, random_h5mu_path): + output_path = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--obs_label", + "cell_type", + "--obs_groups", + "treatment", + "--output_compression", + "gzip", + ] + ) + + assert os.path.exists(output_path), "Output query file does not exist" + mdata = mu.read_h5mu(output_path) + adata = mdata.mod["rna"] + + expected_obs = ["treatment", "cell_type", "n_cells"] + assert all(col in adata.obs for col in expected_obs), ( + f"Expected columns {expected_obs} not found in .obs" + ) + assert adata.shape[0] == 8, "Expected a total of 8 pseudobulk samples in the output" + + +def test_multiple_factors(run_component, random_h5mu_path): + output_path = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--obs_label", + "cell_type", + "--obs_groups", + "disease", + "--obs_groups", + "donor_id", + "--obs_groups", + "treatment", + "--min_obs_per_sample", + "5", + "--output_compression", + "gzip", + ] + ) + + assert os.path.exists(output_path), "Output query file does not exist" + mdata = mu.read_h5mu(output_path) + adata = mdata.mod["rna"] + + expected_obs = ["treatment", "cell_type", "n_cells"] + assert all(col in adata.obs for col in expected_obs), ( + f"Expected columns {expected_obs} not found in .obs" + ) + assert adata.shape[0] == 16, ( + "Expected a total of 16 pseudobulk samples in the output" + ) + + +def test_filtering(run_component, random_h5mu_path): + output_path = random_h5mu_path() + + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--obs_label", + "cell_type", + "--obs_groups", + "treatment", + "--min_obs_per_sample", + "50", + "--output_compression", + "gzip", + ] + ) + + assert os.path.exists(output_path), "Output query file does not exist" + mdata = mu.read_h5mu(output_path) + adata = mdata.mod["rna"] + + expected_obs = ["treatment", "cell_type", "n_cells"] + assert all(col in adata.obs for col in expected_obs), ( + f"Expected columns {expected_obs} not found in .obs" + ) + assert adata.shape[0] == 4, "Expected a total of 8 pseudobulk samples in the output" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__]))