openpipelines-bio · jakubmajercik · Jul 24, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,11 @@
 
 * `from_h5mu_or_h5ad_to_seurat`: converts an h5ad file or a single modality from an h5mu file to a seurat object (PR #1046).
 
+* (Experimental) `differential_expression/create_pseudobulk`: Added a component to generate pseudobulk samples from single-cell transcriptomics data, 
+  to create bulk-like expression profiles suitable for differential expression analysis with methods designed for bulk differential expression analysis. 
+  Warning: the functionality in this component is experimental and its behavior may change in future releases (PR #1042).
+
+
 ## MAJOR CHANGES
 
 * `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006).

diff --git a/resources_test_scripts/annotation_test_data.sh b/resources_test_scripts/annotation_test_data.sh
@@ -34,24 +34,38 @@ wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?
 # Process Tabula Sapiens Blood reference h5ad
 # (Select one individual and 100 cells per cell type)
 # normalize and log1p transform data
+# Add treatment and disease columns
 python <<HEREDOC
 import anndata as ad
 import scanpy as sc
+import numpy as np
+
+# Read in data
 ref_adata = ad.read_h5ad("${OUT}/tmp_TS_Blood_filtered.h5ad")
 sub_ref_adata = ref_adata[ref_adata.obs["donor_assay"] == "TSP14_10x 3' v3"] 
 n=100
 s=sub_ref_adata.obs.groupby('cell_ontology_class').cell_ontology_class.transform('count')
 sub_ref_adata_final = sub_ref_adata[sub_ref_adata.obs[s>=n].groupby('cell_ontology_class').head(n).index]
-# assert sub_ref_adata_final.shape == (500, 58870)
+
+# Normalize and log1p transform data
 data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X)
 sc.pp.normalize_total(data_for_scanpy, target_sum=10000)
 sc.pp.log1p(
     data_for_scanpy,
     base=None,
     layer=None,
     copy=False,
-)  
+)
 sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X
+
+# Add treatment and disease columns
+n_cells = sub_ref_adata_final.n_obs
+treatment = np.random.choice(["ctrl", "stim"], size=n_cells, p=[0.5, 0.5])
+disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5])
+sub_ref_adata_final.obs["treatment"] = treatment
+sub_ref_adata_final.obs["disease"] = disease
+
+# Write out data
 sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
 HEREDOC
 
@@ -115,3 +129,13 @@ viash run src/integrate/scanvi/config.vsh.yaml --engine docker -- \
 rm "${OUT}/scanvi_output.h5mu"
 rm "${OUT}/scvi_output.h5mu"
 rm -r "${OUT}/Pretrained_model/"
+
+echo "> Creating Pseudobulk Data for DGEA"
+viash run src/differential_expression/create_pseudobulk/config.vsh.yaml --engine docker -- \
+    --input "${OUT}/TS_Blood_filtered.h5mu" \
+    --obs_grouping "cell_type" \
+    --obs_sample_conditions "donor_id" \
+    --obs_sample_conditions "treatment" \
+    --obs_sample_conditions "disease" \
+    --min_num_cells_per_sample 5 \
+    --output "${OUT}/TS_Blood_filtered_pseudobulk.h5mu" 
diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml
@@ -0,0 +1,111 @@
+name: differential_expression
+namespace: "create_pseudobulk"
+scope: "public"
+description: |
+  Generation of pseudobulk samples from single-cell transcriptomics data,
+  by aggregating raw gene expression counts from individual cells to create 
+  bulk-like expression profiles suitable for differential expression analysis
+  with methods designed for bulk differential expression analysis.
+  Note that this componentonly considers factors as explanatory variables, 
+  and excludes covariates from the analysis.
+
+authors:
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/dries_de_maeyer.yaml
+    roles: [ contributor ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        alternatives: ["-i"]
+        type: file
+        description: Input h5mu file.
+        direction: input
+        required: true
+      - name: "--modality"
+        description: |
+          Which modality from the input MuData file to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        required: false
+        description: "Input layer to use. If None, X is used. This layer must contain raw counts."
+      - name: "--obs_label"
+        type: string
+        required: true
+        description: ".obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters."
+      - name: "--obs_groups"
+        type: string
+        multiple: true
+        description: ".obs fields containing the experimental condition(s) to create pseudobulk samples for."
+
+  - name: Arguments
+    arguments:
+      - name: "--aggregation_method"
+        type: string
+        choices: ["sum", "mean"]
+        default: "sum"
+        description: "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean."
+      - name: "--min_obs_per_sample"
+        type: integer
+        min: 1
+        default: 30
+        description: "Minimum number of cells per pseudobulk sample."
+      - name: --random_state
+        type: integer
+        description: |
+          The random seed for sampling.
+        min: 0
+        default: 0
+        required: false
+      - name: "--obs_cell_count"
+        type: string
+        default: "n_cells"
+        description: "The .obs field to store the number of observations per pseudobulk sample."
+
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file containing the aggregated pseudobulk samples.
+        direction: output
+        required: true
+      - name: "--output_compression"
+        type: string
+        description: |
+          The compression format to be used on the output h5mu object.
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+
+engines:
+- type: docker
+  image: python:3.12
+  setup:
+    - type: python
+      __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ]
+  __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+
+runners:
+- type: executable
+- type: nextflow
+  directives:
+    label: [lowcpu, lowmem]
diff --git a/src/differential_expression/create_pseudobulk/script.py b/src/differential_expression/create_pseudobulk/script.py
@@ -0,0 +1,107 @@
+import numpy as np
+import mudata as mu
+import pandas as pd
+import sys
+import scanpy as sc
+import scipy.sparse as sp
+
+## VIASH START
+par = {
+    "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
+    "modality": "rna",
+    "input_layer": None,
+    "obs_label": "cell_type",
+    "obs_groups": ["treatment", "donor_id", "disease"],
+    "obs_cell_count": "n_cells",
+    "min_obs_per_sample": 5,
+    "random_state": 0,
+    "output": "test.h5mu",
+    "output_compression": "gzip",
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+
+logger = setup_logger()
+
+
+def is_normalized(layer):
+    if sp.issparse(layer):
+        row_sums = np.array(layer.sum(axis=1)).flatten()
+    else:
+        row_sums = layer.sum(axis=1)
+
+    return np.allclose(row_sums, 1)
+
+
+def count_obs(adata, pb_adata, obs_cols):
+    counts = []
+    for i in range(pb_adata.n_obs):
+        values = pb_adata.obs.iloc[i][obs_cols]
+
+        mask = pd.Series([True] * adata.n_obs)
+        for col in obs_cols:
+            mask &= (adata.obs[col] == values[col]).values
+
+        count = mask.sum()
+        counts.append(count)
+
+    return counts
+
+
+def main():
+    # Read in data
+    logger.info(f"Reading input data {par['input']}...")
+    adata = mu.read_h5ad(par["input"], mod=par["modality"]).copy()
+
+    # Make sure .X contains raw counts
+    if par["input_layer"]:
+        adata.X = adata.layers[par["input_layer"]]
+    if is_normalized(adata.X):
+        raise ValueError("Input layer must contain raw counts.")
+
+    # Sanitize pseudobulk aggregation fields
+    pseudobulk_cols = [par["obs_label"]]
+    if par["obs_groups"]:
+        pseudobulk_cols += par["obs_groups"]
+
+    for col in pseudobulk_cols:
+        if col not in adata.obs.columns:
+            raise ValueError(f"Required column '{col}' not found in .obs.")
+        adata.obs[col] = (
+            adata.obs[col]
+            .astype(str)
+            .str.replace(" ", "_")
+            .str.replace("+", "")
+            .astype("category")
+        )
+
+    # Aggregate pseudobulk data per cell group
+    logger.info("Creating pseudobulk samples...")
+    adata_pb = sc.get.aggregate(adata, by=pseudobulk_cols, func="sum", axis=0)
+    adata_pb.X = adata_pb.layers["sum"]
+    del adata_pb.layers["sum"]
+
+    adata_pb.obs_names = [
+        "_".join(values)
+        for values in zip(*[adata_pb.obs[col] for col in pseudobulk_cols])
+    ]
+
+    # Filter pseudobulk samples based on minimum observation count
+    logger.info("Filtering pseudobulk samples based on minimum observation count...")
+    adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols)
+    adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]]
+
+    logger.info(
+        f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes"
+    )
+
+    logger.info("Writing output data...")
+    mdata_pb = mu.MuData({"rna": adata_pb})
+    mdata_pb.write_h5mu(par["output"], compression=par["output_compression"])
+
+
+if __name__ == "__main__":
+    main()