openpipelines-bio · DriesSchaumont · Sep 2, 2025 · Jul 2, 2025 · Jul 3, 2025 · Jul 3, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,21 @@
 # openpipelines 3.x.x
 
+## BREAKING
+
+* `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044).
+
 ## NEW FUNCTIONALITY
 
 * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070).
 
 * `filter/delimit_counts`: Turns an .obs column of a MuData file containing count data into a boolean column based on thresholds (PR #1069)
 
+## EXPERIMENTAL
+
+* `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044).
+
+* `workflows/differential_expression/pseudobulk_deseq2`: Workflow for generating pseudobulk samples from single-cell data followed by DESeq2 differential expression analysis (PR #1044)
+
 # openpipelines 3.0.0
 
 ## BREAKING CHANGES

diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml
@@ -1,5 +1,5 @@
-name: differential_expression
-namespace: "create_pseudobulk"
+name: create_pseudobulk
+namespace: differential_expression
 scope: "public"
 description: |
   Generation of pseudobulk samples from single-cell transcriptomics data,
@@ -54,11 +54,6 @@ argument_groups:
         choices: ["sum", "mean"]
         default: "sum"
         description: "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean."
-      - name: "--min_obs_per_sample"
-        type: integer
-        min: 1
-        default: 30
-        description: "Minimum number of cells per pseudobulk sample."
       - name: --random_state
         type: integer
         description: |

diff --git a/src/differential_expression/create_pseudobulk/script.py b/src/differential_expression/create_pseudobulk/script.py
@@ -13,7 +13,6 @@
     "obs_label": "cell_type",
     "obs_groups": ["treatment", "donor_id", "disease"],
     "obs_cell_count": "n_cells",
-    "min_obs_per_sample": 5,
     "random_state": 0,
     "output": "test.h5mu",
     "output_compression": "gzip",
@@ -92,7 +91,6 @@ def main():
     # Filter pseudobulk samples based on minimum observation count
     logger.info("Filtering pseudobulk samples based on minimum observation count...")
     adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols)
-    adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]]
 
     logger.info(
         f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes"

diff --git a/src/differential_expression/create_pseudobulk/test.py b/src/differential_expression/create_pseudobulk/test.py
@@ -58,8 +58,6 @@ def test_multiple_factors(run_component, random_h5mu_path):
             "donor_id",
             "--obs_groups",
             "treatment",
-            "--min_obs_per_sample",
-            "5",
             "--output_compression",
             "gzip",
         ]
@@ -78,36 +76,5 @@ def test_multiple_factors(run_component, random_h5mu_path):
     )
 
 
-def test_filtering(run_component, random_h5mu_path):
-    output_path = random_h5mu_path()
-
-    run_component(
-        [
-            "--input",
-            input_path,
-            "--output",
-            output_path,
-            "--obs_label",
-            "cell_type",
-            "--obs_groups",
-            "treatment",
-            "--min_obs_per_sample",
-            "50",
-            "--output_compression",
-            "gzip",
-        ]
-    )
-
-    assert os.path.exists(output_path), "Output query file does not exist"
-    mdata = mu.read_h5mu(output_path)
-    adata = mdata.mod["rna"]
-
-    expected_obs = ["treatment", "cell_type", "n_cells"]
-    assert all(col in adata.obs for col in expected_obs), (
-        f"Expected columns {expected_obs} not found in .obs"
-    )
-    assert adata.shape[0] == 4, "Expected a total of 8 pseudobulk samples in the output"
-
-
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__]))
diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml
@@ -0,0 +1,136 @@
+name: deseq2
+namespace: differential_expression
+description: |
+  Performs differential expression analysis using DESeq2 on bulk samples or pseudobulk samples aggregated from single-cell data.
+  Note that this component only considers factors as explanatory variables, and excludes covariates from the analysis.
+
+authors:
+  - __merge__: /src/authors/jakub_majercik.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ author ]
+  - __merge__: /src/authors/dries_de_maeyer.yaml
+    roles: [ contributor ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        alternatives: ["-i"]
+        type: file
+        description: Input h5mu file containing (pseudo-)bulk transcriptomic samples.
+        direction: input
+        required: true
+      - name: "--modality"
+        description: |
+          Which modality from the input MuData file to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        required: false
+        description: "Input layer to use. If None, X is used. This layer must contain raw counts."
+      - name: "--var_gene_names"
+        type: string
+        description: |
+          Name of the .var field that contains gene symbols. If not provided, .var.index will be used.
+        required: false
+        example: "gene_symbol"
+      - name: "--obs_cell_group"
+        type: string
+        description: |
+          .obs field containing the cell group information, for example per cell type or per cell cluster.
+          If true, performs per-cell-group analysis with cell-group-specific results.
+
+  - name: Arguments
+    arguments:
+      - name: "--design_formula"
+        type: string
+        description: |
+          The formula should be a tilde (~) followed by the variables with plus signs between them. 
+          The design formula is used to estimate the dispersions and to estimate the log2 fold changes of the model.
+        required: true
+        example: "~ disease + treatment"
+      - name: "--contrast_column"
+        type: string
+        description: |
+          Column in the metadata to use for the contrast.
+          This column should contain the conditions to compare.
+        required: true
+        example: "treatment"
+      - name: "--contrast_values"
+        type: string
+        multiple: true
+        description: |
+          Values to compare in the contrast column.
+          First value is the control group, following values are comparison groups. 
+          Values must be present in the fields specified for the design formula.
+        required: true
+        example: ["ctrl", "stim"]
+      - name: "--p_adj_threshold"
+        type: double
+        min: 0
+        max: 1
+        description: |
+          Adjusted p-value threshold for significance, corrected with the Benjamini and Hochberg method.
+          Genes with adjusted p-values below this threshold will be considered significant.
+        default: 0.05
+        required: false
+      - name: "--log2fc_threshold"
+        type: double
+        min: 0
+        description: |
+          Log2 fold change threshold for significance.
+          Genes with absolute log2 fold change above this threshold will be considered significant.
+        default: 0.0
+        required: false
+
+  - name: Outputs
+    arguments:
+      - name: "--output_dir"
+        alternatives: ["-o"]
+        type: file
+        description: |
+          Output directory for DESeq2 results. 
+          If cell groups are defined (using `--obs_cell_group`), the output folder will contain one CSV per cell group, otherwise it will contain a single output file.
+        direction: output
+        required: true
+      - name: "--output_prefix"
+        type: string
+        description: |
+          Prefix for output CSV files. 
+          If no cell groups are specified, the output file will be named "{prefix}.csv".
+          If cell groups are specified, files will be named "{prefix}_{cell_group}.csv".
+        default: "deseq2_analysis"
+        required: false
+
+resources:
+  - type: r_script
+    path: script.R
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered_pseudobulk.h5mu
+
+engines:
+- type: docker
+  image: rocker/r2u:22.04
+  setup:
+    - type: apt
+      packages: [ libhdf5-dev, libgeos-dev, hdf5-tools ]
+    - type: r
+      cran: [ hdf5r ]
+      # Use pinned version to avoid H5 file access issues with latest stable release (0.99.0)
+      github: scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a
+      bioc: [ DESeq2 ]
+  test_setup:
+    - type: apt
+      packages: [ python3, python3-pip, python3-dev, python-is-python3 ]
+  __merge__: [ ., /src/base/requirements/python_test_setup.yaml]
+
+runners:
+- type: executable
+- type: nextflow