From 99f8f70eeddf4be43e3797f29b69a78b23bbae29 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 2 Jul 2025 09:53:43 +0200 Subject: [PATCH 01/52] WIP create pseudobulk samples --- src/dgea/create_pseudobulk_samples/script.py | 112 +++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 src/dgea/create_pseudobulk_samples/script.py diff --git a/src/dgea/create_pseudobulk_samples/script.py b/src/dgea/create_pseudobulk_samples/script.py new file mode 100644 index 00000000000..17913e0a876 --- /dev/null +++ b/src/dgea/create_pseudobulk_samples/script.py @@ -0,0 +1,112 @@ +import scanpy as sc +import mudata as mu +import random +import numpy as np + +## VIASH START +par = { + "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", + "mod": "rna", + "min_cells": 100, + "target_cluster_id": "donor_id", + "grouping_column": "cell_type", # Add this parameter + "min_cells_per_sample": 50, # Add this parameter + "n_replicates": 1, # Add this parameter +} + +meta = {"resources_dir": "src/utils"} +### VIASH END + +import sys +sys.path.append(meta["resources_dir"]) + +from setup_logger import setup_logger + +logger = setup_logger() + +mdata = mu.read_h5mu(par["input"]) +mod = mdata.mod[par["mod"]] + +sc.pp.filter_genes( + mod, + min_cells=par["min_cells"], +) + +mod.obs_names_make_unique() +mod.var_names_make_unique() + +mod.obs[par["target_cluster_id"]] = mod.obs[par["target_cluster_id"]].cat.remove_unused_categories() + +mod_dge = mod.copy() + +pbs = {} +for cluster in mod_dge.obs[par["target_cluster_id"]].cat.categories: + logger.info(f"Processing cluster {cluster}") + + pbs[cluster] = [] + + # Get cells from this cluster + cluster_cells = mod_dge[mod_dge.obs[par["target_cluster_id"]] == cluster] + + grouping_values = cluster_cells.obs[par["grouping_column"]].unique() + + # Process each group within this cluster + for group in grouping_values: + group_cells = cluster_cells[cluster_cells.obs[par["grouping_column"]] == group] + + if len(group_cells) < par["min_cells_per_sample"]: + logger.warning(f"Skipping {cluster}-{group}: only {len(group_cells)} cells (minimum: {par['min_cells_per_sample']})") + continue + + # Create pseudoreplicates + indices = list(group_cells.obs_names) + random.shuffle(indices) + indices_split = np.array_split(np.array(indices), par["n_replicates"]) + + for i, pseudo_rep_indices in enumerate(indices_split): + if len(pseudo_rep_indices) >= par["min_cells_per_sample"]: + # Sum expression across cells to create pseudobulk sample + pseudobulk_cells = group_cells[pseudo_rep_indices] + + rep_adata = sc.AnnData( + X=pseudobulk_cells.X.sum(axis=0), + var=pseudobulk_cells.var[[]] # Empty var dataframe with same index + ) + + # Add metadata + rep_adata.obs_names = [f'{group}_{cluster}_rep{i}'] + rep_adata.obs['replicate'] = i + rep_adata.obs['group'] = str(group) + rep_adata.obs['cluster'] = str(cluster) + rep_adata.obs['n_cells'] = len(pseudo_rep_indices) + + # Add original metadata if available + first_cell_metadata = pseudobulk_cells.obs.iloc[0] + for col in pseudobulk_cells.obs.columns: + if col not in ['replicate', 'group', 'cluster', 'n_cells']: + rep_adata.obs[col] = str(first_cell_metadata[col]) + + pbs[cluster].append(rep_adata) + logger.info(f"Created pseudobulk sample: {group}_{cluster}_rep{i} with {len(pseudo_rep_indices)} cells") + +# Concatenate all pseudobulk samples +logger.info("Concatenating pseudobulk samples...") +all_pseudobulk_samples = [] +for cluster, samples in pbs.items(): + if samples: # Only add if there are samples + all_pseudobulk_samples.extend(samples) + +if all_pseudobulk_samples: + adata_pseudo = sc.concat(all_pseudobulk_samples) + logger.info(f"Created {len(all_pseudobulk_samples)} pseudobulk samples with shape: {adata_pseudo.shape}") + + # Convert object columns to string for compatibility + for col in adata_pseudo.obs.columns: + if adata_pseudo.obs[col].dtype == 'object': + adata_pseudo.obs[col] = adata_pseudo.obs[col].astype(str) + + logger.info("Pseudobulk generation completed successfully") +else: + logger.error("No pseudobulk samples were generated!") + +print("here") \ No newline at end of file From 8f0078a63a820a429fbe6a8bfdd5e087820536d3 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 3 Jul 2025 10:35:31 +0200 Subject: [PATCH 02/52] add deseq2 component script --- src/dgea/deseq2/script.py | 155 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 src/dgea/deseq2/script.py diff --git a/src/dgea/deseq2/script.py b/src/dgea/deseq2/script.py new file mode 100644 index 00000000000..dd7add314ca --- /dev/null +++ b/src/dgea/deseq2/script.py @@ -0,0 +1,155 @@ +import scanpy as sc +import mudata as mu +import pandas as pd +import numpy as np +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats + +## VIASH START +par = { + "input": "pseudobulk_samples.h5mu", + "output": "deseq2_results.csv", + "design_factors": ["disease", "treatment"], + "contrast_column": "treatment", + "contrast_baseline": "ctrl", + "contrast_treatment": "stim", + "filter_genes_min_samples": None, + "padj_threshold": 0.05, + "log2fc_threshold": 0.0, + "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], + "var_gene_name" : "feature_name", +} + +meta = {"resources_dir": "src/utils"} +### VIASH END + +import sys +sys.path.append(meta["resources_dir"]) + +from setup_logger import setup_logger + +logger = setup_logger() + + +# Load pseudobulk data +logger.info(f"Loading pseudobulk data from {par['input']}") +mdata = mu.read_h5mu(par["input"]) +mod = mdata.mod["rna"] + +logger.info(f"Input pseudobulk data shape: {mod.shape}") +logger.info(f"Available metadata columns: {list(mod.obs.columns)}") + +# Validate required columns exist +required_columns = par["design_factors"] + [par["contrast_column"]] +missing_columns = [col for col in required_columns if col not in mod.obs.columns] +if missing_columns: + raise ValueError(f"Missing required columns in metadata: {missing_columns}") + +# Prepare counts matrix +logger.info("Preparing counts matrix for DESeq2") +counts = pd.DataFrame( + mod.X, + columns=mod.var_names, + index=mod.obs_names +) + +# Ensure counts are integers (required for DESeq2) +counts = counts.astype(int) + +metadata = mod.obs.copy() + +logger.info(f"Counts matrix shape: {counts.shape}") +logger.info(f"Design factors: {par['design_factors']}") + +# Check contrast values exist +contrast_values = metadata[par["contrast_column"]].unique() +logger.info(f"Available values in {par['contrast_column']}: {contrast_values}") + +if par["contrast_baseline"] not in contrast_values: + raise ValueError(f"Baseline '{par['contrast_baseline']}' not found in {par['contrast_column']}") +if par["contrast_treatment"] not in contrast_values: + raise ValueError(f"Treatment '{par['contrast_treatment']}' not found in {par['contrast_column']}") + +# Create DESeq2 dataset +logger.info("Creating DESeq2 dataset") +try: + dds = DeseqDataSet( + counts=counts, + metadata=metadata, + design_factors=par["design_factors"], + ) + + # Optional gene filtering + if par["filter_genes_min_samples"] is not None: + logger.info(f"Filtering genes expressed in at least {par['filter_genes_min_samples']} samples") + sc.pp.filter_genes(dds, min_cells=par["filter_genes_min_samples"]) + else: + # Default: filter genes expressed in all samples + sc.pp.filter_genes(dds, min_cells=len(mod.obs)) + + logger.info(f"After filtering: {dds.shape}") + + # Run DESeq2 analysis + logger.info("Running DESeq2 analysis...") + dds.deseq2() + + # Perform statistical test + logger.info(f"Performing contrast: {par['contrast_column']} {par['contrast_treatment']} vs {par['contrast_baseline']}") + stat_res = DeseqStats( + dds, + contrast=(par["contrast_column"], par["contrast_treatment"], par["contrast_baseline"]) + ) + stat_res.summary() + + # Get results + results = stat_res.results_df.reset_index() + logger.info(f"DESeq2 analysis completed. Found {len(results)} genes.") + + # Filter results based on significance + significant_genes = results[ + (results['padj'] < par["padj_threshold"]) & + (results['log2FoldChange'].abs() > par["log2fc_threshold"]) + ] + logger.info(f"Significant genes (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}") + + # Filter out unwanted gene patterns if specified + if par["filter_gene_patterns"]: + pattern_string = "|".join(par["filter_gene_patterns"]) + before_filter = len(results) + results_filtered = results[~results[par["var_gene_name"]].str.contains(pattern_string, na=False)] + logger.info(f"Filtered out genes matching patterns {par['filter_gene_patterns']}: {before_filter} -> {len(results_filtered)}") + results = results_filtered + + # Sort by log2FoldChange + results = results.sort_values('log2FoldChange', ascending=False) + + # Add additional statistics + results['abs_log2FoldChange'] = results['log2FoldChange'].abs() + results['significant'] = ( + (results['padj'] < par["padj_threshold"]) & + (results['log2FoldChange'].abs() > par["log2fc_threshold"]) + ) + + # Save results + logger.info(f"Saving results to {par['output']}") + results.to_csv(par["output"], index=False) + + # Log summary statistics + upregulated = results[(results['log2FoldChange'] > 0) & results['significant']] + downregulated = results[(results['log2FoldChange'] < 0) & results['significant']] + + logger.info(f"Summary:") + logger.info(f" Total genes analyzed: {len(results)}") + logger.info(f" Significant upregulated: {len(upregulated)}") + logger.info(f" Significant downregulated: {len(downregulated)}") + logger.info(f" Top upregulated gene: {upregulated.iloc[0][par['var_gene_name']] if len(upregulated) > 0 else 'None'}") + logger.info(f" Top downregulated gene: {downregulated.iloc[-1][par['var_gene_name']] if len(downregulated) > 0 else 'None'}") + +except Exception as e: + logger.error(f"DESeq2 analysis failed: {str(e)}") + logger.error(f"Analysis context: {par['contrast_column']} contrast '{par['contrast_treatment']}' vs '{par['contrast_baseline']}', {len(mod.obs)} samples, {len(mod.var)} genes") + logger.warning("Check your input data, design factors, and contrast specifications") + + raise e + +logger.info("DESeq2 analysis completed successfully") \ No newline at end of file From e07bb99d3fa867a82386719ffdc5af043db98849 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 3 Jul 2025 10:54:54 +0200 Subject: [PATCH 03/52] add deseq2 config --- src/dgea/deseq2/config.vsh.yaml | 139 ++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 src/dgea/deseq2/config.vsh.yaml diff --git a/src/dgea/deseq2/config.vsh.yaml b/src/dgea/deseq2/config.vsh.yaml new file mode 100644 index 00000000000..de3f79277d9 --- /dev/null +++ b/src/dgea/deseq2/config.vsh.yaml @@ -0,0 +1,139 @@ +name: deseq2 +namespace: dgea +description: | + Perform differential expression analysis using DESeq2 on pseudobulk samples aggregated from single-cell data. + This component aggregates single-cell data into pseudobulk samples based on specified design factors, + and then performs differential expression analysis using DESeq2. + +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ contributor ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file. + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is used. This layer must contain raw counts." + + - name: Arguments + arguments: + - name: "--design_factors" + type: string + multiple: true + description: | + List of design factors to use for pseudobulk aggregation. + These factors will be used to group cells into pseudobulk samples. + required: true + example: ["disease", "treatment"] + - name: "--contrast_column" + type: string + description: | + Column in the metadata to use for the contrast. + This column should contain the conditions to compare. + required: true + example: "treatment" + - name: "--contrast_baseline" + type: string + description: | + Baseline condition for the contrast. + This is the condition against which the other conditions will be compared. + required: true + example: "ctrl" + - name: "--contrast_treatment" + type: string + description: | + Treatment condition for the contrast. + This is the condition that will be compared against the baseline. + required: true + example: "stim" + - name: "--filter_genes_min_samples" + type: integer + description: | + Minimum number of samples a gene must be expressed in to be included in the analysis. + If None, no filtering is applied. + required: false + - name: "--padj_threshold" + type: double + description: | + Adjusted p-value threshold for significance. + Genes with adjusted p-values below this threshold will be considered significant. + default: 0.05 + required: false + - name: "--log2fc_threshold" + type: double + description: | + Log2 fold change threshold for significance. + Genes with absolute log2 fold change above this threshold will be considered significant. + default: 0.0 + required: false + - name: "--filter_gene_patterns" + type: string + multiple: true + description: | + List of regex patterns to filter out genes. + Genes matching any of these patterns will be excluded from the analysis. + required: false + example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] + - name: "--var_gene_name" + type: string + description: | + Column name in the variable features table that contains gene symbols. + This is used for reporting gene names in the output. + default: "feature_name" + required: false + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: Output CSV file with DESeq2 results. + direction: output + required: true + + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +# test_resources: +# - type: python_script +# path: test.py +# - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +engines: +- type: docker + image: python:3.12 + setup: + - type: python + packages: + - pydeseq2~=0.5.2 + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml] + __merge__: [/src/base/requirements/scanpy.yaml] + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + + +runners: +- type: executable +- type: nextflow \ No newline at end of file From c1f48c184fca7eccf81db5607c501b2426615a6b Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 3 Jul 2025 10:55:04 +0200 Subject: [PATCH 04/52] fix script typo --- src/dgea/deseq2/script.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/dgea/deseq2/script.py b/src/dgea/deseq2/script.py index dd7add314ca..9f12448231b 100644 --- a/src/dgea/deseq2/script.py +++ b/src/dgea/deseq2/script.py @@ -19,9 +19,7 @@ "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], "var_gene_name" : "feature_name", } - -meta = {"resources_dir": "src/utils"} -### VIASH END +## VIASH END import sys sys.path.append(meta["resources_dir"]) From 2c2c946936ffb7e9fe93d53c840b3a3dfa538b47 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 3 Jul 2025 11:01:30 +0200 Subject: [PATCH 05/52] add input layer arg --- src/dgea/deseq2/script.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/dgea/deseq2/script.py b/src/dgea/deseq2/script.py index 9f12448231b..d09acc07d74 100644 --- a/src/dgea/deseq2/script.py +++ b/src/dgea/deseq2/script.py @@ -2,6 +2,7 @@ import mudata as mu import pandas as pd import numpy as np +import scipy.sparse as sp from pydeseq2.dds import DeseqDataSet from pydeseq2.ds import DeseqStats @@ -9,6 +10,7 @@ par = { "input": "pseudobulk_samples.h5mu", "output": "deseq2_results.csv", + "input_layer": None, "design_factors": ["disease", "treatment"], "contrast_column": "treatment", "contrast_baseline": "ctrl", @@ -21,6 +23,15 @@ } ## VIASH END +def is_normalized(layer): + if sp.issparse(layer): + row_sums = np.array(layer.sum(axis=1)).flatten() + else: + row_sums = layer.sum(axis=1) + + return np.allclose(row_sums, 1) + + import sys sys.path.append(meta["resources_dir"]) @@ -45,6 +56,15 @@ # Prepare counts matrix logger.info("Preparing counts matrix for DESeq2") + + +# Use specified layer if provided +if par["input_layer"]: + mod.X = mod.layers[par["input_layer"]] + +if is_normalized(mod.X): + raise ValueError("Input layer must contain raw counts.") + counts = pd.DataFrame( mod.X, columns=mod.var_names, From 631231638be981d571fb2c2be6a6854582aef70c Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 16 Jul 2025 13:25:18 +0200 Subject: [PATCH 06/52] add design formula --- src/dgea/deseq2/config.vsh.yaml | 35 ++++++++-------------- src/dgea/deseq2/script.py | 53 ++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/src/dgea/deseq2/config.vsh.yaml b/src/dgea/deseq2/config.vsh.yaml index de3f79277d9..0d2e0ae7cf1 100644 --- a/src/dgea/deseq2/config.vsh.yaml +++ b/src/dgea/deseq2/config.vsh.yaml @@ -37,14 +37,13 @@ argument_groups: - name: Arguments arguments: - - name: "--design_factors" + - name: "--design_formula" type: string - multiple: true description: | - List of design factors to use for pseudobulk aggregation. - These factors will be used to group cells into pseudobulk samples. + Design formula for DESeq2 analysis in R-style format. + Specifies the statistical model to account for various factors. required: true - example: ["disease", "treatment"] + example: "~ cell_type + disease + treatment" - name: "--contrast_column" type: string description: | @@ -52,20 +51,14 @@ argument_groups: This column should contain the conditions to compare. required: true example: "treatment" - - name: "--contrast_baseline" - type: string - description: | - Baseline condition for the contrast. - This is the condition against which the other conditions will be compared. - required: true - example: "ctrl" - - name: "--contrast_treatment" + - name: "--contrast_values" type: string + multiple: true description: | - Treatment condition for the contrast. - This is the condition that will be compared against the baseline. + Values to compare in the contrast column. + First value is treatment, second is baseline (e.g., ["stim", "ctrl"]). required: true - example: "stim" + example: ["stim", "ctrl"] - name: "--filter_genes_min_samples" type: integer description: | @@ -111,15 +104,14 @@ argument_groups: direction: output required: true - resources: - type: python_script path: script.py - path: /src/utils/setup_logger.py -# test_resources: -# - type: python_script -# path: test.py -# - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/TS_Blood_filtered_annotated_pseudobulk.h5mu engines: - type: docker @@ -133,7 +125,6 @@ engines: __merge__: [/src/base/requirements/scanpy.yaml] __merge__: [ /src/base/requirements/python_test_setup.yaml, .] - runners: - type: executable - type: nextflow \ No newline at end of file diff --git a/src/dgea/deseq2/script.py b/src/dgea/deseq2/script.py index d09acc07d74..fc0d9e269c6 100644 --- a/src/dgea/deseq2/script.py +++ b/src/dgea/deseq2/script.py @@ -5,22 +5,24 @@ import scipy.sparse as sp from pydeseq2.dds import DeseqDataSet from pydeseq2.ds import DeseqStats +import re ## VIASH START par = { - "input": "pseudobulk_samples.h5mu", + "input": "resources_test/annotation_test_data/TS_Blood_filtered_annotated_pseudobulk.h5mu", "output": "deseq2_results.csv", "input_layer": None, - "design_factors": ["disease", "treatment"], + "design_formula": "~ cell_type + disease + treatment", "contrast_column": "treatment", - "contrast_baseline": "ctrl", - "contrast_treatment": "stim", + "contrast_values": ["stim", "ctrl"], # [treatment, baseline] or [group1, group2, group3, ...] "filter_genes_min_samples": None, "padj_threshold": 0.05, "log2fc_threshold": 0.0, "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], "var_gene_name" : "feature_name", } +meta = {"resources_dir": "src/utils"} + ## VIASH END def is_normalized(layer): @@ -46,10 +48,16 @@ def is_normalized(layer): mod = mdata.mod["rna"] logger.info(f"Input pseudobulk data shape: {mod.shape}") -logger.info(f"Available metadata columns: {list(mod.obs.columns)}") +logger.info(f"Available metadata columns: {list(mod.obs.columns)}") + +# Parse design formula to extract factors +design_factors = re.findall(r'\b(?!~)\w+', par["design_formula"]) +logger.info(f"Design formula: {par['design_formula']}") +logger.info(f"Extracted factors: {design_factors}") # Validate required columns exist -required_columns = par["design_factors"] + [par["contrast_column"]] +contrast_column = par["contrast_column"] +required_columns = design_factors + [contrast_column] if contrast_column not in design_factors else design_factors missing_columns = [col for col in required_columns if col not in mod.obs.columns] if missing_columns: raise ValueError(f"Missing required columns in metadata: {missing_columns}") @@ -57,7 +65,6 @@ def is_normalized(layer): # Prepare counts matrix logger.info("Preparing counts matrix for DESeq2") - # Use specified layer if provided if par["input_layer"]: mod.X = mod.layers[par["input_layer"]] @@ -77,16 +84,25 @@ def is_normalized(layer): metadata = mod.obs.copy() logger.info(f"Counts matrix shape: {counts.shape}") -logger.info(f"Design factors: {par['design_factors']}") +logger.info(f"Design formula: {par['design_formula']}") # Check contrast values exist -contrast_values = metadata[par["contrast_column"]].unique() -logger.info(f"Available values in {par['contrast_column']}: {contrast_values}") +contrast_column = par["contrast_column"] +contrast_values = par["contrast_values"] +available_values = metadata[contrast_column].unique() +logger.info(f"Available values in {contrast_column}: {available_values}") + +# Validate all contrast values exist +missing_values = [val for val in contrast_values if val not in available_values] +if missing_values: + raise ValueError(f"Contrast values {missing_values} not found in {contrast_column}") + +if len(contrast_values) < 2: + raise ValueError(f"Need at least 2 values for contrast, got: {contrast_values}") -if par["contrast_baseline"] not in contrast_values: - raise ValueError(f"Baseline '{par['contrast_baseline']}' not found in {par['contrast_column']}") -if par["contrast_treatment"] not in contrast_values: - raise ValueError(f"Treatment '{par['contrast_treatment']}' not found in {par['contrast_column']}") +treatment, baseline = contrast_values +contrast_tuple = (contrast_column, treatment, baseline) +logger.info(f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}") # Create DESeq2 dataset logger.info("Creating DESeq2 dataset") @@ -94,7 +110,7 @@ def is_normalized(layer): dds = DeseqDataSet( counts=counts, metadata=metadata, - design_factors=par["design_factors"], + design=par["design_formula"], ) # Optional gene filtering @@ -112,10 +128,9 @@ def is_normalized(layer): dds.deseq2() # Perform statistical test - logger.info(f"Performing contrast: {par['contrast_column']} {par['contrast_treatment']} vs {par['contrast_baseline']}") stat_res = DeseqStats( dds, - contrast=(par["contrast_column"], par["contrast_treatment"], par["contrast_baseline"]) + contrast=contrast_tuple ) stat_res.summary() @@ -160,12 +175,10 @@ def is_normalized(layer): logger.info(f" Total genes analyzed: {len(results)}") logger.info(f" Significant upregulated: {len(upregulated)}") logger.info(f" Significant downregulated: {len(downregulated)}") - logger.info(f" Top upregulated gene: {upregulated.iloc[0][par['var_gene_name']] if len(upregulated) > 0 else 'None'}") - logger.info(f" Top downregulated gene: {downregulated.iloc[-1][par['var_gene_name']] if len(downregulated) > 0 else 'None'}") except Exception as e: logger.error(f"DESeq2 analysis failed: {str(e)}") - logger.error(f"Analysis context: {par['contrast_column']} contrast '{par['contrast_treatment']}' vs '{par['contrast_baseline']}', {len(mod.obs)} samples, {len(mod.var)} genes") + logger.error(f"Analysis context: {contrast_column} contrast {contrast_values}, {len(mod.obs)} samples, {len(mod.var)} genes") logger.warning("Check your input data, design factors, and contrast specifications") raise e From e03545735ccb8bce11ea3eebc6aff14cf66c9f78 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 16 Jul 2025 13:39:19 +0200 Subject: [PATCH 07/52] add unit tests --- src/dgea/deseq2/test.py | 116 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/dgea/deseq2/test.py diff --git a/src/dgea/deseq2/test.py b/src/dgea/deseq2/test.py new file mode 100644 index 00000000000..752324c07f9 --- /dev/null +++ b/src/dgea/deseq2/test.py @@ -0,0 +1,116 @@ +import os +import subprocess +import mudata as mu +import sys +import pytest +import pandas as pd +import re + + +## VIASH START +meta = { + "resources_dir": "resources_test/" +} +## VIASH END + +sys.path.append(meta["resources_dir"]) + + +@pytest.fixture +def pseudobulk_test_data_path(): + """Path to the pseudobulk test data""" + return f"{meta['resources_dir']}/TS_Blood_filtered_annotated_pseudobulk.h5mu" + + +def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_path): + """Test basic DESeq2 execution with minimal parameters""" + output_path = tmp_path / "deseq2_results.csv" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output", + str(output_path), + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_path.exists(), "Output CSV file does not exist" + + # Check the output file + results = pd.read_csv(output_path) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] + assert all(col in results.columns for col in expected_columns), f"Expected columns {expected_columns} not found" + assert len(results) > 0, "No results found in output" + + +def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_path): + """Test DESeq2 with complex design formula accounting for multiple factors""" + output_path = tmp_path / "deseq2_complex_results.csv" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output", + str(output_path), + "--design_formula", + "~ cell_type + disease + treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + "--padj_threshold", + "0.1", + "--log2fc_threshold", + "0.5", + ] + ) + + assert output_path.exists(), "Output CSV file does not exist" + + results = pd.read_csv(output_path) + assert "significant" in results.columns, "Significance column not found" + assert len(results) > 0, "No results found for complex design" + + +def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_path): + """Test that invalid contrast column raises appropriate error""" + output_path = tmp_path / "deseq2_invalid_results.csv" + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output", + str(output_path), + "--design_formula", + "~ treatment", + "--contrast_column", + "nonexistent_column", + "--contrast_values", + "group1", + "--contrast_values", + "group2", + ] + ) + + assert re.search( + r"Missing required columns in metadata: \['nonexistent_column'\]", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) \ No newline at end of file From a1dc501db91fa90329a07b7ad1a2db0728f504aa Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 16 Jul 2025 13:41:17 +0200 Subject: [PATCH 08/52] add changelog entry --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf4d09fb93f..f87ec3001a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ * (Experimental) Added `from_h5mu_or_h5ad_to_tiledb` component. Warning: the functionality in this component is experimental and its behavior may change in future releases (PR #1034). +* Add `dgea/deseq2` Deseq2 component for differential gene expression analysis (PR #1044). + ## MAJOR CHANGES * `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006). From af0b2f933b4396a92a23e35fd0701cd92c3a77c4 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 17 Jul 2025 16:58:56 +0200 Subject: [PATCH 09/52] add per cell type option --- src/dgea/deseq2/config.vsh.yaml | 13 +++ src/dgea/deseq2/script.py | 156 +++++++++++++++++++++++--------- 2 files changed, 127 insertions(+), 42 deletions(-) diff --git a/src/dgea/deseq2/config.vsh.yaml b/src/dgea/deseq2/config.vsh.yaml index 0d2e0ae7cf1..abffe89216b 100644 --- a/src/dgea/deseq2/config.vsh.yaml +++ b/src/dgea/deseq2/config.vsh.yaml @@ -94,6 +94,19 @@ argument_groups: This is used for reporting gene names in the output. default: "feature_name" required: false + - name: "--per_cell_type" + type: boolean_true + description: | + Whether to run DESeq2 analysis separately for each cell type. + If true, performs per-cell-type analysis with cell-type-specific results. + If false (default), runs multi-factor analysis accounting for cell type as a covariate. + - name: "--cell_type_column" + type: string + description: | + Column name in .obs that contains cell type information. + Only used when per_cell_type is true. + default: "cell_type" + required: false - name: Outputs arguments: diff --git a/src/dgea/deseq2/script.py b/src/dgea/deseq2/script.py index fc0d9e269c6..8cfb066f338 100644 --- a/src/dgea/deseq2/script.py +++ b/src/dgea/deseq2/script.py @@ -20,6 +20,8 @@ "log2fc_threshold": 0.0, "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], "var_gene_name" : "feature_name", + "per_cell_type": False, # New parameter + "cell_type_column": "cell_type", # Which column contains cell types } meta = {"resources_dir": "src/utils"} @@ -83,7 +85,19 @@ def is_normalized(layer): metadata = mod.obs.copy() -logger.info(f"Counts matrix shape: {counts.shape}") +# Filter out unwanted gene patterns if specified (before DESeq2 analysis) +if par["filter_gene_patterns"]: + pattern_string = "|".join(par["filter_gene_patterns"]) + before_filter = counts.shape[1] + + # Filter genes based on column names (gene names) + genes_to_keep = ~pd.Series(counts.columns).str.contains(pattern_string, na=False) + counts_filtered = counts.loc[:, genes_to_keep] + + logger.info(f"Filtered out genes matching patterns {par['filter_gene_patterns']}: {before_filter} -> {counts_filtered.shape[1]}") + counts = counts_filtered + +logger.info(f"Counts matrix shape after gene pattern filtering: {counts.shape}") logger.info(f"Design formula: {par['design_formula']}") # Check contrast values exist @@ -118,50 +132,108 @@ def is_normalized(layer): logger.info(f"Filtering genes expressed in at least {par['filter_genes_min_samples']} samples") sc.pp.filter_genes(dds, min_cells=par["filter_genes_min_samples"]) else: - # Default: filter genes expressed in all samples - sc.pp.filter_genes(dds, min_cells=len(mod.obs)) + # Default: only filter genes with zero counts across all samples + logger.info("Default filtering: removing genes with zero counts across all samples") + sc.pp.filter_genes(dds, min_cells=1) logger.info(f"After filtering: {dds.shape}") - # Run DESeq2 analysis - logger.info("Running DESeq2 analysis...") - dds.deseq2() - - # Perform statistical test - stat_res = DeseqStats( - dds, - contrast=contrast_tuple - ) - stat_res.summary() - - # Get results - results = stat_res.results_df.reset_index() - logger.info(f"DESeq2 analysis completed. Found {len(results)} genes.") - - # Filter results based on significance - significant_genes = results[ - (results['padj'] < par["padj_threshold"]) & - (results['log2FoldChange'].abs() > par["log2fc_threshold"]) - ] - logger.info(f"Significant genes (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}") - - # Filter out unwanted gene patterns if specified - if par["filter_gene_patterns"]: - pattern_string = "|".join(par["filter_gene_patterns"]) - before_filter = len(results) - results_filtered = results[~results[par["var_gene_name"]].str.contains(pattern_string, na=False)] - logger.info(f"Filtered out genes matching patterns {par['filter_gene_patterns']}: {before_filter} -> {len(results_filtered)}") - results = results_filtered - - # Sort by log2FoldChange - results = results.sort_values('log2FoldChange', ascending=False) - - # Add additional statistics - results['abs_log2FoldChange'] = results['log2FoldChange'].abs() - results['significant'] = ( - (results['padj'] < par["padj_threshold"]) & - (results['log2FoldChange'].abs() > par["log2fc_threshold"]) - ) + # Check if running per cell type or multi-factor + if par["per_cell_type"]: + logger.info("Running DESeq2 analysis per cell type") + + # Remove cell_type from design formula for per-cell-type analysis + design_no_celltype = par["design_formula"].replace(f"+ {par['cell_type_column']}", "").replace(f"{par['cell_type_column']} +", "") + + all_results = [] + for cell_type in metadata[par["cell_type_column"]].unique(): + logger.info(f"Analyzing cell type: {cell_type}") + + # Subset data for this cell type + cell_mask = metadata[par["cell_type_column"]] == cell_type + counts_subset = counts.loc[cell_mask, :] + metadata_subset = metadata.loc[cell_mask, :] + + # Run DESeq2 for this cell type + dds_cell = DeseqDataSet( + counts=counts_subset, + metadata=metadata_subset, + design=design_no_celltype, + ) + + # Run DESeq2 analysis + logger.info("Running DESeq2 analysis...") + dds_cell.deseq2() + + # Perform statistical test + stat_res_cell = DeseqStats( + dds_cell, + contrast=contrast_tuple + ) + stat_res_cell.summary() + + # Get results + cell_results = stat_res_cell.results_df.reset_index() + logger.info(f"DESeq2 analysis for cell type {cell_type} completed. Found {len(cell_results)} genes.") + + # Filter results based on significance + significant_genes_cell = cell_results[ + (cell_results['padj'] < par["padj_threshold"]) & + (cell_results['log2FoldChange'].abs() > par["log2fc_threshold"]) + ] + logger.info(f"Significant genes for cell type {cell_type} (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes_cell)}") + + # Sort by log2FoldChange + cell_results = cell_results.sort_values('log2FoldChange', ascending=False) + + # Add additional statistics + cell_results['abs_log2FoldChange'] = cell_results['log2FoldChange'].abs() + cell_results['significant'] = ( + (cell_results['padj'] < par["padj_threshold"]) & + (cell_results['log2FoldChange'].abs() > par["log2fc_threshold"]) + ) + + # Add cell_type column to results + cell_results['cell_type'] = cell_type + all_results.append(cell_results) + + # Combine all results + results = pd.concat(all_results, ignore_index=True) + + else: + # Current multi-factor approach + logger.info("Running multi-factor DESeq2 analysis") + # Run DESeq2 analysis + logger.info("Running DESeq2 analysis...") + dds.deseq2() + + # Perform statistical test + stat_res = DeseqStats( + dds, + contrast=contrast_tuple + ) + stat_res.summary() + + # Get results + results = stat_res.results_df.reset_index() + logger.info(f"DESeq2 analysis completed. Found {len(results)} genes.") + + # Filter results based on significance + significant_genes = results[ + (results['padj'] < par["padj_threshold"]) & + (results['log2FoldChange'].abs() > par["log2fc_threshold"]) + ] + logger.info(f"Significant genes (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}") + + # Sort by log2FoldChange + results = results.sort_values('log2FoldChange', ascending=False) + + # Add additional statistics + results['abs_log2FoldChange'] = results['log2FoldChange'].abs() + results['significant'] = ( + (results['padj'] < par["padj_threshold"]) & + (results['log2FoldChange'].abs() > par["log2fc_threshold"]) + ) # Save results logger.info(f"Saving results to {par['output']}") From da6eb529019c460167f9a273e1f1daede74b7ccb Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 17 Jul 2025 17:53:13 +0200 Subject: [PATCH 10/52] fix test resources --- src/dgea/deseq2/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dgea/deseq2/config.vsh.yaml b/src/dgea/deseq2/config.vsh.yaml index abffe89216b..22a62c46347 100644 --- a/src/dgea/deseq2/config.vsh.yaml +++ b/src/dgea/deseq2/config.vsh.yaml @@ -124,7 +124,7 @@ resources: test_resources: - type: python_script path: test.py - - path: /resources_test/annotation_test_data/TS_Blood_filtered_annotated_pseudobulk.h5mu + - path: /resources_test/annotation_test_data/TS_Blood_filtered_pseudobulk.h5mu engines: - type: docker From b6919b79591778f498fec5b19197d863ec1a9146 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Tue, 22 Jul 2025 14:06:08 +0200 Subject: [PATCH 11/52] pin zarr version <3.0.0 --- src/dgea/deseq2/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dgea/deseq2/config.vsh.yaml b/src/dgea/deseq2/config.vsh.yaml index 22a62c46347..350030e9273 100644 --- a/src/dgea/deseq2/config.vsh.yaml +++ b/src/dgea/deseq2/config.vsh.yaml @@ -133,6 +133,7 @@ engines: - type: python packages: - pydeseq2~=0.5.2 + - zarr<3.0.0 - type: python __merge__: [/src/base/requirements/anndata_mudata.yaml] __merge__: [/src/base/requirements/scanpy.yaml] From 4c82a7952a2a5872c50bff8a03b41175c5094871 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Tue, 22 Jul 2025 16:48:46 +0200 Subject: [PATCH 12/52] fix tests --- src/dgea/deseq2/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dgea/deseq2/test.py b/src/dgea/deseq2/test.py index 752324c07f9..66d2974c45b 100644 --- a/src/dgea/deseq2/test.py +++ b/src/dgea/deseq2/test.py @@ -19,7 +19,7 @@ @pytest.fixture def pseudobulk_test_data_path(): """Path to the pseudobulk test data""" - return f"{meta['resources_dir']}/TS_Blood_filtered_annotated_pseudobulk.h5mu" + return f"{meta['resources_dir']}/TS_Blood_filtered_pseudobulk.h5mu" def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_path): From 6b15c8d5cff8a324bcbefb456577591e64e00ee9 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 24 Jul 2025 09:07:37 +0200 Subject: [PATCH 13/52] liint --- src/dgea/create_pseudobulk_samples/script.py | 112 ------------ .../deseq2}/deseq2/config.vsh.yaml | 16 +- .../deseq2}/deseq2/script.py | 168 ++++++++++-------- .../deseq2}/deseq2/test.py | 20 +-- 4 files changed, 108 insertions(+), 208 deletions(-) delete mode 100644 src/dgea/create_pseudobulk_samples/script.py rename src/{dgea => differential_expression/deseq2}/deseq2/config.vsh.yaml (91%) rename src/{dgea => differential_expression/deseq2}/deseq2/script.py (66%) rename src/{dgea => differential_expression/deseq2}/deseq2/test.py (93%) diff --git a/src/dgea/create_pseudobulk_samples/script.py b/src/dgea/create_pseudobulk_samples/script.py deleted file mode 100644 index 17913e0a876..00000000000 --- a/src/dgea/create_pseudobulk_samples/script.py +++ /dev/null @@ -1,112 +0,0 @@ -import scanpy as sc -import mudata as mu -import random -import numpy as np - -## VIASH START -par = { - "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", - "mod": "rna", - "min_cells": 100, - "target_cluster_id": "donor_id", - "grouping_column": "cell_type", # Add this parameter - "min_cells_per_sample": 50, # Add this parameter - "n_replicates": 1, # Add this parameter -} - -meta = {"resources_dir": "src/utils"} -### VIASH END - -import sys -sys.path.append(meta["resources_dir"]) - -from setup_logger import setup_logger - -logger = setup_logger() - -mdata = mu.read_h5mu(par["input"]) -mod = mdata.mod[par["mod"]] - -sc.pp.filter_genes( - mod, - min_cells=par["min_cells"], -) - -mod.obs_names_make_unique() -mod.var_names_make_unique() - -mod.obs[par["target_cluster_id"]] = mod.obs[par["target_cluster_id"]].cat.remove_unused_categories() - -mod_dge = mod.copy() - -pbs = {} -for cluster in mod_dge.obs[par["target_cluster_id"]].cat.categories: - logger.info(f"Processing cluster {cluster}") - - pbs[cluster] = [] - - # Get cells from this cluster - cluster_cells = mod_dge[mod_dge.obs[par["target_cluster_id"]] == cluster] - - grouping_values = cluster_cells.obs[par["grouping_column"]].unique() - - # Process each group within this cluster - for group in grouping_values: - group_cells = cluster_cells[cluster_cells.obs[par["grouping_column"]] == group] - - if len(group_cells) < par["min_cells_per_sample"]: - logger.warning(f"Skipping {cluster}-{group}: only {len(group_cells)} cells (minimum: {par['min_cells_per_sample']})") - continue - - # Create pseudoreplicates - indices = list(group_cells.obs_names) - random.shuffle(indices) - indices_split = np.array_split(np.array(indices), par["n_replicates"]) - - for i, pseudo_rep_indices in enumerate(indices_split): - if len(pseudo_rep_indices) >= par["min_cells_per_sample"]: - # Sum expression across cells to create pseudobulk sample - pseudobulk_cells = group_cells[pseudo_rep_indices] - - rep_adata = sc.AnnData( - X=pseudobulk_cells.X.sum(axis=0), - var=pseudobulk_cells.var[[]] # Empty var dataframe with same index - ) - - # Add metadata - rep_adata.obs_names = [f'{group}_{cluster}_rep{i}'] - rep_adata.obs['replicate'] = i - rep_adata.obs['group'] = str(group) - rep_adata.obs['cluster'] = str(cluster) - rep_adata.obs['n_cells'] = len(pseudo_rep_indices) - - # Add original metadata if available - first_cell_metadata = pseudobulk_cells.obs.iloc[0] - for col in pseudobulk_cells.obs.columns: - if col not in ['replicate', 'group', 'cluster', 'n_cells']: - rep_adata.obs[col] = str(first_cell_metadata[col]) - - pbs[cluster].append(rep_adata) - logger.info(f"Created pseudobulk sample: {group}_{cluster}_rep{i} with {len(pseudo_rep_indices)} cells") - -# Concatenate all pseudobulk samples -logger.info("Concatenating pseudobulk samples...") -all_pseudobulk_samples = [] -for cluster, samples in pbs.items(): - if samples: # Only add if there are samples - all_pseudobulk_samples.extend(samples) - -if all_pseudobulk_samples: - adata_pseudo = sc.concat(all_pseudobulk_samples) - logger.info(f"Created {len(all_pseudobulk_samples)} pseudobulk samples with shape: {adata_pseudo.shape}") - - # Convert object columns to string for compatibility - for col in adata_pseudo.obs.columns: - if adata_pseudo.obs[col].dtype == 'object': - adata_pseudo.obs[col] = adata_pseudo.obs[col].astype(str) - - logger.info("Pseudobulk generation completed successfully") -else: - logger.error("No pseudobulk samples were generated!") - -print("here") \ No newline at end of file diff --git a/src/dgea/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/deseq2/config.vsh.yaml similarity index 91% rename from src/dgea/deseq2/config.vsh.yaml rename to src/differential_expression/deseq2/deseq2/config.vsh.yaml index 350030e9273..607b0f23571 100644 --- a/src/dgea/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/deseq2/config.vsh.yaml @@ -1,15 +1,13 @@ name: deseq2 -namespace: dgea +namespace: differential_expression description: | - Perform differential expression analysis using DESeq2 on pseudobulk samples aggregated from single-cell data. - This component aggregates single-cell data into pseudobulk samples based on specified design factors, - and then performs differential expression analysis using DESeq2. + Perform differential expression analysis using DESeq2 on bulk samples or pseudobulk samples aggregated from single-cell data. authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ author ] - __merge__: /src/authors/jakub_majercik.yaml roles: [ author ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] - __merge__: /src/authors/dries_de_maeyer.yaml roles: [ contributor ] - __merge__: /src/authors/weiwei_schultz.yaml @@ -21,7 +19,7 @@ argument_groups: - name: "--input" alternatives: ["-i"] type: file - description: Input h5mu file. + description: Input h5mu file containing bulk or pseudobulk transcriptomic samples. direction: input required: true - name: "--modality" @@ -104,8 +102,8 @@ argument_groups: type: string description: | Column name in .obs that contains cell type information. - Only used when per_cell_type is true. - default: "cell_type" + Required when per_cell_type is true. + example: "cell_type" required: false - name: Outputs diff --git a/src/dgea/deseq2/script.py b/src/differential_expression/deseq2/deseq2/script.py similarity index 66% rename from src/dgea/deseq2/script.py rename to src/differential_expression/deseq2/deseq2/script.py index 8cfb066f338..437ab44591f 100644 --- a/src/dgea/deseq2/script.py +++ b/src/differential_expression/deseq2/deseq2/script.py @@ -14,12 +14,15 @@ "input_layer": None, "design_formula": "~ cell_type + disease + treatment", "contrast_column": "treatment", - "contrast_values": ["stim", "ctrl"], # [treatment, baseline] or [group1, group2, group3, ...] + "contrast_values": [ + "stim", + "ctrl", + ], # [treatment, baseline] or [group1, group2, group3, ...] "filter_genes_min_samples": None, "padj_threshold": 0.05, "log2fc_threshold": 0.0, "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], - "var_gene_name" : "feature_name", + "var_gene_name": "feature_name", "per_cell_type": False, # New parameter "cell_type_column": "cell_type", # Which column contains cell types } @@ -27,6 +30,7 @@ ## VIASH END + def is_normalized(layer): if sp.issparse(layer): row_sums = np.array(layer.sum(axis=1)).flatten() @@ -37,6 +41,7 @@ def is_normalized(layer): import sys + sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger @@ -47,19 +52,23 @@ def is_normalized(layer): # Load pseudobulk data logger.info(f"Loading pseudobulk data from {par['input']}") mdata = mu.read_h5mu(par["input"]) -mod = mdata.mod["rna"] +mod = mdata.mod[par["modality"]] -logger.info(f"Input pseudobulk data shape: {mod.shape}") -logger.info(f"Available metadata columns: {list(mod.obs.columns)}") +logger.info(f"Input (pseudo-)bulk data shape: {mod.shape}") +logger.info(f"Available metadata columns: {list(mod.obs.columns)}") # Parse design formula to extract factors -design_factors = re.findall(r'\b(?!~)\w+', par["design_formula"]) +design_factors = re.findall(r"\b(?!~)\w+", par["design_formula"]) logger.info(f"Design formula: {par['design_formula']}") logger.info(f"Extracted factors: {design_factors}") # Validate required columns exist contrast_column = par["contrast_column"] -required_columns = design_factors + [contrast_column] if contrast_column not in design_factors else design_factors +required_columns = ( + design_factors + [contrast_column] + if contrast_column not in design_factors + else design_factors +) missing_columns = [col for col in required_columns if col not in mod.obs.columns] if missing_columns: raise ValueError(f"Missing required columns in metadata: {missing_columns}") @@ -74,11 +83,7 @@ def is_normalized(layer): if is_normalized(mod.X): raise ValueError("Input layer must contain raw counts.") -counts = pd.DataFrame( - mod.X, - columns=mod.var_names, - index=mod.obs_names -) +counts = pd.DataFrame(mod.X, columns=mod.var_names, index=mod.obs_names) # Ensure counts are integers (required for DESeq2) counts = counts.astype(int) @@ -89,12 +94,14 @@ def is_normalized(layer): if par["filter_gene_patterns"]: pattern_string = "|".join(par["filter_gene_patterns"]) before_filter = counts.shape[1] - + # Filter genes based on column names (gene names) genes_to_keep = ~pd.Series(counts.columns).str.contains(pattern_string, na=False) counts_filtered = counts.loc[:, genes_to_keep] - - logger.info(f"Filtered out genes matching patterns {par['filter_gene_patterns']}: {before_filter} -> {counts_filtered.shape[1]}") + + logger.info( + f"Filtered out genes matching patterns {par['filter_gene_patterns']}: {before_filter} -> {counts_filtered.shape[1]}" + ) counts = counts_filtered logger.info(f"Counts matrix shape after gene pattern filtering: {counts.shape}") @@ -116,7 +123,9 @@ def is_normalized(layer): treatment, baseline = contrast_values contrast_tuple = (contrast_column, treatment, baseline) -logger.info(f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}") +logger.info( + f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}" +) # Create DESeq2 dataset logger.info("Creating DESeq2 dataset") @@ -126,133 +135,140 @@ def is_normalized(layer): metadata=metadata, design=par["design_formula"], ) - + # Optional gene filtering if par["filter_genes_min_samples"] is not None: - logger.info(f"Filtering genes expressed in at least {par['filter_genes_min_samples']} samples") + logger.info( + f"Filtering genes expressed in at least {par['filter_genes_min_samples']} samples" + ) sc.pp.filter_genes(dds, min_cells=par["filter_genes_min_samples"]) else: # Default: only filter genes with zero counts across all samples - logger.info("Default filtering: removing genes with zero counts across all samples") + logger.info( + "Default filtering: removing genes with zero counts across all samples" + ) sc.pp.filter_genes(dds, min_cells=1) - + logger.info(f"After filtering: {dds.shape}") - + # Check if running per cell type or multi-factor if par["per_cell_type"]: logger.info("Running DESeq2 analysis per cell type") - + # Remove cell_type from design formula for per-cell-type analysis - design_no_celltype = par["design_formula"].replace(f"+ {par['cell_type_column']}", "").replace(f"{par['cell_type_column']} +", "") - + design_no_celltype = ( + par["design_formula"] + .replace(f"+ {par['cell_type_column']}", "") + .replace(f"{par['cell_type_column']} +", "") + ) + all_results = [] for cell_type in metadata[par["cell_type_column"]].unique(): logger.info(f"Analyzing cell type: {cell_type}") - + # Subset data for this cell type cell_mask = metadata[par["cell_type_column"]] == cell_type counts_subset = counts.loc[cell_mask, :] metadata_subset = metadata.loc[cell_mask, :] - + # Run DESeq2 for this cell type dds_cell = DeseqDataSet( counts=counts_subset, metadata=metadata_subset, design=design_no_celltype, ) - + # Run DESeq2 analysis logger.info("Running DESeq2 analysis...") dds_cell.deseq2() - + # Perform statistical test - stat_res_cell = DeseqStats( - dds_cell, - contrast=contrast_tuple - ) + stat_res_cell = DeseqStats(dds_cell, contrast=contrast_tuple) stat_res_cell.summary() - + # Get results cell_results = stat_res_cell.results_df.reset_index() - logger.info(f"DESeq2 analysis for cell type {cell_type} completed. Found {len(cell_results)} genes.") - + logger.info( + f"DESeq2 analysis for cell type {cell_type} completed. Found {len(cell_results)} genes." + ) + # Filter results based on significance significant_genes_cell = cell_results[ - (cell_results['padj'] < par["padj_threshold"]) & - (cell_results['log2FoldChange'].abs() > par["log2fc_threshold"]) + (cell_results["padj"] < par["padj_threshold"]) + & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) ] - logger.info(f"Significant genes for cell type {cell_type} (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes_cell)}") - + logger.info( + f"Significant genes for cell type {cell_type} (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes_cell)}" + ) + # Sort by log2FoldChange - cell_results = cell_results.sort_values('log2FoldChange', ascending=False) - + cell_results = cell_results.sort_values("log2FoldChange", ascending=False) + # Add additional statistics - cell_results['abs_log2FoldChange'] = cell_results['log2FoldChange'].abs() - cell_results['significant'] = ( - (cell_results['padj'] < par["padj_threshold"]) & - (cell_results['log2FoldChange'].abs() > par["log2fc_threshold"]) - ) - + cell_results["abs_log2FoldChange"] = cell_results["log2FoldChange"].abs() + cell_results["significant"] = ( + cell_results["padj"] < par["padj_threshold"] + ) & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) + # Add cell_type column to results - cell_results['cell_type'] = cell_type + cell_results["cell_type"] = cell_type all_results.append(cell_results) - + # Combine all results results = pd.concat(all_results, ignore_index=True) - + else: # Current multi-factor approach logger.info("Running multi-factor DESeq2 analysis") # Run DESeq2 analysis logger.info("Running DESeq2 analysis...") dds.deseq2() - + # Perform statistical test - stat_res = DeseqStats( - dds, - contrast=contrast_tuple - ) + stat_res = DeseqStats(dds, contrast=contrast_tuple) stat_res.summary() - + # Get results results = stat_res.results_df.reset_index() logger.info(f"DESeq2 analysis completed. Found {len(results)} genes.") - + # Filter results based on significance significant_genes = results[ - (results['padj'] < par["padj_threshold"]) & - (results['log2FoldChange'].abs() > par["log2fc_threshold"]) + (results["padj"] < par["padj_threshold"]) + & (results["log2FoldChange"].abs() > par["log2fc_threshold"]) ] - logger.info(f"Significant genes (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}") - + logger.info( + f"Significant genes (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}" + ) + # Sort by log2FoldChange - results = results.sort_values('log2FoldChange', ascending=False) - + results = results.sort_values("log2FoldChange", ascending=False) + # Add additional statistics - results['abs_log2FoldChange'] = results['log2FoldChange'].abs() - results['significant'] = ( - (results['padj'] < par["padj_threshold"]) & - (results['log2FoldChange'].abs() > par["log2fc_threshold"]) + results["abs_log2FoldChange"] = results["log2FoldChange"].abs() + results["significant"] = (results["padj"] < par["padj_threshold"]) & ( + results["log2FoldChange"].abs() > par["log2fc_threshold"] ) - # Save results logger.info(f"Saving results to {par['output']}") results.to_csv(par["output"], index=False) - + # Log summary statistics - upregulated = results[(results['log2FoldChange'] > 0) & results['significant']] - downregulated = results[(results['log2FoldChange'] < 0) & results['significant']] - - logger.info(f"Summary:") + upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] + downregulated = results[(results["log2FoldChange"] < 0) & results["significant"]] + + logger.info("Summary:") logger.info(f" Total genes analyzed: {len(results)}") logger.info(f" Significant upregulated: {len(upregulated)}") logger.info(f" Significant downregulated: {len(downregulated)}") except Exception as e: logger.error(f"DESeq2 analysis failed: {str(e)}") - logger.error(f"Analysis context: {contrast_column} contrast {contrast_values}, {len(mod.obs)} samples, {len(mod.var)} genes") + logger.error( + f"Analysis context: {contrast_column} contrast {contrast_values}, {len(mod.obs)} samples, {len(mod.var)} genes" + ) logger.warning("Check your input data, design factors, and contrast specifications") - + raise e -logger.info("DESeq2 analysis completed successfully") \ No newline at end of file +logger.info("DESeq2 analysis completed successfully") diff --git a/src/dgea/deseq2/test.py b/src/differential_expression/deseq2/deseq2/test.py similarity index 93% rename from src/dgea/deseq2/test.py rename to src/differential_expression/deseq2/deseq2/test.py index 66d2974c45b..77e85f67dd6 100644 --- a/src/dgea/deseq2/test.py +++ b/src/differential_expression/deseq2/deseq2/test.py @@ -1,6 +1,4 @@ -import os import subprocess -import mudata as mu import sys import pytest import pandas as pd @@ -8,9 +6,7 @@ ## VIASH START -meta = { - "resources_dir": "resources_test/" -} +meta = {"resources_dir": "resources_test/"} ## VIASH END sys.path.append(meta["resources_dir"]) @@ -38,17 +34,19 @@ def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_p "treatment", "--contrast_values", "stim", - "--contrast_values", + "--contrast_values", "ctrl", ] ) assert output_path.exists(), "Output CSV file does not exist" - + # Check the output file results = pd.read_csv(output_path) expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] - assert all(col in results.columns for col in expected_columns), f"Expected columns {expected_columns} not found" + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) assert len(results) > 0, "No results found in output" @@ -78,7 +76,7 @@ def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_pa ) assert output_path.exists(), "Output CSV file does not exist" - + results = pd.read_csv(output_path) assert "significant" in results.columns, "Significance column not found" assert len(results) > 0, "No results found for complex design" @@ -105,7 +103,7 @@ def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_p "group2", ] ) - + assert re.search( r"Missing required columns in metadata: \['nonexistent_column'\]", err.value.stdout.decode("utf-8"), @@ -113,4 +111,4 @@ def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_p if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) From c2eb0ecf5217d6fe0bed77091cb7f8822765466f Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 24 Jul 2025 11:13:26 +0200 Subject: [PATCH 14/52] wip --- .../deseq2/deseq2/config.vsh.yaml | 40 +- .../deseq2/deseq2/script.py | 423 +++++++++--------- 2 files changed, 235 insertions(+), 228 deletions(-) diff --git a/src/differential_expression/deseq2/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/deseq2/config.vsh.yaml index 607b0f23571..a0db9f3fb8e 100644 --- a/src/differential_expression/deseq2/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/deseq2/config.vsh.yaml @@ -1,7 +1,8 @@ name: deseq2 namespace: differential_expression description: | - Perform differential expression analysis using DESeq2 on bulk samples or pseudobulk samples aggregated from single-cell data. + Performs differential expression analysis using DESeq2 on bulk samples or pseudobulk samples aggregated from single-cell data. + Note that this component focuses exclusively on factorial experimental design variables, and excludes e.g. covariates from the analysis. authors: - __merge__: /src/authors/jakub_majercik.yaml @@ -19,7 +20,7 @@ argument_groups: - name: "--input" alternatives: ["-i"] type: file - description: Input h5mu file containing bulk or pseudobulk transcriptomic samples. + description: Input h5mu file containing (pseudo-)bulk transcriptomic samples. direction: input required: true - name: "--modality" @@ -32,6 +33,18 @@ argument_groups: type: string required: false description: "Input layer to use. If None, X is used. This layer must contain raw counts." + - name: "--var_gene_name" + type: string + description: | + Column name in the variable features table that contains gene symbols. + This is used for reporting gene names in the output. + default: "feature_name" + required: false + - name: "--obs_cell_group" + type: boolean_true + description: | + Whether to run DESeq2 analysis separately for each cell group, for example per cell type or per cell cluster. + If true, performs per-cell-group analysis with cell-group-specific results. - name: Arguments arguments: @@ -59,11 +72,12 @@ argument_groups: example: ["stim", "ctrl"] - name: "--filter_genes_min_samples" type: integer + min: 1 description: | Minimum number of samples a gene must be expressed in to be included in the analysis. If None, no filtering is applied. required: false - - name: "--padj_threshold" + - name: "--p_adj_threshold" type: double description: | Adjusted p-value threshold for significance. @@ -85,26 +99,6 @@ argument_groups: Genes matching any of these patterns will be excluded from the analysis. required: false example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] - - name: "--var_gene_name" - type: string - description: | - Column name in the variable features table that contains gene symbols. - This is used for reporting gene names in the output. - default: "feature_name" - required: false - - name: "--per_cell_type" - type: boolean_true - description: | - Whether to run DESeq2 analysis separately for each cell type. - If true, performs per-cell-type analysis with cell-type-specific results. - If false (default), runs multi-factor analysis accounting for cell type as a covariate. - - name: "--cell_type_column" - type: string - description: | - Column name in .obs that contains cell type information. - Required when per_cell_type is true. - example: "cell_type" - required: false - name: Outputs arguments: diff --git a/src/differential_expression/deseq2/deseq2/script.py b/src/differential_expression/deseq2/deseq2/script.py index 437ab44591f..34616c8fcd7 100644 --- a/src/differential_expression/deseq2/deseq2/script.py +++ b/src/differential_expression/deseq2/deseq2/script.py @@ -6,12 +6,15 @@ from pydeseq2.dds import DeseqDataSet from pydeseq2.ds import DeseqStats import re +import sys ## VIASH START par = { - "input": "resources_test/annotation_test_data/TS_Blood_filtered_annotated_pseudobulk.h5mu", + "input": "resources_test/annotation_test_data/TS_Blood_filtered_pseudobulk.h5mu", "output": "deseq2_results.csv", "input_layer": None, + "modality": "rna", + "obs_cell_group": None, "design_formula": "~ cell_type + disease + treatment", "contrast_column": "treatment", "contrast_values": [ @@ -19,17 +22,20 @@ "ctrl", ], # [treatment, baseline] or [group1, group2, group3, ...] "filter_genes_min_samples": None, - "padj_threshold": 0.05, + "p_adj_threshold": 0.05, "log2fc_threshold": 0.0, "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], "var_gene_name": "feature_name", - "per_cell_type": False, # New parameter - "cell_type_column": "cell_type", # Which column contains cell types } meta = {"resources_dir": "src/utils"} - ## VIASH END +sys.path.append(meta["resources_dir"]) + +from setup_logger import setup_logger + +logger = setup_logger() + def is_normalized(layer): if sp.issparse(layer): @@ -40,235 +46,242 @@ def is_normalized(layer): return np.allclose(row_sums, 1) -import sys - -sys.path.append(meta["resources_dir"]) - -from setup_logger import setup_logger +def parse_design_formula(design_formula): + design_factors = re.findall(r"\b(?!~)\w+", design_formula) + logger.info( + f"Design formula: {design_formula}\nExtracted factors: {design_factors}" + ) + return design_factors -logger = setup_logger() +def prepare_contrast_matrix(design_factors, contrast_column, metadata): + # Validate required columns exist + required_columns = ( + design_factors + [contrast_column] + if contrast_column not in design_factors + else design_factors + ) + missing_columns = [col for col in required_columns if col not in metadata.columns] + if missing_columns: + raise ValueError( + f"Missing required columns in metadata: {missing_columns}\n" + f"Available metadata columns: {list(metadata.columns)}" + ) -# Load pseudobulk data -logger.info(f"Loading pseudobulk data from {par['input']}") -mdata = mu.read_h5mu(par["input"]) -mod = mdata.mod[par["modality"]] + # Check contrast values exist + contrast_values = par["contrast_values"] + available_values = metadata[contrast_column].unique() + missing_values = [val for val in contrast_values if val not in available_values] + if missing_values: + raise ValueError( + f"Contrast values {missing_values} not found in {contrast_column}.\n" + f"Available values: {available_values}" + ) -logger.info(f"Input (pseudo-)bulk data shape: {mod.shape}") -logger.info(f"Available metadata columns: {list(mod.obs.columns)}") + if len(contrast_values) < 2: + raise ValueError(f"Need at least 2 values for contrast, got: {contrast_values}") -# Parse design formula to extract factors -design_factors = re.findall(r"\b(?!~)\w+", par["design_formula"]) -logger.info(f"Design formula: {par['design_formula']}") -logger.info(f"Extracted factors: {design_factors}") + treatment, baseline = contrast_values + contrast_tuple = (contrast_column, treatment, baseline) + logger.info( + f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}" + ) + return contrast_tuple -# Validate required columns exist -contrast_column = par["contrast_column"] -required_columns = ( - design_factors + [contrast_column] - if contrast_column not in design_factors - else design_factors -) -missing_columns = [col for col in required_columns if col not in mod.obs.columns] -if missing_columns: - raise ValueError(f"Missing required columns in metadata: {missing_columns}") -# Prepare counts matrix -logger.info("Preparing counts matrix for DESeq2") +def prepare_counts_matrix(layer, mod): + counts = pd.DataFrame(layer, columns=mod.var_names, index=mod.obs_names) + counts = counts.astype(int) # Ensure counts are integers (required for DESeq2) + return counts -# Use specified layer if provided -if par["input_layer"]: - mod.X = mod.layers[par["input_layer"]] -if is_normalized(mod.X): - raise ValueError("Input layer must contain raw counts.") +def filter_genes_by_pattern(counts, gene_pattern): + pattern_string = "|".join(gene_pattern) + before_filter = counts.shape[1] -counts = pd.DataFrame(mod.X, columns=mod.var_names, index=mod.obs_names) + # Filter genes based on column names (gene names) + genes_to_keep = list( + ~pd.Series(counts.columns).str.contains(pattern_string, na=False) + ) + counts = counts.loc[:, genes_to_keep] -# Ensure counts are integers (required for DESeq2) -counts = counts.astype(int) + logger.info( + f"Filtered out genes matching patterns {par['filter_gene_patterns']}:\n" + f"Counts matrix shape before bene pattern filtering: {before_filter}\n" + f"Counts matrix shape after gene pattern filtering: {counts.shape}" + ) -metadata = mod.obs.copy() + return counts -# Filter out unwanted gene patterns if specified (before DESeq2 analysis) -if par["filter_gene_patterns"]: - pattern_string = "|".join(par["filter_gene_patterns"]) - before_filter = counts.shape[1] - - # Filter genes based on column names (gene names) - genes_to_keep = ~pd.Series(counts.columns).str.contains(pattern_string, na=False) - counts_filtered = counts.loc[:, genes_to_keep] +def deseq2_analysis(adata, contrast_tuple): + adata.deseq2() + # Perform statistical test + stat_res = DeseqStats(adata, contrast=contrast_tuple) + results = stat_res.results_df.reset_index() + # Sort by log2FoldChange + results = results.sort_values("log2FoldChange", ascending=False) + # Add additional statistics + results["abs_log2FoldChange"] = results["log2FoldChange"].abs() + results["significant"] = (results["padj"] < par["p_adj_threshold"]) & ( + results["log2FoldChange"].abs() > par["log2fc_threshold"] + ) + # Filter results based on significance + significant_genes = results[ + (results["padj"] < par["p_adj_threshold"]) + & (results["log2FoldChange"].abs() > par["log2fc_threshold"]) + ] logger.info( - f"Filtered out genes matching patterns {par['filter_gene_patterns']}: {before_filter} -> {counts_filtered.shape[1]}" + f"Significant genes (padj < {par['p_adj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}" ) - counts = counts_filtered - -logger.info(f"Counts matrix shape after gene pattern filtering: {counts.shape}") -logger.info(f"Design formula: {par['design_formula']}") - -# Check contrast values exist -contrast_column = par["contrast_column"] -contrast_values = par["contrast_values"] -available_values = metadata[contrast_column].unique() -logger.info(f"Available values in {contrast_column}: {available_values}") - -# Validate all contrast values exist -missing_values = [val for val in contrast_values if val not in available_values] -if missing_values: - raise ValueError(f"Contrast values {missing_values} not found in {contrast_column}") - -if len(contrast_values) < 2: - raise ValueError(f"Need at least 2 values for contrast, got: {contrast_values}") - -treatment, baseline = contrast_values -contrast_tuple = (contrast_column, treatment, baseline) -logger.info( - f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}" -) - -# Create DESeq2 dataset -logger.info("Creating DESeq2 dataset") -try: - dds = DeseqDataSet( - counts=counts, - metadata=metadata, - design=par["design_formula"], + return results + + +def main(): + # Load pseudobulk data + logger.info(f"Loading pseudobulk data from {par['input']}") + mdata = mu.read_h5mu(par["input"]) + mod = mdata.mod[par["modality"]] + metadata = mod.obs.copy() + layer = mod.layers[par["input_layer"]] if par["input_layer"] else mod.X + if is_normalized(layer): + raise ValueError("Input layer must contain raw counts.") + + # Parse design formula to extract factors + logger.info("Preparing design formula") + design_factors = parse_design_formula(par["design_formula"]) + + # Preparing contrast matrix + logger.info("Preparing contrast matrix") + contrast_tuple = prepare_contrast_matrix( + design_factors, par["contrast_column"], metadata ) - # Optional gene filtering - if par["filter_genes_min_samples"] is not None: - logger.info( - f"Filtering genes expressed in at least {par['filter_genes_min_samples']} samples" + # Prepare counts matrix + logger.info("Preparing counts matrix for DESeq2") + counts = prepare_counts_matrix(layer, mod) + + # Filter out unwanted gene patterns if specified (before DESeq2 analysis) + if par["filter_gene_patterns"]: + logger.info("Filtering genes based on gene patterns") + counts = filter_genes_by_pattern(counts, par["filter_gene_patterns"]) + + # Run DESeq2 analysis + try: + logger.info("Creating DESeq2 dataset") + dds = DeseqDataSet( + counts=counts, + metadata=metadata, + design=par["design_formula"], ) - sc.pp.filter_genes(dds, min_cells=par["filter_genes_min_samples"]) - else: - # Default: only filter genes with zero counts across all samples + # Filtering genes based on presence across samples + min_cells = par.get("filter_genes_min_samples", 1) logger.info( - "Default filtering: removing genes with zero counts across all samples" + f"Filtering genes by counts: removing genes that are present in less than {min_cells} samples" ) - sc.pp.filter_genes(dds, min_cells=1) - - logger.info(f"After filtering: {dds.shape}") - - # Check if running per cell type or multi-factor - if par["per_cell_type"]: - logger.info("Running DESeq2 analysis per cell type") - - # Remove cell_type from design formula for per-cell-type analysis - design_no_celltype = ( - par["design_formula"] - .replace(f"+ {par['cell_type_column']}", "") - .replace(f"{par['cell_type_column']} +", "") - ) - - all_results = [] - for cell_type in metadata[par["cell_type_column"]].unique(): - logger.info(f"Analyzing cell type: {cell_type}") - - # Subset data for this cell type - cell_mask = metadata[par["cell_type_column"]] == cell_type - counts_subset = counts.loc[cell_mask, :] - metadata_subset = metadata.loc[cell_mask, :] - - # Run DESeq2 for this cell type - dds_cell = DeseqDataSet( - counts=counts_subset, - metadata=metadata_subset, - design=design_no_celltype, - ) + sc.pp.filter_genes(dds, min_cells=min_cells) - # Run DESeq2 analysis - logger.info("Running DESeq2 analysis...") - dds_cell.deseq2() + # Check if running per cell group or multi-factor + if par["obs_cell_group"]: + logger.info("Running DESeq2 analysis per cell group") - # Perform statistical test - stat_res_cell = DeseqStats(dds_cell, contrast=contrast_tuple) - stat_res_cell.summary() - - # Get results - cell_results = stat_res_cell.results_df.reset_index() - logger.info( - f"DESeq2 analysis for cell type {cell_type} completed. Found {len(cell_results)} genes." - ) - - # Filter results based on significance - significant_genes_cell = cell_results[ - (cell_results["padj"] < par["padj_threshold"]) - & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) - ] - logger.info( - f"Significant genes for cell type {cell_type} (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes_cell)}" + # Remove cell group from design formula for per-cell-type analysis + design_no_celltype = ( + par["design_formula"] + .replace(f"+ {par['obs_cell_group']}", "") + .replace(f"{par['obs_cell_group']} +", "") ) - # Sort by log2FoldChange - cell_results = cell_results.sort_values("log2FoldChange", ascending=False) - - # Add additional statistics - cell_results["abs_log2FoldChange"] = cell_results["log2FoldChange"].abs() - cell_results["significant"] = ( - cell_results["padj"] < par["padj_threshold"] - ) & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) - - # Add cell_type column to results - cell_results["cell_type"] = cell_type - all_results.append(cell_results) - - # Combine all results - results = pd.concat(all_results, ignore_index=True) - - else: - # Current multi-factor approach - logger.info("Running multi-factor DESeq2 analysis") - # Run DESeq2 analysis - logger.info("Running DESeq2 analysis...") - dds.deseq2() - - # Perform statistical test - stat_res = DeseqStats(dds, contrast=contrast_tuple) - stat_res.summary() - - # Get results - results = stat_res.results_df.reset_index() - logger.info(f"DESeq2 analysis completed. Found {len(results)} genes.") - - # Filter results based on significance - significant_genes = results[ - (results["padj"] < par["padj_threshold"]) - & (results["log2FoldChange"].abs() > par["log2fc_threshold"]) + all_results = [] + for cell_group in metadata[par["obs_cell_group"]].unique(): + logger.info(f"Analyzing cell group: {cell_group}") + + # Subset data for this cell group + cell_mask = metadata[par["obs_cell_group"]] == cell_group + counts_subset = counts.loc[cell_mask, :] + metadata_subset = metadata.loc[cell_mask, :] + + # Run DESeq2 for this cell group + dds_cell = DeseqDataSet( + counts=counts_subset, + metadata=metadata_subset, + design=design_no_celltype, + ) + + # Run DESeq2 analysis + logger.info("Running DESeq2 analysis...") + dds_cell.deseq2() + + # Perform statistical test + stat_res_cell = DeseqStats(dds_cell, contrast=contrast_tuple) + stat_res_cell.summary() + + # Get results + cell_results = stat_res_cell.results_df.reset_index() + logger.info( + f"DESeq2 analysis for cell group {cell_group} completed. Found {len(cell_results)} genes." + ) + + # Filter results based on significance + significant_genes_cell = cell_results[ + (cell_results["padj"] < par["p_adj_threshold"]) + & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) + ] + logger.info( + f"Significant genes for cell group {cell_group} (padj < {par['p_adj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes_cell)}" + ) + + # Sort by log2FoldChange + cell_results = cell_results.sort_values( + "log2FoldChange", ascending=False + ) + + # Add additional statistics + cell_results["abs_log2FoldChange"] = cell_results[ + "log2FoldChange" + ].abs() + cell_results["significant"] = ( + cell_results["padj"] < par["p_adj_threshold"] + ) & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) + + # Add cell_group column to results + cell_results["cell_group"] = cell_group + all_results.append(cell_results) + + # Combine all results + results = pd.concat(all_results, ignore_index=True) + + else: + results = deseq2_analysis(dds, contrast_tuple) + + # Log summary statistics + upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] + downregulated = results[ + (results["log2FoldChange"] < 0) & results["significant"] ] + logger.info( - f"Significant genes (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}" + "Summary:\n" + f"Total genes analyzed: {len(results)}\n" + f" Significant upregulated: {len(upregulated)}\n" + f" Significant downregulated: {len(downregulated)}\n" ) - # Sort by log2FoldChange - results = results.sort_values("log2FoldChange", ascending=False) + # Save results + logger.info(f"Saving results to {par['output']}") + results.to_csv(par["output"], index=False) - # Add additional statistics - results["abs_log2FoldChange"] = results["log2FoldChange"].abs() - results["significant"] = (results["padj"] < par["padj_threshold"]) & ( - results["log2FoldChange"].abs() > par["log2fc_threshold"] - ) - # Save results - logger.info(f"Saving results to {par['output']}") - results.to_csv(par["output"], index=False) - - # Log summary statistics - upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] - downregulated = results[(results["log2FoldChange"] < 0) & results["significant"]] - - logger.info("Summary:") - logger.info(f" Total genes analyzed: {len(results)}") - logger.info(f" Significant upregulated: {len(upregulated)}") - logger.info(f" Significant downregulated: {len(downregulated)}") - -except Exception as e: - logger.error(f"DESeq2 analysis failed: {str(e)}") - logger.error( - f"Analysis context: {contrast_column} contrast {contrast_values}, {len(mod.obs)} samples, {len(mod.var)} genes" - ) - logger.warning("Check your input data, design factors, and contrast specifications") + except Exception as e: + # logger.error( + # f"DESeq2 analysis failed: {str(e)}:\n" + # f"Analysis context: {contrast_column} contrast {contrast_values}, {len(mod.obs)} samples, {len(mod.var)} genes" + # ) + logger.warning("Check input data, design factors, and contrast specifications") + + raise e + + logger.info("DESeq2 analysis completed successfully") - raise e -logger.info("DESeq2 analysis completed successfully") +if __name__ == "__main__": + main() From c99221ac3e5c3567a74fea277fe7df27581619e6 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 24 Jul 2025 11:30:07 +0200 Subject: [PATCH 15/52] cleanup --- src/dgea/create_pseudobulk_samples/script.py | 112 ------------------- src/dgea/deseq2/script.py | 6 +- 2 files changed, 2 insertions(+), 116 deletions(-) delete mode 100644 src/dgea/create_pseudobulk_samples/script.py diff --git a/src/dgea/create_pseudobulk_samples/script.py b/src/dgea/create_pseudobulk_samples/script.py deleted file mode 100644 index 17913e0a876..00000000000 --- a/src/dgea/create_pseudobulk_samples/script.py +++ /dev/null @@ -1,112 +0,0 @@ -import scanpy as sc -import mudata as mu -import random -import numpy as np - -## VIASH START -par = { - "input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu", - "mod": "rna", - "min_cells": 100, - "target_cluster_id": "donor_id", - "grouping_column": "cell_type", # Add this parameter - "min_cells_per_sample": 50, # Add this parameter - "n_replicates": 1, # Add this parameter -} - -meta = {"resources_dir": "src/utils"} -### VIASH END - -import sys -sys.path.append(meta["resources_dir"]) - -from setup_logger import setup_logger - -logger = setup_logger() - -mdata = mu.read_h5mu(par["input"]) -mod = mdata.mod[par["mod"]] - -sc.pp.filter_genes( - mod, - min_cells=par["min_cells"], -) - -mod.obs_names_make_unique() -mod.var_names_make_unique() - -mod.obs[par["target_cluster_id"]] = mod.obs[par["target_cluster_id"]].cat.remove_unused_categories() - -mod_dge = mod.copy() - -pbs = {} -for cluster in mod_dge.obs[par["target_cluster_id"]].cat.categories: - logger.info(f"Processing cluster {cluster}") - - pbs[cluster] = [] - - # Get cells from this cluster - cluster_cells = mod_dge[mod_dge.obs[par["target_cluster_id"]] == cluster] - - grouping_values = cluster_cells.obs[par["grouping_column"]].unique() - - # Process each group within this cluster - for group in grouping_values: - group_cells = cluster_cells[cluster_cells.obs[par["grouping_column"]] == group] - - if len(group_cells) < par["min_cells_per_sample"]: - logger.warning(f"Skipping {cluster}-{group}: only {len(group_cells)} cells (minimum: {par['min_cells_per_sample']})") - continue - - # Create pseudoreplicates - indices = list(group_cells.obs_names) - random.shuffle(indices) - indices_split = np.array_split(np.array(indices), par["n_replicates"]) - - for i, pseudo_rep_indices in enumerate(indices_split): - if len(pseudo_rep_indices) >= par["min_cells_per_sample"]: - # Sum expression across cells to create pseudobulk sample - pseudobulk_cells = group_cells[pseudo_rep_indices] - - rep_adata = sc.AnnData( - X=pseudobulk_cells.X.sum(axis=0), - var=pseudobulk_cells.var[[]] # Empty var dataframe with same index - ) - - # Add metadata - rep_adata.obs_names = [f'{group}_{cluster}_rep{i}'] - rep_adata.obs['replicate'] = i - rep_adata.obs['group'] = str(group) - rep_adata.obs['cluster'] = str(cluster) - rep_adata.obs['n_cells'] = len(pseudo_rep_indices) - - # Add original metadata if available - first_cell_metadata = pseudobulk_cells.obs.iloc[0] - for col in pseudobulk_cells.obs.columns: - if col not in ['replicate', 'group', 'cluster', 'n_cells']: - rep_adata.obs[col] = str(first_cell_metadata[col]) - - pbs[cluster].append(rep_adata) - logger.info(f"Created pseudobulk sample: {group}_{cluster}_rep{i} with {len(pseudo_rep_indices)} cells") - -# Concatenate all pseudobulk samples -logger.info("Concatenating pseudobulk samples...") -all_pseudobulk_samples = [] -for cluster, samples in pbs.items(): - if samples: # Only add if there are samples - all_pseudobulk_samples.extend(samples) - -if all_pseudobulk_samples: - adata_pseudo = sc.concat(all_pseudobulk_samples) - logger.info(f"Created {len(all_pseudobulk_samples)} pseudobulk samples with shape: {adata_pseudo.shape}") - - # Convert object columns to string for compatibility - for col in adata_pseudo.obs.columns: - if adata_pseudo.obs[col].dtype == 'object': - adata_pseudo.obs[col] = adata_pseudo.obs[col].astype(str) - - logger.info("Pseudobulk generation completed successfully") -else: - logger.error("No pseudobulk samples were generated!") - -print("here") \ No newline at end of file diff --git a/src/dgea/deseq2/script.py b/src/dgea/deseq2/script.py index 8cfb066f338..33151047fc2 100644 --- a/src/dgea/deseq2/script.py +++ b/src/dgea/deseq2/script.py @@ -20,8 +20,8 @@ "log2fc_threshold": 0.0, "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], "var_gene_name" : "feature_name", - "per_cell_type": False, # New parameter - "cell_type_column": "cell_type", # Which column contains cell types + "per_cell_type": False, + "cell_type_column": "cell_type", } meta = {"resources_dir": "src/utils"} @@ -201,9 +201,7 @@ def is_normalized(layer): results = pd.concat(all_results, ignore_index=True) else: - # Current multi-factor approach logger.info("Running multi-factor DESeq2 analysis") - # Run DESeq2 analysis logger.info("Running DESeq2 analysis...") dds.deseq2() From 313a6ec51163e4f3263045141872224f64dfe924 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 24 Jul 2025 11:51:33 +0200 Subject: [PATCH 16/52] wip --- .../deseq2/deseq2/config.vsh.yaml | 2 +- .../deseq2/deseq2/script.py | 104 +++++++----------- 2 files changed, 41 insertions(+), 65 deletions(-) diff --git a/src/differential_expression/deseq2/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/deseq2/config.vsh.yaml index a0db9f3fb8e..1c01965f887 100644 --- a/src/differential_expression/deseq2/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/deseq2/config.vsh.yaml @@ -2,7 +2,7 @@ name: deseq2 namespace: differential_expression description: | Performs differential expression analysis using DESeq2 on bulk samples or pseudobulk samples aggregated from single-cell data. - Note that this component focuses exclusively on factorial experimental design variables, and excludes e.g. covariates from the analysis. + Note that this componentonly considers factors as explanatory variables, and excludes covariates from the analysis. authors: - __merge__: /src/authors/jakub_majercik.yaml diff --git a/src/differential_expression/deseq2/deseq2/script.py b/src/differential_expression/deseq2/deseq2/script.py index 34616c8fcd7..5e072f6fba1 100644 --- a/src/differential_expression/deseq2/deseq2/script.py +++ b/src/differential_expression/deseq2/deseq2/script.py @@ -114,18 +114,43 @@ def filter_genes_by_pattern(counts, gene_pattern): return counts -def deseq2_analysis(adata, contrast_tuple): +def deseq2_analysis(counts, metadata, contrast_tuple, design_formula): + # Creating DESeq2 dataset + logger.info("Creating DESeq2 dataset") + adata = DeseqDataSet( + counts=counts, + metadata=metadata, + design=design_formula, + ) + + # Filtering genes based on presence across samples + sample_count = ( + par["filter_genes_min_samples"] if par["filter_genes_min_samples"] else 1 + ) + logger.info( + f"Filtering genes by counts: removing genes that are present in less than {sample_count} samples" + ) + sc.pp.filter_genes(adata, min_cells=sample_count) + + # Run DESeq2 analysis + logger.info("Running DESeq2 analysis") adata.deseq2() + # Perform statistical test + logger.info("Performing statistical test") stat_res = DeseqStats(adata, contrast=contrast_tuple) + stat_res.summary() results = stat_res.results_df.reset_index() + # Sort by log2FoldChange results = results.sort_values("log2FoldChange", ascending=False) + # Add additional statistics results["abs_log2FoldChange"] = results["log2FoldChange"].abs() results["significant"] = (results["padj"] < par["p_adj_threshold"]) & ( results["log2FoldChange"].abs() > par["log2fc_threshold"] ) + # Filter results based on significance significant_genes = results[ (results["padj"] < par["p_adj_threshold"]) @@ -168,23 +193,9 @@ def main(): # Run DESeq2 analysis try: - logger.info("Creating DESeq2 dataset") - dds = DeseqDataSet( - counts=counts, - metadata=metadata, - design=par["design_formula"], - ) - # Filtering genes based on presence across samples - min_cells = par.get("filter_genes_min_samples", 1) - logger.info( - f"Filtering genes by counts: removing genes that are present in less than {min_cells} samples" - ) - sc.pp.filter_genes(dds, min_cells=min_cells) - # Check if running per cell group or multi-factor if par["obs_cell_group"]: logger.info("Running DESeq2 analysis per cell group") - # Remove cell group from design formula for per-cell-type analysis design_no_celltype = ( par["design_formula"] @@ -194,56 +205,17 @@ def main(): all_results = [] for cell_group in metadata[par["obs_cell_group"]].unique(): - logger.info(f"Analyzing cell group: {cell_group}") - # Subset data for this cell group cell_mask = metadata[par["obs_cell_group"]] == cell_group counts_subset = counts.loc[cell_mask, :] metadata_subset = metadata.loc[cell_mask, :] # Run DESeq2 for this cell group - dds_cell = DeseqDataSet( - counts=counts_subset, - metadata=metadata_subset, - design=design_no_celltype, - ) - - # Run DESeq2 analysis - logger.info("Running DESeq2 analysis...") - dds_cell.deseq2() - - # Perform statistical test - stat_res_cell = DeseqStats(dds_cell, contrast=contrast_tuple) - stat_res_cell.summary() - - # Get results - cell_results = stat_res_cell.results_df.reset_index() - logger.info( - f"DESeq2 analysis for cell group {cell_group} completed. Found {len(cell_results)} genes." + logger.info(f"Running DESeq2 analysis for cell group {cell_group}...") + cell_results = deseq2_analysis( + counts_subset, metadata_subset, contrast_tuple, design_no_celltype ) - # Filter results based on significance - significant_genes_cell = cell_results[ - (cell_results["padj"] < par["p_adj_threshold"]) - & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) - ] - logger.info( - f"Significant genes for cell group {cell_group} (padj < {par['p_adj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes_cell)}" - ) - - # Sort by log2FoldChange - cell_results = cell_results.sort_values( - "log2FoldChange", ascending=False - ) - - # Add additional statistics - cell_results["abs_log2FoldChange"] = cell_results[ - "log2FoldChange" - ].abs() - cell_results["significant"] = ( - cell_results["padj"] < par["p_adj_threshold"] - ) & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) - # Add cell_group column to results cell_results["cell_group"] = cell_group all_results.append(cell_results) @@ -252,7 +224,9 @@ def main(): results = pd.concat(all_results, ignore_index=True) else: - results = deseq2_analysis(dds, contrast_tuple) + results = deseq2_analysis( + counts, metadata, contrast_tuple, par["design_formula"] + ) # Log summary statistics upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] @@ -262,7 +236,7 @@ def main(): logger.info( "Summary:\n" - f"Total genes analyzed: {len(results)}\n" + f" Total genes analyzed: {len(results)}\n" f" Significant upregulated: {len(upregulated)}\n" f" Significant downregulated: {len(downregulated)}\n" ) @@ -272,11 +246,13 @@ def main(): results.to_csv(par["output"], index=False) except Exception as e: - # logger.error( - # f"DESeq2 analysis failed: {str(e)}:\n" - # f"Analysis context: {contrast_column} contrast {contrast_values}, {len(mod.obs)} samples, {len(mod.var)} genes" - # ) - logger.warning("Check input data, design factors, and contrast specifications") + logger.warning( + "Check input data, design factors, and contrast specifications\n" + f"Contrast column: {par['contrast_column']}\n" + f"Contrast values: {par['contrast_values']}\n" + f"Number of samples: {len(mod.obs)}\n" + f"Number of genes: {len(mod.var)}\n" + ) raise e From f5526880cb604f839b775e3178d8511f84bf9f06 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 24 Jul 2025 11:53:15 +0200 Subject: [PATCH 17/52] wip --- src/differential_expression/deseq2/{deseq2 => }/config.vsh.yaml | 0 src/differential_expression/deseq2/{deseq2 => }/script.py | 0 src/differential_expression/deseq2/{deseq2 => }/test.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename src/differential_expression/deseq2/{deseq2 => }/config.vsh.yaml (100%) rename src/differential_expression/deseq2/{deseq2 => }/script.py (100%) rename src/differential_expression/deseq2/{deseq2 => }/test.py (100%) diff --git a/src/differential_expression/deseq2/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml similarity index 100% rename from src/differential_expression/deseq2/deseq2/config.vsh.yaml rename to src/differential_expression/deseq2/config.vsh.yaml diff --git a/src/differential_expression/deseq2/deseq2/script.py b/src/differential_expression/deseq2/script.py similarity index 100% rename from src/differential_expression/deseq2/deseq2/script.py rename to src/differential_expression/deseq2/script.py diff --git a/src/differential_expression/deseq2/deseq2/test.py b/src/differential_expression/deseq2/test.py similarity index 100% rename from src/differential_expression/deseq2/deseq2/test.py rename to src/differential_expression/deseq2/test.py From 84668b1ed50a5c7e3fc0a3c2182f772041c9f6c2 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 24 Jul 2025 13:22:49 +0200 Subject: [PATCH 18/52] expand tests --- .../deseq2/config.vsh.yaml | 4 +-- src/differential_expression/deseq2/script.py | 24 +++++++------ src/differential_expression/deseq2/test.py | 36 +++++++++++++++++++ 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index 1c01965f887..e41295a0057 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -41,9 +41,9 @@ argument_groups: default: "feature_name" required: false - name: "--obs_cell_group" - type: boolean_true + type: string description: | - Whether to run DESeq2 analysis separately for each cell group, for example per cell type or per cell cluster. + .obs field containing the cell group information, for example per cell type or per cell cluster. If true, performs per-cell-group analysis with cell-group-specific results. - name: Arguments diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py index 5e072f6fba1..38b58bbe9d2 100644 --- a/src/differential_expression/deseq2/script.py +++ b/src/differential_expression/deseq2/script.py @@ -14,8 +14,8 @@ "output": "deseq2_results.csv", "input_layer": None, "modality": "rna", - "obs_cell_group": None, - "design_formula": "~ cell_type + disease + treatment", + "obs_cell_group": "cell_type", + "design_formula": "~ disease + treatment", "contrast_column": "treatment", "contrast_values": [ "stim", @@ -114,8 +114,7 @@ def filter_genes_by_pattern(counts, gene_pattern): return counts -def deseq2_analysis(counts, metadata, contrast_tuple, design_formula): - # Creating DESeq2 dataset +def create_deseq2_dataset(counts, metadata, design_formula): logger.info("Creating DESeq2 dataset") adata = DeseqDataSet( counts=counts, @@ -132,7 +131,10 @@ def deseq2_analysis(counts, metadata, contrast_tuple, design_formula): ) sc.pp.filter_genes(adata, min_cells=sample_count) - # Run DESeq2 analysis + return adata + + +def deseq2_analysis(adata, contrast_tuple): logger.info("Running DESeq2 analysis") adata.deseq2() @@ -212,21 +214,21 @@ def main(): # Run DESeq2 for this cell group logger.info(f"Running DESeq2 analysis for cell group {cell_group}...") - cell_results = deseq2_analysis( - counts_subset, metadata_subset, contrast_tuple, design_no_celltype + cell_dds = create_deseq2_dataset( + counts_subset, metadata_subset, design_no_celltype ) + cell_results = deseq2_analysis(cell_dds, contrast_tuple) # Add cell_group column to results - cell_results["cell_group"] = cell_group + cell_results[par["obs_cell_group"]] = cell_group all_results.append(cell_results) # Combine all results results = pd.concat(all_results, ignore_index=True) else: - results = deseq2_analysis( - counts, metadata, contrast_tuple, par["design_formula"] - ) + dds = create_deseq2_dataset(counts, metadata, par["design_formula"]) + results = deseq2_analysis(dds, contrast_tuple) # Log summary statistics upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index 77e85f67dd6..299b6cb4ddb 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -50,6 +50,42 @@ def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_p assert len(results) > 0, "No results found in output" +def test_simple_deseq2_with_cell_group( + run_component, tmp_path, pseudobulk_test_data_path +): + """Test basic DESeq2 execution with minimal parameters""" + output_path = tmp_path / "deseq2_results.csv" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output", + str(output_path), + "--obs_cell_group", + "cell_type", + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_path.exists(), "Output CSV file does not exist" + + # Check the output file + results = pd.read_csv(output_path) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant", "cell_type"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) + assert len(results) > 0, "No results found in output" + + def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_path): """Test DESeq2 with complex design formula accounting for multiple factors""" output_path = tmp_path / "deseq2_complex_results.csv" From dac24b0841fdb7e763e2b919d52918d7c7cd82cf Mon Sep 17 00:00:00 2001 From: dorien-er Date: Thu, 24 Jul 2025 17:28:14 +0200 Subject: [PATCH 19/52] remove previous script --- src/dgea/deseq2/script.py | 273 -------------------------------------- 1 file changed, 273 deletions(-) delete mode 100644 src/dgea/deseq2/script.py diff --git a/src/dgea/deseq2/script.py b/src/dgea/deseq2/script.py deleted file mode 100644 index 5ffcb915b3d..00000000000 --- a/src/dgea/deseq2/script.py +++ /dev/null @@ -1,273 +0,0 @@ -import scanpy as sc -import mudata as mu -import pandas as pd -import numpy as np -import scipy.sparse as sp -from pydeseq2.dds import DeseqDataSet -from pydeseq2.ds import DeseqStats -import re - -## VIASH START -par = { - "input": "resources_test/annotation_test_data/TS_Blood_filtered_annotated_pseudobulk.h5mu", - "output": "deseq2_results.csv", - "input_layer": None, - "design_formula": "~ cell_type + disease + treatment", - "contrast_column": "treatment", - "contrast_values": [ - "stim", - "ctrl", - ], # [treatment, baseline] or [group1, group2, group3, ...] - "filter_genes_min_samples": None, - "padj_threshold": 0.05, - "log2fc_threshold": 0.0, - "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], - "var_gene_name": "feature_name", - "per_cell_type": False, - "cell_type_column": "cell_type", -} -meta = {"resources_dir": "src/utils"} - -## VIASH END - - -def is_normalized(layer): - if sp.issparse(layer): - row_sums = np.array(layer.sum(axis=1)).flatten() - else: - row_sums = layer.sum(axis=1) - - return np.allclose(row_sums, 1) - - -import sys - -sys.path.append(meta["resources_dir"]) - -from setup_logger import setup_logger - -logger = setup_logger() - - -# Load pseudobulk data -logger.info(f"Loading pseudobulk data from {par['input']}") -mdata = mu.read_h5mu(par["input"]) -mod = mdata.mod["rna"] - -logger.info(f"Input pseudobulk data shape: {mod.shape}") -logger.info(f"Available metadata columns: {list(mod.obs.columns)}") - -# Parse design formula to extract factors -design_factors = re.findall(r"\b(?!~)\w+", par["design_formula"]) -logger.info(f"Design formula: {par['design_formula']}") -logger.info(f"Extracted factors: {design_factors}") - -# Validate required columns exist -contrast_column = par["contrast_column"] -required_columns = ( - design_factors + [contrast_column] - if contrast_column not in design_factors - else design_factors -) -missing_columns = [col for col in required_columns if col not in mod.obs.columns] -if missing_columns: - raise ValueError(f"Missing required columns in metadata: {missing_columns}") - -# Prepare counts matrix -logger.info("Preparing counts matrix for DESeq2") - -# Use specified layer if provided -if par["input_layer"]: - mod.X = mod.layers[par["input_layer"]] - -if is_normalized(mod.X): - raise ValueError("Input layer must contain raw counts.") - -counts = pd.DataFrame(mod.X, columns=mod.var_names, index=mod.obs_names) - -# Ensure counts are integers (required for DESeq2) -counts = counts.astype(int) - -metadata = mod.obs.copy() - -# Filter out unwanted gene patterns if specified (before DESeq2 analysis) -if par["filter_gene_patterns"]: - pattern_string = "|".join(par["filter_gene_patterns"]) - before_filter = counts.shape[1] - - # Filter genes based on column names (gene names) - genes_to_keep = ~pd.Series(counts.columns).str.contains(pattern_string, na=False) - counts_filtered = counts.loc[:, genes_to_keep] - - logger.info( - f"Filtered out genes matching patterns {par['filter_gene_patterns']}: {before_filter} -> {counts_filtered.shape[1]}" - ) - counts = counts_filtered - -logger.info(f"Counts matrix shape after gene pattern filtering: {counts.shape}") -logger.info(f"Design formula: {par['design_formula']}") - -# Check contrast values exist -contrast_column = par["contrast_column"] -contrast_values = par["contrast_values"] -available_values = metadata[contrast_column].unique() -logger.info(f"Available values in {contrast_column}: {available_values}") - -# Validate all contrast values exist -missing_values = [val for val in contrast_values if val not in available_values] -if missing_values: - raise ValueError(f"Contrast values {missing_values} not found in {contrast_column}") - -if len(contrast_values) < 2: - raise ValueError(f"Need at least 2 values for contrast, got: {contrast_values}") - -treatment, baseline = contrast_values -contrast_tuple = (contrast_column, treatment, baseline) -logger.info( - f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}" -) - -# Create DESeq2 dataset -logger.info("Creating DESeq2 dataset") -try: - dds = DeseqDataSet( - counts=counts, - metadata=metadata, - design=par["design_formula"], - ) - - # Optional gene filtering - if par["filter_genes_min_samples"] is not None: - logger.info( - f"Filtering genes expressed in at least {par['filter_genes_min_samples']} samples" - ) - sc.pp.filter_genes(dds, min_cells=par["filter_genes_min_samples"]) - else: - # Default: only filter genes with zero counts across all samples - logger.info( - "Default filtering: removing genes with zero counts across all samples" - ) - sc.pp.filter_genes(dds, min_cells=1) - - logger.info(f"After filtering: {dds.shape}") - - # Check if running per cell type or multi-factor - if par["per_cell_type"]: - logger.info("Running DESeq2 analysis per cell type") - - # Remove cell_type from design formula for per-cell-type analysis - design_no_celltype = ( - par["design_formula"] - .replace(f"+ {par['cell_type_column']}", "") - .replace(f"{par['cell_type_column']} +", "") - ) - - all_results = [] - for cell_type in metadata[par["cell_type_column"]].unique(): - logger.info(f"Analyzing cell type: {cell_type}") - - # Subset data for this cell type - cell_mask = metadata[par["cell_type_column"]] == cell_type - counts_subset = counts.loc[cell_mask, :] - metadata_subset = metadata.loc[cell_mask, :] - - # Run DESeq2 for this cell type - dds_cell = DeseqDataSet( - counts=counts_subset, - metadata=metadata_subset, - design=design_no_celltype, - ) - - # Run DESeq2 analysis - logger.info("Running DESeq2 analysis...") - dds_cell.deseq2() - - # Perform statistical test - stat_res_cell = DeseqStats(dds_cell, contrast=contrast_tuple) - stat_res_cell.summary() - - # Get results - cell_results = stat_res_cell.results_df.reset_index() - logger.info( - f"DESeq2 analysis for cell type {cell_type} completed. Found {len(cell_results)} genes." - ) - - # Filter results based on significance - significant_genes_cell = cell_results[ - (cell_results["padj"] < par["padj_threshold"]) - & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) - ] - logger.info( - f"Significant genes for cell type {cell_type} (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes_cell)}" - ) - - # Sort by log2FoldChange - cell_results = cell_results.sort_values("log2FoldChange", ascending=False) - - # Add additional statistics - cell_results["abs_log2FoldChange"] = cell_results["log2FoldChange"].abs() - cell_results["significant"] = ( - cell_results["padj"] < par["padj_threshold"] - ) & (cell_results["log2FoldChange"].abs() > par["log2fc_threshold"]) - - # Add cell_type column to results - cell_results["cell_type"] = cell_type - all_results.append(cell_results) - - # Combine all results - results = pd.concat(all_results, ignore_index=True) - - else: - logger.info("Running multi-factor DESeq2 analysis") - logger.info("Running DESeq2 analysis...") - dds.deseq2() - - # Perform statistical test - stat_res = DeseqStats(dds, contrast=contrast_tuple) - stat_res.summary() - - # Get results - results = stat_res.results_df.reset_index() - logger.info(f"DESeq2 analysis completed. Found {len(results)} genes.") - - # Filter results based on significance - significant_genes = results[ - (results["padj"] < par["padj_threshold"]) - & (results["log2FoldChange"].abs() > par["log2fc_threshold"]) - ] - logger.info( - f"Significant genes (padj < {par['padj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}" - ) - - # Sort by log2FoldChange - results = results.sort_values("log2FoldChange", ascending=False) - - # Add additional statistics - results["abs_log2FoldChange"] = results["log2FoldChange"].abs() - results["significant"] = (results["padj"] < par["padj_threshold"]) & ( - results["log2FoldChange"].abs() > par["log2fc_threshold"] - ) - - # Save results - logger.info(f"Saving results to {par['output']}") - results.to_csv(par["output"], index=False) - - # Log summary statistics - upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] - downregulated = results[(results["log2FoldChange"] < 0) & results["significant"]] - - logger.info("Summary:") - logger.info(f" Total genes analyzed: {len(results)}") - logger.info(f" Significant upregulated: {len(upregulated)}") - logger.info(f" Significant downregulated: {len(downregulated)}") - -except Exception as e: - logger.error(f"DESeq2 analysis failed: {str(e)}") - logger.error( - f"Analysis context: {contrast_column} contrast {contrast_values}, {len(mod.obs)} samples, {len(mod.var)} genes" - ) - logger.warning("Check your input data, design factors, and contrast specifications") - - raise e - -logger.info("DESeq2 analysis completed successfully") From fc30feee1bfe428deca523c66ebc9353473e1b1f Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 30 Jul 2025 14:58:20 +0200 Subject: [PATCH 20/52] add multiple treatment conditions --- src/differential_expression/deseq2/script.py | 98 ++++++++++++-------- 1 file changed, 61 insertions(+), 37 deletions(-) diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py index 38b58bbe9d2..dfa4609325d 100644 --- a/src/differential_expression/deseq2/script.py +++ b/src/differential_expression/deseq2/script.py @@ -78,17 +78,26 @@ def prepare_contrast_matrix(design_factors, contrast_column, metadata): f"Available values: {available_values}" ) - if len(contrast_values) < 2: + # Handle multi-level contrasts + if len(contrast_values) == 2: + # Pairwise comparison (current behavior) + treatment, baseline = contrast_values + contrast_tuple = (contrast_column, treatment, baseline) + logger.info(f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}") + return [contrast_tuple] + + elif len(contrast_values) > 2: + # Multiple comparisons - all vs first (baseline) + baseline = contrast_values[0] + contrast_tuples = [] + for treatment in contrast_values[1:]: + contrast_tuples.append((contrast_column, treatment, baseline)) + logger.info(f"Performing multiple contrasts against baseline '{baseline}': {[t[1] for t in contrast_tuples]}") + return contrast_tuples + + else: raise ValueError(f"Need at least 2 values for contrast, got: {contrast_values}") - treatment, baseline = contrast_values - contrast_tuple = (contrast_column, treatment, baseline) - logger.info( - f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}" - ) - return contrast_tuple - - def prepare_counts_matrix(layer, mod): counts = pd.DataFrame(layer, columns=mod.var_names, index=mod.obs_names) counts = counts.astype(int) # Ensure counts are integers (required for DESeq2) @@ -134,34 +143,49 @@ def create_deseq2_dataset(counts, metadata, design_formula): return adata -def deseq2_analysis(adata, contrast_tuple): +def deseq2_analysis(adata, contrast_tuples): logger.info("Running DESeq2 analysis") adata.deseq2() - # Perform statistical test - logger.info("Performing statistical test") - stat_res = DeseqStats(adata, contrast=contrast_tuple) - stat_res.summary() - results = stat_res.results_df.reset_index() - - # Sort by log2FoldChange - results = results.sort_values("log2FoldChange", ascending=False) - - # Add additional statistics - results["abs_log2FoldChange"] = results["log2FoldChange"].abs() - results["significant"] = (results["padj"] < par["p_adj_threshold"]) & ( - results["log2FoldChange"].abs() > par["log2fc_threshold"] - ) - - # Filter results based on significance - significant_genes = results[ - (results["padj"] < par["p_adj_threshold"]) - & (results["log2FoldChange"].abs() > par["log2fc_threshold"]) - ] - logger.info( - f"Significant genes (padj < {par['p_adj_threshold']}, |log2FC| > {par['log2fc_threshold']}): {len(significant_genes)}" - ) - return results + all_results = [] + + # Handle multiple contrasts + if not isinstance(contrast_tuples, list): + contrast_tuples = [contrast_tuples] + + for contrast_tuple in contrast_tuples: + logger.info(f"Performing statistical test for contrast: {contrast_tuple}") + stat_res = DeseqStats(adata, contrast=contrast_tuple) + stat_res.summary() + results = stat_res.results_df.reset_index() + + # Add contrast information + results['contrast'] = f"{contrast_tuple[1]}_vs_{contrast_tuple[2]}" + results['treatment'] = contrast_tuple[1] + results['baseline'] = contrast_tuple[2] + + # Sort by log2FoldChange + results = results.sort_values("log2FoldChange", ascending=False) + + # Add additional statistics + results["abs_log2FoldChange"] = results["log2FoldChange"].abs() + results["significant"] = (results["padj"] < par["p_adj_threshold"]) & ( + results["log2FoldChange"].abs() > par["log2fc_threshold"] + ) + + all_results.append(results) + + # Combine all contrast results + combined_results = pd.concat(all_results, ignore_index=True) + + # Log summary per contrast + for contrast_tuple in contrast_tuples: + contrast_name = f"{contrast_tuple[1]}_vs_{contrast_tuple[2]}" + contrast_results = combined_results[combined_results['contrast'] == contrast_name] + significant_genes = contrast_results[contrast_results['significant']] + logger.info(f"Contrast {contrast_name}: {len(significant_genes)} significant genes") + + return combined_results def main(): @@ -180,7 +204,7 @@ def main(): # Preparing contrast matrix logger.info("Preparing contrast matrix") - contrast_tuple = prepare_contrast_matrix( + contrast_tuples = prepare_contrast_matrix( design_factors, par["contrast_column"], metadata ) @@ -217,7 +241,7 @@ def main(): cell_dds = create_deseq2_dataset( counts_subset, metadata_subset, design_no_celltype ) - cell_results = deseq2_analysis(cell_dds, contrast_tuple) + cell_results = deseq2_analysis(cell_dds, contrast_tuples) # Add cell_group column to results cell_results[par["obs_cell_group"]] = cell_group @@ -228,7 +252,7 @@ def main(): else: dds = create_deseq2_dataset(counts, metadata, par["design_formula"]) - results = deseq2_analysis(dds, contrast_tuple) + results = deseq2_analysis(dds, contrast_tuples) # Log summary statistics upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] From 40f786e5e3ef1a6635257a3a0e89b44cdefb438a Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 30 Jul 2025 15:01:25 +0200 Subject: [PATCH 21/52] fix contrast_Values arg --- src/differential_expression/deseq2/config.vsh.yaml | 4 ++-- src/differential_expression/deseq2/script.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index e41295a0057..b4027e8f6e4 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -67,9 +67,9 @@ argument_groups: multiple: true description: | Values to compare in the contrast column. - First value is treatment, second is baseline (e.g., ["stim", "ctrl"]). + First value is baseline, following values are different treatments. required: true - example: ["stim", "ctrl"] + example: ["ctrl", "stim"] - name: "--filter_genes_min_samples" type: integer min: 1 diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py index dfa4609325d..2c9f519258a 100644 --- a/src/differential_expression/deseq2/script.py +++ b/src/differential_expression/deseq2/script.py @@ -18,8 +18,8 @@ "design_formula": "~ disease + treatment", "contrast_column": "treatment", "contrast_values": [ - "stim", "ctrl", + "stim", ], # [treatment, baseline] or [group1, group2, group3, ...] "filter_genes_min_samples": None, "p_adj_threshold": 0.05, From c6661991d3259f90c4393439da7158c62cf87877 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 30 Jul 2025 17:47:53 +0200 Subject: [PATCH 22/52] add workflow config --- .../pseudobulk_deseq2/config.vsh.yaml | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml new file mode 100644 index 00000000000..9120ae13680 --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -0,0 +1,151 @@ +name: pseudobulk_deseq2 +namespace: differential_expression +scope: public +description: | + Performs pseudobulk generation and subsequent differential expression analysis using DESeq2. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + required: true + description: ID of the sample. + example: foo + - name: --input + type: string + required: true + description: Input h5mu file. + example: /path/to/input_file.h5mu + - name: --modality + type: string + default: rna + description: Which modality from the input MuData file to process. + - name: --input_layer + type: string + required: false + description: Input layer to use. If None, X is used. This layer must contain raw counts. + - name: --obs_label + type: string + required: true + description: .obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters. + example: cell_type + - name: --obs_groups + type: string + multiple: true + description: .obs fields containing the experimental condition(s) to create pseudobulk samples for. + example: ["treatment", "disease"] + - name: "--var_gene_name" + type: string + description: | + Column name in the variable features table that contains gene symbols. + This is used for reporting gene names in the output. + default: "feature_name" + required: false + - name: --obs_cell_group + type: string + description: | + .obs field containing the cell group information, for example per cell type or per cell cluster. + If true, performs per-cell-group analysis with cell-group-specific results. + required: false + + - name: Pseudobulk Options + arguments: + - name: --aggregation_method + type: string + choices: ["sum", "mean"] + default: sum + description: Method to aggregate the raw counts for pseudoreplicates. Either sum or mean. + - name: --min_obs_per_sample + type: integer + min: 1 + default: 30 + description: Minimum number of cells per pseudobulk sample. + - name: --random_state + type: integer + min: 0 + default: 0 + description: The random seed for sampling. + - name: --obs_cell_count + type: string + required: false + description: .obs field containing the cell count per pseudobulk sample. + - name: DGEA options + arguments: + - name: --design_formula + type: string + required: true + description: | + Design formula for DESeq2 analysis in R-style format. + Specifies the statistical model to account for various factors. + example: "~ cell_type + disease + treatment" + - name: --contrast_column + type: string + required: true + description: | + Column in the metadata to use for the contrast. + This column should contain the conditions to compare. + example: "treatment" + - name: --contrast_values + type: string + multiple: true + required: true + description: | + Values to compare in the contrast column. + First value is baseline, following values are different treatments. + example: ["ctrl", "stim"] + - name: --filter_genes_min_samples + type: integer + min: 1 + description: | + Minimum number of samples a gene must be expressed in to be included in the analysis. + If None, no filtering is applied. + required: false + - name: --p_adj_threshold + type: double + default: 0.05 + description: | + Adjusted p-value threshold for significance. + Genes with adjusted p-values below this threshold will be considered significant. + required: false + - name: --log2fc_threshold + type: double + default: 0.0 + description: | + Log2 fold change threshold for significance. + Genes with absolute log2 fold change above this threshold will be considered significant. + required: false + - name: --filter_gene_patterns + type: string + multiple: true + description: | + List of regex patterns to filter out genes. + Genes matching any of these patterns will be excluded from the analysis. + required: false + + - name: Outputs + arguments: + - name: --output + type: file + description: Output CSV file with DESeq2 results. + direction: output + required: true + +dependencies: + - name: differential_expression/create_pseudobulk + alias: create_pseudobulk + - name: differential_expression/deseq2 + alias: deseq2 + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +runners: + - type: nextflow \ No newline at end of file From 802948e5195bd5e37b269ea705b85fb2d2a1a261 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 31 Jul 2025 13:51:24 +0200 Subject: [PATCH 23/52] add workflow --- .../pseudobulk_deseq2/main.nf | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 src/workflows/differential_expression/pseudobulk_deseq2/main.nf diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf new file mode 100644 index 00000000000..cf46ce64af2 --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -0,0 +1,47 @@ +workflow run_wf { + take: + input_ch + + main: + + output_ch = input_ch + | create_pseudobulk.run( + fromState: [ + id: "id", + input: "input", + modality: "modality", + input_layer: "input_layer", + obs_label: "obs_label", + obs_groups: "obs_groups", + aggregation_method: "aggregation_method", + min_obs_per_sample: "min_obs_per_sample", + random_state: "random_state", + obs_cell_count: "obs_cell_count" + ], + toState: [ "input": "output" ] + ) + + | deseq2.run( + fromState: [ + id: "id", + input: "input", + modality: "modality", + input_layer: "input_layer", + var_gene_name: "var_gene_name", + obs_cell_group: "obs_cell_group", + design_formula: "design_formula", + contrast_column: "contrast_column", + contrast_values: "contrast_values", + filter_genes_min_samples: "filter_genes_min_samples", + p_adj_threshold: "p_adj_threshold", + log2fc_threshold: "log2fc_threshold", + filter_gene_patterns: "filter_gene_patterns", + ], + toState: [ "output": "output" ] + ) + + | setState([ "output" ]) + + emit: + output_ch +} \ No newline at end of file From d4df4aaf01091a6b6e4518520c34eab4728aa273 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Tue, 5 Aug 2025 10:27:09 +0200 Subject: [PATCH 24/52] add workflow test --- .../create_pseudobulk/config.vsh.yaml | 4 +- .../pseudobulk_deseq2/config.vsh.yaml | 12 ++++-- .../pseudobulk_deseq2/integration_test.sh | 15 +++++++ .../pseudobulk_deseq2/nextflow.config | 10 +++++ .../pseudobulk_deseq2/test.nf | 40 +++++++++++++++++++ 5 files changed, 75 insertions(+), 6 deletions(-) create mode 100755 src/workflows/differential_expression/pseudobulk_deseq2/integration_test.sh create mode 100644 src/workflows/differential_expression/pseudobulk_deseq2/nextflow.config create mode 100644 src/workflows/differential_expression/pseudobulk_deseq2/test.nf diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml index aa90b260be1..855566a39b0 100644 --- a/src/differential_expression/create_pseudobulk/config.vsh.yaml +++ b/src/differential_expression/create_pseudobulk/config.vsh.yaml @@ -1,5 +1,5 @@ -name: differential_expression -namespace: "create_pseudobulk" +name: create_pseudobulk +namespace: differential_expression scope: "public" description: | Generation of pseudobulk samples from single-cell transcriptomics data, diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 9120ae13680..1267ed5b9c9 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -1,5 +1,5 @@ name: pseudobulk_deseq2 -namespace: differential_expression +namespace: workflows/differential_expression scope: public description: | Performs pseudobulk generation and subsequent differential expression analysis using DESeq2. @@ -18,7 +18,7 @@ argument_groups: description: ID of the sample. example: foo - name: --input - type: string + type: file required: true description: Input h5mu file. example: /path/to/input_file.h5mu @@ -138,14 +138,18 @@ argument_groups: dependencies: - name: differential_expression/create_pseudobulk - alias: create_pseudobulk - name: differential_expression/deseq2 - alias: deseq2 resources: - type: nextflow_script path: main.nf entrypoint: run_wf +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + runners: - type: nextflow \ No newline at end of file diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/integration_test.sh b/src/workflows/differential_expression/pseudobulk_deseq2/integration_test.sh new file mode 100755 index 00000000000..c712b294512 --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/differential_expression/pseudobulk_deseq2/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/nextflow.config b/src/workflows/differential_expression/pseudobulk_deseq2/nextflow.config new file mode 100644 index 00000000000..059100c489c --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf new file mode 100644 index 00000000000..7fb716bab68 --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -0,0 +1,40 @@ +nextflow.enable.dsl=2 + +include { pseudobulk_deseq2 } from params.rootDir + "/target/nextflow/workflows/differential_expression/pseudobulk_deseq2/main.nf" +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("/annotation_test_data/TS_Blood_filtered.h5mu"), + obs_label: "cell_type", + obs_groups: "treatment", + design_formula: "~ cell_type + disease + treatment", + contrast_column: "treatment", + contrast_values: ["ctrl", "stim"] + ] + ]) + | map{ state -> [state.id, state] } + | pseudobulk_deseq2 + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".csv") : "Output file should end with '.csv'. Found: ${state.output}" + + "Output: $output" + } +} \ No newline at end of file From d18c043b3fbc6eb4bec78bfe08a7bb7c678f28ae Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Tue, 5 Aug 2025 10:29:50 +0200 Subject: [PATCH 25/52] fix typo --- src/workflows/differential_expression/pseudobulk_deseq2/test.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf index 7fb716bab68..8915c605fa1 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -11,7 +11,7 @@ workflow test_wf { [ [ id: "simple_execution_test", - input: resources_test.resolve("/annotation_test_data/TS_Blood_filtered.h5mu"), + input: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), obs_label: "cell_type", obs_groups: "treatment", design_formula: "~ cell_type + disease + treatment", From 5976197552730de97728adfab7ad3fd5860af218 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Tue, 5 Aug 2025 11:43:51 +0200 Subject: [PATCH 26/52] integration test working --- src/differential_expression/deseq2/config.vsh.yaml | 3 +-- .../differential_expression/pseudobulk_deseq2/main.nf | 1 + .../differential_expression/pseudobulk_deseq2/test.nf | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index b4027e8f6e4..ba537f754dc 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -127,8 +127,7 @@ engines: - pydeseq2~=0.5.2 - zarr<3.0.0 - type: python - __merge__: [/src/base/requirements/anndata_mudata.yaml] - __merge__: [/src/base/requirements/scanpy.yaml] + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml] __merge__: [ /src/base/requirements/python_test_setup.yaml, .] runners: diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf index cf46ce64af2..146275a4e8d 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -36,6 +36,7 @@ workflow run_wf { p_adj_threshold: "p_adj_threshold", log2fc_threshold: "log2fc_threshold", filter_gene_patterns: "filter_gene_patterns", + output: "output" ], toState: [ "output": "output" ] ) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf index 8915c605fa1..76a97f08bb4 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -14,9 +14,10 @@ workflow test_wf { input: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), obs_label: "cell_type", obs_groups: "treatment", - design_formula: "~ cell_type + disease + treatment", + design_formula: "~ cell_type + treatment", contrast_column: "treatment", - contrast_values: ["ctrl", "stim"] + contrast_values: ["ctrl", "stim"], + output: "simple_execution_test_output.csv" ] ]) | map{ state -> [state.id, state] } From f2b912327cc714e1dc800531fe5547523152c165 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 5 Aug 2025 13:19:29 +0200 Subject: [PATCH 27/52] linting --- src/differential_expression/deseq2/script.py | 47 ++++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py index 2c9f519258a..440dd561848 100644 --- a/src/differential_expression/deseq2/script.py +++ b/src/differential_expression/deseq2/script.py @@ -83,21 +83,26 @@ def prepare_contrast_matrix(design_factors, contrast_column, metadata): # Pairwise comparison (current behavior) treatment, baseline = contrast_values contrast_tuple = (contrast_column, treatment, baseline) - logger.info(f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}") + logger.info( + f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}" + ) return [contrast_tuple] - + elif len(contrast_values) > 2: # Multiple comparisons - all vs first (baseline) baseline = contrast_values[0] contrast_tuples = [] for treatment in contrast_values[1:]: contrast_tuples.append((contrast_column, treatment, baseline)) - logger.info(f"Performing multiple contrasts against baseline '{baseline}': {[t[1] for t in contrast_tuples]}") + logger.info( + f"Performing multiple contrasts against baseline '{baseline}': {[t[1] for t in contrast_tuples]}" + ) return contrast_tuples - + else: raise ValueError(f"Need at least 2 values for contrast, got: {contrast_values}") + def prepare_counts_matrix(layer, mod): counts = pd.DataFrame(layer, columns=mod.var_names, index=mod.obs_names) counts = counts.astype(int) # Ensure counts are integers (required for DESeq2) @@ -148,43 +153,47 @@ def deseq2_analysis(adata, contrast_tuples): adata.deseq2() all_results = [] - + # Handle multiple contrasts if not isinstance(contrast_tuples, list): contrast_tuples = [contrast_tuples] - + for contrast_tuple in contrast_tuples: logger.info(f"Performing statistical test for contrast: {contrast_tuple}") stat_res = DeseqStats(adata, contrast=contrast_tuple) stat_res.summary() results = stat_res.results_df.reset_index() - + # Add contrast information - results['contrast'] = f"{contrast_tuple[1]}_vs_{contrast_tuple[2]}" - results['treatment'] = contrast_tuple[1] - results['baseline'] = contrast_tuple[2] - + results["contrast"] = f"{contrast_tuple[1]}_vs_{contrast_tuple[2]}" + results["treatment"] = contrast_tuple[1] + results["baseline"] = contrast_tuple[2] + # Sort by log2FoldChange results = results.sort_values("log2FoldChange", ascending=False) - + # Add additional statistics results["abs_log2FoldChange"] = results["log2FoldChange"].abs() results["significant"] = (results["padj"] < par["p_adj_threshold"]) & ( results["log2FoldChange"].abs() > par["log2fc_threshold"] ) - + all_results.append(results) - + # Combine all contrast results combined_results = pd.concat(all_results, ignore_index=True) - + # Log summary per contrast for contrast_tuple in contrast_tuples: contrast_name = f"{contrast_tuple[1]}_vs_{contrast_tuple[2]}" - contrast_results = combined_results[combined_results['contrast'] == contrast_name] - significant_genes = contrast_results[contrast_results['significant']] - logger.info(f"Contrast {contrast_name}: {len(significant_genes)} significant genes") - + contrast_results = combined_results[ + combined_results["contrast"] == contrast_name + ] + significant_genes = contrast_results[contrast_results["significant"]] + logger.info( + f"Contrast {contrast_name}: {len(significant_genes)} significant genes" + ) + return combined_results From d9094b5a0faf607ca0d15db6d84c966545d44ad0 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Sun, 24 Aug 2025 16:45:09 +0200 Subject: [PATCH 28/52] add per-cell group outputs --- .../deseq2/config.vsh.yaml | 17 +- src/differential_expression/deseq2/script.py | 55 +++-- src/differential_expression/deseq2/test.py | 189 +++++++++++++++--- 3 files changed, 210 insertions(+), 51 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index ba537f754dc..d421059772b 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -2,7 +2,7 @@ name: deseq2 namespace: differential_expression description: | Performs differential expression analysis using DESeq2 on bulk samples or pseudobulk samples aggregated from single-cell data. - Note that this componentonly considers factors as explanatory variables, and excludes covariates from the analysis. + Note that this component only considers factors as explanatory variables, and excludes covariates from the analysis. authors: - __merge__: /src/authors/jakub_majercik.yaml @@ -67,7 +67,8 @@ argument_groups: multiple: true description: | Values to compare in the contrast column. - First value is baseline, following values are different treatments. + First value is the control group, following values are comparison groups. + Values must be present in the fields specified for the design formula. required: true example: ["ctrl", "stim"] - name: "--filter_genes_min_samples" @@ -102,12 +103,20 @@ argument_groups: - name: Outputs arguments: - - name: "--output" + - name: "--output_dir" alternatives: ["-o"] type: file - description: Output CSV file with DESeq2 results. + description: Output directory for DESeq2 results. direction: output required: true + - name: "--output_prefix" + type: string + description: | + Prefix for output CSV files. + If no cell groups are specified, the output file will be named "{prefix}.csv". + If cell groups are specified, files will be named "{prefix}_{cell_group}.csv". + default: "deseq2_results" + required: false resources: - type: python_script diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py index 440dd561848..820e08e46c2 100644 --- a/src/differential_expression/deseq2/script.py +++ b/src/differential_expression/deseq2/script.py @@ -7,11 +7,14 @@ from pydeseq2.ds import DeseqStats import re import sys +import os +from pathlib import Path ## VIASH START par = { "input": "resources_test/annotation_test_data/TS_Blood_filtered_pseudobulk.h5mu", - "output": "deseq2_results.csv", + "output_dir": "./", + "output_prefix": "deseq2_results", "input_layer": None, "modality": "rna", "obs_cell_group": "cell_type", @@ -197,6 +200,24 @@ def deseq2_analysis(adata, contrast_tuples): return combined_results +def save_results_and_log_summary(results, output_file, cell_group=None): + """Save results to CSV and log summary statistics""" + logger.info(f"Saving results{f' for {cell_group}' if cell_group else ''} to {output_file}") + results.to_csv(output_file, index=False) + + # Log summary statistics + upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] + downregulated = results[(results["log2FoldChange"] < 0) & results["significant"]] + + summary_prefix = f"Summary{f' for {cell_group}' if cell_group else ''}:\n" + logger.info( + f"{summary_prefix}" + f" Total genes analyzed: {len(results)}\n" + f" Significant upregulated: {len(upregulated)}\n" + f" Significant downregulated: {len(downregulated)}\n" + ) + + def main(): # Load pseudobulk data logger.info(f"Loading pseudobulk data from {par['input']}") @@ -228,6 +249,10 @@ def main(): # Run DESeq2 analysis try: + # Create output directory + output_dir = Path(par["output_dir"]) + output_dir.mkdir(parents=True, exist_ok=True) + # Check if running per cell group or multi-factor if par["obs_cell_group"]: logger.info("Running DESeq2 analysis per cell group") @@ -238,7 +263,6 @@ def main(): .replace(f"{par['obs_cell_group']} +", "") ) - all_results = [] for cell_group in metadata[par["obs_cell_group"]].unique(): # Subset data for this cell group cell_mask = metadata[par["obs_cell_group"]] == cell_group @@ -254,31 +278,20 @@ def main(): # Add cell_group column to results cell_results[par["obs_cell_group"]] = cell_group - all_results.append(cell_results) - # Combine all results - results = pd.concat(all_results, ignore_index=True) + # Save results for this cell group + # Create safe filename by replacing problematic characters + safe_cell_group = str(cell_group).replace("/", "_").replace(" ", "_").replace("(", "").replace(")", "") + output_file = output_dir / f"{par['output_prefix']}_{safe_cell_group}.csv" + save_results_and_log_summary(cell_results, output_file, cell_group) else: dds = create_deseq2_dataset(counts, metadata, par["design_formula"]) results = deseq2_analysis(dds, contrast_tuples) - # Log summary statistics - upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] - downregulated = results[ - (results["log2FoldChange"] < 0) & results["significant"] - ] - - logger.info( - "Summary:\n" - f" Total genes analyzed: {len(results)}\n" - f" Significant upregulated: {len(upregulated)}\n" - f" Significant downregulated: {len(downregulated)}\n" - ) - - # Save results - logger.info(f"Saving results to {par['output']}") - results.to_csv(par["output"], index=False) + # Save results + output_file = output_dir / f"{par['output_prefix']}.csv" + save_results_and_log_summary(results, output_file) except Exception as e: logger.warning( diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index 299b6cb4ddb..057dcd638bc 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -3,6 +3,7 @@ import pytest import pandas as pd import re +from pathlib import Path ## VIASH START @@ -20,14 +21,14 @@ def pseudobulk_test_data_path(): def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_path): """Test basic DESeq2 execution with minimal parameters""" - output_path = tmp_path / "deseq2_results.csv" + output_dir = tmp_path / "deseq2_output" run_component( [ "--input", pseudobulk_test_data_path, - "--output", - str(output_path), + "--output_dir", + str(output_dir), "--design_formula", "~ treatment", "--contrast_column", @@ -39,10 +40,12 @@ def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_p ] ) - assert output_path.exists(), "Output CSV file does not exist" + assert output_dir.exists(), "Output directory does not exist" + output_file = output_dir / "deseq2_results.csv" # Default prefix + assert output_file.exists(), "Output CSV file does not exist" # Check the output file - results = pd.read_csv(output_path) + results = pd.read_csv(output_file) expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found" @@ -53,15 +56,15 @@ def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_p def test_simple_deseq2_with_cell_group( run_component, tmp_path, pseudobulk_test_data_path ): - """Test basic DESeq2 execution with minimal parameters""" - output_path = tmp_path / "deseq2_results.csv" + """Test DESeq2 execution with cell groups - should create separate files""" + output_dir = tmp_path / "deseq2_output" run_component( [ "--input", pseudobulk_test_data_path, - "--output", - str(output_path), + "--output_dir", + str(output_dir), "--obs_cell_group", "cell_type", "--design_formula", @@ -75,27 +78,35 @@ def test_simple_deseq2_with_cell_group( ] ) - assert output_path.exists(), "Output CSV file does not exist" - - # Check the output file - results = pd.read_csv(output_path) - expected_columns = ["log2FoldChange", "pvalue", "padj", "significant", "cell_type"] - assert all(col in results.columns for col in expected_columns), ( - f"Expected columns {expected_columns} not found" - ) - assert len(results) > 0, "No results found in output" + assert output_dir.exists(), "Output directory does not exist" + + # Check that multiple CSV files exist (one per cell group) + csv_files = list(output_dir.glob("deseq2_results_*.csv")) # Default prefix + assert len(csv_files) > 0, "No cell group-specific CSV files found" + + # Check each file has expected structure + for csv_file in csv_files: + results = pd.read_csv(csv_file) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant", "cell_type"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found in {csv_file}" + ) + assert len(results) > 0, f"No results found in {csv_file}" + + # Check that all rows have the same cell_type value + assert results["cell_type"].nunique() == 1, f"Multiple cell types found in {csv_file}" def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_path): """Test DESeq2 with complex design formula accounting for multiple factors""" - output_path = tmp_path / "deseq2_complex_results.csv" + output_dir = tmp_path / "deseq2_output" run_component( [ "--input", pseudobulk_test_data_path, - "--output", - str(output_path), + "--output_dir", + str(output_dir), "--design_formula", "~ cell_type + disease + treatment", "--contrast_column", @@ -111,24 +122,26 @@ def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_pa ] ) - assert output_path.exists(), "Output CSV file does not exist" + assert output_dir.exists(), "Output directory does not exist" + output_file = output_dir / "deseq2_results.csv" # Default prefix + assert output_file.exists(), "Output CSV file does not exist" - results = pd.read_csv(output_path) + results = pd.read_csv(output_file) assert "significant" in results.columns, "Significance column not found" assert len(results) > 0, "No results found for complex design" def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_path): """Test that invalid contrast column raises appropriate error""" - output_path = tmp_path / "deseq2_invalid_results.csv" + output_dir = tmp_path / "deseq2_output" with pytest.raises(subprocess.CalledProcessError) as err: run_component( [ "--input", pseudobulk_test_data_path, - "--output", - str(output_path), + "--output_dir", + str(output_dir), "--design_formula", "~ treatment", "--contrast_column", @@ -146,5 +159,129 @@ def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_p ) +def test_output_without_cell_group(run_component, tmp_path, pseudobulk_test_data_path): + """Test that without cell group specified, a single CSV is created in output directory""" + output_dir = tmp_path / "deseq2_output" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Should have exactly one CSV file named deseq2_results.csv + output_file = output_dir / "deseq2_results.csv" # Default prefix + assert output_file.exists(), "Main output CSV file does not exist" + + # Check no cell group specific files exist + csv_files = list(output_dir.glob("deseq2_results_*.csv")) # Default prefix + assert len(csv_files) == 0, "Found cell group-specific files when none should exist" + + # Check the main file structure + results = pd.read_csv(output_file) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) + assert len(results) > 0, "No results found in output" + + +def test_custom_output_prefix(run_component, tmp_path, pseudobulk_test_data_path): + """Test custom output prefix functionality""" + output_dir = tmp_path / "deseq2_output" + custom_prefix = "my_custom_analysis" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--output_prefix", + custom_prefix, + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Should have exactly one CSV file with custom prefix + output_file = output_dir / f"{custom_prefix}.csv" + assert output_file.exists(), f"Custom prefix output CSV file {output_file} does not exist" + + # Check the main file structure + results = pd.read_csv(output_file) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) + assert len(results) > 0, "No results found in output" + + +def test_custom_output_prefix_with_cell_groups(run_component, tmp_path, pseudobulk_test_data_path): + """Test custom output prefix with cell groups""" + output_dir = tmp_path / "deseq2_output" + custom_prefix = "celltype_analysis" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--output_prefix", + custom_prefix, + "--obs_cell_group", + "cell_type", + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Check that multiple CSV files exist with custom prefix + csv_files = list(output_dir.glob(f"{custom_prefix}_*.csv")) + assert len(csv_files) > 0, f"No cell group-specific CSV files found with prefix {custom_prefix}" + + # Check each file has expected structure + for csv_file in csv_files: + results = pd.read_csv(csv_file) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant", "cell_type"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found in {csv_file}" + ) + assert len(results) > 0, f"No results found in {csv_file}" + + # Check that all rows have the same cell_type value + assert results["cell_type"].nunique() == 1, f"Multiple cell types found in {csv_file}" + + if __name__ == "__main__": sys.exit(pytest.main([__file__])) From 3295ba0278780344ee224e2bab09e0ef0826bc6a Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Sun, 24 Aug 2025 17:18:56 +0200 Subject: [PATCH 29/52] fix linting --- src/differential_expression/deseq2/script.py | 6 ++-- src/differential_expression/deseq2/test.py | 37 +++++++++++++++----- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py index 820e08e46c2..181ec269f0d 100644 --- a/src/differential_expression/deseq2/script.py +++ b/src/differential_expression/deseq2/script.py @@ -98,7 +98,8 @@ def prepare_contrast_matrix(design_factors, contrast_column, metadata): for treatment in contrast_values[1:]: contrast_tuples.append((contrast_column, treatment, baseline)) logger.info( - f"Performing multiple contrasts against baseline '{baseline}': {[t[1] for t in contrast_tuples]}" + f"Performing multiple contrasts against baseline '{baseline}': " + f"{[t[1] for t in contrast_tuples]}" ) return contrast_tuples @@ -144,7 +145,8 @@ def create_deseq2_dataset(counts, metadata, design_formula): par["filter_genes_min_samples"] if par["filter_genes_min_samples"] else 1 ) logger.info( - f"Filtering genes by counts: removing genes that are present in less than {sample_count} samples" + f"Filtering genes by counts: removing genes that are present in " + f"less than {sample_count} samples" ) sc.pp.filter_genes(adata, min_cells=sample_count) diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index 057dcd638bc..415f025f683 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -3,7 +3,6 @@ import pytest import pandas as pd import re -from pathlib import Path ## VIASH START @@ -87,14 +86,22 @@ def test_simple_deseq2_with_cell_group( # Check each file has expected structure for csv_file in csv_files: results = pd.read_csv(csv_file) - expected_columns = ["log2FoldChange", "pvalue", "padj", "significant", "cell_type"] + expected_columns = [ + "log2FoldChange", + "pvalue", + "padj", + "significant", + "cell_type" + ] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found in {csv_file}" ) assert len(results) > 0, f"No results found in {csv_file}" # Check that all rows have the same cell_type value - assert results["cell_type"].nunique() == 1, f"Multiple cell types found in {csv_file}" + assert results["cell_type"].nunique() == 1, ( + f"Multiple cell types found in {csv_file}" + ) def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_path): @@ -227,7 +234,9 @@ def test_custom_output_prefix(run_component, tmp_path, pseudobulk_test_data_path # Should have exactly one CSV file with custom prefix output_file = output_dir / f"{custom_prefix}.csv" - assert output_file.exists(), f"Custom prefix output CSV file {output_file} does not exist" + assert output_file.exists(), ( + f"Custom prefix output CSV file {output_file} does not exist" + ) # Check the main file structure results = pd.read_csv(output_file) @@ -238,7 +247,9 @@ def test_custom_output_prefix(run_component, tmp_path, pseudobulk_test_data_path assert len(results) > 0, "No results found in output" -def test_custom_output_prefix_with_cell_groups(run_component, tmp_path, pseudobulk_test_data_path): +def test_custom_output_prefix_with_cell_groups( + run_component, tmp_path, pseudobulk_test_data_path +): """Test custom output prefix with cell groups""" output_dir = tmp_path / "deseq2_output" custom_prefix = "celltype_analysis" @@ -268,19 +279,29 @@ def test_custom_output_prefix_with_cell_groups(run_component, tmp_path, pseudobu # Check that multiple CSV files exist with custom prefix csv_files = list(output_dir.glob(f"{custom_prefix}_*.csv")) - assert len(csv_files) > 0, f"No cell group-specific CSV files found with prefix {custom_prefix}" + assert len(csv_files) > 0, ( + f"No cell group-specific CSV files found with prefix {custom_prefix}" + ) # Check each file has expected structure for csv_file in csv_files: results = pd.read_csv(csv_file) - expected_columns = ["log2FoldChange", "pvalue", "padj", "significant", "cell_type"] + expected_columns = [ + "log2FoldChange", + "pvalue", + "padj", + "significant", + "cell_type" + ] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found in {csv_file}" ) assert len(results) > 0, f"No results found in {csv_file}" # Check that all rows have the same cell_type value - assert results["cell_type"].nunique() == 1, f"Multiple cell types found in {csv_file}" + assert results["cell_type"].nunique() == 1, ( + f"Multiple cell types found in {csv_file}" + ) if __name__ == "__main__": From ef8a418db961f3af8e82d97e3d2a053ee9b9ce73 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Mon, 25 Aug 2025 09:15:53 +0200 Subject: [PATCH 30/52] fix linting --- src/differential_expression/deseq2/script.py | 17 ++++++-- src/differential_expression/deseq2/test.py | 46 ++++++++++---------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py index 181ec269f0d..930f9dec9d2 100644 --- a/src/differential_expression/deseq2/script.py +++ b/src/differential_expression/deseq2/script.py @@ -7,7 +7,6 @@ from pydeseq2.ds import DeseqStats import re import sys -import os from pathlib import Path ## VIASH START @@ -204,7 +203,9 @@ def deseq2_analysis(adata, contrast_tuples): def save_results_and_log_summary(results, output_file, cell_group=None): """Save results to CSV and log summary statistics""" - logger.info(f"Saving results{f' for {cell_group}' if cell_group else ''} to {output_file}") + logger.info( + f"Saving results{f' for {cell_group}' if cell_group else ''} to {output_file}" + ) results.to_csv(output_file, index=False) # Log summary statistics @@ -283,8 +284,16 @@ def main(): # Save results for this cell group # Create safe filename by replacing problematic characters - safe_cell_group = str(cell_group).replace("/", "_").replace(" ", "_").replace("(", "").replace(")", "") - output_file = output_dir / f"{par['output_prefix']}_{safe_cell_group}.csv" + safe_cell_group = ( + str(cell_group) + .replace("/", "_") + .replace(" ", "_") + .replace("(", "") + .replace(")", "") + ) + output_file = ( + output_dir / f"{par['output_prefix']}_{safe_cell_group}.csv" + ) save_results_and_log_summary(cell_results, output_file, cell_group) else: diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index 415f025f683..00b515f5ece 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -78,27 +78,27 @@ def test_simple_deseq2_with_cell_group( ) assert output_dir.exists(), "Output directory does not exist" - + # Check that multiple CSV files exist (one per cell group) csv_files = list(output_dir.glob("deseq2_results_*.csv")) # Default prefix assert len(csv_files) > 0, "No cell group-specific CSV files found" - + # Check each file has expected structure for csv_file in csv_files: results = pd.read_csv(csv_file) expected_columns = [ - "log2FoldChange", - "pvalue", - "padj", - "significant", - "cell_type" + "log2FoldChange", + "pvalue", + "padj", + "significant", + "cell_type", ] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found in {csv_file}" ) assert len(results) > 0, f"No results found in {csv_file}" - - # Check that all rows have the same cell_type value + + # Check that all rows have the same cell_type value assert results["cell_type"].nunique() == 1, ( f"Multiple cell types found in {csv_file}" ) @@ -188,15 +188,15 @@ def test_output_without_cell_group(run_component, tmp_path, pseudobulk_test_data ) assert output_dir.exists(), "Output directory does not exist" - + # Should have exactly one CSV file named deseq2_results.csv output_file = output_dir / "deseq2_results.csv" # Default prefix assert output_file.exists(), "Main output CSV file does not exist" - + # Check no cell group specific files exist csv_files = list(output_dir.glob("deseq2_results_*.csv")) # Default prefix assert len(csv_files) == 0, "Found cell group-specific files when none should exist" - + # Check the main file structure results = pd.read_csv(output_file) expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] @@ -231,13 +231,13 @@ def test_custom_output_prefix(run_component, tmp_path, pseudobulk_test_data_path ) assert output_dir.exists(), "Output directory does not exist" - + # Should have exactly one CSV file with custom prefix output_file = output_dir / f"{custom_prefix}.csv" assert output_file.exists(), ( f"Custom prefix output CSV file {output_file} does not exist" ) - + # Check the main file structure results = pd.read_csv(output_file) expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] @@ -276,29 +276,29 @@ def test_custom_output_prefix_with_cell_groups( ) assert output_dir.exists(), "Output directory does not exist" - + # Check that multiple CSV files exist with custom prefix csv_files = list(output_dir.glob(f"{custom_prefix}_*.csv")) assert len(csv_files) > 0, ( f"No cell group-specific CSV files found with prefix {custom_prefix}" ) - + # Check each file has expected structure for csv_file in csv_files: results = pd.read_csv(csv_file) expected_columns = [ - "log2FoldChange", - "pvalue", - "padj", - "significant", - "cell_type" + "log2FoldChange", + "pvalue", + "padj", + "significant", + "cell_type", ] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found in {csv_file}" ) assert len(results) > 0, f"No results found in {csv_file}" - - # Check that all rows have the same cell_type value + + # Check that all rows have the same cell_type value assert results["cell_type"].nunique() == 1, ( f"Multiple cell types found in {csv_file}" ) From c6a0700c658c6f2b6d37748a33ae53f7b98e8728 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Mon, 25 Aug 2025 15:02:07 +0200 Subject: [PATCH 31/52] update config, expand tests --- .../deseq2/config.vsh.yaml | 34 +- src/differential_expression/deseq2/script.R | 457 ++++++++++++++++++ src/differential_expression/deseq2/script.py | 322 ------------ src/differential_expression/deseq2/test.py | 102 ++-- 4 files changed, 532 insertions(+), 383 deletions(-) create mode 100644 src/differential_expression/deseq2/script.R delete mode 100644 src/differential_expression/deseq2/script.py diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index d421059772b..d8c3ac8c6f8 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -54,7 +54,7 @@ argument_groups: Design formula for DESeq2 analysis in R-style format. Specifies the statistical model to account for various factors. required: true - example: "~ cell_type + disease + treatment" + example: "~ disease + treatment" - name: "--contrast_column" type: string description: | @@ -106,7 +106,9 @@ argument_groups: - name: "--output_dir" alternatives: ["-o"] type: file - description: Output directory for DESeq2 results. + description: | + Output directory for DESeq2 results. + If cell groups are defined (using `--obs_cell_group`), the output folder will contain one CSV per cell group, otherwise it will contain a single output file. direction: output required: true - name: "--output_prefix" @@ -115,13 +117,12 @@ argument_groups: Prefix for output CSV files. If no cell groups are specified, the output file will be named "{prefix}.csv". If cell groups are specified, files will be named "{prefix}_{cell_group}.csv". - default: "deseq2_results" + default: "deseq2_analysis" required: false resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py + - type: r_script + path: script.R test_resources: - type: python_script path: test.py @@ -129,15 +130,20 @@ test_resources: engines: - type: docker - image: python:3.12 + image: rocker/r2u:22.04 setup: - - type: python - packages: - - pydeseq2~=0.5.2 - - zarr<3.0.0 - - type: python - __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml] - __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + - type: apt + packages: [ libhdf5-dev, libgeos-dev, hdf5-tools ] + - type: r + cran: [ hdf5r ] + github: scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a + bioc: [ DESeq2, "SingleCellExperiment" ] + test_setup: + - type: apt + packages: [ python3, python3-pip, python3-dev, python-is-python3 ] + # - type: python + # __merge__: [ /src/base/requirements/anndata_mudata.yaml, . ] + __merge__: [ ., /src/base/requirements/python_test_setup.yaml] runners: - type: executable diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R new file mode 100644 index 00000000000..a669fcf7995 --- /dev/null +++ b/src/differential_expression/deseq2/script.R @@ -0,0 +1,457 @@ +library(DESeq2) +library(anndataR) +library(hdf5r) + +## VIASH START +par <- list( + input = paste0( + "resources_test/annotation_test_data/", + "TS_Blood_filtered_pseudobulk.h5mu" + ), + output_dir = "./test_deseq2_no_cellgroup/", + output_prefix = "deseq2_analysis", + input_layer = NULL, + modality = "rna", + obs_cell_group = NULL, + design_formula = "~ disease + treatment", + contrast_column = "treatment", + contrast_values = c("ctrl", "stim"), + filter_genes_min_samples = NULL, + p_adj_threshold = 0.05, + log2fc_threshold = 0.0, + filter_gene_patterns = c( + "MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+" + ), + var_gene_name = "feature_name" +) +meta <- list(resources_dir = "src/utils") +## VIASH END + +cat("Starting DESeq2 analysis...\n") + +h5mu_to_h5ad <- function(h5mu_path, modality_name) { + tmp_path <- tempfile(fileext = ".h5ad") + mod_location <- paste("mod", modality_name, sep = "/") + h5src <- hdf5r::H5File$new(h5mu_path, "r") + h5dest <- hdf5r::H5File$new(tmp_path, "w") + # Copy over the child objects and the child attributes from root + # Root cannot be copied directly because it always exists and + # copying does not allow overwriting. + children <- hdf5r::list.objects(h5src, + path = mod_location, + full.names = FALSE, recursive = FALSE + ) + for (child in children) { + h5dest$obj_copy_from( + h5src, paste(mod_location, child, sep = "/"), + paste0("/", child) + ) + } + # Also copy the root attributes + root_attrs <- hdf5r::h5attr_names(x = h5src) + for (attr in root_attrs) { + h5a <- h5src$attr_open(attr_name = attr) + robj <- h5a$read() + h5dest$create_attr_by_name( + attr_name = attr, + obj_name = ".", + robj = robj, + space = h5a$get_space(), + dtype = h5a$get_type() + ) + } + h5src$close() + h5dest$close() + + tmp_path +} + +# Check if expression data is normalized (row sums ≈ 1) +is_normalized <- function(layer) { + row_sums <- if (is(layer, "sparseMatrix") || is(layer, "dgCMatrix")) { + Matrix::rowSums(layer) + } else { + rowSums(layer) + } + + all(abs(row_sums - 1) < 1e-6, na.rm = TRUE) +} + +# Extract design factors from formula +parse_design_formula <- function(design_formula) { + design_factors <- all.vars(as.formula(design_formula)) + cat("Design formula:", design_formula, "\n") + cat("Extracted factors:", paste(design_factors, collapse = ", "), "\n") + design_factors +} + +# Validate and prepare contrast specifications +prepare_contrast_matrix <- function( + design_factors, contrast_column, metadata) { + # Validate required columns exist + required_columns <- unique(c(design_factors, contrast_column)) + missing_columns <- setdiff(required_columns, colnames(metadata)) + + if (length(missing_columns) > 0) { + stop(sprintf( + paste( + "Missing required columns in metadata: %s\n", + "Available metadata columns: %s" + ), + paste(missing_columns, collapse = ", "), + paste(colnames(metadata), collapse = ", ") + )) + } + + # Check contrast values exist + contrast_values <- par$contrast_values + available_values <- unique(metadata[[contrast_column]]) + missing_values <- setdiff(contrast_values, available_values) + + if (length(missing_values) > 0) { + stop(sprintf( + paste("Contrast values %s not found in %s.", + "Available values: %s", + sep = "\n" + ), + paste(missing_values, collapse = ", "), + contrast_column, + paste(available_values, collapse = ", ") + )) + } + + # Handle different contrast scenarios + if (length(contrast_values) == 2) { + # Pairwise comparison + treatment <- contrast_values[1] + baseline <- contrast_values[2] + contrast_spec <- c(contrast_column, treatment, baseline) + cat( + "Performing pairwise contrast:", contrast_column, + treatment, "vs", baseline, "\n" + ) + contrast_spec + } else if (length(contrast_values) > 2) { + # Multiple comparisons against first value (baseline) + baseline <- contrast_values[1] + contrast_specs <- list() + for (i in 2:length(contrast_values)) { + treatment <- contrast_values[i] + contrast_specs[[length(contrast_specs) + 1]] <- + c(contrast_column, treatment, baseline) + } + cat( + "Performing multiple contrasts against baseline '", baseline, "':", + paste(sapply(contrast_specs, function(x) x[2]), collapse = ", "), "\n" + ) + contrast_specs + } else { + stop(sprintf( + "Need at least 2 values for contrast, got: %s", + paste(contrast_values, collapse = ", ") + )) + } +} + +# Convert expression matrix to counts data frame +prepare_counts_matrix <- function(layer, var_names, obs_names) { + counts <- if (is(layer, "sparseMatrix") || is(layer, "dgCMatrix")) { + as.matrix(layer) + } else { + layer + } + + # Create properly named data frame (transpose for DESeq2 format) + counts_df <- data.frame(counts) + colnames(counts_df) <- var_names + rownames(counts_df) <- obs_names + + # Ensure integer counts (required for DESeq2) + counts_df[] <- lapply(counts_df, function(x) as.integer(round(x))) + counts_df +} + +# Filter genes based on regex patterns +filter_genes_by_pattern <- function(counts, gene_patterns) { + if (is.null(gene_patterns) || length(gene_patterns) == 0) { + return(counts) + } + + pattern_string <- paste(gene_patterns, collapse = "|") + before_filter <- ncol(counts) + genes_to_keep <- !grepl(pattern_string, colnames(counts)) + counts_filtered <- counts[, genes_to_keep, drop = FALSE] + + cat( + "Filtered out genes matching patterns:", + paste(gene_patterns, collapse = ", "), "\n" + ) + cat( + "Matrix shape before filtering:", nrow(counts), "x", before_filter, + "\n" + ) + cat( + "Matrix shape after filtering:", nrow(counts_filtered), "x", + ncol(counts_filtered), "\n" + ) + + counts_filtered +} + +# Create and configure DESeq2 dataset +create_deseq2_dataset <- function(counts, metadata, design_formula) { + cat("Creating DESeq2 dataset\n") + + # Ensure matching samples between counts and metadata + common_samples <- intersect(rownames(counts), rownames(metadata)) + if (length(common_samples) == 0) { + stop("No common samples found between counts and metadata") + } + + counts <- counts[common_samples, , drop = FALSE] + metadata <- metadata[common_samples, , drop = FALSE] + + # Create DESeqDataSet (transpose for gene x sample format) + dds <- DESeq2::DESeqDataSetFromMatrix( + countData = t(counts), + colData = metadata, + design = as.formula(design_formula) + ) + + # Filtering genes based on presence across samples + sample_count <- if ( + !is.null(par$filter_genes_min_samples) + ) { + par$filter_genes_min_samples + } else { + 1 + } + cat( + "Filtering genes by counts: removing genes that are present in less than", + sample_count, "samples\n" + ) + + keep <- rowSums(counts(dds) >= 1) >= sample_count + dds <- dds[keep, ] + + dds +} + +# Perform DESeq2 differential expression analysis +deseq2_analysis <- function(dds, contrast_specs) { + cat("Running DESeq2 analysis\n") + + dds <- DESeq2::DESeq(dds) + + # Ensure contrast_specs is a list + if (!is.list(contrast_specs)) { + contrast_specs <- list(contrast_specs) + } + + all_results <- lapply(seq_along(contrast_specs), function(i) { + contrast_spec <- contrast_specs[[i]] + cat( + "Performing statistical test for contrast:", + paste(contrast_spec, collapse = " "), "\n" + ) + + # Get DESeq2 results for this contrast + res <- DESeq2::results( + dds, + contrast = contrast_spec, alpha = par$p_adj_threshold + ) + + # Convert to data frame and add metadata + results_df <- as.data.frame(res) + results_df$gene_id <- rownames(results_df) + results_df$contrast <- paste0(contrast_spec[2], "_vs_", contrast_spec[3]) + results_df$treatment <- contrast_spec[2] + results_df$baseline <- contrast_spec[3] + results_df$abs_log2FoldChange <- abs(results_df$log2FoldChange) + results_df$significant <- ( + results_df$padj < par$p_adj_threshold & + !is.na(results_df$padj) & + abs(results_df$log2FoldChange) > par$log2fc_threshold + ) + + # Sort by effect size + results_df[order(results_df$log2FoldChange, decreasing = TRUE), ] + }) + + # Combine all results + combined_results <- do.call(rbind, all_results) + + # Log summary statistics + for (i in seq_along(contrast_specs)) { + contrast_spec <- contrast_specs[[i]] + contrast_name <- paste0(contrast_spec[2], "_vs_", contrast_spec[3]) + contrast_subset <- combined_results[ + combined_results$contrast == contrast_name, + ] + n_significant <- sum(contrast_subset$significant, na.rm = TRUE) + cat("Contrast", contrast_name, ":", n_significant, "significant genes\n") + } + + combined_results +} + +# Save results and print summary statistics +save_results_and_log_summary <- function( + results, output_file, cell_group = NULL) { + group_text <- if (!is.null(cell_group)) paste(" for", cell_group) else "" + cat("Saving results", group_text, "to", output_file, "\n") + + write.csv(results, output_file, row.names = FALSE) + + # Calculate summary statistics + sig_results <- results[results$significant & !is.na(results$significant), ] + upregulated <- sig_results[sig_results$log2FoldChange > 0, ] + downregulated <- sig_results[sig_results$log2FoldChange < 0, ] + + cat("Summary", group_text, ":\n") + cat(" Total genes analyzed:", nrow(results), "\n") + cat(" Significant upregulated:", nrow(upregulated), "\n") + cat(" Significant downregulated:", nrow(downregulated), "\n") +} + +# Main analysis workflow +main <- function() { + cat("Loading pseudobulk data from", par$input, "\n") + + # Load and prepare data + h5ad_path <- h5mu_to_h5ad(par$input, par$modality) + mod <- anndataR::read_h5ad(h5ad_path, as = "InMemoryAnnData") + metadata <- as.data.frame(mod$obs) + + # Get expression matrix + layer <- if (!is.null(par$input_layer)) { + mod$layers[[par$input_layer]] + } else { + mod$X + } + + if (is_normalized(layer)) { + stop("Input layer must contain raw counts.") + } + + # Prepare analysis components + cat("Preparing design formula\n") + design_factors <- parse_design_formula(par$design_formula) + + cat("Preparing contrast matrix\n") + contrast_specs <- prepare_contrast_matrix( + design_factors, par$contrast_column, metadata + ) + + cat("Preparing counts matrix for DESeq2\n") + var_names <- if ( + !is.null(par$var_gene_name) && par$var_gene_name %in% colnames(mod$var) + ) { + mod$var[[par$var_gene_name]] + } else { + mod$var_names + } + obs_names <- mod$obs_names + counts <- prepare_counts_matrix(layer, var_names, obs_names) + + # Apply gene filtering if specified + if ( + !is.null(par$filter_gene_patterns) && + length(par$filter_gene_patterns) > 0 + ) { + cat("Filtering genes based on patterns\n") + counts <- filter_genes_by_pattern(counts, par$filter_gene_patterns) + } + + # Ensure output directory exists + if (!dir.exists(par$output_dir)) { + dir.create(par$output_dir, recursive = TRUE) + } + + # Run analysis (per cell group or overall) + tryCatch( + { + if ( + !is.null(par$obs_cell_group) && + par$obs_cell_group %in% colnames(metadata) + ) { + run_per_cell_group_analysis(counts, metadata, contrast_specs) + } else { + run_overall_analysis(counts, metadata, contrast_specs) + } + cat("DESeq2 analysis completed successfully\n") + }, + error = function(e) { + cat("Error in analysis. Check input data and parameters:\n") + cat("Contrast column:", par$contrast_column, "\n") + cat("Contrast values:", paste(par$contrast_values, collapse = ", "), "\n") + cat("Number of samples:", nrow(metadata), "\n") + cat("Number of genes:", ncol(counts), "\n") + stop(e) + } + ) +} + +# Run analysis per cell group +run_per_cell_group_analysis <- function(counts, metadata, contrast_specs) { + cat("Running DESeq2 analysis per cell group\n") + + # Remove cell group from design formula + design_no_celltype <- gsub( + paste0("\\+\\s*", par$obs_cell_group), "", par$design_formula + ) + design_no_celltype <- gsub( + paste0(par$obs_cell_group, "\\s*\\+"), "", design_no_celltype + ) + design_no_celltype <- gsub("\\s+", " ", design_no_celltype) + + cell_groups <- unique(metadata[[par$obs_cell_group]]) + + for (cell_group in cell_groups) { + cat("Processing cell group:", cell_group, "\n") + + # Subset data + cell_mask <- metadata[[par$obs_cell_group]] == cell_group + counts_subset <- counts[cell_mask, , drop = FALSE] + metadata_subset <- metadata[cell_mask, , drop = FALSE] + + # Skip if insufficient samples + if (nrow(counts_subset) < 2) { + cat("Skipping cell group", cell_group, "- too few samples\n") + next + } + + # Run analysis + dds <- create_deseq2_dataset( + counts_subset, metadata_subset, design_no_celltype + ) + results <- deseq2_analysis(dds, contrast_specs) + results[[par$obs_cell_group]] <- cell_group + + # Save results + safe_name <- gsub("[/ \\(\\)]", "_", as.character(cell_group)) + safe_name <- gsub("_+", "_", safe_name) + output_file <- file.path( + par$output_dir, + paste0(par$output_prefix, "_", safe_name, ".csv") + ) + save_results_and_log_summary(results, output_file, cell_group) + } +} + +# Run overall analysis (all samples together) +run_overall_analysis <- function(counts, metadata, contrast_specs) { + dds <- create_deseq2_dataset(counts, metadata, par$design_formula) + results <- deseq2_analysis(dds, contrast_specs) + + output_file <- file.path( + par$output_dir, + paste0(par$output_prefix, ".csv") + ) + save_results_and_log_summary(results, output_file) +} + +# Run main function if script is executed directly +if (!interactive()) { + main() +} diff --git a/src/differential_expression/deseq2/script.py b/src/differential_expression/deseq2/script.py deleted file mode 100644 index 930f9dec9d2..00000000000 --- a/src/differential_expression/deseq2/script.py +++ /dev/null @@ -1,322 +0,0 @@ -import scanpy as sc -import mudata as mu -import pandas as pd -import numpy as np -import scipy.sparse as sp -from pydeseq2.dds import DeseqDataSet -from pydeseq2.ds import DeseqStats -import re -import sys -from pathlib import Path - -## VIASH START -par = { - "input": "resources_test/annotation_test_data/TS_Blood_filtered_pseudobulk.h5mu", - "output_dir": "./", - "output_prefix": "deseq2_results", - "input_layer": None, - "modality": "rna", - "obs_cell_group": "cell_type", - "design_formula": "~ disease + treatment", - "contrast_column": "treatment", - "contrast_values": [ - "ctrl", - "stim", - ], # [treatment, baseline] or [group1, group2, group3, ...] - "filter_genes_min_samples": None, - "p_adj_threshold": 0.05, - "log2fc_threshold": 0.0, - "filter_gene_patterns": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], - "var_gene_name": "feature_name", -} -meta = {"resources_dir": "src/utils"} -## VIASH END - -sys.path.append(meta["resources_dir"]) - -from setup_logger import setup_logger - -logger = setup_logger() - - -def is_normalized(layer): - if sp.issparse(layer): - row_sums = np.array(layer.sum(axis=1)).flatten() - else: - row_sums = layer.sum(axis=1) - - return np.allclose(row_sums, 1) - - -def parse_design_formula(design_formula): - design_factors = re.findall(r"\b(?!~)\w+", design_formula) - logger.info( - f"Design formula: {design_formula}\nExtracted factors: {design_factors}" - ) - return design_factors - - -def prepare_contrast_matrix(design_factors, contrast_column, metadata): - # Validate required columns exist - required_columns = ( - design_factors + [contrast_column] - if contrast_column not in design_factors - else design_factors - ) - missing_columns = [col for col in required_columns if col not in metadata.columns] - if missing_columns: - raise ValueError( - f"Missing required columns in metadata: {missing_columns}\n" - f"Available metadata columns: {list(metadata.columns)}" - ) - - # Check contrast values exist - contrast_values = par["contrast_values"] - available_values = metadata[contrast_column].unique() - missing_values = [val for val in contrast_values if val not in available_values] - if missing_values: - raise ValueError( - f"Contrast values {missing_values} not found in {contrast_column}.\n" - f"Available values: {available_values}" - ) - - # Handle multi-level contrasts - if len(contrast_values) == 2: - # Pairwise comparison (current behavior) - treatment, baseline = contrast_values - contrast_tuple = (contrast_column, treatment, baseline) - logger.info( - f"Performing pairwise contrast: {contrast_column} {treatment} vs {baseline}" - ) - return [contrast_tuple] - - elif len(contrast_values) > 2: - # Multiple comparisons - all vs first (baseline) - baseline = contrast_values[0] - contrast_tuples = [] - for treatment in contrast_values[1:]: - contrast_tuples.append((contrast_column, treatment, baseline)) - logger.info( - f"Performing multiple contrasts against baseline '{baseline}': " - f"{[t[1] for t in contrast_tuples]}" - ) - return contrast_tuples - - else: - raise ValueError(f"Need at least 2 values for contrast, got: {contrast_values}") - - -def prepare_counts_matrix(layer, mod): - counts = pd.DataFrame(layer, columns=mod.var_names, index=mod.obs_names) - counts = counts.astype(int) # Ensure counts are integers (required for DESeq2) - return counts - - -def filter_genes_by_pattern(counts, gene_pattern): - pattern_string = "|".join(gene_pattern) - before_filter = counts.shape[1] - - # Filter genes based on column names (gene names) - genes_to_keep = list( - ~pd.Series(counts.columns).str.contains(pattern_string, na=False) - ) - counts = counts.loc[:, genes_to_keep] - - logger.info( - f"Filtered out genes matching patterns {par['filter_gene_patterns']}:\n" - f"Counts matrix shape before bene pattern filtering: {before_filter}\n" - f"Counts matrix shape after gene pattern filtering: {counts.shape}" - ) - - return counts - - -def create_deseq2_dataset(counts, metadata, design_formula): - logger.info("Creating DESeq2 dataset") - adata = DeseqDataSet( - counts=counts, - metadata=metadata, - design=design_formula, - ) - - # Filtering genes based on presence across samples - sample_count = ( - par["filter_genes_min_samples"] if par["filter_genes_min_samples"] else 1 - ) - logger.info( - f"Filtering genes by counts: removing genes that are present in " - f"less than {sample_count} samples" - ) - sc.pp.filter_genes(adata, min_cells=sample_count) - - return adata - - -def deseq2_analysis(adata, contrast_tuples): - logger.info("Running DESeq2 analysis") - adata.deseq2() - - all_results = [] - - # Handle multiple contrasts - if not isinstance(contrast_tuples, list): - contrast_tuples = [contrast_tuples] - - for contrast_tuple in contrast_tuples: - logger.info(f"Performing statistical test for contrast: {contrast_tuple}") - stat_res = DeseqStats(adata, contrast=contrast_tuple) - stat_res.summary() - results = stat_res.results_df.reset_index() - - # Add contrast information - results["contrast"] = f"{contrast_tuple[1]}_vs_{contrast_tuple[2]}" - results["treatment"] = contrast_tuple[1] - results["baseline"] = contrast_tuple[2] - - # Sort by log2FoldChange - results = results.sort_values("log2FoldChange", ascending=False) - - # Add additional statistics - results["abs_log2FoldChange"] = results["log2FoldChange"].abs() - results["significant"] = (results["padj"] < par["p_adj_threshold"]) & ( - results["log2FoldChange"].abs() > par["log2fc_threshold"] - ) - - all_results.append(results) - - # Combine all contrast results - combined_results = pd.concat(all_results, ignore_index=True) - - # Log summary per contrast - for contrast_tuple in contrast_tuples: - contrast_name = f"{contrast_tuple[1]}_vs_{contrast_tuple[2]}" - contrast_results = combined_results[ - combined_results["contrast"] == contrast_name - ] - significant_genes = contrast_results[contrast_results["significant"]] - logger.info( - f"Contrast {contrast_name}: {len(significant_genes)} significant genes" - ) - - return combined_results - - -def save_results_and_log_summary(results, output_file, cell_group=None): - """Save results to CSV and log summary statistics""" - logger.info( - f"Saving results{f' for {cell_group}' if cell_group else ''} to {output_file}" - ) - results.to_csv(output_file, index=False) - - # Log summary statistics - upregulated = results[(results["log2FoldChange"] > 0) & results["significant"]] - downregulated = results[(results["log2FoldChange"] < 0) & results["significant"]] - - summary_prefix = f"Summary{f' for {cell_group}' if cell_group else ''}:\n" - logger.info( - f"{summary_prefix}" - f" Total genes analyzed: {len(results)}\n" - f" Significant upregulated: {len(upregulated)}\n" - f" Significant downregulated: {len(downregulated)}\n" - ) - - -def main(): - # Load pseudobulk data - logger.info(f"Loading pseudobulk data from {par['input']}") - mdata = mu.read_h5mu(par["input"]) - mod = mdata.mod[par["modality"]] - metadata = mod.obs.copy() - layer = mod.layers[par["input_layer"]] if par["input_layer"] else mod.X - if is_normalized(layer): - raise ValueError("Input layer must contain raw counts.") - - # Parse design formula to extract factors - logger.info("Preparing design formula") - design_factors = parse_design_formula(par["design_formula"]) - - # Preparing contrast matrix - logger.info("Preparing contrast matrix") - contrast_tuples = prepare_contrast_matrix( - design_factors, par["contrast_column"], metadata - ) - - # Prepare counts matrix - logger.info("Preparing counts matrix for DESeq2") - counts = prepare_counts_matrix(layer, mod) - - # Filter out unwanted gene patterns if specified (before DESeq2 analysis) - if par["filter_gene_patterns"]: - logger.info("Filtering genes based on gene patterns") - counts = filter_genes_by_pattern(counts, par["filter_gene_patterns"]) - - # Run DESeq2 analysis - try: - # Create output directory - output_dir = Path(par["output_dir"]) - output_dir.mkdir(parents=True, exist_ok=True) - - # Check if running per cell group or multi-factor - if par["obs_cell_group"]: - logger.info("Running DESeq2 analysis per cell group") - # Remove cell group from design formula for per-cell-type analysis - design_no_celltype = ( - par["design_formula"] - .replace(f"+ {par['obs_cell_group']}", "") - .replace(f"{par['obs_cell_group']} +", "") - ) - - for cell_group in metadata[par["obs_cell_group"]].unique(): - # Subset data for this cell group - cell_mask = metadata[par["obs_cell_group"]] == cell_group - counts_subset = counts.loc[cell_mask, :] - metadata_subset = metadata.loc[cell_mask, :] - - # Run DESeq2 for this cell group - logger.info(f"Running DESeq2 analysis for cell group {cell_group}...") - cell_dds = create_deseq2_dataset( - counts_subset, metadata_subset, design_no_celltype - ) - cell_results = deseq2_analysis(cell_dds, contrast_tuples) - - # Add cell_group column to results - cell_results[par["obs_cell_group"]] = cell_group - - # Save results for this cell group - # Create safe filename by replacing problematic characters - safe_cell_group = ( - str(cell_group) - .replace("/", "_") - .replace(" ", "_") - .replace("(", "") - .replace(")", "") - ) - output_file = ( - output_dir / f"{par['output_prefix']}_{safe_cell_group}.csv" - ) - save_results_and_log_summary(cell_results, output_file, cell_group) - - else: - dds = create_deseq2_dataset(counts, metadata, par["design_formula"]) - results = deseq2_analysis(dds, contrast_tuples) - - # Save results - output_file = output_dir / f"{par['output_prefix']}.csv" - save_results_and_log_summary(results, output_file) - - except Exception as e: - logger.warning( - "Check input data, design factors, and contrast specifications\n" - f"Contrast column: {par['contrast_column']}\n" - f"Contrast values: {par['contrast_values']}\n" - f"Number of samples: {len(mod.obs)}\n" - f"Number of genes: {len(mod.var)}\n" - ) - - raise e - - logger.info("DESeq2 analysis completed successfully") - - -if __name__ == "__main__": - main() diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index 00b515f5ece..83acb173a94 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -40,16 +40,19 @@ def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_p ) assert output_dir.exists(), "Output directory does not exist" - output_file = output_dir / "deseq2_results.csv" # Default prefix + output_file = output_dir / "deseq2_analysis.csv" # Default prefix assert output_file.exists(), "Output CSV file does not exist" + # Assert one CSV file was found + csv_files = list(output_dir.glob("deseq2_analysis*.csv")) # Default prefix + assert len(csv_files) == 1, "Should find exactly one CSV file in output folder" + # Check the output file results = pd.read_csv(output_file) expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found" ) - assert len(results) > 0, "No results found in output" def test_simple_deseq2_with_cell_group( @@ -80,7 +83,7 @@ def test_simple_deseq2_with_cell_group( assert output_dir.exists(), "Output directory does not exist" # Check that multiple CSV files exist (one per cell group) - csv_files = list(output_dir.glob("deseq2_results_*.csv")) # Default prefix + csv_files = list(output_dir.glob("deseq2_analysis_*.csv")) # Default prefix assert len(csv_files) > 0, "No cell group-specific CSV files found" # Check each file has expected structure @@ -115,7 +118,7 @@ def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_pa "--output_dir", str(output_dir), "--design_formula", - "~ cell_type + disease + treatment", + "~ disease + treatment", "--contrast_column", "treatment", "--contrast_values", @@ -130,7 +133,7 @@ def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_pa ) assert output_dir.exists(), "Output directory does not exist" - output_file = output_dir / "deseq2_results.csv" # Default prefix + output_file = output_dir / "deseq2_analysis.csv" # Default prefix assert output_file.exists(), "Output CSV file does not exist" results = pd.read_csv(output_file) @@ -138,6 +141,50 @@ def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_pa assert len(results) > 0, "No results found for complex design" +def test_complex_design_formula_with_cell_groups( + run_component, tmp_path, pseudobulk_test_data_path +): + """Test that without cell group specified, a single CSV is created in output directory""" + output_dir = tmp_path / "deseq2_output" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "~ treatment + disease", + "--obs_cell_group", + "cell_type", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Check that cell group specific files exist + csv_files = list(output_dir.glob("deseq2_analysis_*.csv")) # Default prefix + assert len(csv_files) >= 1, "Could not find cell group-specific files" + + # Check the main file structure + for csv_file in csv_files: + results = pd.read_csv(csv_file) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found in {csv_file}" + ) + assert len(results) > 0, f"No results found in {csv_file}" + assert results["cell_type"].nunique() == 1, ( + f"Multiple cell types found in {csv_file} - should be one per file" + ) + + def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_path): """Test that invalid contrast column raises appropriate error""" output_dir = tmp_path / "deseq2_output" @@ -160,50 +207,11 @@ def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_p ] ) + # Updated regex to match actual R error format (no square brackets) assert re.search( - r"Missing required columns in metadata: \['nonexistent_column'\]", + r"Missing required columns in metadata: nonexistent_column", err.value.stdout.decode("utf-8"), - ) - - -def test_output_without_cell_group(run_component, tmp_path, pseudobulk_test_data_path): - """Test that without cell group specified, a single CSV is created in output directory""" - output_dir = tmp_path / "deseq2_output" - - run_component( - [ - "--input", - pseudobulk_test_data_path, - "--output_dir", - str(output_dir), - "--design_formula", - "~ treatment", - "--contrast_column", - "treatment", - "--contrast_values", - "stim", - "--contrast_values", - "ctrl", - ] - ) - - assert output_dir.exists(), "Output directory does not exist" - - # Should have exactly one CSV file named deseq2_results.csv - output_file = output_dir / "deseq2_results.csv" # Default prefix - assert output_file.exists(), "Main output CSV file does not exist" - - # Check no cell group specific files exist - csv_files = list(output_dir.glob("deseq2_results_*.csv")) # Default prefix - assert len(csv_files) == 0, "Found cell group-specific files when none should exist" - - # Check the main file structure - results = pd.read_csv(output_file) - expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] - assert all(col in results.columns for col in expected_columns), ( - f"Expected columns {expected_columns} not found" - ) - assert len(results) > 0, "No results found in output" + ), f"Expected error message not found: {err.value.stdout.decode('utf-8')}" def test_custom_output_prefix(run_component, tmp_path, pseudobulk_test_data_path): From 3a3eacaa9f422eddab082f030a7348745ed894de Mon Sep 17 00:00:00 2001 From: dorien-er Date: Mon, 25 Aug 2025 15:29:07 +0200 Subject: [PATCH 32/52] expand tests --- .../deseq2/config.vsh.yaml | 2 +- src/differential_expression/deseq2/script.R | 23 ++++--- src/differential_expression/deseq2/test.py | 68 +++++++++++++++++-- 3 files changed, 77 insertions(+), 16 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index d8c3ac8c6f8..5e88c53e2d4 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -137,7 +137,7 @@ engines: - type: r cran: [ hdf5r ] github: scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a - bioc: [ DESeq2, "SingleCellExperiment" ] + bioc: [ DESeq2 ] test_setup: - type: apt packages: [ python3, python3-pip, python3-dev, python-is-python3 ] diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index a669fcf7995..66e8c9f044b 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -123,25 +123,26 @@ prepare_contrast_matrix <- function( # Handle different contrast scenarios if (length(contrast_values) == 2) { # Pairwise comparison - treatment <- contrast_values[1] - baseline <- contrast_values[2] - contrast_spec <- c(contrast_column, treatment, baseline) + comparison_group <- contrast_values[1] + control_group <- contrast_values[2] + contrast_spec <- c(contrast_column, comparison_group, control_group) cat( "Performing pairwise contrast:", contrast_column, - treatment, "vs", baseline, "\n" + comparison_group, "vs", control_group, "\n" ) contrast_spec } else if (length(contrast_values) > 2) { - # Multiple comparisons against first value (baseline) - baseline <- contrast_values[1] + # Multiple comparisons against first value (control_group) + control_group <- contrast_values[1] contrast_specs <- list() for (i in 2:length(contrast_values)) { - treatment <- contrast_values[i] + comparison_group <- contrast_values[i] contrast_specs[[length(contrast_specs) + 1]] <- - c(contrast_column, treatment, baseline) + c(contrast_column, comparison_group, control_group) } cat( - "Performing multiple contrasts against baseline '", baseline, "':", + "Performing multiple contrasts against control_group '", + control_group, "':", paste(sapply(contrast_specs, function(x) x[2]), collapse = ", "), "\n" ) contrast_specs @@ -265,8 +266,8 @@ deseq2_analysis <- function(dds, contrast_specs) { results_df <- as.data.frame(res) results_df$gene_id <- rownames(results_df) results_df$contrast <- paste0(contrast_spec[2], "_vs_", contrast_spec[3]) - results_df$treatment <- contrast_spec[2] - results_df$baseline <- contrast_spec[3] + results_df$comparison_group <- contrast_spec[2] + results_df$control_group <- contrast_spec[3] results_df$abs_log2FoldChange <- abs(results_df$log2FoldChange) results_df$significant <- ( results_df$padj < par$p_adj_threshold & diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index 83acb173a94..bac124b53a4 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -40,20 +40,53 @@ def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_p ) assert output_dir.exists(), "Output directory does not exist" - output_file = output_dir / "deseq2_analysis.csv" # Default prefix - assert output_file.exists(), "Output CSV file does not exist" # Assert one CSV file was found csv_files = list(output_dir.glob("deseq2_analysis*.csv")) # Default prefix assert len(csv_files) == 1, "Should find exactly one CSV file in output folder" + output_file = output_dir / "deseq2_analysis.csv" # Default prefix + assert output_file.exists(), "Output CSV file does not exist" # Check the output file results = pd.read_csv(output_file) - expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] + expected_columns = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "significant", + "gene_id", + "contrast", + "comparison_group", + "control_group", + "abs_log2FoldChange", + ] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found" ) + expected_float_cols = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "abs_log2FoldChange", + ] + float_cols = results.select_dtypes(include=["float"]).columns.tolist() + assert all(col in float_cols for col in expected_float_cols), ( + f"Expected float columns {expected_float_cols} not found" + ) + + expected_obj_cols = ["gene_id", "contrast", "comparison_group", "control_group"] + obj_cols = results.select_dtypes(include=["object"]).columns.tolist() + assert all(col in obj_cols for col in expected_obj_cols), ( + f"Expected object columns {expected_obj_cols} not found" + ) + def test_simple_deseq2_with_cell_group( run_component, tmp_path, pseudobulk_test_data_path @@ -90,11 +123,18 @@ def test_simple_deseq2_with_cell_group( for csv_file in csv_files: results = pd.read_csv(csv_file) expected_columns = [ + "baseMean", "log2FoldChange", + "lfcSE", + "stat", "pvalue", "padj", "significant", - "cell_type", + "gene_id", + "contrast", + "comparison_group", + "control_group", + "abs_log2FoldChange", ] assert all(col in results.columns for col in expected_columns), ( f"Expected columns {expected_columns} not found in {csv_file}" @@ -106,6 +146,26 @@ def test_simple_deseq2_with_cell_group( f"Multiple cell types found in {csv_file}" ) + expected_float_cols = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "abs_log2FoldChange", + ] + float_cols = results.select_dtypes(include=["float"]).columns.tolist() + assert all(col in float_cols for col in expected_float_cols), ( + f"Expected float columns {expected_float_cols} not found" + ) + + expected_obj_cols = ["gene_id", "contrast", "comparison_group", "control_group"] + obj_cols = results.select_dtypes(include=["object"]).columns.tolist() + assert all(col in obj_cols for col in expected_obj_cols), ( + f"Expected object columns {expected_obj_cols} not found" + ) + def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_path): """Test DESeq2 with complex design formula accounting for multiple factors""" From 34588f21f836c094abc788f01461c584b7c607ff Mon Sep 17 00:00:00 2001 From: dorien-er Date: Mon, 25 Aug 2025 18:41:49 +0200 Subject: [PATCH 33/52] update workflow and integration tests --- src/differential_expression/deseq2/script.R | 4 +- .../pseudobulk_deseq2/config.vsh.yaml | 24 +++++--- .../pseudobulk_deseq2/main.nf | 7 ++- .../pseudobulk_deseq2/test.nf | 61 ++++++++++++++++--- .../pseudobulk_deseq2/config.vsh.yaml | 25 ++++++++ .../pseudobulk_deseq2/script.py | 59 ++++++++++++++++++ 6 files changed, 158 insertions(+), 22 deletions(-) create mode 100644 src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml create mode 100644 src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/script.py diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index 66e8c9f044b..e34fa4a0e31 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -12,8 +12,8 @@ par <- list( output_prefix = "deseq2_analysis", input_layer = NULL, modality = "rna", - obs_cell_group = NULL, - design_formula = "~ disease + treatment", + obs_cell_group = "cell_type", + design_formula = "~ treatment", contrast_column = "treatment", contrast_values = c("ctrl", "stim"), filter_genes_min_samples = NULL, diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 1267ed5b9c9..13aa4e16d6b 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -30,10 +30,12 @@ argument_groups: type: string required: false description: Input layer to use. If None, X is used. This layer must contain raw counts. - - name: --obs_label + - name: --obs_cell_group type: string required: true - description: .obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters. + description: | + .obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters. + If provided, will generate DESeq2 analysis per cell group. example: cell_type - name: --obs_groups type: string @@ -47,12 +49,6 @@ argument_groups: This is used for reporting gene names in the output. default: "feature_name" required: false - - name: --obs_cell_group - type: string - description: | - .obs field containing the cell group information, for example per cell type or per cell cluster. - If true, performs per-cell-group analysis with cell-group-specific results. - required: false - name: Pseudobulk Options arguments: @@ -130,11 +126,19 @@ argument_groups: - name: Outputs arguments: - - name: --output + - name: "--output" + alternatives: ["-o"] type: file - description: Output CSV file with DESeq2 results. + description: | + Output directory for DESeq2 results, containing one CSV file per cell group (as defined by `--obs_cell_group`) direction: output required: true + - name: "--output_prefix" + type: string + description: | + Prefix for output CSV files: files will be named "{prefix}_{cell_group}.csv" + default: "deseq2_analysis" + required: false dependencies: - name: differential_expression/create_pseudobulk diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf index 146275a4e8d..c4ee6ac1013 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -11,7 +11,7 @@ workflow run_wf { input: "input", modality: "modality", input_layer: "input_layer", - obs_label: "obs_label", + obs_label: "obs_cell_group", obs_groups: "obs_groups", aggregation_method: "aggregation_method", min_obs_per_sample: "min_obs_per_sample", @@ -36,9 +36,10 @@ workflow run_wf { p_adj_threshold: "p_adj_threshold", log2fc_threshold: "log2fc_threshold", filter_gene_patterns: "filter_gene_patterns", - output: "output" + output_dir: "output", + output_prefix: "output_prefix" ], - toState: [ "output": "output" ] + toState: [ "output": "output_dir" ] ) | setState([ "output" ]) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf index 76a97f08bb4..02de4ed17d9 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -1,6 +1,8 @@ nextflow.enable.dsl=2 include { pseudobulk_deseq2 } from params.rootDir + "/target/nextflow/workflows/differential_expression/pseudobulk_deseq2/main.nf" +include { pseudobulk_deseq2_test } from params.rootDir + "/target/_test/nextflow/test_workflows/differential_expression/pseudobulk_deseq2_test/main.nf" + params.resources_test = params.rootDir + "/resources_test" workflow test_wf { @@ -12,12 +14,13 @@ workflow test_wf { [ id: "simple_execution_test", input: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), - obs_label: "cell_type", - obs_groups: "treatment", - design_formula: "~ cell_type + treatment", + obs_cell_group: "cell_type", + obs_groups: ["treatment", "disease"], + min_obs_per_sample: 5, + design_formula: "~ treatment", contrast_column: "treatment", contrast_values: ["ctrl", "stim"], - output: "simple_execution_test_output.csv" + output: "simple_execution_test_output" ] ]) | map{ state -> [state.id, state] } @@ -32,10 +35,54 @@ workflow test_wf { // check output def state = output[1] assert state instanceof Map : "State should be a map. Found: ${state}" - assert state.containsKey("output") : "Output should contain key 'output'." - assert state.output.isFile() : "'output' should be a file." - assert state.output.toString().endsWith(".csv") : "Output file should end with '.csv'. Found: ${state.output}" + assert state.containsKey("output") : "Output should contain key 'output'. found: ${state}" + assert state.output.isDirectory() : "'output' should be a directory." + def deseqFiles = state.output.listFiles().findAll { file -> + file.name.matches(/deseq2_analysis_.*\.csv/) + } + assert deseqFiles.size() == 4, "Output directory should contain exactly 4 deseq2_analysis_*.csv files. Found: ${deseqFiles.size()} files: ${deseqFiles.collect { it.name }}" "Output: $output" } + + | flatMap { output-> + def id = output[0] + def state = output[1] + + // Find all CSV files and create separate entries + def deseqFiles = state.output.listFiles().findAll { file -> + file.name.matches(/deseq2_analysis_.*\.csv/) + } + + deseqFiles.collect { csvFile -> + def cellType = csvFile.name.replaceAll(/deseq2_analysis_(.*)\.csv/, '$1') + def newId = "${id}_${cellType}" + def newState = state.clone() + [newId, ["input": csvFile]] + + } + } + + | view { output -> "After flatMap: $output" } + | pseudobulk_deseq2_test.run( + fromState: [ + "input": "input" + ] + ) + + output_ch + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 4 : "output channel should contain 4 events" + + def sortedIds = output_list.collect{it[0]}.sort() + def expectedIds = [ + "simple_execution_test_classical_monocyte", + "simple_execution_test_erythrocyte", + "simple_execution_test_neutrophil", + "simple_execution_test_plasma_cell" + ] + + assert sortedIds == expectedIds : "IDs don't match. Expected: ${expectedIds}, found: ${sortedIds}" + } } \ No newline at end of file diff --git a/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml new file mode 100644 index 00000000000..55be62a1379 --- /dev/null +++ b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -0,0 +1,25 @@ +name: "pseudobulk_deseq2_test" +namespace: "test_workflows/differential_expression" +scope: "test" +description: "This component tests the output of the pseudobulk_deseq2 differential expression analysis workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu + +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.13-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/script.py b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/script.py new file mode 100644 index 00000000000..297bac204c7 --- /dev/null +++ b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/script.py @@ -0,0 +1,59 @@ +import pandas as pd +import sys +import pytest + +## VIASH START +par = {"input": "test/deseq_analysis_classical_monocyte.csv"} +## VIASH END + + +def test_run(): + results = pd.read_csv(par["input"]) + + # Make sure expected columns are present + expected_columns = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "significant", + "gene_id", + "contrast", + "comparison_group", + "control_group", + "abs_log2FoldChange", + ] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) + + # Make sure columns have expected dtypes + assert results["significant"].dtypes == "bool", ( + "Expected 'significant' column to be of type bool" + ) + + expected_float_cols = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "abs_log2FoldChange", + ] + float_cols = results.select_dtypes(include=["float"]).columns.tolist() + assert all(col in float_cols for col in expected_float_cols), ( + f"Expected float columns {expected_float_cols} not found" + ) + + expected_obj_cols = ["gene_id", "contrast", "comparison_group", "control_group"] + obj_cols = results.select_dtypes(include=["object"]).columns.tolist() + assert all(col in obj_cols for col in expected_obj_cols), ( + f"Expected object columns {expected_obj_cols} not found" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"])) From f090399a42b837ef602d46e4ee6c5d2cb1e5e845 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Mon, 25 Aug 2025 18:44:51 +0200 Subject: [PATCH 34/52] update changelog --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6caffa3d0e3..76cb380f223 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +# openpipelines 3.x.x + +## EXPERIMENTAL + +* `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044). + +* `workflows/differential_expression/pseudobulk_deseq2`: Workflow for generating pseudobulk samples from single-cell data followed by DESeq2 differential expression analysis (PR #1044) + # openpipelines 3.0.0 ## BREAKING CHANGES From 5201d710467c8e076fbd796aa45e20fedfc9d60c Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 09:30:36 +0200 Subject: [PATCH 35/52] test asci --- src/differential_expression/deseq2/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index 5e88c53e2d4..a1716810834 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -54,7 +54,7 @@ argument_groups: Design formula for DESeq2 analysis in R-style format. Specifies the statistical model to account for various factors. required: true - example: "~ disease + treatment" + # example: "~ disease + treatment" - name: "--contrast_column" type: string description: | From ddf4f85cba1122c9efb5a61be8b0279a7535f4ba Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 10:03:58 +0200 Subject: [PATCH 36/52] remove non ascii --- src/differential_expression/deseq2/config.vsh.yaml | 2 +- src/differential_expression/deseq2/script.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index a1716810834..5e88c53e2d4 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -54,7 +54,7 @@ argument_groups: Design formula for DESeq2 analysis in R-style format. Specifies the statistical model to account for various factors. required: true - # example: "~ disease + treatment" + example: "~ disease + treatment" - name: "--contrast_column" type: string description: | diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index e34fa4a0e31..17bc924beeb 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -66,7 +66,7 @@ h5mu_to_h5ad <- function(h5mu_path, modality_name) { tmp_path } -# Check if expression data is normalized (row sums ≈ 1) +# Check if expression data is normalized (row sums =~ 1) is_normalized <- function(layer) { row_sums <- if (is(layer, "sparseMatrix") || is(layer, "dgCMatrix")) { Matrix::rowSums(layer) From 38c78c92b3d7307000167a918f379a704a9a8c6b Mon Sep 17 00:00:00 2001 From: Dorien <41797896+dorien-er@users.noreply.github.com> Date: Tue, 26 Aug 2025 12:26:12 +0200 Subject: [PATCH 37/52] Apply suggestions from code review Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- src/differential_expression/deseq2/config.vsh.yaml | 8 +++++--- .../pseudobulk_deseq2/config.vsh.yaml | 10 ++-------- .../differential_expression/pseudobulk_deseq2/test.nf | 2 -- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index 5e88c53e2d4..6691010f140 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -33,12 +33,11 @@ argument_groups: type: string required: false description: "Input layer to use. If None, X is used. This layer must contain raw counts." - - name: "--var_gene_name" + - name: "--var_gene_names" type: string description: | Column name in the variable features table that contains gene symbols. This is used for reporting gene names in the output. - default: "feature_name" required: false - name: "--obs_cell_group" type: string @@ -76,10 +75,12 @@ argument_groups: min: 1 description: | Minimum number of samples a gene must be expressed in to be included in the analysis. - If None, no filtering is applied. + If no value is provided, no filtering is applied. required: false - name: "--p_adj_threshold" type: double + min: 0 + max: 1 description: | Adjusted p-value threshold for significance. Genes with adjusted p-values below this threshold will be considered significant. @@ -87,6 +88,7 @@ argument_groups: required: false - name: "--log2fc_threshold" type: double + min: 0 description: | Log2 fold change threshold for significance. Genes with absolute log2 fold change above this threshold will be considered significant. diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 13aa4e16d6b..48d2d58e12e 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -12,11 +12,6 @@ authors: argument_groups: - name: Inputs arguments: - - name: --id - type: string - required: true - description: ID of the sample. - example: foo - name: --input type: file required: true @@ -42,13 +37,12 @@ argument_groups: multiple: true description: .obs fields containing the experimental condition(s) to create pseudobulk samples for. example: ["treatment", "disease"] - - name: "--var_gene_name" + - name: "--var_gene_names" type: string description: | Column name in the variable features table that contains gene symbols. This is used for reporting gene names in the output. - default: "feature_name" - required: false + required: true - name: Pseudobulk Options arguments: diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf index 02de4ed17d9..e7150b420e8 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -57,9 +57,7 @@ workflow test_wf { deseqFiles.collect { csvFile -> def cellType = csvFile.name.replaceAll(/deseq2_analysis_(.*)\.csv/, '$1') def newId = "${id}_${cellType}" - def newState = state.clone() [newId, ["input": csvFile]] - } } From 35ba8c5af16add4448dad23f2b70992bc8d3a008 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 14:39:22 +0200 Subject: [PATCH 38/52] wip --- .../deseq2/config.vsh.yaml | 9 +++--- src/differential_expression/deseq2/script.R | 6 ++-- .../pseudobulk_deseq2/config.vsh.yaml | 3 ++ .../pseudobulk_deseq2/main.nf | 29 +++++++++++++++++++ 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index 6691010f140..67c678c546c 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -50,8 +50,8 @@ argument_groups: - name: "--design_formula" type: string description: | - Design formula for DESeq2 analysis in R-style format. - Specifies the statistical model to account for various factors. + The formula should be a tilde (~) followed by the variables with plus signs between them. + The design formula is used to estimate the dispersions and to estimate the log2 fold changes of the model. required: true example: "~ disease + treatment" - name: "--contrast_column" @@ -82,7 +82,7 @@ argument_groups: min: 0 max: 1 description: | - Adjusted p-value threshold for significance. + Adjusted p-value threshold for significance, corrected with the Benjamini and Hochberg method. Genes with adjusted p-values below this threshold will be considered significant. default: 0.05 required: false @@ -138,13 +138,12 @@ engines: packages: [ libhdf5-dev, libgeos-dev, hdf5-tools ] - type: r cran: [ hdf5r ] + # Use pinned version to avoid H5 file access issues with latest stable release (0.99.0) github: scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a bioc: [ DESeq2 ] test_setup: - type: apt packages: [ python3, python3-pip, python3-dev, python-is-python3 ] - # - type: python - # __merge__: [ /src/base/requirements/anndata_mudata.yaml, . ] __merge__: [ ., /src/base/requirements/python_test_setup.yaml] runners: diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index 17bc924beeb..dea8078918f 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -22,7 +22,7 @@ par <- list( filter_gene_patterns = c( "MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+" ), - var_gene_name = "feature_name" + var_gene_names = "feature_name" ) meta <- list(resources_dir = "src/utils") ## VIASH END @@ -346,9 +346,9 @@ main <- function() { cat("Preparing counts matrix for DESeq2\n") var_names <- if ( - !is.null(par$var_gene_name) && par$var_gene_name %in% colnames(mod$var) + !is.null(par$var_gene_names) && par$var_gene_names %in% colnames(mod$var) ) { - mod$var[[par$var_gene_name]] + mod$var[[par$var_gene_names]] } else { mod$var_names } diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 48d2d58e12e..320de83defb 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -137,6 +137,9 @@ argument_groups: dependencies: - name: differential_expression/create_pseudobulk - name: differential_expression/deseq2 + - name: filter/filter_with_counts + - name: filter/filter_genes_by_pattern + - name: filter/do_filter resources: - type: nextflow_script diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf index c4ee6ac1013..d193087d453 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -21,6 +21,35 @@ workflow run_wf { toState: [ "input": "output" ] ) + | filter_with_counts.run( + fromState: [ + id: "id", + input: "input", + modality: "modality", + layer: "input_layer", + min_counts: "min_obs_per_sample", + + obs_cell_count: "obs_cell_count" + ], + args: [ + do_subset: "True", + + ] + toState: [ "input": "output" ] + ) + + | do_filter.run( + + ) + + | filter_genes_by_pattern.run( + + ) + + | do_filter.run( + + ) + | deseq2.run( fromState: [ id: "id", From cea3fbf3a958a91b5ad2bbade11e13b6f4086844 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 15:38:07 +0200 Subject: [PATCH 39/52] implement compponent --- .../filter_with_pattern/config.vsh.yaml | 79 ++++++++++++ src/filter/filter_with_pattern/script.py | 65 ++++++++++ src/filter/filter_with_pattern/test.py | 114 ++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 src/filter/filter_with_pattern/config.vsh.yaml create mode 100644 src/filter/filter_with_pattern/script.py create mode 100644 src/filter/filter_with_pattern/test.py diff --git a/src/filter/filter_with_pattern/config.vsh.yaml b/src/filter/filter_with_pattern/config.vsh.yaml new file mode 100644 index 00000000000..7a7f6aaa6cf --- /dev/null +++ b/src/filter/filter_with_pattern/config.vsh.yaml @@ -0,0 +1,79 @@ +name: filter_with_pattern +namespace: "filter" +scope: "public" +description: | + Filter a MuData object based on gene names using a regex. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + - name: "--var_gene_names" + type: string + description: | + The .var field containing the gene names to be filtered. If not provided, `.var.index` will be used. + example: "gene_symbol" + required: false + - name: "--pattern" + type: string + multiple: true + example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] + description: | + A regex pattern to filter the gene names. + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--do_subset" + type: boolean_true + description: Whether to subset before storing the output. + - name: "--var_name_filter" + type: string + default: "filter_with_pattern" + description: In which .var slot to store a boolean array corresponding to which variables should be removed. + __merge__: [., /src/base/h5_compression_argument.yaml] + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/utils/compress_h5mu.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/filter/filter_with_pattern/script.py b/src/filter/filter_with_pattern/script.py new file mode 100644 index 00000000000..0707c8a9de8 --- /dev/null +++ b/src/filter/filter_with_pattern/script.py @@ -0,0 +1,65 @@ +import mudata as mu +import sys +import pandas as pd + +### VIASH START +par = { + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "pattern": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], + "var_name_filter": "filter_with_pattern", + "var_gene_names": "gene_symbol", + "do_subset": True, + "output_compression": None, +} +meta = {"resources_dir": "src/utils"} +### VIASH END + + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger +from compress_h5mu import write_h5ad_to_h5mu_with_compression + +logger = setup_logger() + +logger.info("Reading input data from %s, modality %s", par["input"], par["modality"]) +modality_data = mu.read_h5ad(par["input"], mod=par["modality"]) + +logger.info("\tUnfiltered data: %s", modality_data) + +gene_names = ( + modality_data.var[par["var_gene_names"]].values + if par["var_gene_names"] + else modality_data.var_names.values +) + +pattern_string = "|".join(par["pattern"]) + +# Filter genes based on regex pattern +genes_to_keep = list(~pd.Series(gene_names).str.contains(pattern_string, na=False)) + +logger.info( + f"Number of genes to keep: {sum(genes_to_keep)} out of {len(genes_to_keep)}" +) +modality_data.var[par["var_name_filter"]] = genes_to_keep + +if par["do_subset"]: + modality_data = modality_data[:, genes_to_keep] + logger.info("\tFiltered data: %s", modality_data) + +logger.info( + "Writing output data to %s with compression %s", + par["output"], + par["output_compression"], +) +write_h5ad_to_h5mu_with_compression( + par["output"], + par["input"], + par["modality"], + modality_data, + par["output_compression"], +) + + +logger.info("Finished") diff --git a/src/filter/filter_with_pattern/test.py b/src/filter/filter_with_pattern/test.py new file mode 100644 index 00000000000..e16afeaa53e --- /dev/null +++ b/src/filter/filter_with_pattern/test.py @@ -0,0 +1,114 @@ +import mudata as mu +import sys +from pathlib import Path +import pytest +import numpy as np + +## VIASH START +meta = { + "executable": "./target/executable/filter/filter_with_pattern/filter_with_pattern", + "resources_dir": "resources_test/", + "config": "/home/di/code/openpipeline/src/filter/filter_with_pattern/config.vsh.yaml", +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() + + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + + +def test_simple_execution(run_component, input_path, tmp_path): + output_path = tmp_path / "output.h5mu" + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--var_gene_names", + "gene_symbol", + "--pattern", + "MIR\\d+", + "--pattern", + "MIR\\d+", + "--pattern", + "LINC\\d+", + "--pattern", + "AC\\d+", + "--pattern", + "AP\\d+", + "--do_subset", + "True", + "--output_compression", + "gzip", + ] + ) + assert Path(output_path).is_file() + mu_out = mu.read_h5ad(output_path, mod="rna") + mu_in = mu.read_h5ad(input_path, mod="rna") + + var_out = mu_out.var + + assert "filter_with_pattern" in var_out, ( + "Expected column with filter results to be present" + ) + assert var_out["filter_with_pattern"].dtype == np.dtype("bool"), ( + "Expected filter column to be boolean" + ) + + mu_in.shape[0] == mu_out.shape[0], "Number of observations should be unaltered" + mu_out.shape[1] < mu_in.shape[1], "Genes should have been filtered" + + +def test_filter_without_subset(run_component, input_path, tmp_path): + output_path = tmp_path / "output.h5mu" + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--var_gene_names", + "gene_symbol", + "--pattern", + "MIR\\d+", + "--pattern", + "MIR\\d+", + "--pattern", + "LINC\\d+", + "--pattern", + "AC\\d+", + "--pattern", + "AP\\d+", + "--do_subset", + "False", + "--output_compression", + "gzip", + ] + ) + assert Path(output_path).is_file() + mu_out = mu.read_h5ad(output_path, mod="rna") + mu_in = mu.read_h5ad(input_path, mod="rna") + + var_out = mu_out.var + + assert "filter_with_pattern" in var_out, ( + "Expected column with filter results to be present" + ) + assert var_out["filter_with_pattern"].dtype == np.dtype("bool"), ( + "Expected filter column to be boolean" + ) + + mu_in.shape[0] == mu_out.shape[0], "Number of observations should be unaltered" + mu_out.shape[1] == mu_in.shape[1], "Genes should not have been filtered" + + +if __name__ == "__main__": + exit(pytest.main([__file__])) From 678571d591ba88073aac3758af1370ec0170e76b Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 15:40:30 +0200 Subject: [PATCH 40/52] update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6caffa3d0e3..b05367ba1ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# openpipelines 3.x.x + +## NEW FUNCTIONALITY + +* `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070). + # openpipelines 3.0.0 ## BREAKING CHANGES From 4acfa710976817f5bc62c226c3db20633252676e Mon Sep 17 00:00:00 2001 From: Dorien <41797896+dorien-er@users.noreply.github.com> Date: Tue, 26 Aug 2025 15:53:29 +0200 Subject: [PATCH 41/52] Update src/filter/filter_with_pattern/config.vsh.yaml Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- src/filter/filter_with_pattern/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/filter/filter_with_pattern/config.vsh.yaml b/src/filter/filter_with_pattern/config.vsh.yaml index 7a7f6aaa6cf..a1df69a59fb 100644 --- a/src/filter/filter_with_pattern/config.vsh.yaml +++ b/src/filter/filter_with_pattern/config.vsh.yaml @@ -76,4 +76,4 @@ runners: - type: executable - type: nextflow directives: - label: [singlecpu, lowmem] \ No newline at end of file + label: [lowcpu, midmem] \ No newline at end of file From 7bcb1b2f42ec47a1e255f37b9deffe7a6669471f Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 15:55:06 +0200 Subject: [PATCH 42/52] update param requirements --- src/filter/filter_with_pattern/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/filter/filter_with_pattern/config.vsh.yaml b/src/filter/filter_with_pattern/config.vsh.yaml index a1df69a59fb..637172151e2 100644 --- a/src/filter/filter_with_pattern/config.vsh.yaml +++ b/src/filter/filter_with_pattern/config.vsh.yaml @@ -28,6 +28,7 @@ argument_groups: example: "gene_symbol" required: false - name: "--pattern" + required: true type: string multiple: true example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] From e78c06cb8383b7f44dd057e061f9132db070640c Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 16:07:57 +0200 Subject: [PATCH 43/52] wip --- .../differential_expression/pseudobulk_deseq2/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 320de83defb..971ae6d067e 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -139,6 +139,7 @@ dependencies: - name: differential_expression/deseq2 - name: filter/filter_with_counts - name: filter/filter_genes_by_pattern + - name: filter/delimit_counts - name: filter/do_filter resources: From ca0788533c37c4439afd7d71454bf350aafe28cb Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 16:08:16 +0200 Subject: [PATCH 44/52] wip --- src/differential_expression/deseq2/config.vsh.yaml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index 67c678c546c..fb8561dc870 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -94,14 +94,6 @@ argument_groups: Genes with absolute log2 fold change above this threshold will be considered significant. default: 0.0 required: false - - name: "--filter_gene_patterns" - type: string - multiple: true - description: | - List of regex patterns to filter out genes. - Genes matching any of these patterns will be excluded from the analysis. - required: false - example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] - name: Outputs arguments: From f1a9a8892e1d92c7b4d8a0770e799aa65291226b Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 16:16:08 +0200 Subject: [PATCH 45/52] wip --- .../create_pseudobulk/config.vsh.yaml | 3 +- src/differential_expression/deseq2/script.R | 39 ------------------- .../pseudobulk_deseq2/config.vsh.yaml | 5 +-- .../pseudobulk_deseq2/main.nf | 32 ++++++++++----- 4 files changed, 26 insertions(+), 53 deletions(-) diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml index 855566a39b0..07bd54dac18 100644 --- a/src/differential_expression/create_pseudobulk/config.vsh.yaml +++ b/src/differential_expression/create_pseudobulk/config.vsh.yaml @@ -57,7 +57,8 @@ argument_groups: - name: "--min_obs_per_sample" type: integer min: 1 - default: 30 + example: 30 + required: false description: "Minimum number of cells per pseudobulk sample." - name: --random_state type: integer diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index dea8078918f..3245df52032 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -19,9 +19,6 @@ par <- list( filter_genes_min_samples = NULL, p_adj_threshold = 0.05, log2fc_threshold = 0.0, - filter_gene_patterns = c( - "MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+" - ), var_gene_names = "feature_name" ) meta <- list(resources_dir = "src/utils") @@ -172,33 +169,6 @@ prepare_counts_matrix <- function(layer, var_names, obs_names) { counts_df } -# Filter genes based on regex patterns -filter_genes_by_pattern <- function(counts, gene_patterns) { - if (is.null(gene_patterns) || length(gene_patterns) == 0) { - return(counts) - } - - pattern_string <- paste(gene_patterns, collapse = "|") - before_filter <- ncol(counts) - genes_to_keep <- !grepl(pattern_string, colnames(counts)) - counts_filtered <- counts[, genes_to_keep, drop = FALSE] - - cat( - "Filtered out genes matching patterns:", - paste(gene_patterns, collapse = ", "), "\n" - ) - cat( - "Matrix shape before filtering:", nrow(counts), "x", before_filter, - "\n" - ) - cat( - "Matrix shape after filtering:", nrow(counts_filtered), "x", - ncol(counts_filtered), "\n" - ) - - counts_filtered -} - # Create and configure DESeq2 dataset create_deseq2_dataset <- function(counts, metadata, design_formula) { cat("Creating DESeq2 dataset\n") @@ -355,15 +325,6 @@ main <- function() { obs_names <- mod$obs_names counts <- prepare_counts_matrix(layer, var_names, obs_names) - # Apply gene filtering if specified - if ( - !is.null(par$filter_gene_patterns) && - length(par$filter_gene_patterns) > 0 - ) { - cat("Filtering genes based on patterns\n") - counts <- filter_genes_by_pattern(counts, par$filter_gene_patterns) - } - # Ensure output directory exists if (!dir.exists(par$output_dir)) { dir.create(par$output_dir, recursive = TRUE) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 971ae6d067e..81361754a59 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -61,10 +61,7 @@ argument_groups: min: 0 default: 0 description: The random seed for sampling. - - name: --obs_cell_count - type: string - required: false - description: .obs field containing the cell count per pseudobulk sample. + - name: DGEA options arguments: - name: --design_formula diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf index d193087d453..99fa07d069d 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -7,23 +7,40 @@ workflow run_wf { output_ch = input_ch | create_pseudobulk.run( fromState: [ - id: "id", input: "input", modality: "modality", input_layer: "input_layer", obs_label: "obs_cell_group", obs_groups: "obs_groups", - aggregation_method: "aggregation_method", - min_obs_per_sample: "min_obs_per_sample", - random_state: "random_state", - obs_cell_count: "obs_cell_count" + aggregation_method: "aggregation_method" + random_state: "random_state" ], + args:[ + obs_cell_count: "n_cells" + ] toState: [ "input": "output" ] ) + | delimit_counts.run( + fromState: [ + input: "input", + modality: "modality", + obs_count_column: "input_layer", + min_counts: "min_obs_per_sample", + + ], + args: [ + do_subset: "True", + obs_cell_count: "n_cells", + obs_name_filter: "delimit_samples_per_pseudobulk", + ] + toState: [ "input": "output" ] + ) + | filter_genes_by_pattern.run( + + ) | filter_with_counts.run( fromState: [ - id: "id", input: "input", modality: "modality", layer: "input_layer", @@ -42,9 +59,7 @@ workflow run_wf { ) - | filter_genes_by_pattern.run( - ) | do_filter.run( @@ -52,7 +67,6 @@ workflow run_wf { | deseq2.run( fromState: [ - id: "id", input: "input", modality: "modality", input_layer: "input_layer", From 99ee20f34a8e679e78c9e9d856c6a2d275b31fea Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 16:26:01 +0200 Subject: [PATCH 46/52] wip --- .../deseq2/config.vsh.yaml | 7 ---- src/differential_expression/deseq2/script.R | 18 ---------- .../pseudobulk_deseq2/main.nf | 36 ++++++++++--------- .../pseudobulk_deseq2/test.nf | 9 ++--- 4 files changed, 21 insertions(+), 49 deletions(-) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index fb8561dc870..c2cc23ef618 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -70,13 +70,6 @@ argument_groups: Values must be present in the fields specified for the design formula. required: true example: ["ctrl", "stim"] - - name: "--filter_genes_min_samples" - type: integer - min: 1 - description: | - Minimum number of samples a gene must be expressed in to be included in the analysis. - If no value is provided, no filtering is applied. - required: false - name: "--p_adj_threshold" type: double min: 0 diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index 3245df52032..5661d7de85a 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -16,7 +16,6 @@ par <- list( design_formula = "~ treatment", contrast_column = "treatment", contrast_values = c("ctrl", "stim"), - filter_genes_min_samples = NULL, p_adj_threshold = 0.05, log2fc_threshold = 0.0, var_gene_names = "feature_name" @@ -188,23 +187,6 @@ create_deseq2_dataset <- function(counts, metadata, design_formula) { colData = metadata, design = as.formula(design_formula) ) - - # Filtering genes based on presence across samples - sample_count <- if ( - !is.null(par$filter_genes_min_samples) - ) { - par$filter_genes_min_samples - } else { - 1 - } - cat( - "Filtering genes by counts: removing genes that are present in less than", - sample_count, "samples\n" - ) - - keep <- rowSums(counts(dds) >= 1) >= sample_count - dds <- dds[keep, ] - dds } diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf index 99fa07d069d..5aacdbb476f 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -31,38 +31,40 @@ workflow run_wf { ], args: [ do_subset: "True", - obs_cell_count: "n_cells", - obs_name_filter: "delimit_samples_per_pseudobulk", + var_name_filter: "filter_with_gene_pattern", ] toState: [ "input": "output" ] ) + | filter_genes_by_pattern.run( - - ) - | filter_with_counts.run( fromState: [ input: "input", modality: "modality", - layer: "input_layer", + var_gene_names: "var_gene_names", + pattern: "filter_gene_patterns", min_counts: "min_obs_per_sample", - obs_cell_count: "obs_cell_count" ], args: [ do_subset: "True", - + obs_cell_count: "n_cells", + obs_name_filter: "delimit_samples_per_pseudobulk", ] toState: [ "input": "output" ] ) - | do_filter.run( - - ) - - - - | do_filter.run( - + | filter_with_counts.run( + fromState: [ + input: "input", + modality: "modality", + layer: "input_layer", + min_cells_per_gene: "filter_genes_min_samples", + ], + args: [ + do_subset: "True", + var_name_filter: "filter_with_counts" + ] + toState: [ "input": "output" ] ) | deseq2.run( @@ -70,7 +72,7 @@ workflow run_wf { input: "input", modality: "modality", input_layer: "input_layer", - var_gene_name: "var_gene_name", + var_gene_names: "var_gene_names", obs_cell_group: "obs_cell_group", design_formula: "design_formula", contrast_column: "contrast_column", diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf index e7150b420e8..cbe11cddf99 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -37,9 +37,7 @@ workflow test_wf { assert state instanceof Map : "State should be a map. Found: ${state}" assert state.containsKey("output") : "Output should contain key 'output'. found: ${state}" assert state.output.isDirectory() : "'output' should be a directory." - def deseqFiles = state.output.listFiles().findAll { file -> - file.name.matches(/deseq2_analysis_.*\.csv/) - } + def deseqFiles = files("${state.output}/deseq2_analysis_.*.csv") assert deseqFiles.size() == 4, "Output directory should contain exactly 4 deseq2_analysis_*.csv files. Found: ${deseqFiles.size()} files: ${deseqFiles.collect { it.name }}" "Output: $output" @@ -50,10 +48,7 @@ workflow test_wf { def state = output[1] // Find all CSV files and create separate entries - def deseqFiles = state.output.listFiles().findAll { file -> - file.name.matches(/deseq2_analysis_.*\.csv/) - } - + def deseqFiles = files("${state.output}/deseq2_analysis_.*.csv") deseqFiles.collect { csvFile -> def cellType = csvFile.name.replaceAll(/deseq2_analysis_(.*)\.csv/, '$1') def newId = "${id}_${cellType}" From ef15226f13cd0ce4b46246ac0b85803163fb07bb Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 17:09:32 +0200 Subject: [PATCH 47/52] wip --- .../create_pseudobulk/config.vsh.yaml | 6 ---- .../create_pseudobulk/script.py | 2 -- .../create_pseudobulk/test.py | 33 ------------------- .../deseq2/config.vsh.yaml | 4 +-- src/differential_expression/deseq2/script.R | 32 ++++++++++++++---- src/differential_expression/deseq2/test.py | 29 ++++++++++++++++ .../pseudobulk_deseq2/config.vsh.yaml | 11 ++++--- .../pseudobulk_deseq2/main.nf | 22 +++++++------ .../pseudobulk_deseq2/test.nf | 3 +- 9 files changed, 76 insertions(+), 66 deletions(-) diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml index 07bd54dac18..6b0f0aa3588 100644 --- a/src/differential_expression/create_pseudobulk/config.vsh.yaml +++ b/src/differential_expression/create_pseudobulk/config.vsh.yaml @@ -54,12 +54,6 @@ argument_groups: choices: ["sum", "mean"] default: "sum" description: "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean." - - name: "--min_obs_per_sample" - type: integer - min: 1 - example: 30 - required: false - description: "Minimum number of cells per pseudobulk sample." - name: --random_state type: integer description: | diff --git a/src/differential_expression/create_pseudobulk/script.py b/src/differential_expression/create_pseudobulk/script.py index a9af50f6d34..257893c6e69 100644 --- a/src/differential_expression/create_pseudobulk/script.py +++ b/src/differential_expression/create_pseudobulk/script.py @@ -13,7 +13,6 @@ "obs_label": "cell_type", "obs_groups": ["treatment", "donor_id", "disease"], "obs_cell_count": "n_cells", - "min_obs_per_sample": 5, "random_state": 0, "output": "test.h5mu", "output_compression": "gzip", @@ -92,7 +91,6 @@ def main(): # Filter pseudobulk samples based on minimum observation count logger.info("Filtering pseudobulk samples based on minimum observation count...") adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols) - adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]] logger.info( f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes" diff --git a/src/differential_expression/create_pseudobulk/test.py b/src/differential_expression/create_pseudobulk/test.py index f40963ee553..859be8874b1 100644 --- a/src/differential_expression/create_pseudobulk/test.py +++ b/src/differential_expression/create_pseudobulk/test.py @@ -58,8 +58,6 @@ def test_multiple_factors(run_component, random_h5mu_path): "donor_id", "--obs_groups", "treatment", - "--min_obs_per_sample", - "5", "--output_compression", "gzip", ] @@ -78,36 +76,5 @@ def test_multiple_factors(run_component, random_h5mu_path): ) -def test_filtering(run_component, random_h5mu_path): - output_path = random_h5mu_path() - - run_component( - [ - "--input", - input_path, - "--output", - output_path, - "--obs_label", - "cell_type", - "--obs_groups", - "treatment", - "--min_obs_per_sample", - "50", - "--output_compression", - "gzip", - ] - ) - - assert os.path.exists(output_path), "Output query file does not exist" - mdata = mu.read_h5mu(output_path) - adata = mdata.mod["rna"] - - expected_obs = ["treatment", "cell_type", "n_cells"] - assert all(col in adata.obs for col in expected_obs), ( - f"Expected columns {expected_obs} not found in .obs" - ) - assert adata.shape[0] == 4, "Expected a total of 8 pseudobulk samples in the output" - - if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml index c2cc23ef618..d5385a5f834 100644 --- a/src/differential_expression/deseq2/config.vsh.yaml +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -36,9 +36,9 @@ argument_groups: - name: "--var_gene_names" type: string description: | - Column name in the variable features table that contains gene symbols. - This is used for reporting gene names in the output. + Name of the .var field that contains gene symbols. If not provided, .var.index will be used. required: false + example: "gene_symbol" - name: "--obs_cell_group" type: string description: | diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R index 5661d7de85a..a6bde13b473 100644 --- a/src/differential_expression/deseq2/script.R +++ b/src/differential_expression/deseq2/script.R @@ -30,9 +30,6 @@ h5mu_to_h5ad <- function(h5mu_path, modality_name) { mod_location <- paste("mod", modality_name, sep = "/") h5src <- hdf5r::H5File$new(h5mu_path, "r") h5dest <- hdf5r::H5File$new(tmp_path, "w") - # Copy over the child objects and the child attributes from root - # Root cannot be copied directly because it always exists and - # copying does not allow overwriting. children <- hdf5r::list.objects(h5src, path = mod_location, full.names = FALSE, recursive = FALSE @@ -75,6 +72,19 @@ is_normalized <- function(layer) { # Extract design factors from formula parse_design_formula <- function(design_formula) { + if (!grepl("^~\\s*\\w+(\\s*\\+\\s*\\w+)*$", design_formula)) { + stop( + sprintf( + paste( + "Invalid design formula: '%s'.", + "Formula must start with '~' and contain factors separated by '+'.", + "Example: '~ disease + treatment'", + sep = "\n" + ), + design_formula + ) + ) + } design_factors <- all.vars(as.formula(design_formula)) cat("Design formula:", design_formula, "\n") cat("Extracted factors:", paste(design_factors, collapse = ", "), "\n") @@ -297,10 +307,18 @@ main <- function() { ) cat("Preparing counts matrix for DESeq2\n") - var_names <- if ( - !is.null(par$var_gene_names) && par$var_gene_names %in% colnames(mod$var) - ) { - mod$var[[par$var_gene_names]] + var_names <- if (!is.null(par$var_gene_names)) { + if (par$var_gene_names %in% colnames(mod$var)) { + mod$var[[par$var_gene_names]] + } else { + stop( + sprintf( + "var_gene_names '%s' not found in mod$var columns: %s", + par$var_gene_names, + paste(colnames(mod$var), collapse = ", ") + ) + ) + } } else { mod$var_names } diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py index bac124b53a4..63a05f0f4e6 100644 --- a/src/differential_expression/deseq2/test.py +++ b/src/differential_expression/deseq2/test.py @@ -274,6 +274,35 @@ def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_p ), f"Expected error message not found: {err.value.stdout.decode('utf-8')}" +def test_invalid_design_column(run_component, tmp_path, pseudobulk_test_data_path): + """Test that invalid contrast column raises appropriate error""" + output_dir = tmp_path / "deseq2_output" + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "malformed formula", + "--contrast_column", + "treatment", + "--contrast_values", + "ctrl", + "--contrast_values", + "stim", + ] + ) + + # Updated regex to match actual R error format (no square brackets) + assert re.search( + r"Invalid design formula: 'malformed formula'", + err.value.stdout.decode("utf-8"), + ), f"Expected error message not found: {err.value.stdout.decode('utf-8')}" + + def test_custom_output_prefix(run_component, tmp_path, pseudobulk_test_data_path): """Test custom output prefix functionality""" output_dir = tmp_path / "deseq2_output" diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 81361754a59..e44fa7a3dcd 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -40,10 +40,10 @@ argument_groups: - name: "--var_gene_names" type: string description: | - Column name in the variable features table that contains gene symbols. - This is used for reporting gene names in the output. - required: true - + Name of the .var field that contains gene symbols. If not provided, .var.index will be used. + required: false + example: "gene_symbol" + - name: Pseudobulk Options arguments: - name: --aggregation_method @@ -110,6 +110,7 @@ argument_groups: - name: --filter_gene_patterns type: string multiple: true + example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] description: | List of regex patterns to filter out genes. Genes matching any of these patterns will be excluded from the analysis. @@ -135,7 +136,7 @@ dependencies: - name: differential_expression/create_pseudobulk - name: differential_expression/deseq2 - name: filter/filter_with_counts - - name: filter/filter_genes_by_pattern + - name: filter/filter_with_pattern - name: filter/delimit_counts - name: filter/do_filter diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf index 5aacdbb476f..36f68295d28 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -12,12 +12,12 @@ workflow run_wf { input_layer: "input_layer", obs_label: "obs_cell_group", obs_groups: "obs_groups", - aggregation_method: "aggregation_method" + aggregation_method: "aggregation_method", random_state: "random_state" ], args:[ obs_cell_count: "n_cells" - ] + ], toState: [ "input": "output" ] ) @@ -25,18 +25,21 @@ workflow run_wf { fromState: [ input: "input", modality: "modality", - obs_count_column: "input_layer", min_counts: "min_obs_per_sample", ], args: [ do_subset: "True", - var_name_filter: "filter_with_gene_pattern", - ] + obs_count_column: "n_cells", + obs_name_filter: "delimit_samples_per_pseudobulk", + ], toState: [ "input": "output" ] ) - | filter_genes_by_pattern.run( + | filter_with_pattern.run( + runIf: { id, state -> + state.filter_gene_patterns + }, fromState: [ input: "input", modality: "modality", @@ -47,9 +50,8 @@ workflow run_wf { ], args: [ do_subset: "True", - obs_cell_count: "n_cells", - obs_name_filter: "delimit_samples_per_pseudobulk", - ] + var_name_filter: "filter_with_gene_pattern" + ], toState: [ "input": "output" ] ) @@ -63,7 +65,7 @@ workflow run_wf { args: [ do_subset: "True", var_name_filter: "filter_with_counts" - ] + ], toState: [ "input": "output" ] ) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf index cbe11cddf99..8120d47d8ea 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -37,7 +37,8 @@ workflow test_wf { assert state instanceof Map : "State should be a map. Found: ${state}" assert state.containsKey("output") : "Output should contain key 'output'. found: ${state}" assert state.output.isDirectory() : "'output' should be a directory." - def deseqFiles = files("${state.output}/deseq2_analysis_.*.csv") + def deseqFiles = state.output.glob("deseq2_analysis_*.csv").toList() + "deseqFiles: $deseqFiles" assert deseqFiles.size() == 4, "Output directory should contain exactly 4 deseq2_analysis_*.csv files. Found: ${deseqFiles.size()} files: ${deseqFiles.collect { it.name }}" "Output: $output" From cc3879eaa8e032da86d9e9460a7e7c13cb982377 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 26 Aug 2025 17:23:49 +0200 Subject: [PATCH 48/52] update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b86598dd7ee..8367a8aff6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # openpipelines 3.x.x +## BREAKING + +* `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044). + ## NEW FUNCTIONALITY * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070). From 8338093040b8cbd0483a63aa6e7d53f37690ac30 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 27 Aug 2025 07:54:43 +0200 Subject: [PATCH 49/52] remove redundant component --- .../filter_with_pattern/config.vsh.yaml | 80 ------------ src/filter/filter_with_pattern/script.py | 65 ---------- src/filter/filter_with_pattern/test.py | 114 ------------------ .../pseudobulk_deseq2/test.nf | 10 +- 4 files changed, 7 insertions(+), 262 deletions(-) delete mode 100644 src/filter/filter_with_pattern/config.vsh.yaml delete mode 100644 src/filter/filter_with_pattern/script.py delete mode 100644 src/filter/filter_with_pattern/test.py diff --git a/src/filter/filter_with_pattern/config.vsh.yaml b/src/filter/filter_with_pattern/config.vsh.yaml deleted file mode 100644 index 637172151e2..00000000000 --- a/src/filter/filter_with_pattern/config.vsh.yaml +++ /dev/null @@ -1,80 +0,0 @@ -name: filter_with_pattern -namespace: "filter" -scope: "public" -description: | - Filter a MuData object based on gene names using a regex. -authors: - - __merge__: /src/authors/dorien_roosen.yaml - roles: [ author ] - -argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - description: Input h5mu file - direction: input - required: true - example: input.h5mu - - name: "--modality" - description: | - Which modality from the input MuData file to process. - type: string - default: "rna" - - name: "--var_gene_names" - type: string - description: | - The .var field containing the gene names to be filtered. If not provided, `.var.index` will be used. - example: "gene_symbol" - required: false - - name: "--pattern" - required: true - type: string - multiple: true - example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] - description: | - A regex pattern to filter the gene names. - - name: Outputs - arguments: - - name: "--output" - type: file - description: Output h5mu file. - direction: output - example: output.h5mu - - name: "--do_subset" - type: boolean_true - description: Whether to subset before storing the output. - - name: "--var_name_filter" - type: string - default: "filter_with_pattern" - description: In which .var slot to store a boolean array corresponding to which variables should be removed. - __merge__: [., /src/base/h5_compression_argument.yaml] - -resources: - - type: python_script - path: script.py - - path: /src/utils/setup_logger.py - - path: /src/utils/compress_h5mu.py -test_resources: - - type: python_script - path: test.py - - path: /resources_test/pbmc_1k_protein_v3 - -engines: -- type: docker - image: python:3.12-slim - setup: - - type: apt - packages: - - procps - - type: python - __merge__: /src/base/requirements/anndata_mudata.yaml - test_setup: - - type: python - __merge__: [ /src/base/requirements/viashpy.yaml, .] - -runners: -- type: executable -- type: nextflow - directives: - label: [lowcpu, midmem] \ No newline at end of file diff --git a/src/filter/filter_with_pattern/script.py b/src/filter/filter_with_pattern/script.py deleted file mode 100644 index 0707c8a9de8..00000000000 --- a/src/filter/filter_with_pattern/script.py +++ /dev/null @@ -1,65 +0,0 @@ -import mudata as mu -import sys -import pandas as pd - -### VIASH START -par = { - "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", - "modality": "rna", - "output": "output.h5mu", - "pattern": ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"], - "var_name_filter": "filter_with_pattern", - "var_gene_names": "gene_symbol", - "do_subset": True, - "output_compression": None, -} -meta = {"resources_dir": "src/utils"} -### VIASH END - - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger -from compress_h5mu import write_h5ad_to_h5mu_with_compression - -logger = setup_logger() - -logger.info("Reading input data from %s, modality %s", par["input"], par["modality"]) -modality_data = mu.read_h5ad(par["input"], mod=par["modality"]) - -logger.info("\tUnfiltered data: %s", modality_data) - -gene_names = ( - modality_data.var[par["var_gene_names"]].values - if par["var_gene_names"] - else modality_data.var_names.values -) - -pattern_string = "|".join(par["pattern"]) - -# Filter genes based on regex pattern -genes_to_keep = list(~pd.Series(gene_names).str.contains(pattern_string, na=False)) - -logger.info( - f"Number of genes to keep: {sum(genes_to_keep)} out of {len(genes_to_keep)}" -) -modality_data.var[par["var_name_filter"]] = genes_to_keep - -if par["do_subset"]: - modality_data = modality_data[:, genes_to_keep] - logger.info("\tFiltered data: %s", modality_data) - -logger.info( - "Writing output data to %s with compression %s", - par["output"], - par["output_compression"], -) -write_h5ad_to_h5mu_with_compression( - par["output"], - par["input"], - par["modality"], - modality_data, - par["output_compression"], -) - - -logger.info("Finished") diff --git a/src/filter/filter_with_pattern/test.py b/src/filter/filter_with_pattern/test.py deleted file mode 100644 index e16afeaa53e..00000000000 --- a/src/filter/filter_with_pattern/test.py +++ /dev/null @@ -1,114 +0,0 @@ -import mudata as mu -import sys -from pathlib import Path -import pytest -import numpy as np - -## VIASH START -meta = { - "executable": "./target/executable/filter/filter_with_pattern/filter_with_pattern", - "resources_dir": "resources_test/", - "config": "/home/di/code/openpipeline/src/filter/filter_with_pattern/config.vsh.yaml", -} - -## VIASH END - -sys.path.append(meta["resources_dir"]) -from setup_logger import setup_logger - -logger = setup_logger() - - -@pytest.fixture -def input_path(): - return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" - - -def test_simple_execution(run_component, input_path, tmp_path): - output_path = tmp_path / "output.h5mu" - run_component( - [ - "--input", - input_path, - "--output", - output_path, - "--var_gene_names", - "gene_symbol", - "--pattern", - "MIR\\d+", - "--pattern", - "MIR\\d+", - "--pattern", - "LINC\\d+", - "--pattern", - "AC\\d+", - "--pattern", - "AP\\d+", - "--do_subset", - "True", - "--output_compression", - "gzip", - ] - ) - assert Path(output_path).is_file() - mu_out = mu.read_h5ad(output_path, mod="rna") - mu_in = mu.read_h5ad(input_path, mod="rna") - - var_out = mu_out.var - - assert "filter_with_pattern" in var_out, ( - "Expected column with filter results to be present" - ) - assert var_out["filter_with_pattern"].dtype == np.dtype("bool"), ( - "Expected filter column to be boolean" - ) - - mu_in.shape[0] == mu_out.shape[0], "Number of observations should be unaltered" - mu_out.shape[1] < mu_in.shape[1], "Genes should have been filtered" - - -def test_filter_without_subset(run_component, input_path, tmp_path): - output_path = tmp_path / "output.h5mu" - run_component( - [ - "--input", - input_path, - "--output", - output_path, - "--var_gene_names", - "gene_symbol", - "--pattern", - "MIR\\d+", - "--pattern", - "MIR\\d+", - "--pattern", - "LINC\\d+", - "--pattern", - "AC\\d+", - "--pattern", - "AP\\d+", - "--do_subset", - "False", - "--output_compression", - "gzip", - ] - ) - assert Path(output_path).is_file() - mu_out = mu.read_h5ad(output_path, mod="rna") - mu_in = mu.read_h5ad(input_path, mod="rna") - - var_out = mu_out.var - - assert "filter_with_pattern" in var_out, ( - "Expected column with filter results to be present" - ) - assert var_out["filter_with_pattern"].dtype == np.dtype("bool"), ( - "Expected filter column to be boolean" - ) - - mu_in.shape[0] == mu_out.shape[0], "Number of observations should be unaltered" - mu_out.shape[1] == mu_in.shape[1], "Genes should not have been filtered" - - -if __name__ == "__main__": - exit(pytest.main([__file__])) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf index 8120d47d8ea..565682f58fd 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -25,6 +25,7 @@ workflow test_wf { ]) | map{ state -> [state.id, state] } | pseudobulk_deseq2 + | view { output -> assert output.size() == 2 : "Outputs should contain two elements; [id, state]" @@ -37,8 +38,9 @@ workflow test_wf { assert state instanceof Map : "State should be a map. Found: ${state}" assert state.containsKey("output") : "Output should contain key 'output'. found: ${state}" assert state.output.isDirectory() : "'output' should be a directory." - def deseqFiles = state.output.glob("deseq2_analysis_*.csv").toList() - "deseqFiles: $deseqFiles" + def deseqFiles = state.output.listFiles().findAll { file -> + file.name.matches(/deseq2_analysis_.*\.csv/) + } assert deseqFiles.size() == 4, "Output directory should contain exactly 4 deseq2_analysis_*.csv files. Found: ${deseqFiles.size()} files: ${deseqFiles.collect { it.name }}" "Output: $output" @@ -49,7 +51,9 @@ workflow test_wf { def state = output[1] // Find all CSV files and create separate entries - def deseqFiles = files("${state.output}/deseq2_analysis_.*.csv") + def deseqFiles = state.output.listFiles().findAll { file -> + file.name.matches(/deseq2_analysis_.*\.csv/) + } deseqFiles.collect { csvFile -> def cellType = csvFile.name.replaceAll(/deseq2_analysis_(.*)\.csv/, '$1') def newId = "${id}_${cellType}" From 8beacf0367ceb1e52f362073d026a6d22bc89233 Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 27 Aug 2025 08:33:33 +0200 Subject: [PATCH 50/52] cleanup --- .../pseudobulk_deseq2/config.vsh.yaml | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index e44fa7a3dcd..4daeb970440 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -51,17 +51,35 @@ argument_groups: choices: ["sum", "mean"] default: sum description: Method to aggregate the raw counts for pseudoreplicates. Either sum or mean. - - name: --min_obs_per_sample - type: integer - min: 1 - default: 30 - description: Minimum number of cells per pseudobulk sample. - name: --random_state type: integer min: 0 default: 0 description: The random seed for sampling. + - name: Filtering options + arguments: + - name: --min_obs_per_sample + type: integer + min: 1 + default: 30 + description: Minimum number of observations per pseudobulk sample. + - name: --filter_genes_min_samples + type: integer + min: 1 + default: 1 + description: | + Minimum number of samples a gene must be expressed in to be included in the analysis. + If None, no filtering is applied. + - name: --filter_gene_patterns + type: string + multiple: true + example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] + description: | + List of regex patterns to filter out genes. + Genes matching any of these patterns will be excluded from the analysis. + required: false + - name: DGEA options arguments: - name: --design_formula @@ -86,13 +104,6 @@ argument_groups: Values to compare in the contrast column. First value is baseline, following values are different treatments. example: ["ctrl", "stim"] - - name: --filter_genes_min_samples - type: integer - min: 1 - description: | - Minimum number of samples a gene must be expressed in to be included in the analysis. - If None, no filtering is applied. - required: false - name: --p_adj_threshold type: double default: 0.05 @@ -107,14 +118,6 @@ argument_groups: Log2 fold change threshold for significance. Genes with absolute log2 fold change above this threshold will be considered significant. required: false - - name: --filter_gene_patterns - type: string - multiple: true - example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] - description: | - List of regex patterns to filter out genes. - Genes matching any of these patterns will be excluded from the analysis. - required: false - name: Outputs arguments: From 7340956bc9d76e65eff439631f7616ec963dc98f Mon Sep 17 00:00:00 2001 From: dorien-er Date: Wed, 27 Aug 2025 11:47:09 +0200 Subject: [PATCH 51/52] fix --- .../differential_expression/pseudobulk_deseq2/config.vsh.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml index 4daeb970440..256824bdadd 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -67,7 +67,6 @@ argument_groups: - name: --filter_genes_min_samples type: integer min: 1 - default: 1 description: | Minimum number of samples a gene must be expressed in to be included in the analysis. If None, no filtering is applied. From 961eec68d4780f93c165fd5c6e3122aaa435b61c Mon Sep 17 00:00:00 2001 From: dorien-er Date: Tue, 2 Sep 2025 14:03:35 +0200 Subject: [PATCH 52/52] remove unused arguments --- src/workflows/differential_expression/pseudobulk_deseq2/main.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf index 36f68295d28..a9fec8dc6d9 100644 --- a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -79,10 +79,8 @@ workflow run_wf { design_formula: "design_formula", contrast_column: "contrast_column", contrast_values: "contrast_values", - filter_genes_min_samples: "filter_genes_min_samples", p_adj_threshold: "p_adj_threshold", log2fc_threshold: "log2fc_threshold", - filter_gene_patterns: "filter_gene_patterns", output_dir: "output", output_prefix: "output_prefix" ],