diff --git a/CHANGELOG.md b/CHANGELOG.md index 583acc98579..8367a8aff6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,21 @@ # openpipelines 3.x.x +## BREAKING + +* `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044). + ## NEW FUNCTIONALITY * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070). * `filter/delimit_counts`: Turns an .obs column of a MuData file containing count data into a boolean column based on thresholds (PR #1069) +## EXPERIMENTAL + +* `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044). + +* `workflows/differential_expression/pseudobulk_deseq2`: Workflow for generating pseudobulk samples from single-cell data followed by DESeq2 differential expression analysis (PR #1044) + # openpipelines 3.0.0 ## BREAKING CHANGES diff --git a/src/differential_expression/create_pseudobulk/config.vsh.yaml b/src/differential_expression/create_pseudobulk/config.vsh.yaml index aa90b260be1..6b0f0aa3588 100644 --- a/src/differential_expression/create_pseudobulk/config.vsh.yaml +++ b/src/differential_expression/create_pseudobulk/config.vsh.yaml @@ -1,5 +1,5 @@ -name: differential_expression -namespace: "create_pseudobulk" +name: create_pseudobulk +namespace: differential_expression scope: "public" description: | Generation of pseudobulk samples from single-cell transcriptomics data, @@ -54,11 +54,6 @@ argument_groups: choices: ["sum", "mean"] default: "sum" description: "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean." - - name: "--min_obs_per_sample" - type: integer - min: 1 - default: 30 - description: "Minimum number of cells per pseudobulk sample." - name: --random_state type: integer description: | diff --git a/src/differential_expression/create_pseudobulk/script.py b/src/differential_expression/create_pseudobulk/script.py index a9af50f6d34..257893c6e69 100644 --- a/src/differential_expression/create_pseudobulk/script.py +++ b/src/differential_expression/create_pseudobulk/script.py @@ -13,7 +13,6 @@ "obs_label": "cell_type", "obs_groups": ["treatment", "donor_id", "disease"], "obs_cell_count": "n_cells", - "min_obs_per_sample": 5, "random_state": 0, "output": "test.h5mu", "output_compression": "gzip", @@ -92,7 +91,6 @@ def main(): # Filter pseudobulk samples based on minimum observation count logger.info("Filtering pseudobulk samples based on minimum observation count...") adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols) - adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]] logger.info( f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes" diff --git a/src/differential_expression/create_pseudobulk/test.py b/src/differential_expression/create_pseudobulk/test.py index f40963ee553..859be8874b1 100644 --- a/src/differential_expression/create_pseudobulk/test.py +++ b/src/differential_expression/create_pseudobulk/test.py @@ -58,8 +58,6 @@ def test_multiple_factors(run_component, random_h5mu_path): "donor_id", "--obs_groups", "treatment", - "--min_obs_per_sample", - "5", "--output_compression", "gzip", ] @@ -78,36 +76,5 @@ def test_multiple_factors(run_component, random_h5mu_path): ) -def test_filtering(run_component, random_h5mu_path): - output_path = random_h5mu_path() - - run_component( - [ - "--input", - input_path, - "--output", - output_path, - "--obs_label", - "cell_type", - "--obs_groups", - "treatment", - "--min_obs_per_sample", - "50", - "--output_compression", - "gzip", - ] - ) - - assert os.path.exists(output_path), "Output query file does not exist" - mdata = mu.read_h5mu(output_path) - adata = mdata.mod["rna"] - - expected_obs = ["treatment", "cell_type", "n_cells"] - assert all(col in adata.obs for col in expected_obs), ( - f"Expected columns {expected_obs} not found in .obs" - ) - assert adata.shape[0] == 4, "Expected a total of 8 pseudobulk samples in the output" - - if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/differential_expression/deseq2/config.vsh.yaml b/src/differential_expression/deseq2/config.vsh.yaml new file mode 100644 index 00000000000..d5385a5f834 --- /dev/null +++ b/src/differential_expression/deseq2/config.vsh.yaml @@ -0,0 +1,136 @@ +name: deseq2 +namespace: differential_expression +description: | + Performs differential expression analysis using DESeq2 on bulk samples or pseudobulk samples aggregated from single-cell data. + Note that this component only considers factors as explanatory variables, and excludes covariates from the analysis. + +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ contributor ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: Input h5mu file containing (pseudo-)bulk transcriptomic samples. + direction: input + required: true + - name: "--modality" + description: | + Which modality from the input MuData file to process. + type: string + default: "rna" + required: false + - name: "--input_layer" + type: string + required: false + description: "Input layer to use. If None, X is used. This layer must contain raw counts." + - name: "--var_gene_names" + type: string + description: | + Name of the .var field that contains gene symbols. If not provided, .var.index will be used. + required: false + example: "gene_symbol" + - name: "--obs_cell_group" + type: string + description: | + .obs field containing the cell group information, for example per cell type or per cell cluster. + If true, performs per-cell-group analysis with cell-group-specific results. + + - name: Arguments + arguments: + - name: "--design_formula" + type: string + description: | + The formula should be a tilde (~) followed by the variables with plus signs between them. + The design formula is used to estimate the dispersions and to estimate the log2 fold changes of the model. + required: true + example: "~ disease + treatment" + - name: "--contrast_column" + type: string + description: | + Column in the metadata to use for the contrast. + This column should contain the conditions to compare. + required: true + example: "treatment" + - name: "--contrast_values" + type: string + multiple: true + description: | + Values to compare in the contrast column. + First value is the control group, following values are comparison groups. + Values must be present in the fields specified for the design formula. + required: true + example: ["ctrl", "stim"] + - name: "--p_adj_threshold" + type: double + min: 0 + max: 1 + description: | + Adjusted p-value threshold for significance, corrected with the Benjamini and Hochberg method. + Genes with adjusted p-values below this threshold will be considered significant. + default: 0.05 + required: false + - name: "--log2fc_threshold" + type: double + min: 0 + description: | + Log2 fold change threshold for significance. + Genes with absolute log2 fold change above this threshold will be considered significant. + default: 0.0 + required: false + + - name: Outputs + arguments: + - name: "--output_dir" + alternatives: ["-o"] + type: file + description: | + Output directory for DESeq2 results. + If cell groups are defined (using `--obs_cell_group`), the output folder will contain one CSV per cell group, otherwise it will contain a single output file. + direction: output + required: true + - name: "--output_prefix" + type: string + description: | + Prefix for output CSV files. + If no cell groups are specified, the output file will be named "{prefix}.csv". + If cell groups are specified, files will be named "{prefix}_{cell_group}.csv". + default: "deseq2_analysis" + required: false + +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: test.py + - path: /resources_test/annotation_test_data/TS_Blood_filtered_pseudobulk.h5mu + +engines: +- type: docker + image: rocker/r2u:22.04 + setup: + - type: apt + packages: [ libhdf5-dev, libgeos-dev, hdf5-tools ] + - type: r + cran: [ hdf5r ] + # Use pinned version to avoid H5 file access issues with latest stable release (0.99.0) + github: scverse/anndataR@36f3caad9a7f360165c1510bbe0c62657580415a + bioc: [ DESeq2 ] + test_setup: + - type: apt + packages: [ python3, python3-pip, python3-dev, python-is-python3 ] + __merge__: [ ., /src/base/requirements/python_test_setup.yaml] + +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/differential_expression/deseq2/script.R b/src/differential_expression/deseq2/script.R new file mode 100644 index 00000000000..a6bde13b473 --- /dev/null +++ b/src/differential_expression/deseq2/script.R @@ -0,0 +1,419 @@ +library(DESeq2) +library(anndataR) +library(hdf5r) + +## VIASH START +par <- list( + input = paste0( + "resources_test/annotation_test_data/", + "TS_Blood_filtered_pseudobulk.h5mu" + ), + output_dir = "./test_deseq2_no_cellgroup/", + output_prefix = "deseq2_analysis", + input_layer = NULL, + modality = "rna", + obs_cell_group = "cell_type", + design_formula = "~ treatment", + contrast_column = "treatment", + contrast_values = c("ctrl", "stim"), + p_adj_threshold = 0.05, + log2fc_threshold = 0.0, + var_gene_names = "feature_name" +) +meta <- list(resources_dir = "src/utils") +## VIASH END + +cat("Starting DESeq2 analysis...\n") + +h5mu_to_h5ad <- function(h5mu_path, modality_name) { + tmp_path <- tempfile(fileext = ".h5ad") + mod_location <- paste("mod", modality_name, sep = "/") + h5src <- hdf5r::H5File$new(h5mu_path, "r") + h5dest <- hdf5r::H5File$new(tmp_path, "w") + children <- hdf5r::list.objects(h5src, + path = mod_location, + full.names = FALSE, recursive = FALSE + ) + for (child in children) { + h5dest$obj_copy_from( + h5src, paste(mod_location, child, sep = "/"), + paste0("/", child) + ) + } + # Also copy the root attributes + root_attrs <- hdf5r::h5attr_names(x = h5src) + for (attr in root_attrs) { + h5a <- h5src$attr_open(attr_name = attr) + robj <- h5a$read() + h5dest$create_attr_by_name( + attr_name = attr, + obj_name = ".", + robj = robj, + space = h5a$get_space(), + dtype = h5a$get_type() + ) + } + h5src$close() + h5dest$close() + + tmp_path +} + +# Check if expression data is normalized (row sums =~ 1) +is_normalized <- function(layer) { + row_sums <- if (is(layer, "sparseMatrix") || is(layer, "dgCMatrix")) { + Matrix::rowSums(layer) + } else { + rowSums(layer) + } + + all(abs(row_sums - 1) < 1e-6, na.rm = TRUE) +} + +# Extract design factors from formula +parse_design_formula <- function(design_formula) { + if (!grepl("^~\\s*\\w+(\\s*\\+\\s*\\w+)*$", design_formula)) { + stop( + sprintf( + paste( + "Invalid design formula: '%s'.", + "Formula must start with '~' and contain factors separated by '+'.", + "Example: '~ disease + treatment'", + sep = "\n" + ), + design_formula + ) + ) + } + design_factors <- all.vars(as.formula(design_formula)) + cat("Design formula:", design_formula, "\n") + cat("Extracted factors:", paste(design_factors, collapse = ", "), "\n") + design_factors +} + +# Validate and prepare contrast specifications +prepare_contrast_matrix <- function( + design_factors, contrast_column, metadata) { + # Validate required columns exist + required_columns <- unique(c(design_factors, contrast_column)) + missing_columns <- setdiff(required_columns, colnames(metadata)) + + if (length(missing_columns) > 0) { + stop(sprintf( + paste( + "Missing required columns in metadata: %s\n", + "Available metadata columns: %s" + ), + paste(missing_columns, collapse = ", "), + paste(colnames(metadata), collapse = ", ") + )) + } + + # Check contrast values exist + contrast_values <- par$contrast_values + available_values <- unique(metadata[[contrast_column]]) + missing_values <- setdiff(contrast_values, available_values) + + if (length(missing_values) > 0) { + stop(sprintf( + paste("Contrast values %s not found in %s.", + "Available values: %s", + sep = "\n" + ), + paste(missing_values, collapse = ", "), + contrast_column, + paste(available_values, collapse = ", ") + )) + } + + # Handle different contrast scenarios + if (length(contrast_values) == 2) { + # Pairwise comparison + comparison_group <- contrast_values[1] + control_group <- contrast_values[2] + contrast_spec <- c(contrast_column, comparison_group, control_group) + cat( + "Performing pairwise contrast:", contrast_column, + comparison_group, "vs", control_group, "\n" + ) + contrast_spec + } else if (length(contrast_values) > 2) { + # Multiple comparisons against first value (control_group) + control_group <- contrast_values[1] + contrast_specs <- list() + for (i in 2:length(contrast_values)) { + comparison_group <- contrast_values[i] + contrast_specs[[length(contrast_specs) + 1]] <- + c(contrast_column, comparison_group, control_group) + } + cat( + "Performing multiple contrasts against control_group '", + control_group, "':", + paste(sapply(contrast_specs, function(x) x[2]), collapse = ", "), "\n" + ) + contrast_specs + } else { + stop(sprintf( + "Need at least 2 values for contrast, got: %s", + paste(contrast_values, collapse = ", ") + )) + } +} + +# Convert expression matrix to counts data frame +prepare_counts_matrix <- function(layer, var_names, obs_names) { + counts <- if (is(layer, "sparseMatrix") || is(layer, "dgCMatrix")) { + as.matrix(layer) + } else { + layer + } + + # Create properly named data frame (transpose for DESeq2 format) + counts_df <- data.frame(counts) + colnames(counts_df) <- var_names + rownames(counts_df) <- obs_names + + # Ensure integer counts (required for DESeq2) + counts_df[] <- lapply(counts_df, function(x) as.integer(round(x))) + counts_df +} + +# Create and configure DESeq2 dataset +create_deseq2_dataset <- function(counts, metadata, design_formula) { + cat("Creating DESeq2 dataset\n") + + # Ensure matching samples between counts and metadata + common_samples <- intersect(rownames(counts), rownames(metadata)) + if (length(common_samples) == 0) { + stop("No common samples found between counts and metadata") + } + + counts <- counts[common_samples, , drop = FALSE] + metadata <- metadata[common_samples, , drop = FALSE] + + # Create DESeqDataSet (transpose for gene x sample format) + dds <- DESeq2::DESeqDataSetFromMatrix( + countData = t(counts), + colData = metadata, + design = as.formula(design_formula) + ) + dds +} + +# Perform DESeq2 differential expression analysis +deseq2_analysis <- function(dds, contrast_specs) { + cat("Running DESeq2 analysis\n") + + dds <- DESeq2::DESeq(dds) + + # Ensure contrast_specs is a list + if (!is.list(contrast_specs)) { + contrast_specs <- list(contrast_specs) + } + + all_results <- lapply(seq_along(contrast_specs), function(i) { + contrast_spec <- contrast_specs[[i]] + cat( + "Performing statistical test for contrast:", + paste(contrast_spec, collapse = " "), "\n" + ) + + # Get DESeq2 results for this contrast + res <- DESeq2::results( + dds, + contrast = contrast_spec, alpha = par$p_adj_threshold + ) + + # Convert to data frame and add metadata + results_df <- as.data.frame(res) + results_df$gene_id <- rownames(results_df) + results_df$contrast <- paste0(contrast_spec[2], "_vs_", contrast_spec[3]) + results_df$comparison_group <- contrast_spec[2] + results_df$control_group <- contrast_spec[3] + results_df$abs_log2FoldChange <- abs(results_df$log2FoldChange) + results_df$significant <- ( + results_df$padj < par$p_adj_threshold & + !is.na(results_df$padj) & + abs(results_df$log2FoldChange) > par$log2fc_threshold + ) + + # Sort by effect size + results_df[order(results_df$log2FoldChange, decreasing = TRUE), ] + }) + + # Combine all results + combined_results <- do.call(rbind, all_results) + + # Log summary statistics + for (i in seq_along(contrast_specs)) { + contrast_spec <- contrast_specs[[i]] + contrast_name <- paste0(contrast_spec[2], "_vs_", contrast_spec[3]) + contrast_subset <- combined_results[ + combined_results$contrast == contrast_name, + ] + n_significant <- sum(contrast_subset$significant, na.rm = TRUE) + cat("Contrast", contrast_name, ":", n_significant, "significant genes\n") + } + + combined_results +} + +# Save results and print summary statistics +save_results_and_log_summary <- function( + results, output_file, cell_group = NULL) { + group_text <- if (!is.null(cell_group)) paste(" for", cell_group) else "" + cat("Saving results", group_text, "to", output_file, "\n") + + write.csv(results, output_file, row.names = FALSE) + + # Calculate summary statistics + sig_results <- results[results$significant & !is.na(results$significant), ] + upregulated <- sig_results[sig_results$log2FoldChange > 0, ] + downregulated <- sig_results[sig_results$log2FoldChange < 0, ] + + cat("Summary", group_text, ":\n") + cat(" Total genes analyzed:", nrow(results), "\n") + cat(" Significant upregulated:", nrow(upregulated), "\n") + cat(" Significant downregulated:", nrow(downregulated), "\n") +} + +# Main analysis workflow +main <- function() { + cat("Loading pseudobulk data from", par$input, "\n") + + # Load and prepare data + h5ad_path <- h5mu_to_h5ad(par$input, par$modality) + mod <- anndataR::read_h5ad(h5ad_path, as = "InMemoryAnnData") + metadata <- as.data.frame(mod$obs) + + # Get expression matrix + layer <- if (!is.null(par$input_layer)) { + mod$layers[[par$input_layer]] + } else { + mod$X + } + + if (is_normalized(layer)) { + stop("Input layer must contain raw counts.") + } + + # Prepare analysis components + cat("Preparing design formula\n") + design_factors <- parse_design_formula(par$design_formula) + + cat("Preparing contrast matrix\n") + contrast_specs <- prepare_contrast_matrix( + design_factors, par$contrast_column, metadata + ) + + cat("Preparing counts matrix for DESeq2\n") + var_names <- if (!is.null(par$var_gene_names)) { + if (par$var_gene_names %in% colnames(mod$var)) { + mod$var[[par$var_gene_names]] + } else { + stop( + sprintf( + "var_gene_names '%s' not found in mod$var columns: %s", + par$var_gene_names, + paste(colnames(mod$var), collapse = ", ") + ) + ) + } + } else { + mod$var_names + } + obs_names <- mod$obs_names + counts <- prepare_counts_matrix(layer, var_names, obs_names) + + # Ensure output directory exists + if (!dir.exists(par$output_dir)) { + dir.create(par$output_dir, recursive = TRUE) + } + + # Run analysis (per cell group or overall) + tryCatch( + { + if ( + !is.null(par$obs_cell_group) && + par$obs_cell_group %in% colnames(metadata) + ) { + run_per_cell_group_analysis(counts, metadata, contrast_specs) + } else { + run_overall_analysis(counts, metadata, contrast_specs) + } + cat("DESeq2 analysis completed successfully\n") + }, + error = function(e) { + cat("Error in analysis. Check input data and parameters:\n") + cat("Contrast column:", par$contrast_column, "\n") + cat("Contrast values:", paste(par$contrast_values, collapse = ", "), "\n") + cat("Number of samples:", nrow(metadata), "\n") + cat("Number of genes:", ncol(counts), "\n") + stop(e) + } + ) +} + +# Run analysis per cell group +run_per_cell_group_analysis <- function(counts, metadata, contrast_specs) { + cat("Running DESeq2 analysis per cell group\n") + + # Remove cell group from design formula + design_no_celltype <- gsub( + paste0("\\+\\s*", par$obs_cell_group), "", par$design_formula + ) + design_no_celltype <- gsub( + paste0(par$obs_cell_group, "\\s*\\+"), "", design_no_celltype + ) + design_no_celltype <- gsub("\\s+", " ", design_no_celltype) + + cell_groups <- unique(metadata[[par$obs_cell_group]]) + + for (cell_group in cell_groups) { + cat("Processing cell group:", cell_group, "\n") + + # Subset data + cell_mask <- metadata[[par$obs_cell_group]] == cell_group + counts_subset <- counts[cell_mask, , drop = FALSE] + metadata_subset <- metadata[cell_mask, , drop = FALSE] + + # Skip if insufficient samples + if (nrow(counts_subset) < 2) { + cat("Skipping cell group", cell_group, "- too few samples\n") + next + } + + # Run analysis + dds <- create_deseq2_dataset( + counts_subset, metadata_subset, design_no_celltype + ) + results <- deseq2_analysis(dds, contrast_specs) + results[[par$obs_cell_group]] <- cell_group + + # Save results + safe_name <- gsub("[/ \\(\\)]", "_", as.character(cell_group)) + safe_name <- gsub("_+", "_", safe_name) + output_file <- file.path( + par$output_dir, + paste0(par$output_prefix, "_", safe_name, ".csv") + ) + save_results_and_log_summary(results, output_file, cell_group) + } +} + +# Run overall analysis (all samples together) +run_overall_analysis <- function(counts, metadata, contrast_specs) { + dds <- create_deseq2_dataset(counts, metadata, par$design_formula) + results <- deseq2_analysis(dds, contrast_specs) + + output_file <- file.path( + par$output_dir, + paste0(par$output_prefix, ".csv") + ) + save_results_and_log_summary(results, output_file) +} + +# Run main function if script is executed directly +if (!interactive()) { + main() +} diff --git a/src/differential_expression/deseq2/test.py b/src/differential_expression/deseq2/test.py new file mode 100644 index 00000000000..63a05f0f4e6 --- /dev/null +++ b/src/differential_expression/deseq2/test.py @@ -0,0 +1,405 @@ +import subprocess +import sys +import pytest +import pandas as pd +import re + + +## VIASH START +meta = {"resources_dir": "resources_test/"} +## VIASH END + +sys.path.append(meta["resources_dir"]) + + +@pytest.fixture +def pseudobulk_test_data_path(): + """Path to the pseudobulk test data""" + return f"{meta['resources_dir']}/TS_Blood_filtered_pseudobulk.h5mu" + + +def test_simple_deseq2_execution(run_component, tmp_path, pseudobulk_test_data_path): + """Test basic DESeq2 execution with minimal parameters""" + output_dir = tmp_path / "deseq2_output" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Assert one CSV file was found + csv_files = list(output_dir.glob("deseq2_analysis*.csv")) # Default prefix + assert len(csv_files) == 1, "Should find exactly one CSV file in output folder" + output_file = output_dir / "deseq2_analysis.csv" # Default prefix + assert output_file.exists(), "Output CSV file does not exist" + + # Check the output file + results = pd.read_csv(output_file) + expected_columns = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "significant", + "gene_id", + "contrast", + "comparison_group", + "control_group", + "abs_log2FoldChange", + ] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) + + expected_float_cols = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "abs_log2FoldChange", + ] + float_cols = results.select_dtypes(include=["float"]).columns.tolist() + assert all(col in float_cols for col in expected_float_cols), ( + f"Expected float columns {expected_float_cols} not found" + ) + + expected_obj_cols = ["gene_id", "contrast", "comparison_group", "control_group"] + obj_cols = results.select_dtypes(include=["object"]).columns.tolist() + assert all(col in obj_cols for col in expected_obj_cols), ( + f"Expected object columns {expected_obj_cols} not found" + ) + + +def test_simple_deseq2_with_cell_group( + run_component, tmp_path, pseudobulk_test_data_path +): + """Test DESeq2 execution with cell groups - should create separate files""" + output_dir = tmp_path / "deseq2_output" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--obs_cell_group", + "cell_type", + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Check that multiple CSV files exist (one per cell group) + csv_files = list(output_dir.glob("deseq2_analysis_*.csv")) # Default prefix + assert len(csv_files) > 0, "No cell group-specific CSV files found" + + # Check each file has expected structure + for csv_file in csv_files: + results = pd.read_csv(csv_file) + expected_columns = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "significant", + "gene_id", + "contrast", + "comparison_group", + "control_group", + "abs_log2FoldChange", + ] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found in {csv_file}" + ) + assert len(results) > 0, f"No results found in {csv_file}" + + # Check that all rows have the same cell_type value + assert results["cell_type"].nunique() == 1, ( + f"Multiple cell types found in {csv_file}" + ) + + expected_float_cols = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "abs_log2FoldChange", + ] + float_cols = results.select_dtypes(include=["float"]).columns.tolist() + assert all(col in float_cols for col in expected_float_cols), ( + f"Expected float columns {expected_float_cols} not found" + ) + + expected_obj_cols = ["gene_id", "contrast", "comparison_group", "control_group"] + obj_cols = results.select_dtypes(include=["object"]).columns.tolist() + assert all(col in obj_cols for col in expected_obj_cols), ( + f"Expected object columns {expected_obj_cols} not found" + ) + + +def test_complex_design_formula(run_component, tmp_path, pseudobulk_test_data_path): + """Test DESeq2 with complex design formula accounting for multiple factors""" + output_dir = tmp_path / "deseq2_output" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "~ disease + treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + "--padj_threshold", + "0.1", + "--log2fc_threshold", + "0.5", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + output_file = output_dir / "deseq2_analysis.csv" # Default prefix + assert output_file.exists(), "Output CSV file does not exist" + + results = pd.read_csv(output_file) + assert "significant" in results.columns, "Significance column not found" + assert len(results) > 0, "No results found for complex design" + + +def test_complex_design_formula_with_cell_groups( + run_component, tmp_path, pseudobulk_test_data_path +): + """Test that without cell group specified, a single CSV is created in output directory""" + output_dir = tmp_path / "deseq2_output" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "~ treatment + disease", + "--obs_cell_group", + "cell_type", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Check that cell group specific files exist + csv_files = list(output_dir.glob("deseq2_analysis_*.csv")) # Default prefix + assert len(csv_files) >= 1, "Could not find cell group-specific files" + + # Check the main file structure + for csv_file in csv_files: + results = pd.read_csv(csv_file) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found in {csv_file}" + ) + assert len(results) > 0, f"No results found in {csv_file}" + assert results["cell_type"].nunique() == 1, ( + f"Multiple cell types found in {csv_file} - should be one per file" + ) + + +def test_invalid_contrast_column(run_component, tmp_path, pseudobulk_test_data_path): + """Test that invalid contrast column raises appropriate error""" + output_dir = tmp_path / "deseq2_output" + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "~ treatment", + "--contrast_column", + "nonexistent_column", + "--contrast_values", + "group1", + "--contrast_values", + "group2", + ] + ) + + # Updated regex to match actual R error format (no square brackets) + assert re.search( + r"Missing required columns in metadata: nonexistent_column", + err.value.stdout.decode("utf-8"), + ), f"Expected error message not found: {err.value.stdout.decode('utf-8')}" + + +def test_invalid_design_column(run_component, tmp_path, pseudobulk_test_data_path): + """Test that invalid contrast column raises appropriate error""" + output_dir = tmp_path / "deseq2_output" + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--design_formula", + "malformed formula", + "--contrast_column", + "treatment", + "--contrast_values", + "ctrl", + "--contrast_values", + "stim", + ] + ) + + # Updated regex to match actual R error format (no square brackets) + assert re.search( + r"Invalid design formula: 'malformed formula'", + err.value.stdout.decode("utf-8"), + ), f"Expected error message not found: {err.value.stdout.decode('utf-8')}" + + +def test_custom_output_prefix(run_component, tmp_path, pseudobulk_test_data_path): + """Test custom output prefix functionality""" + output_dir = tmp_path / "deseq2_output" + custom_prefix = "my_custom_analysis" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--output_prefix", + custom_prefix, + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Should have exactly one CSV file with custom prefix + output_file = output_dir / f"{custom_prefix}.csv" + assert output_file.exists(), ( + f"Custom prefix output CSV file {output_file} does not exist" + ) + + # Check the main file structure + results = pd.read_csv(output_file) + expected_columns = ["log2FoldChange", "pvalue", "padj", "significant"] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) + assert len(results) > 0, "No results found in output" + + +def test_custom_output_prefix_with_cell_groups( + run_component, tmp_path, pseudobulk_test_data_path +): + """Test custom output prefix with cell groups""" + output_dir = tmp_path / "deseq2_output" + custom_prefix = "celltype_analysis" + + run_component( + [ + "--input", + pseudobulk_test_data_path, + "--output_dir", + str(output_dir), + "--output_prefix", + custom_prefix, + "--obs_cell_group", + "cell_type", + "--design_formula", + "~ treatment", + "--contrast_column", + "treatment", + "--contrast_values", + "stim", + "--contrast_values", + "ctrl", + ] + ) + + assert output_dir.exists(), "Output directory does not exist" + + # Check that multiple CSV files exist with custom prefix + csv_files = list(output_dir.glob(f"{custom_prefix}_*.csv")) + assert len(csv_files) > 0, ( + f"No cell group-specific CSV files found with prefix {custom_prefix}" + ) + + # Check each file has expected structure + for csv_file in csv_files: + results = pd.read_csv(csv_file) + expected_columns = [ + "log2FoldChange", + "pvalue", + "padj", + "significant", + "cell_type", + ] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found in {csv_file}" + ) + assert len(results) > 0, f"No results found in {csv_file}" + + # Check that all rows have the same cell_type value + assert results["cell_type"].nunique() == 1, ( + f"Multiple cell types found in {csv_file}" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml new file mode 100644 index 00000000000..256824bdadd --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -0,0 +1,157 @@ +name: pseudobulk_deseq2 +namespace: workflows/differential_expression +scope: public +description: | + Performs pseudobulk generation and subsequent differential expression analysis using DESeq2. +authors: + - __merge__: /src/authors/jakub_majercik.yaml + roles: [ author ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + description: Input h5mu file. + example: /path/to/input_file.h5mu + - name: --modality + type: string + default: rna + description: Which modality from the input MuData file to process. + - name: --input_layer + type: string + required: false + description: Input layer to use. If None, X is used. This layer must contain raw counts. + - name: --obs_cell_group + type: string + required: true + description: | + .obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters. + If provided, will generate DESeq2 analysis per cell group. + example: cell_type + - name: --obs_groups + type: string + multiple: true + description: .obs fields containing the experimental condition(s) to create pseudobulk samples for. + example: ["treatment", "disease"] + - name: "--var_gene_names" + type: string + description: | + Name of the .var field that contains gene symbols. If not provided, .var.index will be used. + required: false + example: "gene_symbol" + + - name: Pseudobulk Options + arguments: + - name: --aggregation_method + type: string + choices: ["sum", "mean"] + default: sum + description: Method to aggregate the raw counts for pseudoreplicates. Either sum or mean. + - name: --random_state + type: integer + min: 0 + default: 0 + description: The random seed for sampling. + + - name: Filtering options + arguments: + - name: --min_obs_per_sample + type: integer + min: 1 + default: 30 + description: Minimum number of observations per pseudobulk sample. + - name: --filter_genes_min_samples + type: integer + min: 1 + description: | + Minimum number of samples a gene must be expressed in to be included in the analysis. + If None, no filtering is applied. + - name: --filter_gene_patterns + type: string + multiple: true + example: ["MIR\\d+", "AL\\d+", "LINC\\d+", "AC\\d+", "AP\\d+"] + description: | + List of regex patterns to filter out genes. + Genes matching any of these patterns will be excluded from the analysis. + required: false + + - name: DGEA options + arguments: + - name: --design_formula + type: string + required: true + description: | + Design formula for DESeq2 analysis in R-style format. + Specifies the statistical model to account for various factors. + example: "~ cell_type + disease + treatment" + - name: --contrast_column + type: string + required: true + description: | + Column in the metadata to use for the contrast. + This column should contain the conditions to compare. + example: "treatment" + - name: --contrast_values + type: string + multiple: true + required: true + description: | + Values to compare in the contrast column. + First value is baseline, following values are different treatments. + example: ["ctrl", "stim"] + - name: --p_adj_threshold + type: double + default: 0.05 + description: | + Adjusted p-value threshold for significance. + Genes with adjusted p-values below this threshold will be considered significant. + required: false + - name: --log2fc_threshold + type: double + default: 0.0 + description: | + Log2 fold change threshold for significance. + Genes with absolute log2 fold change above this threshold will be considered significant. + required: false + + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + description: | + Output directory for DESeq2 results, containing one CSV file per cell group (as defined by `--obs_cell_group`) + direction: output + required: true + - name: "--output_prefix" + type: string + description: | + Prefix for output CSV files: files will be named "{prefix}_{cell_group}.csv" + default: "deseq2_analysis" + required: false + +dependencies: + - name: differential_expression/create_pseudobulk + - name: differential_expression/deseq2 + - name: filter/filter_with_counts + - name: filter/filter_with_pattern + - name: filter/delimit_counts + - name: filter/do_filter + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +runners: + - type: nextflow \ No newline at end of file diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/integration_test.sh b/src/workflows/differential_expression/pseudobulk_deseq2/integration_test.sh new file mode 100755 index 00000000000..c712b294512 --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/integration_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/differential_expression/pseudobulk_deseq2/test.nf \ + -entry test_wf \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/main.nf b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf new file mode 100644 index 00000000000..a9fec8dc6d9 --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/main.nf @@ -0,0 +1,94 @@ +workflow run_wf { + take: + input_ch + + main: + + output_ch = input_ch + | create_pseudobulk.run( + fromState: [ + input: "input", + modality: "modality", + input_layer: "input_layer", + obs_label: "obs_cell_group", + obs_groups: "obs_groups", + aggregation_method: "aggregation_method", + random_state: "random_state" + ], + args:[ + obs_cell_count: "n_cells" + ], + toState: [ "input": "output" ] + ) + + | delimit_counts.run( + fromState: [ + input: "input", + modality: "modality", + min_counts: "min_obs_per_sample", + + ], + args: [ + do_subset: "True", + obs_count_column: "n_cells", + obs_name_filter: "delimit_samples_per_pseudobulk", + ], + toState: [ "input": "output" ] + ) + + | filter_with_pattern.run( + runIf: { id, state -> + state.filter_gene_patterns + }, + fromState: [ + input: "input", + modality: "modality", + var_gene_names: "var_gene_names", + pattern: "filter_gene_patterns", + min_counts: "min_obs_per_sample", + + ], + args: [ + do_subset: "True", + var_name_filter: "filter_with_gene_pattern" + ], + toState: [ "input": "output" ] + ) + + | filter_with_counts.run( + fromState: [ + input: "input", + modality: "modality", + layer: "input_layer", + min_cells_per_gene: "filter_genes_min_samples", + ], + args: [ + do_subset: "True", + var_name_filter: "filter_with_counts" + ], + toState: [ "input": "output" ] + ) + + | deseq2.run( + fromState: [ + input: "input", + modality: "modality", + input_layer: "input_layer", + var_gene_names: "var_gene_names", + obs_cell_group: "obs_cell_group", + design_formula: "design_formula", + contrast_column: "contrast_column", + contrast_values: "contrast_values", + p_adj_threshold: "p_adj_threshold", + log2fc_threshold: "log2fc_threshold", + output_dir: "output", + output_prefix: "output_prefix" + ], + toState: [ "output": "output_dir" ] + ) + + | setState([ "output" ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/nextflow.config b/src/workflows/differential_expression/pseudobulk_deseq2/nextflow.config new file mode 100644 index 00000000000..059100c489c --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/differential_expression/pseudobulk_deseq2/test.nf b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf new file mode 100644 index 00000000000..565682f58fd --- /dev/null +++ b/src/workflows/differential_expression/pseudobulk_deseq2/test.nf @@ -0,0 +1,86 @@ +nextflow.enable.dsl=2 + +include { pseudobulk_deseq2 } from params.rootDir + "/target/nextflow/workflows/differential_expression/pseudobulk_deseq2/main.nf" +include { pseudobulk_deseq2_test } from params.rootDir + "/target/_test/nextflow/test_workflows/differential_expression/pseudobulk_deseq2_test/main.nf" + +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + obs_cell_group: "cell_type", + obs_groups: ["treatment", "disease"], + min_obs_per_sample: 5, + design_formula: "~ treatment", + contrast_column: "treatment", + contrast_values: ["ctrl", "stim"], + output: "simple_execution_test_output" + ] + ]) + | map{ state -> [state.id, state] } + | pseudobulk_deseq2 + + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'. found: ${state}" + assert state.output.isDirectory() : "'output' should be a directory." + def deseqFiles = state.output.listFiles().findAll { file -> + file.name.matches(/deseq2_analysis_.*\.csv/) + } + assert deseqFiles.size() == 4, "Output directory should contain exactly 4 deseq2_analysis_*.csv files. Found: ${deseqFiles.size()} files: ${deseqFiles.collect { it.name }}" + + "Output: $output" + } + + | flatMap { output-> + def id = output[0] + def state = output[1] + + // Find all CSV files and create separate entries + def deseqFiles = state.output.listFiles().findAll { file -> + file.name.matches(/deseq2_analysis_.*\.csv/) + } + deseqFiles.collect { csvFile -> + def cellType = csvFile.name.replaceAll(/deseq2_analysis_(.*)\.csv/, '$1') + def newId = "${id}_${cellType}" + [newId, ["input": csvFile]] + } + } + + | view { output -> "After flatMap: $output" } + | pseudobulk_deseq2_test.run( + fromState: [ + "input": "input" + ] + ) + + output_ch + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 4 : "output channel should contain 4 events" + + def sortedIds = output_list.collect{it[0]}.sort() + def expectedIds = [ + "simple_execution_test_classical_monocyte", + "simple_execution_test_erythrocyte", + "simple_execution_test_neutrophil", + "simple_execution_test_plasma_cell" + ] + + assert sortedIds == expectedIds : "IDs don't match. Expected: ${expectedIds}, found: ${sortedIds}" + } +} \ No newline at end of file diff --git a/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml new file mode 100644 index 00000000000..55be62a1379 --- /dev/null +++ b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/config.vsh.yaml @@ -0,0 +1,25 @@ +name: "pseudobulk_deseq2_test" +namespace: "test_workflows/differential_expression" +scope: "test" +description: "This component tests the output of the pseudobulk_deseq2 differential expression analysis workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu + +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.13-slim + __merge__: /src/base/requirements/testworkflows_setup.yaml +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/script.py b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/script.py new file mode 100644 index 00000000000..297bac204c7 --- /dev/null +++ b/src/workflows/test_workflows/differential_expression/pseudobulk_deseq2/script.py @@ -0,0 +1,59 @@ +import pandas as pd +import sys +import pytest + +## VIASH START +par = {"input": "test/deseq_analysis_classical_monocyte.csv"} +## VIASH END + + +def test_run(): + results = pd.read_csv(par["input"]) + + # Make sure expected columns are present + expected_columns = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "significant", + "gene_id", + "contrast", + "comparison_group", + "control_group", + "abs_log2FoldChange", + ] + assert all(col in results.columns for col in expected_columns), ( + f"Expected columns {expected_columns} not found" + ) + + # Make sure columns have expected dtypes + assert results["significant"].dtypes == "bool", ( + "Expected 'significant' column to be of type bool" + ) + + expected_float_cols = [ + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj", + "abs_log2FoldChange", + ] + float_cols = results.select_dtypes(include=["float"]).columns.tolist() + assert all(col in float_cols for col in expected_float_cols), ( + f"Expected float columns {expected_float_cols} not found" + ) + + expected_obj_cols = ["gene_id", "contrast", "comparison_group", "control_group"] + obj_cols = results.select_dtypes(include=["object"]).columns.tolist() + assert all(col in obj_cols for col in expected_obj_cols), ( + f"Expected object columns {expected_obj_cols} not found" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "--import-mode=importlib"]))