Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@

* `from_h5mu_or_h5ad_to_seurat`: converts an h5ad file or a single modality from an h5mu file to a seurat object (PR #1046).

* (Experimental) `differential_expression/create_pseudobulk`: Added a component to generate pseudobulk samples from single-cell transcriptomics data,
to create bulk-like expression profiles suitable for differential expression analysis with methods designed for bulk differential expression analysis.
Warning: the functionality in this component is experimental and its behavior may change in future releases (PR #1042).


## MAJOR CHANGES

* `mapping/cellranger_*`: Upgrade CellRanger to v9.0 (PR #992 and #1006).
Expand Down
28 changes: 26 additions & 2 deletions resources_test_scripts/annotation_test_data.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,38 @@ wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz?
# Process Tabula Sapiens Blood reference h5ad
# (Select one individual and 100 cells per cell type)
# normalize and log1p transform data
# Add treatment and disease columns
python <<HEREDOC
import anndata as ad
import scanpy as sc
import numpy as np

# Read in data
ref_adata = ad.read_h5ad("${OUT}/tmp_TS_Blood_filtered.h5ad")
sub_ref_adata = ref_adata[ref_adata.obs["donor_assay"] == "TSP14_10x 3' v3"]
n=100
s=sub_ref_adata.obs.groupby('cell_ontology_class').cell_ontology_class.transform('count')
sub_ref_adata_final = sub_ref_adata[sub_ref_adata.obs[s>=n].groupby('cell_ontology_class').head(n).index]
# assert sub_ref_adata_final.shape == (500, 58870)

# Normalize and log1p transform data
data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X)
sc.pp.normalize_total(data_for_scanpy, target_sum=10000)
sc.pp.log1p(
data_for_scanpy,
base=None,
layer=None,
copy=False,
)
)
sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X

# Add treatment and disease columns
n_cells = sub_ref_adata_final.n_obs
treatment = np.random.choice(["ctrl", "stim"], size=n_cells, p=[0.5, 0.5])
disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5])
sub_ref_adata_final.obs["treatment"] = treatment
sub_ref_adata_final.obs["disease"] = disease

# Write out data
sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
HEREDOC

Expand Down Expand Up @@ -115,3 +129,13 @@ viash run src/integrate/scanvi/config.vsh.yaml --engine docker -- \
rm "${OUT}/scanvi_output.h5mu"
rm "${OUT}/scvi_output.h5mu"
rm -r "${OUT}/Pretrained_model/"

echo "> Creating Pseudobulk Data for DGEA"
viash run src/differential_expression/create_pseudobulk/config.vsh.yaml --engine docker -- \
--input "${OUT}/TS_Blood_filtered.h5mu" \
--obs_grouping "cell_type" \
--obs_sample_conditions "donor_id" \
--obs_sample_conditions "treatment" \
--obs_sample_conditions "disease" \
--min_num_cells_per_sample 5 \
--output "${OUT}/TS_Blood_filtered_pseudobulk.h5mu"
111 changes: 111 additions & 0 deletions src/differential_expression/create_pseudobulk/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
name: differential_expression
namespace: "create_pseudobulk"
scope: "public"
description: |
Generation of pseudobulk samples from single-cell transcriptomics data,
by aggregating raw gene expression counts from individual cells to create
bulk-like expression profiles suitable for differential expression analysis
with methods designed for bulk differential expression analysis.
Note that this componentonly considers factors as explanatory variables,
and excludes covariates from the analysis.

authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author ]
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
- __merge__: /src/authors/dries_de_maeyer.yaml
roles: [ contributor ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]

argument_groups:
- name: Inputs
arguments:
- name: "--input"
alternatives: ["-i"]
type: file
description: Input h5mu file.
direction: input
required: true
- name: "--modality"
description: |
Which modality from the input MuData file to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
required: false
description: "Input layer to use. If None, X is used. This layer must contain raw counts."
- name: "--obs_label"
type: string
required: true
description: ".obs field containing the variable to group on. Typically this field contains cell groups such as annotated cell types or clusters."
- name: "--obs_groups"
type: string
multiple: true
description: ".obs fields containing the experimental condition(s) to create pseudobulk samples for."

- name: Arguments
arguments:
- name: "--aggregation_method"
type: string
choices: ["sum", "mean"]
default: "sum"
description: "Method to aggregate the raw counts for pseudoreplicates. Either sum or mean."
- name: "--min_obs_per_sample"
type: integer
min: 1
default: 30
description: "Minimum number of cells per pseudobulk sample."
- name: --random_state
type: integer
description: |
The random seed for sampling.
min: 0
default: 0
required: false
- name: "--obs_cell_count"
type: string
default: "n_cells"
description: "The .obs field to store the number of observations per pseudobulk sample."

- name: Outputs
arguments:
- name: "--output"
type: file
description: Output h5mu file containing the aggregated pseudobulk samples.
direction: output
required: true
- name: "--output_compression"
type: string
description: |
The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"

resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py

test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu

engines:
- type: docker
image: python:3.12
setup:
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]

runners:
- type: executable
- type: nextflow
directives:
label: [lowcpu, lowmem]
107 changes: 107 additions & 0 deletions src/differential_expression/create_pseudobulk/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import numpy as np
import mudata as mu
import pandas as pd
import sys
import scanpy as sc
import scipy.sparse as sp

## VIASH START
par = {
"input": "resources_test/annotation_test_data/TS_Blood_filtered.h5mu",
"modality": "rna",
"input_layer": None,
"obs_label": "cell_type",
"obs_groups": ["treatment", "donor_id", "disease"],
"obs_cell_count": "n_cells",
"min_obs_per_sample": 5,
"random_state": 0,
"output": "test.h5mu",
"output_compression": "gzip",
}
meta = {"resources_dir": "src/utils"}
## VIASH END

sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger

logger = setup_logger()


def is_normalized(layer):
if sp.issparse(layer):
row_sums = np.array(layer.sum(axis=1)).flatten()
else:
row_sums = layer.sum(axis=1)

return np.allclose(row_sums, 1)


def count_obs(adata, pb_adata, obs_cols):
counts = []
for i in range(pb_adata.n_obs):
values = pb_adata.obs.iloc[i][obs_cols]

mask = pd.Series([True] * adata.n_obs)
for col in obs_cols:
mask &= (adata.obs[col] == values[col]).values

count = mask.sum()
counts.append(count)

return counts


def main():
# Read in data
logger.info(f"Reading input data {par['input']}...")
adata = mu.read_h5ad(par["input"], mod=par["modality"]).copy()

# Make sure .X contains raw counts
if par["input_layer"]:
adata.X = adata.layers[par["input_layer"]]
if is_normalized(adata.X):
raise ValueError("Input layer must contain raw counts.")

# Sanitize pseudobulk aggregation fields
pseudobulk_cols = [par["obs_label"]]
if par["obs_groups"]:
pseudobulk_cols += par["obs_groups"]

for col in pseudobulk_cols:
if col not in adata.obs.columns:
raise ValueError(f"Required column '{col}' not found in .obs.")
adata.obs[col] = (
adata.obs[col]
.astype(str)
.str.replace(" ", "_")
.str.replace("+", "")
.astype("category")
)

# Aggregate pseudobulk data per cell group
logger.info("Creating pseudobulk samples...")
adata_pb = sc.get.aggregate(adata, by=pseudobulk_cols, func="sum", axis=0)
adata_pb.X = adata_pb.layers["sum"]
del adata_pb.layers["sum"]

adata_pb.obs_names = [
"_".join(values)
for values in zip(*[adata_pb.obs[col] for col in pseudobulk_cols])
]

# Filter pseudobulk samples based on minimum observation count
logger.info("Filtering pseudobulk samples based on minimum observation count...")
adata_pb.obs[par["obs_cell_count"]] = count_obs(adata, adata_pb, pseudobulk_cols)
adata_pb = adata_pb[adata_pb.obs[par["obs_cell_count"]] > par["min_obs_per_sample"]]

logger.info(
f"Final dataset: {adata_pb.n_obs} pseudobulk samples, {adata_pb.n_vars} genes"
)

logger.info("Writing output data...")
mdata_pb = mu.MuData({"rna": adata_pb})
mdata_pb.write_h5mu(par["output"], compression=par["output_compression"])


if __name__ == "__main__":
main()
Loading
Loading