Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Output 10x counts #160

Merged
merged 18 commits into from
Nov 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## v2.1.1dev -

- Added support to output 10x count files in text format.
kafkasl marked this conversation as resolved.
Show resolved Hide resolved
- Add gene symbols to count matrices

### Fixes

- Autocanceling previous CI runs when new changes are pushed.
Expand Down
34 changes: 0 additions & 34 deletions bin/cellranger_mtx_to_h5ad.py

This file was deleted.

113 changes: 98 additions & 15 deletions bin/mtx_to_h5ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,138 @@
import scanpy as sc
import pandas as pd
import argparse
import os
from scipy import io
from anndata import AnnData


def mtx_to_adata(
def _10x_h5_to_adata(mtx_h5: str, sample: str):
adata = sc.read_10x_h5(mtx_h5)
adata.var["gene_symbols"] = adata.var_names
adata.var.set_index("gene_ids", inplace=True)
adata.obs["sample"] = sample

# reorder columns for 10x mtx files
adata.var = adata.var[["gene_symbols", "feature_types", "genome"]]

return adata


def _mtx_to_adata(
mtx_file: str,
barcode_file: str,
feature_file: str,
sample: str,
aligner: str,
verbose: bool = False,
):

if verbose:
print("Reading in {}".format(mtx_file))

adata = sc.read_mtx(mtx_file)
if (
aligner == "star"
): # for some reason star matrix comes transposed and doesn't fit when values are appended directly
adata = adata.transpose()

adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values
adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values
adata.obs["sample"] = sample

return adata


def input_to_adata(
input_data: str,
barcode_file: str,
feature_file: str,
sample: str,
aligner: str,
txp2gene: str,
star_index: str,
verbose: bool = True,
):

if verbose and (txp2gene or star_index):
print("Reading in {}".format(input_data))

if aligner == "cellranger":
adata = _10x_h5_to_adata(input_data, sample)
else:
adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner)

if verbose and (txp2gene or star_index):
print("Reading in {}".format(txp2gene))

if txp2gene:
t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
elif star_index:
t2g = pd.read_table(
f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]
)

if txp2gene or star_index:
t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id")
adata.var["gene_symbol"] = t2g["gene_symbol"]

return adata


def write_counts(
adata: AnnData,
out: str,
verbose: bool = False,
):

pd.DataFrame(adata.obs.index).to_csv(os.path.join(out, "barcodes.tsv"), sep="\t", index=False, header=None)
pd.DataFrame(adata.var).to_csv(os.path.join(out, "features.tsv"), sep="\t", index=True, header=None)
io.mmwrite(os.path.join(out, "matrix.mtx"), adata.X.T, field="integer")

if verbose:
print("Wrote features.tsv, barcodes.tsv, and matrix.mtx files to {}".format(args["out"]))


def dump_versions(task_process):
import pkg_resources

with open("versions.yml", "w") as f:
f.write(f"{task_process}:\n\t")
f.write("\n\t".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set]))


if __name__ == "__main__":

parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.")

parser.add_argument("-m", "--mtx", dest="mtx", help="Path to mtx file.")
parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.")
parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=False)
parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.")
parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.")
parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="")
parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="")
parser.add_argument("-s", "--sample", dest="sample", help="Sample name")
parser.add_argument("-o", "--out", dest="out", help="Output path.")
parser.add_argument("-a", "--aligner", dest="aligner", help="Which aligner has been used?")
parser.add_argument("--task_process", dest="task_process", help="Task process name.")
parser.add_argument("--txp2gene", dest="txp2gene", help="Transcript to gene (t2g) file.", nargs="?", const="")
parser.add_argument(
"--star_index", dest="star_index", help="Star index folder containing geneInfo.tab.", nargs="?", const=""
)

args = vars(parser.parse_args())

adata = mtx_to_adata(
args["mtx"],
args["barcode"],
args["feature"],
args["sample"],
args["aligner"],
# create the directory with the sample name
os.makedirs(os.path.dirname(args["out"]), exist_ok=True)

adata = input_to_adata(
input_data=args["input_data"],
barcode_file=args["barcode"],
feature_file=args["feature"],
sample=args["sample"],
aligner=args["aligner"],
txp2gene=args["txp2gene"],
star_index=args["star_index"],
verbose=args["verbose"],
)

write_counts(adata=adata, out=args["sample"], verbose=args["verbose"])

adata.write_h5ad(args["out"], compression="gzip")
kafkasl marked this conversation as resolved.
Show resolved Hide resolved

print("Wrote h5ad file to {}".format(args["out"]))

dump_versions(task_process=args["task_process"])
2 changes: 2 additions & 0 deletions bin/mtx_to_seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ if(aligner %in% c("kallisto", "alevin")) {

seurat.object <- CreateSeuratObject(counts = expression.matrix)

dir.create(basename(dirname(out.file)), showWarnings = FALSE)

saveRDS(seurat.object, file = out.file)


Expand Down
30 changes: 21 additions & 9 deletions modules/local/mtx_to_h5ad.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@ process MTX_TO_H5AD {
// inputs from cellranger nf-core module does not come in a single sample dir
// for each sample, the sub-folders and files come directly in array.
tuple val(meta), path(inputs)
path txp2gene
path star_index

output:
path "*.h5ad", emit: h5ad
path "${meta.id}/*h5ad", emit: h5ad
path "${meta.id}/*", emit: counts
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when
Expand All @@ -40,10 +44,11 @@ process MTX_TO_H5AD {
if (params.aligner == 'cellranger')
"""
# convert file types
cellranger_mtx_to_h5ad.py \\
--mtx filtered_feature_bc_matrix.h5 \\
mtx_to_h5ad.py \\
--aligner ${params.aligner} \\
--input filtered_feature_bc_matrix.h5 \\
--sample ${meta.id} \\
--out ${meta.id}_matrix.h5ad
--out ${meta.id}/${meta.id}_matrix.h5ad
"""

else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard')
Expand All @@ -53,27 +58,34 @@ process MTX_TO_H5AD {
mtx_to_h5ad.py \\
--aligner ${params.aligner} \\
--sample ${meta.id} \\
--mtx *count/counts_unfiltered/\${input_type}.mtx \\
--input *count/counts_unfiltered/\${input_type}.mtx \\
--barcode *count/counts_unfiltered/\${input_type}.barcodes.txt \\
--feature *count/counts_unfiltered/\${input_type}.genes.txt \\
--out ${meta.id}_\${input_type}_matrix.h5ad ;
--txp2gene ${txp2gene} \\
--star_index ${star_index} \\
--out ${meta.id}/${meta.id}_\${input_type}_matrix.h5ad ;
done
"""

else
"""
# convert file types
mtx_to_h5ad.py \\
--task_process ${task.process} \\
--aligner ${params.aligner} \\
--sample ${meta.id} \\
--mtx $mtx_matrix \\
--input $mtx_matrix \\
--barcode $barcodes_tsv \\
--feature $features_tsv \\
--out ${meta.id}_matrix.h5ad
--txp2gene ${txp2gene} \\
--star_index ${star_index} \\
--out ${meta.id}/${meta.id}_matrix.h5ad
"""

stub:
"""
touch ${meta.id}_matrix.h5ad
mkdir ${meta.id}
touch ${meta.id}/${meta.id}_matrix.h5ad
touch versions.yml
"""
}
12 changes: 8 additions & 4 deletions modules/local/mtx_to_seurat.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ process MTX_TO_SEURAT {
tuple val(meta), path(inputs)

output:
path "*.rds", emit: seuratObjects
path "${meta.id}/*.rds", emit: seuratObjects
path "versions.yml", emit: versions

when:
Expand All @@ -38,6 +38,9 @@ process MTX_TO_SEURAT {
barcodes = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz"
features = "*.Solo.out/Gene*/filtered/features.tsv.gz"
}
"""
mkdir ${meta.id}
"""

if (params.aligner == 'kallisto' && params.kb_workflow != 'standard')
"""
Expand All @@ -47,7 +50,7 @@ process MTX_TO_SEURAT {
*count/counts_unfiltered/\${input_type}.mtx \\
*count/counts_unfiltered/\${input_type}.barcodes.txt \\
*count/counts_unfiltered/\${input_type}.genes.txt \\
${meta.id}_\${input_type}_matrix.rds \\
${meta.id}/${meta.id}_\${input_type}_matrix.rds \\
${aligner}
done
"""
Expand All @@ -58,13 +61,14 @@ process MTX_TO_SEURAT {
$matrix \\
$barcodes \\
$features \\
${meta.id}_matrix.rds \\
${meta.id}/${meta.id}_matrix.rds \\
${aligner}
"""

stub:
"""
touch ${meta.id}_matrix.rds
mkdir ${meta.id}
touch ${meta.id}/${meta.id}_matrix.rds
touch versions.yml
"""
}
1 change: 0 additions & 1 deletion subworkflows/local/alevin.nf
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,4 @@ workflow SCRNASEQ_ALEVIN {
alevin_results = SIMPLEAF_QUANT.out.alevin_results
alevinqc = ALEVINQC.out.report
for_multiqc = SIMPLEAF_QUANT.out.alevin_results.collect{it[1]}.ifEmpty([])

}
2 changes: 1 addition & 1 deletion subworkflows/local/align_cellranger.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
include {CELLRANGER_MKGTF} from "../../modules/nf-core/cellranger/mkgtf/main.nf"
include {CELLRANGER_MKREF} from "../../modules/nf-core/cellranger/mkref/main.nf"
include {CELLRANGER_COUNT} from "../../modules/nf-core/cellranger/count/main.nf"
include {MTX_TO_H5AD } from "../../modules/local/mtx_to_h5ad.nf"

// Define workflow to subset and index a genome region fasta file
workflow CELLRANGER_ALIGN {
Expand Down Expand Up @@ -43,4 +42,5 @@ workflow CELLRANGER_ALIGN {
emit:
ch_versions
cellranger_out = CELLRANGER_COUNT.out.outs
star_index = cellranger_index
}
1 change: 1 addition & 0 deletions subworkflows/local/kallisto_bustools.nf
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ workflow KALLISTO_BUSTOOLS {
emit:
ch_versions
counts = KALLISTOBUSTOOLS_COUNT.out.count
txp2gene = txp2gene.collect()


}
13 changes: 10 additions & 3 deletions subworkflows/local/mtx_conversion.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,19 @@ workflow MTX_CONVERSION {
take:
mtx_matrices
samplesheet
txp2gene
star_index

main:
ch_versions = Channel.empty()
// Convert matrix do h5ad

//
// Convert matrix to h5ad
//
MTX_TO_H5AD (
mtx_matrices
mtx_matrices,
txp2gene,
star_index
)

//
Expand All @@ -33,9 +39,10 @@ workflow MTX_CONVERSION {
)

//TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output
ch_version = ch_versions.mix(MTX_TO_SEURAT.out.versions)
ch_version = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions)

emit:
ch_versions
counts = MTX_TO_H5AD.out.counts

}
1 change: 1 addition & 0 deletions subworkflows/local/starsolo.nf
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ workflow STARSOLO {

emit:
ch_versions
star_index = star_index
star_result = STAR_ALIGN.out.tab
star_counts = STAR_ALIGN.out.counts
for_multiqc = STAR_ALIGN.out.log_final.collect{it[1]}.ifEmpty([])
Expand Down
Loading