nf-core · grst · Nov 17, 2022 · Oct 20, 2022 · Nov 7, 2022 · Nov 9, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## v2.1.1dev -
 
+- Added support to output 10x count files in text format.
+- Add gene symbols to count matrices
+
 ### Fixes
 
 - Autocanceling previous CI runs when new changes are pushed.

diff --git a/bin/cellranger_mtx_to_h5ad.py b/bin/cellranger_mtx_to_h5ad.py
diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py
@@ -2,55 +2,138 @@
 import scanpy as sc
 import pandas as pd
 import argparse
+import os
+from scipy import io
+from anndata import AnnData
 
 
-def mtx_to_adata(
+def _10x_h5_to_adata(mtx_h5: str, sample: str):
+    adata = sc.read_10x_h5(mtx_h5)
+    adata.var["gene_symbols"] = adata.var_names
+    adata.var.set_index("gene_ids", inplace=True)
+    adata.obs["sample"] = sample
+
+    # reorder columns for 10x mtx files
+    adata.var = adata.var[["gene_symbols", "feature_types", "genome"]]
+
+    return adata
+
+
+def _mtx_to_adata(
     mtx_file: str,
     barcode_file: str,
     feature_file: str,
     sample: str,
     aligner: str,
-    verbose: bool = False,
 ):
-
-    if verbose:
-        print("Reading in {}".format(mtx_file))
-
     adata = sc.read_mtx(mtx_file)
     if (
         aligner == "star"
     ):  # for some reason star matrix comes transposed and doesn't fit when values are appended directly
         adata = adata.transpose()
+
     adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values
     adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values
     adata.obs["sample"] = sample
 
     return adata
 
 
+def input_to_adata(
+    input_data: str,
+    barcode_file: str,
+    feature_file: str,
+    sample: str,
+    aligner: str,
+    txp2gene: str,
+    star_index: str,
+    verbose: bool = True,
+):
+
+    if verbose and (txp2gene or star_index):
+        print("Reading in {}".format(input_data))
+
+    if aligner == "cellranger":
+        adata = _10x_h5_to_adata(input_data, sample)
+    else:
+        adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner)
+
+    if verbose and (txp2gene or star_index):
+        print("Reading in {}".format(txp2gene))
+
+    if txp2gene:
+        t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
+    elif star_index:
+        t2g = pd.read_table(
+            f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]
+        )
+
+    if txp2gene or star_index:
+        t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id")
+        adata.var["gene_symbol"] = t2g["gene_symbol"]
+
+    return adata
+
+
+def write_counts(
+    adata: AnnData,
+    out: str,
+    verbose: bool = False,
+):
+
+    pd.DataFrame(adata.obs.index).to_csv(os.path.join(out, "barcodes.tsv"), sep="\t", index=False, header=None)
+    pd.DataFrame(adata.var).to_csv(os.path.join(out, "features.tsv"), sep="\t", index=True, header=None)
+    io.mmwrite(os.path.join(out, "matrix.mtx"), adata.X.T, field="integer")
+
+    if verbose:
+        print("Wrote features.tsv, barcodes.tsv, and matrix.mtx files to {}".format(args["out"]))
+
+
+def dump_versions(task_process):
+    import pkg_resources
+
+    with open("versions.yml", "w") as f:
+        f.write(f"{task_process}:\n\t")
+        f.write("\n\t".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set]))
+
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.")
 
-    parser.add_argument("-m", "--mtx", dest="mtx", help="Path to mtx file.")
+    parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.")
     parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=False)
-    parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.")
-    parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.")
+    parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="")
+    parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="")
     parser.add_argument("-s", "--sample", dest="sample", help="Sample name")
     parser.add_argument("-o", "--out", dest="out", help="Output path.")
     parser.add_argument("-a", "--aligner", dest="aligner", help="Which aligner has been used?")
+    parser.add_argument("--task_process", dest="task_process", help="Task process name.")
+    parser.add_argument("--txp2gene", dest="txp2gene", help="Transcript to gene (t2g) file.", nargs="?", const="")
+    parser.add_argument(
+        "--star_index", dest="star_index", help="Star index folder containing geneInfo.tab.", nargs="?", const=""
+    )
 
     args = vars(parser.parse_args())
 
-    adata = mtx_to_adata(
-        args["mtx"],
-        args["barcode"],
-        args["feature"],
-        args["sample"],
-        args["aligner"],
+    # create the directory with the sample name
+    os.makedirs(os.path.dirname(args["out"]), exist_ok=True)
+
+    adata = input_to_adata(
+        input_data=args["input_data"],
+        barcode_file=args["barcode"],
+        feature_file=args["feature"],
+        sample=args["sample"],
+        aligner=args["aligner"],
+        txp2gene=args["txp2gene"],
+        star_index=args["star_index"],
         verbose=args["verbose"],
     )
 
+    write_counts(adata=adata, out=args["sample"], verbose=args["verbose"])
+
     adata.write_h5ad(args["out"], compression="gzip")
 
     print("Wrote h5ad file to {}".format(args["out"]))
+
+    dump_versions(task_process=args["task_process"])
diff --git a/bin/mtx_to_seurat.R b/bin/mtx_to_seurat.R
@@ -22,6 +22,8 @@ if(aligner %in% c("kallisto", "alevin")) {
 
 seurat.object <- CreateSeuratObject(counts = expression.matrix)
 
+dir.create(basename(dirname(out.file)), showWarnings = FALSE)
+
 saveRDS(seurat.object, file = out.file)
 
 

diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf
@@ -11,9 +11,13 @@ process MTX_TO_H5AD {
     // inputs from cellranger nf-core module does not come in a single sample dir
     // for each sample, the sub-folders and files come directly in array.
     tuple val(meta), path(inputs)
+    path txp2gene
+    path star_index
 
     output:
-    path "*.h5ad", emit: h5ad
+    path "${meta.id}/*h5ad", emit: h5ad
+    path "${meta.id}/*", emit: counts
+    path  "versions.yml", emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -40,10 +44,11 @@ process MTX_TO_H5AD {
     if (params.aligner == 'cellranger')
     """
     # convert file types
-    cellranger_mtx_to_h5ad.py \\
-        --mtx filtered_feature_bc_matrix.h5 \\
+    mtx_to_h5ad.py \\
+        --aligner ${params.aligner} \\
+        --input filtered_feature_bc_matrix.h5 \\
         --sample ${meta.id} \\
-        --out ${meta.id}_matrix.h5ad
+        --out ${meta.id}/${meta.id}_matrix.h5ad
     """
 
     else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard')
@@ -53,27 +58,34 @@ process MTX_TO_H5AD {
         mtx_to_h5ad.py \\
             --aligner ${params.aligner} \\
             --sample ${meta.id} \\
-            --mtx *count/counts_unfiltered/\${input_type}.mtx \\
+            --input *count/counts_unfiltered/\${input_type}.mtx \\
             --barcode *count/counts_unfiltered/\${input_type}.barcodes.txt \\
             --feature *count/counts_unfiltered/\${input_type}.genes.txt \\
-            --out ${meta.id}_\${input_type}_matrix.h5ad ;
+            --txp2gene ${txp2gene} \\
+            --star_index ${star_index} \\
+            --out ${meta.id}/${meta.id}_\${input_type}_matrix.h5ad ;
     done
     """
 
     else
     """
     # convert file types
     mtx_to_h5ad.py \\
+        --task_process ${task.process} \\
         --aligner ${params.aligner} \\
         --sample ${meta.id} \\
-        --mtx $mtx_matrix \\
+        --input $mtx_matrix \\
         --barcode $barcodes_tsv \\
         --feature $features_tsv \\
-        --out ${meta.id}_matrix.h5ad
+        --txp2gene ${txp2gene} \\
+        --star_index ${star_index} \\
+        --out ${meta.id}/${meta.id}_matrix.h5ad
     """
 
     stub:
     """
-    touch ${meta.id}_matrix.h5ad
+    mkdir ${meta.id}
+    touch ${meta.id}/${meta.id}_matrix.h5ad
+    touch versions.yml
     """
 }
diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf
@@ -13,7 +13,7 @@ process MTX_TO_SEURAT {
     tuple val(meta), path(inputs)
 
     output:
-    path "*.rds", emit: seuratObjects
+    path "${meta.id}/*.rds", emit: seuratObjects
     path "versions.yml", emit: versions
 
     when:
@@ -38,6 +38,9 @@ process MTX_TO_SEURAT {
         barcodes = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz"
         features = "*.Solo.out/Gene*/filtered/features.tsv.gz"
     }
+    """
+    mkdir ${meta.id}
+    """
 
     if (params.aligner == 'kallisto' && params.kb_workflow != 'standard')
     """
@@ -47,7 +50,7 @@ process MTX_TO_SEURAT {
             *count/counts_unfiltered/\${input_type}.mtx \\
             *count/counts_unfiltered/\${input_type}.barcodes.txt \\
             *count/counts_unfiltered/\${input_type}.genes.txt \\
-            ${meta.id}_\${input_type}_matrix.rds \\
+            ${meta.id}/${meta.id}_\${input_type}_matrix.rds \\
             ${aligner}
     done
     """
@@ -58,13 +61,14 @@ process MTX_TO_SEURAT {
         $matrix \\
         $barcodes \\
         $features \\
-        ${meta.id}_matrix.rds \\
+        ${meta.id}/${meta.id}_matrix.rds \\
         ${aligner}
     """
 
     stub:
     """
-    touch ${meta.id}_matrix.rds
+    mkdir ${meta.id}
+    touch ${meta.id}/${meta.id}_matrix.rds
     touch versions.yml
     """
 }
diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf
@@ -75,5 +75,4 @@ workflow SCRNASEQ_ALEVIN {
     alevin_results = SIMPLEAF_QUANT.out.alevin_results
     alevinqc = ALEVINQC.out.report
     for_multiqc = SIMPLEAF_QUANT.out.alevin_results.collect{it[1]}.ifEmpty([])
-
 }
diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf
@@ -5,7 +5,6 @@
 include {CELLRANGER_MKGTF} from "../../modules/nf-core/cellranger/mkgtf/main.nf"
 include {CELLRANGER_MKREF} from "../../modules/nf-core/cellranger/mkref/main.nf"
 include {CELLRANGER_COUNT} from "../../modules/nf-core/cellranger/count/main.nf"
-include {MTX_TO_H5AD     } from "../../modules/local/mtx_to_h5ad.nf"
 
 // Define workflow to subset and index a genome region fasta file
 workflow CELLRANGER_ALIGN {
@@ -43,4 +42,5 @@ workflow CELLRANGER_ALIGN {
     emit:
         ch_versions
         cellranger_out  = CELLRANGER_COUNT.out.outs
+        star_index = cellranger_index
 }
diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf
@@ -67,6 +67,7 @@ workflow KALLISTO_BUSTOOLS {
     emit:
     ch_versions
     counts = KALLISTOBUSTOOLS_COUNT.out.count
+    txp2gene = txp2gene.collect()
 
 
 }
diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf
@@ -8,13 +8,19 @@ workflow MTX_CONVERSION {
     take:
     mtx_matrices
     samplesheet
+    txp2gene
+    star_index
 
     main:
         ch_versions = Channel.empty()
-        // Convert matrix do h5ad
+
+        //
+        // Convert matrix to h5ad
         //
         MTX_TO_H5AD (
-            mtx_matrices
+            mtx_matrices,
+            txp2gene,
+            star_index
         )
 
         //
@@ -33,9 +39,10 @@ workflow MTX_CONVERSION {
         )
 
         //TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output
-        ch_version = ch_versions.mix(MTX_TO_SEURAT.out.versions)
+        ch_version = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions)
 
     emit:
     ch_versions
+    counts = MTX_TO_H5AD.out.counts
 
 }
diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf
@@ -53,6 +53,7 @@ workflow STARSOLO {
 
     emit:
     ch_versions
+    star_index  = star_index
     star_result = STAR_ALIGN.out.tab
     star_counts = STAR_ALIGN.out.counts
     for_multiqc = STAR_ALIGN.out.log_final.collect{it[1]}.ifEmpty([])