exporting versions in mtx_to_h5ad and unified cellranger w other alig…

…ners
nf-core · Nov 15, 2022 · d6bc6ca · d6bc6ca
1 parent ccfeba3
commit d6bc6ca
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 92 deletions.
diff --git a/bin/cellranger_mtx_to_h5ad.py b/bin/cellranger_mtx_to_h5ad.py
diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py
@@ -6,19 +6,25 @@
 from scipy import io
 from anndata import AnnData
 
+def _10x_h5_to_adata(mtx_h5: str, sample: str):
+    adata = sc.read_10x_h5(mtx_h5)
+    adata.var["gene_symbols"] = adata.var_names
+    adata.var.set_index("gene_ids", inplace=True)
+    adata.obs["sample"] = sample
+
+    # reorder columns for 10x mtx files
+    adata.var = adata.var[['gene_symbols', 'feature_types', 'genome']]
+
+    return adata
+
 
-def mtx_to_adata(
+def _mtx_to_adata(
     mtx_file: str,
     barcode_file: str,
     feature_file: str,
     sample: str,
     aligner: str,
-    verbose: bool = True,
 ):
-
-    if verbose:
-        print("Reading in {}".format(mtx_file))
-
     adata = sc.read_mtx(mtx_file)
     if (
         aligner == "star"
@@ -31,44 +37,69 @@ def mtx_to_adata(
 
     return adata
 
+def input_to_adata(
+    input_data: str,
+    barcode_file: str,
+    feature_file: str,
+    sample: str,
+    aligner: str,
+    verbose: bool = True,
+):
+
+    if verbose:
+        print("Reading in {}".format(input_data))
+
+    if aligner == 'cellranger':
+        return _10x_h5_to_adata(input_data, sample)
+    else:
+        return _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner)
+
 
 def write_counts(
     adata: AnnData,
     txp2gene: str,
     star_index: str,
+    aligner: str,
     out: str,
     verbose: bool = False,
 ):
 
     if verbose:
         print("Reading in {}".format(txp2gene))
 
+    t2g = None
     if txp2gene:
         t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
     elif star_index:
-        t2g = pd.read_table(f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["transcript_id", "gene_symbol"])
+        t2g = pd.read_table(f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1])
+
+    if txp2gene or star_index:
+        t2g = t2g.drop_duplicates(subset='gene_id').set_index("gene_id")
+        adata.var["gene_symbol"] = t2g["gene_symbol"]
 
     # when assigning a pandas column to a dataframe, pandas already takes care of matching the index.
     # therefore, setting the index is enough.
-    t2g = t2g.drop_duplicates(subset='gene_id').set_index("gene_id")
-    adata.var["gene_symbol"] = t2g["gene_symbol"]
-
     pd.DataFrame(adata.obs.index).to_csv(os.path.join(out, "barcodes.tsv"), sep="\t", index=False, header=None)
     pd.DataFrame(adata.var).to_csv(os.path.join(out, "features.tsv"), sep="\t", index=True, header=None)
     io.mmwrite(os.path.join(out, "matrix.mtx"), adata.X.T, field="integer")
 
     if verbose:
         print("Wrote features.tsv, barcodes.tsv, and matrix.mtx files to {}".format(args["out"]))
 
+def dump_versions():
+    import pkg_resources
+    with open('versions.yml', 'w') as f:
+        f.write("\n".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set]))
+
 
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.")
 
-    parser.add_argument("-m", "--mtx", dest="mtx", help="Path to mtx file.")
+    parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.")
     parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=False)
-    parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.")
-    parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.")
+    parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="")
+    parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="")
     parser.add_argument("-s", "--sample", dest="sample", help="Sample name")
     parser.add_argument("-o", "--out", dest="out", help="Output path.")
     parser.add_argument("-a", "--aligner", dest="aligner", help="Which aligner has been used?")
@@ -82,12 +113,15 @@ def write_counts(
     # create the directory with the sample name
     os.makedirs(os.path.dirname(args["out"]), exist_ok=True)
 
-    adata = mtx_to_adata(
-        args["mtx"], args["barcode"], args["feature"], args["sample"], args["aligner"], verbose=args["verbose"]
+    print(args)
+    adata = input_to_adata(
+        args["input_data"], args["barcode"], args["feature"], args["sample"], args["aligner"], verbose=args["verbose"]
     )
 
-    write_counts(adata, args["txp2gene"], args["star_index"], args["sample"], verbose=args["verbose"])
+    write_counts(adata, args["txp2gene"], args["star_index"], args["aligner"], args["sample"], verbose=args["verbose"])
 
     adata.write_h5ad(args["out"], compression="gzip")
 
     print("Wrote h5ad file to {}".format(args["out"]))
+
+    dump_versions()
diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf
@@ -17,6 +17,7 @@ process MTX_TO_H5AD {
     output:
     path "${meta.id}/*h5ad", emit: h5ad
     path "${meta.id}/*", emit: counts
+    path  "versions.yml", emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -43,10 +44,10 @@ process MTX_TO_H5AD {
     if (params.aligner == 'cellranger')
     """
     # convert file types
-    cellranger_mtx_to_h5ad.py \\
-        --mtx filtered_feature_bc_matrix.h5 \\
+    mtx_to_h5ad.py \\
+        --aligner ${params.aligner} \\
+        --input filtered_feature_bc_matrix.h5 \\
         --sample ${meta.id} \\
-        --txp2gene ${txp2gene} \\
         --out ${meta.id}/${meta.id}_matrix.h5ad
     """
 
@@ -57,7 +58,7 @@ process MTX_TO_H5AD {
         mtx_to_h5ad.py \\
             --aligner ${params.aligner} \\
             --sample ${meta.id} \\
-            --mtx *count/counts_unfiltered/\${input_type}.mtx \\
+            --input *count/counts_unfiltered/\${input_type}.mtx \\
             --barcode *count/counts_unfiltered/\${input_type}.barcodes.txt \\
             --feature *count/counts_unfiltered/\${input_type}.genes.txt \\
             --txp2gene ${txp2gene} \\
@@ -72,7 +73,7 @@ process MTX_TO_H5AD {
     mtx_to_h5ad.py \\
         --aligner ${params.aligner} \\
         --sample ${meta.id} \\
-        --mtx $mtx_matrix \\
+        --input $mtx_matrix \\
         --barcode $barcodes_tsv \\
         --feature $features_tsv \\
         --txp2gene ${txp2gene} \\
@@ -84,5 +85,6 @@ process MTX_TO_H5AD {
     """
     mkdir ${meta.id}
     touch ${meta.id}/${meta.id}_matrix.h5ad
+    touch versions.yml
     """
 }
diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf
@@ -42,5 +42,5 @@ workflow CELLRANGER_ALIGN {
     emit:
         ch_versions
         cellranger_out  = CELLRANGER_COUNT.out.outs
-        txp2gene = cellranger_index
+        star_index = cellranger_index
 }
diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf
@@ -39,7 +39,7 @@ workflow MTX_CONVERSION {
         )
 
         //TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output
-        ch_version = ch_versions.mix(MTX_TO_SEURAT.out.versions)
+        ch_version = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions)
 
     emit:
     ch_versions

diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf
@@ -186,7 +186,7 @@ workflow SCRNASEQ {
         )
         ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions)
         ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_out)
-        ch_txp2gene = CELLRANGER_ALIGN.out.txp2gene
+        ch_star_index = CELLRANGER_ALIGN.out.star_index
     }
 
     // Run mtx to h5ad conversion subworkflow