Merge branch 'dev' into output-10x-counts

nf-core · Nov 11, 2022 · fd5d1ae · fd5d1ae
2 parents 1415f2e + 58a6467
commit fd5d1ae
Show file tree

Hide file tree

Showing 20 changed files with 3,470 additions and 173 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,7 +12,7 @@ env:
   NXF_ANSI_LOG: false
 
 concurrency:
-  group: ${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -46,4 +46,4 @@ jobs:
         # For example: adding multiple test runs with different parameters
         # Remember that you can parallelise this by using strategy.matrix
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.parameters }} --outdir ./results
+          nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }} --outdir ./results
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixes
 
 - Autocanceling previous CI runs when new changes are pushed.
+- Fixed [#177](https://github.com/nf-core/scrnaseq/issues/177) by adjusting the channels generation and usage when skipping fastqc
+- Fixed [#173](https://github.com/nf-core/scrnaseq/issues/173) by adjusting parameter type and adding them to modules.config
 
 ## v2.1.0 - 2022-10-06 "Green Mercury Siberian Husky"
 

diff --git a/README.md b/README.md
@@ -33,6 +33,8 @@ This is a community effort in building a pipeline capable to support:
 
 The nf-core/scrnaseq pipeline comes with documentation about the pipeline [usage](https://nf-co.re/scrnaseq/usage), [parameters](https://nf-co.re/scrnaseq/parameters) and [output](https://nf-co.re/scrnaseq/output).
 
+![scrnaseq workflow](docs/images/scrnaseq_pipeline_v1.0_metro_clean.png)
+
 ## Quick Start
 
 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`)
@@ -58,12 +60,28 @@ The nf-core/scrnaseq pipeline comes with documentation about the pipeline [usage
    nextflow run nf-core/scrnaseq --input samplesheet.csv --outdir <OUTDIR> --genome_fasta GRCm38.p6.genome.chr19.fa --gtf gencode.vM19.annotation.chr19.gtf --protocol 10XV2 --aligner <alevin/kallisto/star/cellranger> -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
    ```
 
+## Decision Tree for users
+
+The nf-core/scrnaseq pipeline features several paths to analyze your single cell data. Future additions will also be done soon, e.g. the addition of multi-ome analysis types. To aid users in analyzing their data, we have added a decision tree to help people decide on what type of analysis they want to run and how to choose appropriate parameters for that.
+
+```mermaid
+graph TD
+    A[sc RNA] -->|alevin-fry| B(h5ad/seurat/mtx matrices)
+    A[sc RNA] -->|CellRanger| B(h5ad/seurat/mtx matrices)
+    A[sc RNA] -->|kbpython| B(h5ad/seurat/mtx matrices)
+    A[sc RNA] -->|STARsolo| B(h5ad/seurat/mtx matrices)
+    A[sc RNA] -->|Universc| B(h5ad/seurat/mtx matrices)
+```
+
+Options for the respective alignment method can be found [here](https://github.com/nf-core/scrnaseq/blob/dev/docs/usage.md#aligning-options) to choose between methods.
+
 ## Credits
 
 nf-core/scrnaseq was originally written by Bailey PJ, Botvinnik O, Marques de Almeida F, Gabernet G, Peltzer A, Sturm G.
 
 We thank the following people for their extensive assistance in the development of this pipeline:
 
+- @heylf
 - @KevinMenden
 - @FloWuenne
 - @rob-p

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -15,122 +15,6 @@
 logger = logging.getLogger()
 
 
-class RowChecker:
-    """
-    Define a service that can validate and transform each given row.
-
-    Attributes:
-        modified (list): A list of dicts, where each dict corresponds to a previously
-            validated and transformed row. The order of rows is maintained.
-
-    """
-
-    VALID_FORMATS = (
-        ".fq.gz",
-        ".fastq.gz",
-    )
-
-    def __init__(
-        self,
-        sample_col="sample",
-        first_col="fastq_1",
-        second_col="fastq_2",
-        single_col="single_end",
-        **kwargs,
-    ):
-        """
-        Initialize the row checker with the expected column names.
-
-        Args:
-            sample_col (str): The name of the column that contains the sample name
-                (default "sample").
-            first_col (str): The name of the column that contains the first (or only)
-                FASTQ file path (default "fastq_1").
-            second_col (str): The name of the column that contains the second (if any)
-                FASTQ file path (default "fastq_2").
-            single_col (str): The name of the new column that will be inserted and
-                records whether the sample contains single- or paired-end sequencing
-                reads (default "single_end").
-
-        """
-        super().__init__(**kwargs)
-        self._sample_col = sample_col
-        self._first_col = first_col
-        self._second_col = second_col
-        self._single_col = single_col
-        self._seen = set()
-        self.modified = []
-
-    def validate_and_transform(self, row):
-        """
-        Perform all validations on the given row and insert the read pairing status.
-
-        Args:
-            row (dict): A mapping from column headers (keys) to elements of that row
-                (values).
-
-        """
-        self._validate_sample(row)
-        self._validate_first(row)
-        self._validate_second(row)
-        self._validate_pair(row)
-        self._seen.add((row[self._sample_col], row[self._first_col]))
-        self.modified.append(row)
-
-    def _validate_sample(self, row):
-        """Assert that the sample name exists and convert spaces to underscores."""
-        if len(row[self._sample_col]) <= 0:
-            raise AssertionError("Sample input is required.")
-        # Sanitize samples slightly.
-        row[self._sample_col] = row[self._sample_col].replace(" ", "_")
-
-    def _validate_first(self, row):
-        """Assert that the first FASTQ entry is non-empty and has the right format."""
-        assert len(row[self._first_col]) > 0, "At least the first FASTQ file is required."
-        self._validate_fastq_format(row[self._first_col])
-
-    def _validate_second(self, row):
-        """Assert that the second FASTQ entry has the right format if it exists."""
-        if len(row[self._second_col]) > 0:
-            self._validate_fastq_format(row[self._second_col])
-
-    def _validate_pair(self, row):
-        """Assert that read pairs have the same file extension. Report pair status."""
-        if row[self._first_col] and row[self._second_col]:
-            row[self._single_col] = False
-            assert (
-                Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:]
-            ), "FASTQ pairs must have the same file extensions."
-        else:
-            row[self._single_col] = True
-
-    def _validate_fastq_format(self, filename):
-        """Assert that a given filename has one of the expected FASTQ extensions."""
-        if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
-            raise AssertionError(
-                f"The FASTQ file has an unrecognized extension: {filename}\n"
-                f"It should be one of: {', '.join(self.VALID_FORMATS)}"
-            )
-
-    def validate_unique_samples(self):
-        """
-        Assert that the combination of sample name and FASTQ filename is unique.
-
-        In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
-        number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
-
-        """
-        assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique."
-        if len({pair[0] for pair in self._seen}) < len(self._seen):
-            counts = Counter(pair[0] for pair in self._seen)
-            seen = Counter()
-            for row in self.modified:
-                sample = row[self._sample_col]
-                seen[sample] += 1
-                if counts[sample] > 1:
-                    row[self._sample_col] = f"{sample}_T{seen[sample]}"
-
-
 def read_head(handle, num_lines=10):
     """Read the specified number of lines from the current position in the file."""
     lines = []
@@ -206,11 +90,24 @@ def check_samplesheet(file_in, file_out):
 
         ## Check header
         MIN_COLS = 2
-        HEADER = ["sample", "fastq_1", "fastq_2"]
+        MIN_HEADER = ["sample", "fastq_1", "fastq_2"]
+        OPT_HEADER = ["expected_cells", "seq_center"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
-        if header[: len(HEADER)] != HEADER:
+
+        unknown_header = 0
+        min_header_count = 0
+        colmap = {"sample": 0, "fastq_1": 1, "fastq2": 2}
+        i = 0
+        for h in header:
+            if h not in MIN_HEADER and h not in OPT_HEADER:
+                unknown_header = 1
+            if h in MIN_HEADER:
+                min_header_count = min_header_count + 1
+            colmap[h] = i
+            i = i + 1
+        if unknown_header or min_header_count < len(MIN_HEADER):
             given = ",".join(header)
-            wanted = ",".join(HEADER)
+            wanted = ",".join(MIN_HEADER)
             print(f"ERROR: Please check samplesheet header -> {given} != {wanted}")
             sys.exit(1)
 
@@ -219,9 +116,9 @@ def check_samplesheet(file_in, file_out):
             lspl = [x.strip().strip('"') for x in line.strip().split(",")]
 
             # Check valid number of columns per row
-            if len(lspl) < len(HEADER):
+            if len(lspl) < len(header):
                 print_error(
-                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    "Invalid number of columns (minimum = {})!".format(len(header)),
                     "Line",
                     line,
                 )
@@ -234,11 +131,24 @@ def check_samplesheet(file_in, file_out):
                 )
 
             ## Check sample name entries
-            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            sample, fastq_1, fastq_2 = lspl[: len(MIN_HEADER)]
             sample = sample.replace(" ", "_")
             if not sample:
                 print_error("Sample entry has not been specified!", "Line", line)
 
+            ## Check expected cells is an integer if present
+            expected_cells = ""
+            if "expected_cells" in header:
+                expected_cells = lspl[colmap["expected_cells"]]
+                if not is_integer(expected_cells):
+                    print_error("Expected cells must be an integer", "Line", line)
+
+            ## If present, replace spaces with _ in sequencing center name
+            seq_center = ""
+            if "seq_center" in header:
+                seq_center = lspl[colmap["seq_center"]]
+                seq_center = seq_center.replace(" ", "_")
+
             ## Check FastQ file extension
             for fastq in [fastq_1, fastq_2]:
                 if fastq:
@@ -254,9 +164,9 @@ def check_samplesheet(file_in, file_out):
             ## Auto-detect paired-end/single-end
             sample_info = []  ## [single_end, fastq_1, fastq_2]
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2]
+                sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center]
             elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2]
+                sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center]
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 
@@ -273,7 +183,7 @@ def check_samplesheet(file_in, file_out):
     ## Write validated samplesheet with appropriate columns
     if len(sample_mapping_dict) > 0:
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
+            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "expected_cells", "seq_center"]) + "\n")
             for sample in sorted(sample_mapping_dict.keys()):
 
                 ## Check that multiple runs of the same sample are of the same datatype
@@ -317,6 +227,15 @@ def parse_args(argv=None):
     return parser.parse_args(argv)
 
 
+def is_integer(n):
+    try:
+        float(n)
+    except ValueError:
+        return False
+    else:
+        return float(n).is_integer()
+
+
 def main(argv=None):
     """Coordinate argument parsing and program execution."""
     args = parse_args(argv)

diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py
@@ -35,7 +35,7 @@ def write_counts(
     txp2gene: str,
     star_index: str,
     out: str,
-    verbose: bool = True,):
+    verbose: bool = False,):
 
     if verbose:
         print("Reading in {}".format(txp2gene))
@@ -68,7 +68,7 @@ def write_counts(
     parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.")
 
     parser.add_argument("-m", "--mtx", dest="mtx", help="Path to mtx file.")
-    parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=True)
+    parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=False)
     parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.")
     parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.")
     parser.add_argument("-s", "--sample", dest="sample", help="Sample name")

diff --git a/conf/modules.config b/conf/modules.config
@@ -68,6 +68,7 @@ if(params.aligner == "cellranger") {
                 path: "${params.outdir}/${params.aligner}/count",
                 mode: params.publish_dir_mode
             ]
+            ext.args = {meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : ''}
         }
     }
 }
@@ -81,7 +82,8 @@ if (params.aligner == "alevin") {
         withName: 'SIMPLEAF_INDEX' {
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}" },
-                mode: params.publish_dir_mode
+                mode: params.publish_dir_mode,
+                enabled: params.save_reference
             ]
             ext.args = { "--rlen ${params.simpleaf_rlen}" }
         }
@@ -103,7 +105,8 @@ if (params.aligner == "star") {
         withName: STAR_GENOMEGENERATE {
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}/genome_generate" },
-                mode: params.publish_dir_mode
+                mode: params.publish_dir_mode,
+                enabled: params.save_reference
             ]
         }
         withName: STAR_ALIGN {
@@ -120,7 +123,8 @@ if (params.aligner == 'kallisto') {
         withName: KALLISTOBUSTOOLS_REF {
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}" },
-                mode: params.publish_dir_mode
+                mode: params.publish_dir_mode,
+                enabled: params.save_reference
             ]
         }
         withName: KALLISTOBUSTOOLS_COUNT {