Merge pull request #55 from KevinMenden/dev

DSL2 version with working alevin workflow
nf-core · Apr 19, 2021 · 5e86a46 · 5e86a46
2 parents 1156161 + 9c9c649
commit 5e86a46
Show file tree

Hide file tree

Showing 46 changed files with 2,609 additions and 928 deletions.
diff --git a/Dockerfile b/Dockerfile
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import errno
+import argparse
+
+
+def parse_args(args=None):
+    Description = "Reformat nf-core/cageseq samplesheet file and check its contents."
+    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise exception
+
+
+def print_error(error, context="Line", context_str=""):
+    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    if context != "" and context_str != "":
+        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
+            error, context.strip(), context_str.strip()
+        )
+    print(error_str)
+    sys.exit(1)
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    This function checks that the samplesheet follows the following structure:
+    group,replicate,fastq_1,fastq_2
+    WT,1,WT_LIB1_REP1_1.fastq.gz,WT_LIB1_REP1_2.fastq.gz
+    WT,1,WT_LIB2_REP1_1.fastq.gz,WT_LIB2_REP1_2.fastq.gz
+    WT,2,WT_LIB1_REP2_1.fastq.gz,WT_LIB1_REP2_2.fastq.gz
+    KO,1,KO_LIB1_REP1_1.fastq.gz,KO_LIB1_REP1_2.fastq.gz
+    """
+
+    sample_run_dict = {}
+    with open(file_in, "r") as fin:
+
+        ## Check header
+        MIN_COLS = 3
+        HEADER = ["group", "replicate", "fastq_1", "fastq_2"]
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        if header[: len(HEADER)] != HEADER:
+            print(
+                "ERROR: Please check samplesheet header -> {} != {}".format(
+                    ",".join(header), ",".join(HEADER)
+                )
+            )
+            sys.exit(1)
+
+        ## Check sample entries
+        for line in fin:
+            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            ## Check valid number of columns per row
+            if len(lspl) < len(HEADER):
+                print_error(
+                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    "Line",
+                    line,
+                )
+
+            num_cols = len([x for x in lspl if x])
+            if num_cols < MIN_COLS:
+                print_error(
+                    "Invalid number of populated columns (minimum = {})!".format(
+                        MIN_COLS
+                    ),
+                    "Line",
+                    line,
+                )
+
+            ## Check sample name entries
+            sample, replicate, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            if sample:
+                if sample.find(" ") != -1:
+                    print_error("Group entry contains spaces!", "Line", line)
+            else:
+                print_error("Group entry has not been specified!", "Line", line)
+
+            ## Check replicate entry is integer
+            if not replicate.isdigit():
+                print_error("Replicate id not an integer!", "Line", line)
+            replicate = int(replicate)
+
+            ## Check FastQ file extension
+            for fastq in [fastq_1, fastq_2]:
+                if fastq:
+                    if fastq.find(" ") != -1:
+                        print_error("FastQ file contains spaces!", "Line", line)
+                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                        print_error(
+                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            "Line",
+                            line,
+                        )
+
+            ## Auto-detect paired-end/single-end
+            sample_info = []  ## [single_end, fastq_1, fastq_2]
+            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
+                sample_info = ["0", fastq_1, fastq_2]
+            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
+                sample_info = ["1", fastq_1, fastq_2]
+            else:
+                print_error("Invalid combination of columns provided!", "Line", line)
+
+            ## Create sample mapping dictionary = {sample: {replicate : [ single_end, fastq_1, fastq_2]}}
+            if sample not in sample_run_dict:
+                sample_run_dict[sample] = {}
+            if replicate not in sample_run_dict[sample]:
+                sample_run_dict[sample][replicate] = [sample_info]
+            else:
+                if sample_info in sample_run_dict[sample][replicate]:
+                    print_error("Samplesheet contains duplicate rows!", "Line", line)
+                else:
+                    sample_run_dict[sample][replicate].append(sample_info)
+
+    ## Write validated samplesheet with appropriate columns
+    if len(sample_run_dict) > 0:
+        out_dir = os.path.dirname(file_out)
+        make_dir(out_dir)
+        with open(file_out, "w") as fout:
+
+            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
+            for sample in sorted(sample_run_dict.keys()):
+
+                ## Check that replicate ids are in format 1..<NUM_REPS>
+                uniq_rep_ids = set(sample_run_dict[sample].keys())
+                if len(uniq_rep_ids) != max(uniq_rep_ids):
+                    print_error(
+                        "Replicate ids must start with 1..<num_replicates>!",
+                        "Group",
+                        sample,
+                    )
+
+                for replicate in sorted(sample_run_dict[sample].keys()):
+
+                    ## Check that multiple runs of the same sample are of the same datatype
+                    if not all(
+                        x[0] == sample_run_dict[sample][replicate][0][0]
+                        for x in sample_run_dict[sample][replicate]
+                    ):
+                        print_error(
+                            "Multiple runs of a sample must be of the same datatype!",
+                            "Group",
+                            sample,
+                        )
+
+                    ## Write to file
+                    for idx, sample_info in enumerate(
+                        sample_run_dict[sample][replicate]
+                    ):
+                        sample_id = "{}_R{}_T{}".format(sample, replicate, idx + 1)
+                        fout.write(",".join([sample_id] + sample_info) + "\n")
+
+
+def main(args=None):
+    args = parse_args(args)
+    check_samplesheet(args.FILE_IN, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/base.config b/conf/base.config
@@ -1,49 +1,40 @@
-/*
- * -------------------------------------------------
- *  nf-core/scrnaseq Nextflow base config file
- * -------------------------------------------------
- * A 'blank slate' config file, appropriate for general
- * use on most high performace compute environments.
- * Assumes that all software is installed and available
- * on the PATH. Runs in `local` mode - all jobs will be
- * run on the logged in environment.
- */
-
 process {
 
-  cpus = { check_max( 1 * task.attempt, 'cpus' ) }
-  memory = { check_max( 7.GB * task.attempt, 'memory' ) }
-  time = { check_max( 4.h * task.attempt, 'time' ) }
+  cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
+  memory = { check_max( 6.GB * task.attempt, 'memory' ) }
+  time   = { check_max( 4.h  * task.attempt, 'time'   ) }
 
   errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
-  maxRetries = 1
-  maxErrors = '-1'
-
-  // Process-specific resource requirements
-  // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
+  maxRetries    = 1
+  maxErrors     = '-1'
 
   // Process-specific resource requirements
-  withLabel: low_memory {
-    cpus = { check_max (4, 'cpus')}
-    memory = { check_max( 16.GB * task.attempt, 'memory' ) }
+  withLabel:process_low {
+    cpus   = { check_max( 2     * task.attempt, 'cpus'   ) }
+    memory = { check_max( 12.GB * task.attempt, 'memory' ) }
+    time   = { check_max( 4.h   * task.attempt, 'time'   ) }
   }
-  withLabel: mid_memory {
-    cpus = { check_max (8, 'cpus')}
-    memory = { check_max( 32.GB * task.attempt, 'memory' ) }
-    time = { check_max( 8.h * task.attempt, 'time' ) }
+  withLabel:process_medium {
+    cpus   = { check_max( 6     * task.attempt, 'cpus'   ) }
+    memory = { check_max( 36.GB * task.attempt, 'memory' ) }
+    time   = { check_max( 8.h   * task.attempt, 'time'   ) }
   }
-  withLabel: high_memory {
-    cpus = { check_max (10, 'cpus')}
-    memory = { check_max( 80.GB * task.attempt, 'memory' ) }
-    time = { check_max( 8.h * task.attempt, 'time' ) }
+  withLabel:process_high {
+    cpus   = { check_max( 12    * task.attempt, 'cpus'   ) }
+    memory = { check_max( 72.GB * task.attempt, 'memory' ) }
+    time   = { check_max( 16.h  * task.attempt, 'time'   ) }
   }
-
-}
-
-params {
-  // Defaults only, expecting to be overwritten
-  max_memory = 128.GB
-  max_cpus = 16
-  max_time = 240.h
-  igenomes_base = 's3://ngi-igenomes/igenomes/'
-}
+  withLabel:process_long {
+    time   = { check_max( 20.h  * task.attempt, 'time'   ) }
+  }
+  withLabel:error_ignore {
+    errorStrategy = 'ignore'
+  }
+  withLabel:error_retry {
+    errorStrategy = 'retry'
+    maxRetries    = 2
+  }
+  withName:PLASMIDID {
+    errorStrategy = 'ignore'
+  }
+}
diff --git a/conf/modules.config b/conf/modules.config
@@ -0,0 +1,45 @@
+#!/usr/bin/env nextflow
+
+/*
+ * ---------------------------------------------------------
+ *  Nextflow config file for module specific default options
+ * ---------------------------------------------------------
+ */
+
+params {
+    modules {
+        'cellranger_mkgtf' {
+            publish_dir         = "cellranger/mkgtf"
+            args                = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene"
+        }
+        'cellranger_mkref' {
+            publish_dir         = "cellranger/mkref"
+            publish_files       = false
+        }
+        'cellranger_count' {
+            publish_dir         = "cellranger/count"
+            publish_files       = ['gz':'filtered_feature_bc_matrix']
+        }
+        'gffread_tx2pgene' {
+            args                = "--table transcript_id,gene_id"
+        }
+        'salmon_alevin' {
+            args                = ""
+        }
+        'salmon_index' {
+            args                = ""
+        }
+        'alevinqc' {
+            args                = ""
+        }
+        'multiqc_alevin' {
+            args                = ""
+        }
+        'star_genomegenerate' {
+            args                = ""
+        }
+        'star_align' {
+            args                = "--soloType CB_UMI_Simple --readFilesCommand zcat --runDirPerm All_RWX --outWigType bedGraph --twopassMode Basic --outSAMtype BAM SortedByCoordinate"
+        }
+    }
+}
diff --git a/environment.yml b/environment.yml