Skip to content

Commit

Permalink
Merge pull request #55 from KevinMenden/dev
Browse files Browse the repository at this point in the history
DSL2 version with working alevin workflow
  • Loading branch information
KevinMenden committed Apr 19, 2021
2 parents 1156161 + 9c9c649 commit 5e86a46
Show file tree
Hide file tree
Showing 46 changed files with 2,609 additions and 928 deletions.
17 changes: 0 additions & 17 deletions Dockerfile

This file was deleted.

175 changes: 175 additions & 0 deletions bin/check_samplesheet.py
@@ -0,0 +1,175 @@
#!/usr/bin/env python

import os
import sys
import errno
import argparse


def parse_args(args=None):
Description = "Reformat nf-core/cageseq samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
group,replicate,fastq_1,fastq_2
WT,1,WT_LIB1_REP1_1.fastq.gz,WT_LIB1_REP1_2.fastq.gz
WT,1,WT_LIB2_REP1_1.fastq.gz,WT_LIB2_REP1_2.fastq.gz
WT,2,WT_LIB1_REP2_1.fastq.gz,WT_LIB1_REP2_2.fastq.gz
KO,1,KO_LIB1_REP1_1.fastq.gz,KO_LIB1_REP1_2.fastq.gz
"""

sample_run_dict = {}
with open(file_in, "r") as fin:

## Check header
MIN_COLS = 3
HEADER = ["group", "replicate", "fastq_1", "fastq_2"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print(
"ERROR: Please check samplesheet header -> {} != {}".format(
",".join(header), ",".join(HEADER)
)
)
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

## Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Line",
line,
)

num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(
MIN_COLS
),
"Line",
line,
)

## Check sample name entries
sample, replicate, fastq_1, fastq_2 = lspl[: len(HEADER)]
if sample:
if sample.find(" ") != -1:
print_error("Group entry contains spaces!", "Line", line)
else:
print_error("Group entry has not been specified!", "Line", line)

## Check replicate entry is integer
if not replicate.isdigit():
print_error("Replicate id not an integer!", "Line", line)
replicate = int(replicate)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
print_error(
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
"Line",
line,
)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]
else:
print_error("Invalid combination of columns provided!", "Line", line)

## Create sample mapping dictionary = {sample: {replicate : [ single_end, fastq_1, fastq_2]}}
if sample not in sample_run_dict:
sample_run_dict[sample] = {}
if replicate not in sample_run_dict[sample]:
sample_run_dict[sample][replicate] = [sample_info]
else:
if sample_info in sample_run_dict[sample][replicate]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_run_dict[sample][replicate].append(sample_info)

## Write validated samplesheet with appropriate columns
if len(sample_run_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:

fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
for sample in sorted(sample_run_dict.keys()):

## Check that replicate ids are in format 1..<NUM_REPS>
uniq_rep_ids = set(sample_run_dict[sample].keys())
if len(uniq_rep_ids) != max(uniq_rep_ids):
print_error(
"Replicate ids must start with 1..<num_replicates>!",
"Group",
sample,
)

for replicate in sorted(sample_run_dict[sample].keys()):

## Check that multiple runs of the same sample are of the same datatype
if not all(
x[0] == sample_run_dict[sample][replicate][0][0]
for x in sample_run_dict[sample][replicate]
):
print_error(
"Multiple runs of a sample must be of the same datatype!",
"Group",
sample,
)

## Write to file
for idx, sample_info in enumerate(
sample_run_dict[sample][replicate]
):
sample_id = "{}_R{}_T{}".format(sample, replicate, idx + 1)
fout.write(",".join([sample_id] + sample_info) + "\n")


def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
71 changes: 31 additions & 40 deletions conf/base.config
@@ -1,49 +1,40 @@
/*
* -------------------------------------------------
* nf-core/scrnaseq Nextflow base config file
* -------------------------------------------------
* A 'blank slate' config file, appropriate for general
* use on most high performace compute environments.
* Assumes that all software is installed and available
* on the PATH. Runs in `local` mode - all jobs will be
* run on the logged in environment.
*/

process {

cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 7.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }

errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'

// Process-specific resource requirements
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
maxRetries = 1
maxErrors = '-1'

// Process-specific resource requirements
withLabel: low_memory {
cpus = { check_max (4, 'cpus')}
memory = { check_max( 16.GB * task.attempt, 'memory' ) }
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel: mid_memory {
cpus = { check_max (8, 'cpus')}
memory = { check_max( 32.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
withLabel:process_medium {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 36.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
}
withLabel: high_memory {
cpus = { check_max (10, 'cpus')}
memory = { check_max( 80.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
memory = { check_max( 72.GB * task.attempt, 'memory' ) }
time = { check_max( 16.h * task.attempt, 'time' ) }
}

}

params {
// Defaults only, expecting to be overwritten
max_memory = 128.GB
max_cpus = 16
max_time = 240.h
igenomes_base = 's3://ngi-igenomes/igenomes/'
}
withLabel:process_long {
time = { check_max( 20.h * task.attempt, 'time' ) }
}
withLabel:error_ignore {
errorStrategy = 'ignore'
}
withLabel:error_retry {
errorStrategy = 'retry'
maxRetries = 2
}
withName:PLASMIDID {
errorStrategy = 'ignore'
}
}
45 changes: 45 additions & 0 deletions conf/modules.config
@@ -0,0 +1,45 @@
#!/usr/bin/env nextflow

/*
* ---------------------------------------------------------
* Nextflow config file for module specific default options
* ---------------------------------------------------------
*/

params {
modules {
'cellranger_mkgtf' {
publish_dir = "cellranger/mkgtf"
args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene"
}
'cellranger_mkref' {
publish_dir = "cellranger/mkref"
publish_files = false
}
'cellranger_count' {
publish_dir = "cellranger/count"
publish_files = ['gz':'filtered_feature_bc_matrix']
}
'gffread_tx2pgene' {
args = "--table transcript_id,gene_id"
}
'salmon_alevin' {
args = ""
}
'salmon_index' {
args = ""
}
'alevinqc' {
args = ""
}
'multiqc_alevin' {
args = ""
}
'star_genomegenerate' {
args = ""
}
'star_align' {
args = "--soloType CB_UMI_Simple --readFilesCommand zcat --runDirPerm All_RWX --outWigType bedGraph --twopassMode Basic --outSAMtype BAM SortedByCoordinate"
}
}
}
26 changes: 0 additions & 26 deletions environment.yml

This file was deleted.

0 comments on commit 5e86a46

Please sign in to comment.