Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DSL2 version with working alevin workflow #55

Merged
merged 27 commits into from Apr 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7822d1f
init dsl2 branch with cellranger modules and subworkflow
alexthiery1 Feb 18, 2021
ba4f5b1
add test data for cellranger subworkflow
alexthiery1 Mar 16, 2021
1c640c3
add config for module params and workflow testing
alexthiery1 Mar 16, 2021
a5f6e98
init cellranger subworkflow tests
alexthiery1 Mar 16, 2021
bbe48dd
edit fastq input into cellranger count
alexthiery1 Mar 16, 2021
e3eec29
init csv metadata module
alexthiery1 Mar 16, 2021
528977a
add versions and emit statements to modules
alexthiery1 Mar 19, 2021
552e79c
initial refactoring
KevinMenden Apr 14, 2021
9e8eea1
initial refactoring
KevinMenden Apr 14, 2021
4cf0282
started adding modules and building pipeline
KevinMenden Apr 14, 2021
107acff
added more modules
KevinMenden Apr 14, 2021
4793ddb
move star to local modules
KevinMenden Apr 14, 2021
a8f33ba
refactored into different workflows
KevinMenden Apr 15, 2021
cd0a61b
Merge remote-tracking branch 'origin/dev' into feat-dsl2
KevinMenden Apr 15, 2021
e125b91
Merge pull request #1 from KevinMenden/feat-dsl2
KevinMenden Apr 15, 2021
e0a5b07
added local alevin modules
KevinMenden Apr 15, 2021
271d07d
half-working alevin dsl2 version
KevinMenden Apr 15, 2021
404b1a9
more updates
KevinMenden Apr 16, 2021
2ff52d1
added multiqc, get software versions
KevinMenden Apr 19, 2021
4924218
added workflow groovy class
KevinMenden Apr 19, 2021
d14c7a7
working alevin version
KevinMenden Apr 19, 2021
d017531
removed 'type' parameter
KevinMenden Apr 19, 2021
92f045f
added whitelist to salmon_alevin
KevinMenden Apr 19, 2021
19cc78f
added completion and utils classes
KevinMenden Apr 19, 2021
fa8009f
updated schema; started with star
KevinMenden Apr 19, 2021
01bb273
working STARsolo pipeline
KevinMenden Apr 19, 2021
9c9c649
added multiqc to starsolo; fixed bug
KevinMenden Apr 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 0 additions & 17 deletions Dockerfile

This file was deleted.

175 changes: 175 additions & 0 deletions bin/check_samplesheet.py
@@ -0,0 +1,175 @@
#!/usr/bin/env python

import os
import sys
import errno
import argparse


def parse_args(args=None):
Description = "Reformat nf-core/cageseq samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
group,replicate,fastq_1,fastq_2
WT,1,WT_LIB1_REP1_1.fastq.gz,WT_LIB1_REP1_2.fastq.gz
WT,1,WT_LIB2_REP1_1.fastq.gz,WT_LIB2_REP1_2.fastq.gz
WT,2,WT_LIB1_REP2_1.fastq.gz,WT_LIB1_REP2_2.fastq.gz
KO,1,KO_LIB1_REP1_1.fastq.gz,KO_LIB1_REP1_2.fastq.gz
"""

sample_run_dict = {}
with open(file_in, "r") as fin:

## Check header
MIN_COLS = 3
HEADER = ["group", "replicate", "fastq_1", "fastq_2"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print(
"ERROR: Please check samplesheet header -> {} != {}".format(
",".join(header), ",".join(HEADER)
)
)
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

## Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Line",
line,
)

num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(
MIN_COLS
),
"Line",
line,
)

## Check sample name entries
sample, replicate, fastq_1, fastq_2 = lspl[: len(HEADER)]
if sample:
if sample.find(" ") != -1:
print_error("Group entry contains spaces!", "Line", line)
else:
print_error("Group entry has not been specified!", "Line", line)

## Check replicate entry is integer
if not replicate.isdigit():
print_error("Replicate id not an integer!", "Line", line)
replicate = int(replicate)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
print_error(
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
"Line",
line,
)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]
else:
print_error("Invalid combination of columns provided!", "Line", line)

## Create sample mapping dictionary = {sample: {replicate : [ single_end, fastq_1, fastq_2]}}
if sample not in sample_run_dict:
sample_run_dict[sample] = {}
if replicate not in sample_run_dict[sample]:
sample_run_dict[sample][replicate] = [sample_info]
else:
if sample_info in sample_run_dict[sample][replicate]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_run_dict[sample][replicate].append(sample_info)

## Write validated samplesheet with appropriate columns
if len(sample_run_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:

fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
for sample in sorted(sample_run_dict.keys()):

## Check that replicate ids are in format 1..<NUM_REPS>
uniq_rep_ids = set(sample_run_dict[sample].keys())
if len(uniq_rep_ids) != max(uniq_rep_ids):
print_error(
"Replicate ids must start with 1..<num_replicates>!",
"Group",
sample,
)

for replicate in sorted(sample_run_dict[sample].keys()):

## Check that multiple runs of the same sample are of the same datatype
if not all(
x[0] == sample_run_dict[sample][replicate][0][0]
for x in sample_run_dict[sample][replicate]
):
print_error(
"Multiple runs of a sample must be of the same datatype!",
"Group",
sample,
)

## Write to file
for idx, sample_info in enumerate(
sample_run_dict[sample][replicate]
):
sample_id = "{}_R{}_T{}".format(sample, replicate, idx + 1)
fout.write(",".join([sample_id] + sample_info) + "\n")


def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
71 changes: 31 additions & 40 deletions conf/base.config
@@ -1,49 +1,40 @@
/*
* -------------------------------------------------
* nf-core/scrnaseq Nextflow base config file
* -------------------------------------------------
* A 'blank slate' config file, appropriate for general
* use on most high performace compute environments.
* Assumes that all software is installed and available
* on the PATH. Runs in `local` mode - all jobs will be
* run on the logged in environment.
*/

process {

cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 7.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }

errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'

// Process-specific resource requirements
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
maxRetries = 1
maxErrors = '-1'

// Process-specific resource requirements
withLabel: low_memory {
cpus = { check_max (4, 'cpus')}
memory = { check_max( 16.GB * task.attempt, 'memory' ) }
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel: mid_memory {
cpus = { check_max (8, 'cpus')}
memory = { check_max( 32.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
withLabel:process_medium {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 36.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
}
withLabel: high_memory {
cpus = { check_max (10, 'cpus')}
memory = { check_max( 80.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
memory = { check_max( 72.GB * task.attempt, 'memory' ) }
time = { check_max( 16.h * task.attempt, 'time' ) }
}

}

params {
// Defaults only, expecting to be overwritten
max_memory = 128.GB
max_cpus = 16
max_time = 240.h
igenomes_base = 's3://ngi-igenomes/igenomes/'
}
withLabel:process_long {
time = { check_max( 20.h * task.attempt, 'time' ) }
}
withLabel:error_ignore {
errorStrategy = 'ignore'
}
withLabel:error_retry {
errorStrategy = 'retry'
maxRetries = 2
}
withName:PLASMIDID {
errorStrategy = 'ignore'
}
}
45 changes: 45 additions & 0 deletions conf/modules.config
@@ -0,0 +1,45 @@
#!/usr/bin/env nextflow

/*
* ---------------------------------------------------------
* Nextflow config file for module specific default options
* ---------------------------------------------------------
*/

params {
modules {
'cellranger_mkgtf' {
publish_dir = "cellranger/mkgtf"
args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene"
}
'cellranger_mkref' {
publish_dir = "cellranger/mkref"
publish_files = false
}
'cellranger_count' {
publish_dir = "cellranger/count"
publish_files = ['gz':'filtered_feature_bc_matrix']
}
'gffread_tx2pgene' {
args = "--table transcript_id,gene_id"
}
'salmon_alevin' {
args = ""
}
'salmon_index' {
args = ""
}
'alevinqc' {
args = ""
}
'multiqc_alevin' {
args = ""
}
'star_genomegenerate' {
args = ""
}
'star_align' {
args = "--soloType CB_UMI_Simple --readFilesCommand zcat --runDirPerm All_RWX --outWigType bedGraph --twopassMode Basic --outSAMtype BAM SortedByCoordinate"
}
}
}
26 changes: 0 additions & 26 deletions environment.yml

This file was deleted.