diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9a004f3d3..0286ff700 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -186,3 +186,6 @@ jobs:
- name: MTNUCRATIO Run basic pipeline with bam input profile, but don't convert BAM, skip everything but nmtnucratio
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
+ - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index bff232d59..fa9399491 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,11 +7,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Added`
+- [#583](https://github.com/nf-core/eager/issues/583) - mapDamage2 rescaling of BAM files to remove damage
+
### `Fixed`
- Removed leftover old DockerHub push CI commands.
-- [#627](https://github.com/nf-core/eager/issues/627) Added de Barros Damgaard citation to README
-- [#630](https://github.com/nf-core/eager/pull/630) Better handling of Qualimap memory requirements and error strategy.
+- [#627](https://github.com/nf-core/eager/issues/627) - Added de Barros Damgaard citation to README
+- [#630](https://github.com/nf-core/eager/pull/630) - Better handling of Qualimap memory requirements and error strategy.
+- Fixed some imcomplete schema options to ensure users supply valid input values
### `Dependencies`
diff --git a/README.md b/README.md
index a85b30644..74a05d06a 100644
--- a/README.md
+++ b/README.md
@@ -236,6 +236,7 @@ In addition, references of tools and data used in this pipeline are as follows:
* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
+* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
## Data References
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 201df4a58..0ebef35c4 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -35,7 +35,8 @@
'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
- 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
+ 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
+ 'mapDamage2':['v_mapdamage.txt',r"(\S+)"],
}
results = OrderedDict()
@@ -55,7 +56,7 @@
results['Qualimap'] = 'N/A'
results['Preseq'] = 'N/A'
results['GATK HaplotypeCaller'] = 'N/A'
-#results['GATK UnifiedGenotyper'] = 'N/A'
+results['GATK UnifiedGenotyper'] = 'N/A'
results['freebayes'] = 'N/A'
results['sequenceTools'] = 'N/A'
results['VCF2genome'] = 'N/A'
@@ -71,6 +72,7 @@
results['kraken'] = 'N/A'
results['maltextract'] = 'N/A'
results['eigenstrat_snp_coverage'] = 'N/A'
+results['mapDamage2'] = 'N/A'
# Search each file using its regex
for k, v in regexes.items():
diff --git a/conf/test_resources.config b/conf/test_resources.config
index 109e93e4c..74bb4ce2a 100644
--- a/conf/test_resources.config
+++ b/conf/test_resources.config
@@ -51,4 +51,8 @@ process {
time = { check_max( 10.m * task.attempt, 'time' ) }
}
+ withName:'mapdamage_rescaling'{
+ time = { check_max( 20.m * task.attempt, 'time' ) }
+ }
+
}
\ No newline at end of file
diff --git a/docs/output.md b/docs/output.md
index 7c316e7db..0b8da2f6c 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -658,6 +658,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
- `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
- `pmdtools/` - this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.
- `trimmed_bam/` - this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read).
+- `damage_rescaling/` - this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping.
- `genotyping/` - this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics.
- `multivcfanalyzer/` - this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files.
- `sex_determination/` - this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer.
diff --git a/docs/usage.md b/docs/usage.md
index b5094f606..fe0279137 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -391,203 +391,6 @@ hard drive footprint of the run, so be sure to do this!
## Troubleshooting and FAQs
-### My pipeline update doesn't seem to do anything
-
-To download a new version of a pipeline, you can use the following, replacing
-`` to the corresponding version.
-
-```bash
-nextflow pull nf-core/eager -r
-```
-
-However, in very rare cases, minor fixes to a version will be pushed out without
-a version number bump. This can confuse nextflow slightly, as it thinks you
-already have the 'broken' version from your original pipeline download.
-
-If when running the pipeline you don't see any changes in the fixed version when
-running it, you can try removing your nextflow EAGER cache typically stored in
-your home directory with
-
-```bash
-rm -r ~/.nextflow/assets/nf-core/eager
-```
-
-And re-pull the pipeline with the command above. This will install a fresh
-version of the version with the fixes.
-
-### Input files not found
-
-When using the [direct input](#direct-input-method) method: if no file, only one
-input file, or only 'read one' and not 'read two' is picked up then something is
-likely wrong with your input file declaration ([`--input`](#--input)):
-
-1. The path must be enclosed in quotes (`'` or `"`)
-2. The path must have at least one `*` wildcard character. This is even if you
- are only running one paired end sample.
-3. When using the pipeline with paired end data, the path must use `{1,2}` or
- `{R1,R2}` notation to specify read pairs.
-4. If you are running single-end data make sure to specify `--single_end`
-
-**Important**: The pipeline can't take a list of multiple input files when using
-the direct input method - it takes a 'glob' expression. If your input files are
-scattered in different paths then we recommend that you generate a directory
-with symlinked files. If running in paired-end mode please make sure that your
-files are sensibly named so that they can be properly paired. See the previous
-point.
-
-If the pipeline can't find your files then you will get the following error
-
-```bash
-ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz
-```
-
-If your sample name is "messy" then you have to be very particular with your
-glob specification. A file name like `L1-1-D-2h_S1_L002_R1_001.fastq.gz` can be
-difficult enough for a human to read. Specifying `*{1,2}*.gz` won't work give
-you what you want whilst `*{R1,R2}*.gz` (i.e. the addition of the `R`s) will.
-
-If using the [TSV input](#tsv-input-method) method, this likely means there is a
-mistake or typo in the path in a given column. Often this is a trailing space at
-the end of the path.
-
-### I am only getting output for a single sample although I specified multiple with wildcards
-
-You must specify paths to files in quotes, otherwise your shell will evaluate
-any wildcards (\*) rather than Nextflow.
-
-For example
-
-```bash
-nextflow run nf-core/eager --input /path/to/sample_*/*.fq.gz
-```
-
-Would be evaluated by your shell as
-
-```bash
-nextflow run nf-core/eager --input /path/to/sample_1/sample_1.fq.gz /path/to/sample_1/sample_1.fq.gz /path/to/sample_1/sample_1.fq.gz
-```
-
-And Nextflow will only take the first path after `--input`, ignoring the others.
-
-On the other hand, encapsulating the path in quotes will allow Nextflow to
-evaluate the paths.
-
-```bash
-nextflow run nf-core/eager --input "/path/to/sample_*/*.fq.gz"
-```
-
-### The pipeline crashes almost immediately with an early pipeline step
-
-Sometimes a newly downloaded and set up nf-core/eager pipeline will encounter an
-issue where a run almost immediately crashes (e.g. at `fastqc`,
-`output_documentation` etc.) saying the tool could not be found or similar.
-
-#### I am running Docker
-
-You may have an outdated container. This happens more often when running on the
-`dev` branch of nf-core/eager, because Docker will _not_ update the container on
-each new commit, and thus may not get new tools called within the pipeline code.
-
-To fix, just re-pull the nf-core/eager Docker container manually with:
-
-```bash
-docker pull nfcore/eager:dev
-```
-
-#### I am running Singularity
-
-If you're running Singularity, it could be that Nextflow cannot access your
-Singularity image properly - often due to missing bind paths.
-
-See
-[here](https://nf-co.re/usage/troubleshooting#cannot-find-input-files-when-using-singularity)
-for more information.
-
-### The pipeline has crashed with an error but Nextflow is still running
-
-If this happens, you can either wait until all other already running jobs to
-safely finish, or if Nextflow _still_ does not stop press `ctrl + c` on your
-keyboard (or equivalent) to stop the Nextflow run.
-
-> :warning: if you do this, and do not plan to fix the run make sure to delete
-the output folder. Otherwise you may end up a lot of large intermediate files
-being left! You can clean a Nextflow run of all intermediate files with
-`nextflow clean -f -k` or delete the `work/` directory.
-
-### I get a exceeded job memory limit error
-
-While Nextflow tries to make your life easier by automatically retrying jobs
-that run out of memory with more resources (until your specified max-limit),
-sometimes you may have such large data you run out even after the default 3
-retries.
-
-To fix this you need to change the default memory requirements for the process
-that is breaking. We can do this by making a custom profile, which we then
-provide to the Nextflow run command.
-
-For example, lets say it's the `markduplicates` process that is running out of
-memory.
-
-First we need to check to see what default memory value we have. We can do this
-by going to the main [nf-core/eager code](https://github.com/nf-core/) and
-opening the `main.nf` file. We can then use your browser's find functionality
-for: `process markduplicates`.
-
-Once found, we then need to check the line called `label`. In this case the
-label is `mc_small` (for multi-core small).
-
-Next we need to go back to the main github repository, and open
-`conf/base.config`. Again using our find functionality, we search for:
-`withLabel:'mc_small'`.
-
-We see that the `memory` is set to `4.GB` (`memory = { check_max( 4.GB *
-task.attempt, 'memory' )})`).
-
-Now back on your computer, we need to make a new file called
-`custom_resources.conf`. You should save it somewhere centrally so you can
-reuse it.
-
-> If you think this would be useful for multiple people in your lab/institute,
-> we highly recommend you make an institutional profile at
-> [nf-core/configs](https://github.com/nf-core/configs). This will simplify this
-> process in the future.
-
-Within this file, you will need to add the following:
-
-```txt
-profiles {
- big_data {
- process {
- withName: markduplicates {
- memory = 16.GB
- }
- }
- }
-}
-```
-
-Where we have increased the default `4.GB` to `16.GB`. Make sure that you keep
-the `check_max` function, as this prevents your run asking for too much memory
-during retries.
-
-> Note that with this you will _not_ have the automatic retry mechanism. If
-> you want this, re-add the `check_max()` function on the `memory` line, and
-> add to the bottom of the entire file (outside the profiles block), the
-> block starting `def check_max(obj, type) {`, which is at the end of the
-> [nextflow.config file](https://github.com/nf-core/eager/blob/master/nextflow.config)
-
-Once saved, we can then modify your original Nextflow run command:
-
-```bash
-nextflow run nf-core/eager -r 2.2.0 -c ///custom_resources.conf -profile big_data,, <...>
-```
-
-Where we have added `-c` to specify which file to use for the custom profiles,
-and then added the `big_data` profile to the original profiles you were using.
-
-:warning: it's important that big_data comes first, to ensure it overwrites any
-parameters set in the subsequent profiles!
-
### I get a file name collision error during merging
When using TSV input, nf-core/eager will attempt to merge all `Lanes` of a
@@ -608,37 +411,6 @@ they are unique (e.g. if one library was sequenced on Lane 8 of two HiSeq runs,
specify lanes as 8 and 16 for each FASTQ file respectively). For library merging
errors, you must modify your `Library_ID`s accordingly, to make them unique.
-### I specified a module and it didn't produce the expected output
-
-Possible options:
-
-1. Check there if you have a typo in the parameter name. Nextflow _does not_
- check for this
-2. Check that an upstream module was turned on (if a module requires the output
- of a previous module, it will not be activated unless it receives the output)
-
-### I get a unable to acquire lock
-
-Errors like the following
-
-```bash
-Unable to acquire lock on session with ID 84333844-66e3-4846-a664-b446d070f775
-```
-
-normally suggest a previous Nextflow run (on the same folder) was not cleanly
-killed by a user (e.g. using ctrl + z to hard kill a crashed run).
-
-To fix this, you must clean the entirety of the output directory (including
-output files) e.g. with `rm -r /* /.*` and re-running
-from scratch.
-
-`ctrl +z` is **not** a recommended way of killing a Nextflow job. Runs that take
-a long time to fail are often still running because other job submissions are
-still running. Nextflow will normally wait for those processes to complete
-before cleaning shutting down the run (to allow rerunning of a run with
-`-resume`). `ctrl + c` is much safer as it will tell Nextflow to stop earlier
-but cleanly.
-
## Tutorials
### Tutorial - How to investigate a failed run
diff --git a/environment.yml b/environment.yml
index 7204f8de7..0fd3f7440 100644
--- a/environment.yml
+++ b/environment.yml
@@ -47,3 +47,4 @@ dependencies:
- conda-forge::xopen=0.9.0
- bioconda::bowtie2=2.4.1
- bioconda::eigenstratdatabasetools=1.0.2
+ - bioconda::mapdamage2=2.2.0
diff --git a/main.nf b/main.nf
index daad89f87..6631aba2f 100644
--- a/main.nf
+++ b/main.nf
@@ -116,12 +116,15 @@ def helpMessage() {
--damageprofiler_length [num] Specify length filter for DamageProfiler. Default: ${params.damageprofiler_length}
--damageprofiler_threshold [num] Specify number of bases of each read to consider for DamageProfiler calculations. Default: ${params.damageprofiler_threshold}
--damageprofiler_yaxis [float] Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: ${params.damageprofiler_yaxis}
+ --run_mapdamage_rescaling Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.
+ --rescale_length_5p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_5p}
+ --rescale_length_3p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_3p}
--run_pmdtools [bool] Turn on PMDtools
--pmdtools_range [num] Specify range of bases for PMDTools. Default: ${params.pmdtools_range}
--pmdtools_threshold [num] Specify PMDScore threshold for PMDTools. Default: ${params.pmdtools_threshold}
--pmdtools_reference_mask [file] Specify a path to reference mask for PMDTools.
--pmdtools_max_reads [num] Specify the maximum number of reads to consider for metrics generation. Default: ${params.pmdtools_max_reads}
-
+
Annotation Statistics
--run_bedtools_coverage [bool] Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.
--anno_file [file] Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.
@@ -137,7 +140,7 @@ def helpMessage() {
Genotyping
--run_genotyping [bool] Turn on genotyping of BAM files.
--genotyping_tool [str] Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.
- --genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'. Default: '${params.genotyping_source}'
+ --genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd', 'rescaled'. Default: '${params.genotyping_source}'
--gatk_call_conf [num] Specify GATK phred-scaled confidence threshold. Default: ${params.gatk_call_conf}
--gatk_ploidy [num] Specify GATK organism ploidy. Default: ${params.gatk_ploidy}
--gatk_downsample [num] Maximum depth coverage allowed for genotyping before down-sampling is turned on. Default: ${params.gatk_downsample}
@@ -285,7 +288,7 @@ if("${params.fasta}".endsWith(".gz")){
path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path`
output:
- path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd
+ path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling
script:
unzip = zipped_fasta.toString() - '.gz'
@@ -296,7 +299,7 @@ if("${params.fasta}".endsWith(".gz")){
} else {
fasta_for_indexing = Channel
.fromPath("${params.fasta}", checkIfExists: true)
- .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd }
+ .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling }
}
// Check that fasta index file path ends in '.fai'
@@ -413,8 +416,13 @@ if (params.sexdeterrmine_bedfile == '') {
// Genotyping validation
if (params.run_genotyping){
+
+ if (params.genotyping_source != 'raw' && params.genotyping_source != 'pmd' && params.genotyping_source != 'trimmed' && params.genotyping_source != 'rescaled' ) {
+ exit 1, "[nf-core/eager] error: please specify a valid genotyping source. Options: 'raw', 'pmd', 'trimmed', 'rescaled'. Found parameter: --genotyping_source '${params.genotyping_source}'."
+ }
+
if (params.genotyping_tool != 'ug' && params.genotyping_tool != 'hc' && params.genotyping_tool != 'freebayes' && params.genotyping_tool != 'pileupcaller' && params.genotyping_tool != 'angsd' ) {
- exit 1, "[nf-core/eager] error: please specify a genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
+ exit 1, "[nf-core/eager] error: please specify a valid genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
}
if (params.gatk_ug_out_mode != 'EMIT_VARIANTS_ONLY' && params.gatk_ug_out_mode != 'EMIT_ALL_CONFIDENT_SITES' && params.gatk_ug_out_mode != 'EMIT_ALL_SITES') {
@@ -2172,11 +2180,11 @@ process library_merge {
if (!params.skip_deduplication) {
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
.filter { it =~/.*_rmdup.bam/ }
- .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
+ .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
} else {
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
- .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
+ .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
}
//////////////////////////////////////////////////
@@ -2282,7 +2290,37 @@ process damageprofiler {
"""
}
-// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
+// Damage rescaling with mapDamage
+
+process mapdamage_rescaling {
+
+ label 'sc_small'
+ tag "${libraryid}"
+
+ publishDir "${params.outdir}/damage_rescaling", mode: params.publish_dir_mode
+
+ when:
+ params.run_mapdamage_rescaling
+
+ input:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damagerescaling
+ file fasta from ch_fasta_for_damagerescaling.collect()
+
+ output:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_rescaled.bam"), path("*rescaled.bam.{bai,csi}") into ch_output_from_damagerescaling
+
+ script:
+ def base = "${bam.baseName}"
+ def singlestranded = strandedness == "single" ? '--single-stranded' : ''
+ def size = params.large_ref ? '-c' : ''
+ """
+ mapDamage -i ${bam} -r ${fasta} --rescale --rescale-out ${bam}_rescaled.bam --rescale-length-5p ${params.rescale_length_5p} --rescale-length-3p=${params.rescale_length_3p} ${singlestranded}
+ samtools index ${bam}_rescaled.bam ${size}
+ """
+
+}
+
+// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
process pmdtools {
label 'mc_small'
@@ -2369,7 +2407,7 @@ process bam_trim {
"""
}
-// Post trimming merging of libraries to single samples, except for SS/DS
+// Post-trimming merging of libraries to single samples, except for SS/DS
// libraries as they should be genotyped separately, because we will assume
// that if trimming is turned on, 'lab-removed' libraries can be combined with
// merged with 'in-silico damage removed' libraries to improve genotyping
@@ -2464,12 +2502,19 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && !params.run_pmdtools ) {
- exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmtools (--run_pmdtools)! Please check input parameters."
+ exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmdtools (--run_pmdtools)! Please check input parameters."
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && params.run_pmdtools ) {
ch_output_from_pmdtools
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
+} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && params.run_mapdamage_rescaling) {
+ ch_output_from_damagerescaling
+ .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
+
+} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && !params.run_mapdamage_rescaling) {
+ exit 1, "[nf-core/eager] error: Cannot run genotyping with 'rescaled' source without running damage rescaling (--run_damagescaling)! Please check input parameters."
+
} else if ( !params.run_genotyping && !params.run_trim_bam && !params.run_pmdtools ) {
ch_rmdup_for_skipdamagemanipulation
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
@@ -3165,6 +3210,7 @@ process get_software_versions {
pileupCaller --version &> v_sequencetools.txt 2>&1 || true
bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
+ mapDamage2 --version > v_mapdamage.txt || true
scrape_software_versions.py &> software_versions_mqc.yaml
"""
diff --git a/nextflow.config b/nextflow.config
index 684356b7e..8bd90a020 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -110,6 +110,10 @@ params {
pmdtools_reference_mask = ''
pmdtools_max_reads = 10000
+ // mapDamage
+ run_mapdamage_rescaling = false
+ params.rescale_length_5p = 12
+ params.rescale_length_3p = 12
//Bedtools settings
run_bedtools_coverage = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6c14dc9c9..9f58c8cf4 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -790,6 +790,26 @@
"description": "Specify the maximum number of reads to consider for metrics generation.",
"fa_icon": "fas fa-greater-than-equal",
"help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`"
+ },
+ "run_mapdamage_rescaling": {
+ "type": "boolean",
+ "fa_icon": "fas fa-map",
+ "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.",
+ "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2"
+ },
+ "rescale_length_5p": {
+ "type": "integer",
+ "default": 12,
+ "fa_icon": "fas fa-balance-scale-right",
+ "description": "Length of read for mapDamage2 to rescale from 5p end.",
+ "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2."
+ },
+ "rescale_length_3p": {
+ "type": "integer",
+ "default": 12,
+ "fa_icon": "fas fa-balance-scale-left",
+ "description": "Length of read for mapDamage2 to rescale from 3p end.",
+ "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2."
}
},
"fa_icon": "fas fa-chart-line",
@@ -895,9 +915,15 @@
"genotyping_source": {
"type": "string",
"default": "raw",
- "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'.",
+ "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.",
"fa_icon": "fas fa-faucet",
- "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output). Default is: `'raw'`.\n"
+ "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n",
+ "enum": [
+ "raw",
+ "pmd",
+ "trimmed",
+ "rescaled"
+ ]
},
"gatk_call_conf": {
"type": "integer",