From 0311d4b01eebe438bb71d671144927172d2e7181 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 28 Sep 2020 14:12:51 +0200 Subject: [PATCH 01/14] Get DeDup version with reporting bugfix --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 1af700f87..7dccfbc2d 100644 --- a/environment.yml +++ b/environment.yml @@ -16,7 +16,7 @@ dependencies: - bioconda::bwa=0.7.17 - bioconda::picard=2.22.9 - bioconda::samtools=1.9 - - bioconda::dedup=0.12.6 + - bioconda::dedup=0.12.7 - bioconda::angsd=0.933 - bioconda::circularmapper=1.93.5 - bioconda::gatk4=4.1.7.0 From b6549b22b4c1e808c5d3cc90d749510be804835e Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 28 Sep 2020 14:15:54 +0200 Subject: [PATCH 02/14] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e11c86bc0..4da3d6ce7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * Added Sequencetools (1.4.0.6) that adds the ability to do genotyping with the 'pileupCaller' * Latest version of DeDup (0.12.6) which now reports mapped reads after deduplication +* [#560] Latest version of Dedup (0.12.7), which now correctly reports deduplication statistics based on calculations of mapped reads only (prior denominator was total reads of BAM file) * Latest version of ANGSD (0.933) which doesn't seg fault when running contamination on BAMs with insufficient reads * Latest version of MultiQC (1.9) with support for lots of extra tools in the pipeline (MALT, SexDetERRmine, DamageProfiler, MultiVCFAnalyzer) * Latest versions of Pygments (7.1), Pymdown-Extensions (2.6.1) and Markdown (3.2.2) for documentation output From 4025b3a2a8a162496c2993a8e7cd119732e75ce7 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 28 Sep 2020 14:17:32 +0200 Subject: [PATCH 03/14] Update environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 7dccfbc2d..a0dcebd76 100644 --- a/environment.yml +++ b/environment.yml @@ -22,7 +22,7 @@ dependencies: - bioconda::gatk4=4.1.7.0 - bioconda::qualimap=2.2.2d - bioconda::vcf2genome=0.91 - - bioconda::damageprofiler=0.4.9 + - bioconda::damageprofiler=0.5.0 - bioconda::multiqc=1.9 - bioconda::pmdtools=0.60 - bioconda::bedtools=2.29.2 From d8d46c40a1035bbd12a1db98a3d86b2823bed690 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 29 Sep 2020 10:25:59 +0200 Subject: [PATCH 04/14] Reverting DamageProfiler version due to Java version incompatibility --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index a0dcebd76..7dccfbc2d 100644 --- a/environment.yml +++ b/environment.yml @@ -22,7 +22,7 @@ dependencies: - bioconda::gatk4=4.1.7.0 - bioconda::qualimap=2.2.2d - bioconda::vcf2genome=0.91 - - bioconda::damageprofiler=0.5.0 + - bioconda::damageprofiler=0.4.9 - bioconda::multiqc=1.9 - bioconda::pmdtools=0.60 - bioconda::bedtools=2.29.2 From 1445062bb85859f4896d545478e02fabdb280c5d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Oct 2020 16:35:08 +0200 Subject: [PATCH 05/14] Added eigenstrat_snp_coverage process --- main.nf | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 18fe22676..0c216da38 100644 --- a/main.nf +++ b/main.nf @@ -2599,7 +2599,7 @@ if (params.pileupcaller_snpfile.isEmpty ()) { path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "Pileupcaller SNP file") output: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage script: def use_bed = bed.getName() != 'nf-core_eager_dummy.txt' ? "-l ${bed}" : '' @@ -2614,7 +2614,32 @@ if (params.pileupcaller_snpfile.isEmpty ()) { samtools mpileup -B -q 30 -Q 30 ${use_bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} ${use_snp} -e pileupcaller.${strandedness} """ } - + + process eigenstrat_snp_coverage { + label 'mc_tiny' + tag "${strandedness}" + publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode + + when: + params.run_genotyping && params.genotyping_tool == 'pileupcaller' + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump() + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc + path("*_eigenstrat_coverage.txt") + + script: +/* """ + eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json + """*/ + """ + eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt + parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt + """ + } + process genotyping_angsd { label 'mc_small' tag "${samplename}" @@ -3139,6 +3164,7 @@ process get_software_versions { endorS.py --version &> v_endorSpy.txt || true pileupCaller --version &> v_sequencetools.txt 2>&1 || true bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true + eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true scrape_software_versions.py &> software_versions_mqc.yaml """ @@ -3176,6 +3202,7 @@ process multiqc { file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([]) file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([]) file ('nuclear_contamination/*') from ch_nuclear_contamination_for_multiqc.collect().ifEmpty([]) + file ('genotyping/*') from ch_eigenstrat_snp_cov_for_multiqc file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") From b04a61934ebfb94da34d3e4942ccd2823a2ee9ef Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Oct 2020 16:36:14 +0200 Subject: [PATCH 06/14] Parsing version of eigenstrat_snp_coverage --- bin/scrape_software_versions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 2a6d880ab..201df4a58 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -34,7 +34,8 @@ 'MTNucRatioCalculator':['v_mtnucratiocalculator.txt',r"Version: (\S+)"], 'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "], 'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"], - 'kraken':['v_kraken.txt', r"Kraken version (\S+)"] + 'kraken':['v_kraken.txt', r"Kraken version (\S+)"], + 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"] } results = OrderedDict() @@ -69,6 +70,7 @@ results['malt'] = 'N/A' results['kraken'] = 'N/A' results['maltextract'] = 'N/A' +results['eigenstrat_snp_coverage'] = 'N/A' # Search each file using its regex for k, v in regexes.items(): From efb84223d4392acebd1691f644ca9e456c478e82 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Oct 2020 16:37:14 +0200 Subject: [PATCH 07/14] Small script to make MQC custom-content JSON until eigenstratdatabasetools mqc module is released. --- bin/parse_snp_cov.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 bin/parse_snp_cov.py diff --git a/bin/parse_snp_cov.py b/bin/parse_snp_cov.py new file mode 100755 index 000000000..e29f43268 --- /dev/null +++ b/bin/parse_snp_cov.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import sys, json +from collections import OrderedDict + +jsonOut = OrderedDict() +data = OrderedDict() + + +input = open(sys.argv[1], 'r') +for line in input: + fields = line.strip().split() + sample_id = fields[0] + covered_snps = fields[1] + total_snps = fields[2] + if sample_id[0] == "#": + continue + + data[sample_id] = {"Covered_Snps":covered_snps, "Total_Snps":total_snps} + +jsonOut = {"plot_type": "generalstats", "id": "snp_coverage", + "pconfig": { + "Covered_Snps" : {"title" : "#SNPs Covered"}, + "Total_Snps" : {"title": "#SNPs Total"} + }, + "data" : data +} + +with open(sys.argv[1].rstrip('.txt')+'_mqc.json', 'w') as outfile: + json.dump(jsonOut, outfile) From 30ef5212639eb0072dd1ffec2c9bf6d21253dd94 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Oct 2020 16:37:38 +0200 Subject: [PATCH 08/14] Added eigenstrat_snp_coverage columns --- assets/multiqc_config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index dee97e4ce..297935c34 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -177,6 +177,9 @@ table_columns_visible: Method2_MOM_SE: False Method2_ML_estimate: False Method2_ML_SE: False + snp_coverage: + Covered_Snps: True + Total_Snps: False table_columns_placement: FastQC (pre-AdapterRemoval): @@ -220,6 +223,9 @@ table_columns_placement: Method2_MOM_SE: 1160 Method2_ML_estimate: 1170 Method2_ML_SE: 1180 + snp_coverage: + Covered_Snps: 1050 + Total_Snps: 1060 DeDup: mapped_after_dedup: 620 clusterfactor: 630 From 108e447a32e27397ec965957439a091d3d1a7e10 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 5 Oct 2020 16:39:13 +0200 Subject: [PATCH 09/14] Added eigenstratdatabasetools v1.0.2 --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 7dccfbc2d..dfb95883b 100644 --- a/environment.yml +++ b/environment.yml @@ -43,4 +43,5 @@ dependencies: - conda-forge::biopython=1.76 - conda-forge::xopen=0.9.0 - bioconda::bowtie2=2.4.1 + - bioconda::eigenstratdatabasetools=1.0.2 #Missing Schmutzi,snpAD From b728a6d4b0a5abab1504894b6fbadcc780065683 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 6 Oct 2020 12:09:56 +0200 Subject: [PATCH 10/14] Update environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index dfb95883b..abb6772c8 100644 --- a/environment.yml +++ b/environment.yml @@ -33,7 +33,7 @@ dependencies: - bioconda::fastp=0.20.1 - bioconda::bamutil=1.0.14 - bioconda::mtnucratio=0.7 - - pysam=0.15.4 #Says python3.7 or less + - bioconda::pysam=0.15.4 #Says python3.7 or less - bioconda::kraken2=2.0.9beta - conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible - bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage From e4203397d50dbd2c66cd275b784eed1b71afd551 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Oct 2020 16:04:53 +0200 Subject: [PATCH 11/14] Added eigenstrat_snp_coverage --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4da3d6ce7..497e6c7df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * [#544](https://github.com/nf-core/eager/pull/544) Add script to perform bam filtering on fragment length * [#456](https://github.com/nf-core/eager/pull/546) Bumps the base (default) runtime of all processes to 4 hours, and set shorter timelimits for test profiles (1 hour) * [#552](https://github.com/nf-core/eager/issues/552) Adds optional creation of MALT SAM files alongside RMA6 files. +* Added eigenstrat snp coverage statistics to MultiQC report. Process results are published in `genotyping/*_eigenstrat_coverage.txt`. ### `Fixed` From f90e5ba5d5c4c2a528cf08ba6c1a66280d910496 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Oct 2020 16:41:00 +0200 Subject: [PATCH 12/14] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b6f8ca380..c663f69af 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,7 @@ If you've contributed and you're missing in here, please let us know and we will * **endorS.py** Aida Andrades Valtueña (Unpublished). Download: [https://github.com/aidaanva/endorS.py](https://github.com/aidaanva/endorS.py) * **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). * **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) +* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) ## Data References From 6383d3faa155210e4960b70c3e2eb9d926a04f8e Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Oct 2020 16:41:37 +0200 Subject: [PATCH 13/14] Added info on eigenstrat_snp_coverage columns of GeneralStatsTable --- docs/output.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 5532f8fbe..a9474a3d5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -160,6 +160,8 @@ The possible columns displayed by default are as follows: - **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type. - **XRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome. - **YRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome. +- **#SNPs Covered** This is from eigenstrat\_snp\_coverage. The number of called SNPs after genotyping with pileupcaller. +- **#SNPs Total** This is from eigenstrat\_snp\_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`. - **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region. - **Contamination Estimate (Method1_ML)** This is from the nuclear contamination function of ANGSD. The Maximum Likelihood contamination estimate according to Method 1. The estimates using Method of Moments and/or those based on Method 2 can be unhidden through the "Configure Columns" button. - **Estimate Error (Method1_ML)** This is from ANGSD. The standard error of the Method1 Maximum likelihood estimate. The errors associated with Method of Moments and/or Method2 estimates can be unhidden through the "Configure Columns" button. @@ -721,7 +723,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir - `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. - `pmdtools/` this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. The BAM files do not have corresponding index files. - `trimmed_bam/` this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). -- `genotyping/` this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. +- `genotyping/` this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics. - `multivcfanalyzer/` this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. - `sex_determination/` this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the Sample Name, the Nr of Autosomal SNPs, Nr of SNPs on the X/Y chromosome, the Nr of reads mapping to the Autosomes, the Nr of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per bam. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. - `nuclear_contamination/` this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. From 2d289df8fe359c538ea8d26e339014345b30c4e2 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Oct 2020 16:50:24 +0200 Subject: [PATCH 14/14] Added explanation of the extra code block in eigenstrat_snp_coverage process --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 0c216da38..cf9013fb7 100644 --- a/main.nf +++ b/main.nf @@ -2631,6 +2631,7 @@ if (params.pileupcaller_snpfile.isEmpty ()) { path("*_eigenstrat_coverage.txt") script: + // The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available. /* """ eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json """*/