From 15e501e72df4783509e8a9c2ef35177b8e711c7d Mon Sep 17 00:00:00 2001 From: nservant Date: Tue, 30 Apr 2019 11:53:32 +0200 Subject: [PATCH 1/2] fix mardown --- CHANGELOG.md | 23 ++--- bin/scrape_software_versions.py | 10 +- conf/base.config | 2 +- docs/configuration/local.md | 1 + docs/configuration/reference_genomes.md | 3 +- docs/installation.md | 2 +- docs/output.md | 4 +- docs/troubleshooting.md | 4 +- docs/usage.md | 124 ++++++++++++------------ main.nf | 4 +- nextflow.config | 2 +- 11 files changed, 93 insertions(+), 86 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2539792..b982a57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,14 +2,15 @@ ## v1.0dev - 2019-04-09 - First version of nf-core-hic pipeline which is a Nextflow implementation of the HiC-Pro pipeline [https://github.com/nservant/HiC-Pro]. - Note that all HiC-Pro functionalities are not yet all implemented. The current version is designed for protocols based on restriction enzyme digestion. - - In summary, this version allows : - * Automatic detection and generation of annotation files based on igenomes if not provided. - * Two-steps alignment of raw sequencing reads - * Reads filtering and detection of valid interaction products - * Generation of raw contact matrices for a set of resolutions - * Normalization of the contact maps using the ICE algorithm - * Generation of cooler file for visualization on higlass [https://higlass.io/] - * Quality report based on HiC-Pro MultiQC module +First version of nf-core-hic pipeline which is a Nextflow implementation of the [HiC-Pro pipeline](https://github.com/nservant/HiC-Pro/). +Note that all HiC-Pro functionalities are not yet all implemented. The current version is designed for protocols based on restriction enzyme digestion. + +In summary, this version allows : + +* Automatic detection and generation of annotation files based on igenomes if not provided. +* Two-steps alignment of raw sequencing reads +* Reads filtering and detection of valid interaction products +* Generation of raw contact matrices for a set of resolutions +* Normalization of the contact maps using the ICE algorithm +* Generation of cooler file for visualization on [higlass](https://higlass.io/) +* Quality report based on HiC-Pro MultiQC module diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 8cf977c..7a38fee 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -3,17 +3,21 @@ from collections import OrderedDict import re -# TODO nf-core: Add additional regexes for new tools in process get_software_versions +# Add additional regexes for new tools in process get_software_versions regexes = { 'nf-core/hic': ['v_pipeline.txt', r"(\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], - 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], + 'Bowtie2': ['v_bowtie2.txt', r"Bowtie2 v(\S+)"], + 'Python': ['v_python.txt', r"Python v(\S+)"], + 'Samtools': ['v_samtools.txt', r"Samtools v(\S+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], } results = OrderedDict() results['nf-core/hic'] = 'N/A' results['Nextflow'] = 'N/A' -results['FastQC'] = 'N/A' +results['Bowtie2'] = 'N/A' +results['Python'] = 'N/A' +results['Samtools'] = 'N/A' results['MultiQC'] = 'N/A' # Search each file using its regex diff --git a/conf/base.config b/conf/base.config index 156fa28..28b4679 100644 --- a/conf/base.config +++ b/conf/base.config @@ -11,7 +11,7 @@ process { - // TODO nf-core: Check the defaults for all processes + // Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } time = { check_max( 2.h * task.attempt, 'time' ) } diff --git a/docs/configuration/local.md b/docs/configuration/local.md index 9cd485e..d4530fa 100644 --- a/docs/configuration/local.md +++ b/docs/configuration/local.md @@ -10,6 +10,7 @@ Nextflow has [excellent integration](https://www.nextflow.io/docs/latest/docker. First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/) Then, simply run the analysis pipeline: + ```bash nextflow run nf-core/hic -profile docker --genome '' ``` diff --git a/docs/configuration/reference_genomes.md b/docs/configuration/reference_genomes.md index 1fafa8f..c52faf8 100644 --- a/docs/configuration/reference_genomes.md +++ b/docs/configuration/reference_genomes.md @@ -39,11 +39,12 @@ Multiple reference index types are held together with consistent structure for m We have put a copy of iGenomes up onto AWS S3 hosting and this pipeline is configured to use this by default. The hosting fees for AWS iGenomes are currently kindly funded by a grant from Amazon. The pipeline will automatically download the required reference files when you run the pipeline. -For more information about the AWS iGenomes, see https://ewels.github.io/AWS-iGenomes/ +For more information about the AWS iGenomes, see [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) Downloading the files takes time and bandwidth, so we recommend making a local copy of the iGenomes resource. Once downloaded, you can customise the variable `params.igenomes_base` in your custom configuration file to point to the reference location. For example: + ```nextflow params.igenomes_base = '/path/to/data/igenomes/' ``` diff --git a/docs/installation.md b/docs/installation.md index 70c4a6d..9ac66d5 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -74,7 +74,7 @@ Be warned of two important points about this default configuration: #### 3.1) Software deps: Docker First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/) -Then, running the pipeline with the option `-profile docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from dockerhub (https://hub.docker.com/r/nfcore/hic). +Then, running the pipeline with the option `-profile docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from [dockerhub](https://hub.docker.com/r/nfcore/hic). #### 3.1) Software deps: Singularity If you're not able to use Docker then [Singularity](http://singularity.lbl.gov/) is a great alternative. diff --git a/docs/output.md b/docs/output.md index f395dcd..53c9c0c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -64,7 +64,7 @@ Short range interactions that are likely to be spurious ligation products can th The validPairs are stored using a simple tab-delimited text format ; -``` +```bash read name / chr_reads1 / pos_reads1 / strand_reads1 / chr_reads2 / pos_reads2 / strand_reads2 / fragment_size / res frag name R1 / res frag R2 / mapping qual R1 / mapping qual R2 [/ allele_specific_tag] ``` @@ -102,7 +102,7 @@ A contact map is defined by : Based on the observation that a contact map is symmetric and usually sparse, only non-zero values are stored for half of the matrix. The user can specified if the 'upper', 'lower' or 'complete' matrix has to be stored. The 'asis' option allows to store the contacts as they are observed from the valid pairs files. -``` +```bash A B 10 A C 23 B C 24 diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index e6772eb..e0f2d07 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -7,11 +7,11 @@ If only no file, only one input file , or only read one and not read two is pick 1. The path must be enclosed in quotes (`'` or `"`) 2. The path must have at least one `*` wildcard character. This is even if you are only running one paired end sample. 3. When using the pipeline with paired end data, the path must use `{1,2}` or `{R1,R2}` notation to specify read pairs. -4. If you are running Single end data make sure to specify `--singleEnd` +4. If you are running Single end data make sure to specify `--singleEnd` If the pipeline can't find your files then you will get the following error -``` +```bash ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz ``` diff --git a/docs/usage.md b/docs/usage.md index 4f6825e..9b2bb6a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -7,69 +7,69 @@ * [Updating the pipeline](#updating-the-pipeline) * [Reproducibility](#reproducibility) * [Main arguments](#main-arguments) - * [`-profile`](#-profile-single-dash) - * [`awsbatch`](#awsbatch) - * [`conda`](#conda) - * [`docker`](#docker) - * [`singularity`](#singularity) - * [`test`](#test) - * [`--reads`](#--reads) - * [`--singleEnd`](#--singleend) + * [`-profile`](#-profile-single-dash) + * [`awsbatch`](#awsbatch) + * [`conda`](#conda) + * [`docker`](#docker) + * [`singularity`](#singularity) + * [`test`](#test) + * [`--reads`](#--reads) + * [`--singleEnd`](#--singleend) * [Reference genomes](#reference-genomes) - * [`--genome`](#--genome) - * [`--fasta`](#--fasta) - * [`--igenomesIgnore`](#--igenomesignore) - * [`--bwt2_index`](#--bwt2_index) - * [`--chromosome_size`](#--chromosome_size) - * [`--restriction_fragments`](#--restriction_fragments) + * [`--genome`](#--genome) + * [`--fasta`](#--fasta) + * [`--igenomesIgnore`](#--igenomesignore) + * [`--bwt2_index`](#--bwt2_index) + * [`--chromosome_size`](#--chromosome_size) + * [`--restriction_fragments`](#--restriction_fragments) * [Hi-C specific options](#hi-c-specific-options) - * [Reads mapping](#reads-mapping) - * [`--bwt2_opts_end2end`](#--bwt2_opts_end2end) - * [`--bwt2_opts_trimmed`](#--bwt2_opts_trimmed) - * [`--min_mapq`](#--min_mapq) - * [Digestion Hi-C](#digestion-hi-c) - * [`--restriction_site`](#--restriction_site) - * [`--ligation_site`](#--ligation_site) - * [`--min_restriction_fragment_size`](#--min_restriction_fragment_size) - * [`--max_restriction_fragment_size`](#--max_restriction_fragment_size) - * [`--min_insert_size`](#--min_insert_size) - * [`--max_insert_size`](#--max_insert_size) - * [DNase Hi-C](#dnase-hi-c) - * [`--dnase`](#--dnase) - * [Hi-C Processing](#hi-c-processing) - * [`--min_cis_dist`](#--min_cis_dist) - * [`--rm_singleton`](#--rm_singleton) - * [`--rm_dup`](#--rm_dup) - * [`--rm_multi`](#--rm_multi) - * [Genome-wide contact maps](#genome-wide-contact-maps) - * [`--bins_size`](#--bins_size) - * [`--ice_max_iter`](#--ice_max_iter) - * [`--ice_filer_low_count_perc`](#--ice_filer_low_count_perc) - * [`--ice_filer_high_count_perc`](#--ice_filer_high_count_perc) - * [`--ice_eps`](#--ice_eps) - * [Inputs/Outputs](#inputs-outputs) - * [`--splitFastq`](#--splitFastq) - * [`--saveReference`](#--saveReference) - * [`--saveAlignedIntermediates`](#--saveAlignedIntermediates) + * [Reads mapping](#reads-mapping) + * [`--bwt2_opts_end2end`](#--bwt2_opts_end2end) + * [`--bwt2_opts_trimmed`](#--bwt2_opts_trimmed) + * [`--min_mapq`](#--min_mapq) + * [Digestion Hi-C](#digestion-hi-c) + * [`--restriction_site`](#--restriction_site) + * [`--ligation_site`](#--ligation_site) + * [`--min_restriction_fragment_size`](#--min_restriction_fragment_size) + * [`--max_restriction_fragment_size`](#--max_restriction_fragment_size) + * [`--min_insert_size`](#--min_insert_size) + * [`--max_insert_size`](#--max_insert_size) + * [DNase Hi-C](#dnase-hi-c) + * [`--dnase`](#--dnase) + * [Hi-C Processing](#hi-c-processing) + * [`--min_cis_dist`](#--min_cis_dist) + * [`--rm_singleton`](#--rm_singleton) + * [`--rm_dup`](#--rm_dup) + * [`--rm_multi`](#--rm_multi) + * [Genome-wide contact maps](#genome-wide-contact-maps) + * [`--bins_size`](#--bins_size) + * [`--ice_max_iter`](#--ice_max_iter) + * [`--ice_filer_low_count_perc`](#--ice_filer_low_count_perc) + * [`--ice_filer_high_count_perc`](#--ice_filer_high_count_perc) + * [`--ice_eps`](#--ice_eps) + * [Inputs/Outputs](#inputs-outputs) + * [`--splitFastq`](#--splitFastq) + * [`--saveReference`](#--saveReference) + * [`--saveAlignedIntermediates`](#--saveAlignedIntermediates) * [Job resources](#job-resources) * [Automatic resubmission](#automatic-resubmission) * [Custom resource requests](#custom-resource-requests) * [AWS batch specific parameters](#aws-batch-specific-parameters) - * [`-awsbatch`](#-awsbatch) - * [`--awsqueue`](#--awsqueue) - * [`--awsregion`](#--awsregion) + * [`-awsbatch`](#-awsbatch) + * [`--awsqueue`](#--awsqueue) + * [`--awsregion`](#--awsregion) * [Other command line parameters](#other-command-line-parameters) - * [`--outdir`](#--outdir) - * [`--email`](#--email) - * [`-name`](#-name-single-dash) - * [`-resume`](#-resume-single-dash) - * [`-c`](#-c-single-dash) - * [`--custom_config_version`](#--custom_config_version) - * [`--max_memory`](#--max_memory) - * [`--max_time`](#--max_time) - * [`--max_cpus`](#--max_cpus) - * [`--plaintext_email`](#--plaintext_email) - * [`--multiqc_config`](#--multiqc_config) + * [`--outdir`](#--outdir) + * [`--email`](#--email) + * [`-name`](#-name-single-dash) + * [`-resume`](#-resume-single-dash) + * [`-c`](#-c-single-dash) + * [`--custom_config_version`](#--custom_config_version) + * [`--max_memory`](#--max_memory) + * [`--max_time`](#--max_time) + * [`--max_cpus`](#--max_cpus) + * [`--plaintext_email`](#--plaintext_email) + * [`--multiqc_config`](#--multiqc_config) ## General Nextflow info @@ -83,6 +83,7 @@ NXF_OPTS='-Xms1g -Xmx4g' ## Running the pipeline The typical command for running the pipeline is as follows: + ```bash nextflow run nf-core/hic --reads '*_R{1,2}.fastq.gz' -genome GRCh37 -profile docker ``` @@ -135,8 +136,6 @@ If `-profile` is not specified at all the pipeline will be run locally and expec * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters - - ### `--reads` Use this to specify the location of your input FastQ files. For example: @@ -211,7 +210,8 @@ The bowtie2 indexes are required to run the Hi-C pipeline. If the `--bwt2_index` The Hi-C pipeline will also requires a two-columns text file with the chromosome name and its size (tab separated). If not specified, this file will be automatically created by the pipeline. In the latter case, the `--fasta` reference genome has to be specified. -``` + +```bash chr1 249250621 chr2 243199373 chr3 198022430 @@ -233,7 +233,7 @@ If not specified, this file will be automatically created by the pipeline. In th Finally, Hi-C experiments based on restriction enzyme digestion requires a BED file with coordinates of restriction fragments. -``` +```bash chr1 0 16007 HIC_chr1_1 0 + chr1 16007 24571 HIC_chr1_2 0 + chr1 24571 27981 HIC_chr1_3 0 + @@ -445,7 +445,7 @@ The `--splitFastq` option allows to automatically split input read pairs into ch If specified, annotation files automatically generated from the `--fasta` file are exported in the results folder. Default: false -``` +```bash --saveReference ``` @@ -453,7 +453,7 @@ If specified, annotation files automatically generated from the `--fasta` file a If specified, all intermediate mapping files are saved and exported in the results folder. Default: false -``` +```bash --saveReference ``` diff --git a/main.nf b/main.nf index bccbb0d..17ff4d3 100644 --- a/main.nf +++ b/main.nf @@ -11,7 +11,7 @@ def helpMessage() { - // TODO nf-core: Add to this help message with new command line parameters + // Add to this help message with new command line parameters log.info nfcoreHeader() log.info""" @@ -868,7 +868,7 @@ workflow.onComplete { email_fields['summary']['Nextflow Build'] = workflow.nextflow.build email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - // TODO nf-core: If not using MultiQC, strip out this code (including params.maxMultiqcEmailFileSize) + // If not using MultiQC, strip out this code (including params.maxMultiqcEmailFileSize) // On success try attach the multiqc report def mqc_report = null try { diff --git a/nextflow.config b/nextflow.config index a526e9e..eba5139 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,7 +9,7 @@ params { // Workflow flags - // TODO nf-core: Specify your pipeline's command line flags + // Specify your pipeline's command line flags reads = "*{1,2}.fastq.gz" outdir = './results' genome = false From 06bc08eaaa34994269e4881399e2ca4e944c9226 Mon Sep 17 00:00:00 2001 From: nservant Date: Tue, 30 Apr 2019 14:57:40 +0200 Subject: [PATCH 2/2] fix conda env --- environment.yml | 6 +++--- main.nf | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index ed47c8b..745fdbc 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,7 @@ channels: - bioconda - defaults dependencies: - - python=2.7.13 + - python=2.7.16 - pip=18.1 - conda-forge::scipy=1.0.1 - conda-forge::numpy=1.9.3 @@ -15,7 +15,7 @@ dependencies: - bioconda::pysam=0.14.1 - cooler=0.8.3 - bowtie2=2.3.5 - - samtools=1.7 - - multiqc=1.6 + - samtools=1.9 + - bioconda::multiqc=1.7 - pip: - iced==0.4.2 diff --git a/main.nf b/main.nf index 17ff4d3..eeb6923 100644 --- a/main.nf +++ b/main.nf @@ -236,6 +236,7 @@ summary['Run Name'] = custom_runName ?: workflow.runName summary['Reads'] = params.reads summary['splitFastq'] = params.splitFastq summary['Fasta Ref'] = params.fasta +summary['Restriction Motif']= params.restriction_site summary['Ligation Motif'] = params.ligation_site summary['DNase Mode'] = params.dnase summary['Remove Dup'] = params.rm_dup @@ -311,8 +312,9 @@ process get_software_versions { echo $workflow.manifest.version > v_pipeline.txt echo $workflow.nextflow.version > v_nextflow.txt bowtie2 --version > v_bowtie2.txt - python --version > v_python.txt + python --version > v_python.txt 2>&1 samtools --version > v_samtools.txt + multiqc --version > v_multiqc.txt scrape_software_versions.py &> software_versions_mqc.yaml """ }