Skip to content

Commit

Permalink
Merge pull request #922 from maxplanck-ie/dev_KS
Browse files Browse the repository at this point in the history
updated documentation after moving SalmonIndex
  • Loading branch information
katsikora committed Aug 17, 2023
2 parents c113679 + 7942cdf commit b4772f1
Show file tree
Hide file tree
Showing 35 changed files with 488 additions and 313 deletions.
4 changes: 4 additions & 0 deletions .ci_stuff/organism.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@ bwameth_index: "/tmp/genome.fa"
bwameth2_index: "/tmp/genome.fa"
known_splicesites: ".ci_stuff/gencode/m9/HISAT2/splice_sites.txt"
star_index: ".ci_stuff/STARIndex/"
salmon_index: "/tmp/SalmonIndex/"
salmon_velocity_index: "/tmp/SalmonIndex_RNAVelocity/"
t2g_velocity: "/tmp/annotation/cDNA_introns.joint.t2g"
genes_bed: "/tmp/genes.bed"
genes_gtf: "/tmp/genes.gtf"
genes_t2g: "/tmp/genes.t2g"
extended_coding_regions_gtf: ".ci_stuff/gencode/m9/genes.slop.gtf"
blacklist_bed: ".ci_stuff/DKFZ/GRCm38_General_readAttractingRegions.UseThisOne.bed"
ignoreForNormalization: "MT X Y JH584299.1 GL456233.1 JH584301.1 GL456211.1 GL456350.1 JH584293.1 GL456221.1 JH584297.1 JH584296.1 GL456354.1 JH584294.1 JH584298.1 JH584300.1 GL456219.1 GL456210.1 JH584303.1 JH584302.1 GL456212.1 JH584304.1 GL456379.1 GL456216.1 GL456393.1 GL456366.1 GL456367.1 GL456239.1 GL456213.1 GL456383.1 GL456385.1 GL456360.1 GL456378.1 GL456389.1 GL456372.1 GL456370.1 GL456381.1 GL456387.1 GL456390.1 GL456394.1 GL456392.1 GL456382.1 GL456359.1 GL456396.1 GL456368.1 JH584292.1 JH584295.1"
Expand Down
4 changes: 4 additions & 0 deletions .ci_stuff/spikein_organism.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ bwa_index: ".ci_stuff/BWAindex/genome.fa"
bwameth_index: "/tmp/genome.fa"
known_splicesites: ".ci_stuff/gencode/m9/HISAT2/splice_sites.txt"
star_index: ".ci_stuff/STARIndex/"
salmon_index: "/tmp/SalmonIndex/"
salmon_velocity_index: "/tmp/SalmonIndex_RNAVelocity/"
t2g_velocity: "/tmp/annotation/cDNA_introns.joint.t2g"
genes_bed: "/tmp/genes.bed"
genes_gtf: "/tmp/genes.gtf"
genes_t2g: "/tmp/genes.t2g"
spikein_genes_gtf: "/tmp/spikein_genes.gtf"
extended_coding_regions_gtf: ".ci_stuff/gencode/m9/genes.slop.gtf"
blacklist_bed: ".ci_stuff/DKFZ/GRCm38_General_readAttractingRegions.UseThisOne.bed"
Expand Down
24 changes: 14 additions & 10 deletions .ci_stuff/test_dag.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,24 @@ mkdir -p allelic_input
mkdir -p allelic_input/Ngenome
touch allelic_input/file.vcf.gz allelic_input/snpfile.txt
cp .ci_stuff/genome.fa .ci_stuff/genome.fa.fai /tmp/
mkdir -p /tmp/SalmonIndex /tmp/annotation
touch /tmp/SalmonIndex/decoys.txt
touch /tmp/annotation/cDNA_introns.joint.t2g
touch /tmp/genes.t2g

# Ensure an empty snakePipes config doesn't muck anything up
snakePipes config

# createIndices
WC=`createIndices -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --genome ftp://ftp.ensembl.org/pub/release-93/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz --gtf ftp://ftp.ensembl.org/pub/release-93/gtf/mus_musculus/Mus_musculus.GRCm38.93.gtf.gz blah | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 184 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 229 ]; then exit 1 ; fi
WC=`createIndices -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --genome ftp://ftp.ensembl.org/pub/release-93/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz --gtf ftp://ftp.ensembl.org/pub/release-93/gtf/mus_musculus/Mus_musculus.GRCm38.93.gtf.gz --rmskURL http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz blah | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 192 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 237 ]; then exit 1 ; fi
WC=`createIndices -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --DAG --genome ftp://ftp.ensembl.org/pub/release-93/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz --gtf ftp://ftp.ensembl.org/pub/release-93/gtf/mus_musculus/Mus_musculus.GRCm38.93.gtf.gz --rmskURL http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz blah | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 192 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 237 ]; then exit 1 ; fi
# spikein
WC=`createIndices -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --DAG --genomeURL ftp://ftp.ensembl.org/pub/release-93/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gz --gtfURL ftp://ftp.ensembl.org/pub/release-93/gtf/mus_musculus/Mus_musculus.GRCm38.93.gtf.gz --spikeinGenomeURL ftp://ftp.ensembl.org/pub/release-79/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.dna_sm.toplevel.fa.gz --spikeinGtfURL ftp://ftp.ensembl.org/pub/release-96/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.22.96.gtf.gz --rmskURL http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz blah | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 239 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 284 ]; then exit 1 ; fi


# DNA mapping
Expand Down Expand Up @@ -194,7 +198,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 841 ]; then exit 1 ; fi
WC=`mRNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment,deepTools_qc" --trim .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1261 ]; then exit 1 ; fi
WC=`mRNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment-free,deepTools_qc" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1333 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1309 ]; then exit 1 ; fi
WC=`mRNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment,deepTools_qc" --bcExtract --trim .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1217 ]; then exit 1 ; fi
WC=`mRNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment,deepTools_qc" --bcExtract --UMIDedup --trim .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
Expand All @@ -206,14 +210,14 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 741 ]; then exit 1 ; fi
WC=`mRNA-seq -i SE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment,deepTools_qc" --trim .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1150 ]; then exit 1 ; fi
WC=`mRNA-seq -i SE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment-free,deepTools_qc" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1222 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1198 ]; then exit 1 ; fi
WC=`mRNA-seq -i SE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --trim --fastqc .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1286 ]; then exit 1 ; fi
WC=`mRNA-seq -i BAM_input/filtered_bam -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --fromBAM .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 833 ]; then exit 1 ; fi
#multiple comparison groups
WC=`mRNA-seq --mode alignment,alignment-free -i PE_input -o output --rMats --sampleSheet .ci_stuff/test_sampleSheet_multiComp.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1178 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1154 ]; then exit 1 ; fi
# three prime sequencing
WC=`mRNA-seq -i PE_input -o output --mode three-prime-seq --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 866 ]; then exit 1 ; fi
Expand All @@ -231,7 +235,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1762 ]; then exit 1 ; fi
WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1783 ]; then exit 1 ; fi
WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2328 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2304 ]; then exit 1 ; fi

WC=`noncoding-RNA-seq -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 968 ]; then exit 1 ; fi
Expand All @@ -255,9 +259,9 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1277 ]; then exit 1 ; fi
WC=`scRNAseq -i PE_input -o output --mode STARsolo --skipVelocyto --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1143 ]; then exit 1 ; fi
WC=`scRNAseq -i PE_input -o output --mode Alevin --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 583 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 536 ]; then exit 1 ; fi
WC=`scRNAseq -i PE_input -o output --mode Alevin --skipVelocyto --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 511 ]; then exit 1 ; fi
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 446 ]; then exit 1 ; fi

# WGBS
WC=`WGBS -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
Expand Down
2 changes: 2 additions & 0 deletions docs/content/News.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ snakePipes News

snakePipes x.x.x
----------------

* Moved SalmonIndex creation from mRNAseq and scRNAseq to createIndices.
* Changed the behaviour of snakePipes createEnvs - it is no longer possible to set condaEnvDir with this function. It is required to set it with snakePipes config beforhand, instead. To ingore what's in the defaults.yaml and overwrite the condaEnvDir value with default system conda prefix, use '--autodetectCondaEnvDir'.
* Snakemake options in the defaults.yaml are now an empty string. The required arguments '--use-conda --conda-prefix' have been directly added to the command string. condaEnvDir is parsed from defaults.yaml, requiring running snakePipes config first.
* Fixes #819
Expand Down
28 changes: 15 additions & 13 deletions docs/content/workflows/createIndices.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ There is a configuration file in ``snakePipes/workflows/createIndices/defaults.y
outdir:
configFile:
clusterConfigFile:
local: false
local: False
maxJobs: 5
verbose: False
## Genome name used in snakePipes (no spaces!)
Expand All @@ -35,13 +35,23 @@ There is a configuration file in ``snakePipes/workflows/createIndices/defaults.y
tools: all
## URLs or paths for fasta and GTF files
genomeURL:
spikeinGenomeURL:
spikeinExt: '_spikein'
gtfURL:
spikeinGtfURL:
## The effective genome size
effectiveGenomeSize: 0
## Regions to blacklist in the ChIP-seq and related workflows
blacklist:
spikeinBlacklist:
## Regions to ignore during normalization (e.g., with bamCompare)
ignoreForNorm:
ignoreForNormalization:
## Repeat masker file. It's assumed that the columns are tab separated!
rmsk_file:
## Salmon Index Options
salmonIndexOptions: --type puff -k 31
eisaR_flank_length: 80


These values are most conveniently set on the command line.

Expand All @@ -57,30 +67,22 @@ The following structure will be created in the designated ``outdir``::

.
├── annotation
│   ├── blacklist.bed
│   ├── genes.bed
│   ├── genes.gtf
│   └── genes.slop.gtf
├── BowtieIndex
├── BWAIndex
├── BWAmethIndex
├── createIndices.cluster_config.yaml
├── createIndices.config.yaml
├── createIndices_run-1.log
├── genome_fasta
│   ├── effectiveSize
│   ├── genome.2bit
│   ├── genome.fa
│   └── genome.fa.fai
├── HISAT2Index
└── STARIndex
├── STARIndex
├── SalmonIndex
└── SalmonIndex_RNAVelocity

These files are used internally within snakePipes and don't require further inspection. The ``createIndices_run-1.log`` file contains a full log and will include the URLs or file paths that you specified. Whether the ``annotation/blacklist.bed`` file exists is dependent upon whether you specified one. The ``genome_fasta/effectiveSize`` fill will have the effective genome size (if you didn't specify it, the number of non-N bases in the genome will be used).

In addition to these, an organism yaml file will be created. Its location can be found with ``snakePipes info``.

.. note:: The astute observer will note that no Salmon index is created. This is intentional and done to facilitate users changing which transcripts should be included on the fly.

Command line options
--------------------

Expand Down

0 comments on commit b4772f1

Please sign in to comment.