Skip to content

8 Configuration file

Nuno Fonseca edited this page Mar 13, 2018 · 13 revisions
# This is a comment (lines that start with a # are ignored by iRAP)
# =============================================================================
## name that you want to give to the experiment/analysis (no spaces)
## All files produced by irap will be placed in a folder with the given name.
name=myexp

## =============================================================================
## name of the species
species=homo_sapiens

## =============================================================================
## Fasta file with the reference genome
reference=Homo_sapiens.GRCh37.66.dna.fa

## =============================================================================
# GTF file with the annotations 
gtf_file=Homo_sapiens.GRCh37.66.gtf

## =============================================================================
## Fasta file with the transcripts to be used for transcript level
## quantification.
## Since version 1.0.0, if set to auto then the fasta
## file with the transcripts will be created by iRAP (check
## user_trans_biotypes).
# cdna_file=auto

## =============================================================================
## Which biotypes to include when performing transcript level quantification (since version 1.0.0)
## Only makes sense if cdna_file is set to auto.
# user_trans_biotypes=protein_coding|IG_([a-zA-Z0-9]+)_gene|TR_([a-zA-Z0-9]+)_gene

## =============================================================================
## IRAP options (may be provided/overriden in the command line)

#######################################
## Mapper
#mapper=

## Collect and process junctions files generated by the different
mappers (star and tophat2) - since version 1.0.0
#junction_quant=y

#######################################
## Quantification method
#quant_method=

## Transcript level quantification? Yes (y)| No (n)
# transcript_quant=y

## generate relative isoform usage matrices
# gen_riu=y

## Dominant transcript fold-change
## Only applicable if a transcript quantification method is used
## 'n' disables the generation of the dominant transcript file
#dt_fc=2

## Transcript differential expression method
#transcript_de_method=ebseq

## Exon level quantification ? Yes (y)| No (n)
#exon_quant=y

## exon quantification method 
#exon_quant_method=dexseq

## normalization of counts
## tool to use (irap|kallisto|nurd|...|none)
## default: irap
#quant_norm_tool=irap 

## normalization method to use (fpkm|uq-fpkm|fpkm-uq|deseq_nlib|tpm|none)
## default: none
#quant_norm_method=fpkm 

## Sequence of biotypes, separated by comma, that define the set of
## genes that are used to compute the total number of reads
## (fpkm/fpkm/tpm) default: all biotypes
# quant_norm_mass_biotypes=protein_coding,lncRNA,pseudogenes
#######################################
## Dif. expression method
## Requires: contrasts to be defined (see below)

#de_method=

## Filter for DE analysis
## keep the genes/transcripts/exons with a total number of reads above X
# gene_de_min_count=0
# transcript_de_min_count=0
# exon_de_min_count=0


## used only in the HTML reports
# de_pvalue_cutoff=0.05
# transcript_de_pvalue_cutoff=0.05
# exon_de_pvalue_cutoff=0.05
# de_num_transcripts_per_table=300
# de_num_exons_per_table=300
# de_num_genes_per_table=300
# de_num_transcripts_per_table=300

#######################################
## Gene set enrichment (GSE) analysis
## Requires: de_method to be defined and a gene annotation file (see below).
## Tool to use (none|piano)
#gse_tool=piano

## gse_method: (mean|median|sum|fisher|fisher-exact|stouffer|tailStrength|wilcoxon|reporter|page) see Piano vignete documentation for more details
## note: fisher-exact is implemented outside Piano's code
#gse_method=fisher
#gse_pvalue=0.05
## Minimum number of genes in a gene set
#gse_minsize=3



##  TSV file with gene annotation
##  Format: "ID","Name","locus","source","lname","GO","GOterm","KEGG"
##  Description:
##  ID=gene_id (this is mandatory and should match the one given in the gtf file)
##  Name=gene name
##  locus=chr:start--end
##  source=biotype
##  lname=gene name
##  GO=go ids (multiple values can be separated by,)
##  GO=GO terms (multiple values can be separated by,)
##  KEGG=Kegg ids (multiple values can be separated by,)
##  If auto is defined then iRAP will *try* to generate the file
##  - this may take a considerable amount of time and will only work
##    for a reduced number of species
##  off - default value
#annot_tsv=

######################################
## Fusion gene analysis (fusionmap|none)
#fusion_method=none
## minimum number of reads that need to support a fusion
#fusion_support_reads=3

######################################
## QC
## Check data (reads) quality (on|off|none)
## on  - reads are filtered out based on their quality
## report - the quality of the reads is assessed but no filtering is done
## off - no quality control is performed
#qual_filtering=on

## Trim poly-A/T? y|n
#trim_poly_at=n

## by default, if a read has at least 10 consecutive A or T in the
## edges then it will be trimmed. This option is only used if
## trim_poly_at is set to y
#trim_poly_at_len=10

## Trim all reads to the minimum read size after quality trimming -  Yes (y)| No (n)
## only applicable if qual_filtering is on
#trim_reads=y

## Minimum read size/length after trimming
## If trimming is enabled, reads will be trimmed to have the number of bases defined by this parameter.
## By default, if trimming is enabled, reads will be trimmed to have 85% of the original length. 
# min_read_length=50

## Minimum base quality accepted (def. 10)
#min_read_quality=10

## Maximum (percentage) of uncalled bases acceptable in a read
## max_n=100 disables the filtering
## max_n=0 discard reads with at least 1 uncalled base
#max_n=0


## Contamination check (cont_index parameter).  Reads that likely
## originate from organisms other than the one under study can be
## discarded during pre-processment of the reads. This is done by
## aligning the reads to the genomes of organisms that might be a
## source of contamination and discard those that map with a high
## degree of fidelity. By default iRAP will check if the data is
## contaminated by e-coli. An example to create a contamination
## "database" is provided in the examples/ex_add2contaminationDB.sh
## script. The value of the parameter should be the file name prefix of
## the bowtie index files.

## Disable contamination check
#cont_index=no
## Default value
#cont_index=$(data_dir)/contamination/e_coli

###########################################
## Misc. options

## Number of threads that may be used by IRAP
#max_threads=1

## Maximum memory (in MB)
#max_mem=5000


## =============================================================================
## full or relative path to the directory where all the data can be found.
data_dir=data

## the directory should be organized as follows (see directory data in IRAP toplevel directory)
##
## $data_dir
##$data_dir/
##├── contamination
##│   ├── e_coli.1.ebwt
##│   ├── e_coli.2.ebwt
##│   ├── e_coli.3.ebwt
##│   ├── e_coli.4.ebwt
##│   ├── e_coli.README
##│   ├── e_coli.rev.1.ebwt
##│   └── e_coli.rev.2.eb
##├──  raw_data
##│     └──  $species
##│         ├──  .fastq.gz
##│         ├──  .fastq.gz
##│         ├──  ...
##└──  reference
##     └──  $species
##          ├──  $gtf_file
##          └──  $reference
##
## Notes: 
##  1) $ denotes the value defined for the variable  
##  2) Since version 0.5.0 the raw data (.fastq/.bam) files may be
## distributed across several sub-folders.


## =============================================================================
## Only necessary if you intend to perform Differential Expression analysis

## contrasts=contrast_def [contrast_def ...]
contrasts=purpleVsPink purpleVsGrey

## definition of each constrast
#contrast= group group [ group ...]
#purlpleVsPink=Purple Pink
#purlpleVsGrey=Purple Grey

# groups
## GroupName= Library_name [Library_name ...]
Purlple=myLib1 myLib2
Pink=myLib3
Grey=myLib4

## optional parameter: used in the report (HTML) generation.
#groups=Purple Pink Grey

## technical replicates
#technical.replicates="myLib1,myLib2;myLib3;mylib4"

## Note: names of groups, contrasts, and libraries should start with a letter and contain only alphanumeric characters and the character _. 

## =============================================================================
## Data


## Information for each library
## LibName=Fastq file
## Note:
## 1. LibName should start with a letter and contain only alphanumeric characters and the character _. LibName should not contain in _1 or _2.
## 2. LibName should be different from the name of the fastq file, for instance 
## f1=f1.fastq
## will produce an error.
 
## Single-end
myLib1=f1.fastq
## read size
myLib1_rs=75
## quality encoding (33 or 64)
myLib1_qual=33

## strand specific protocol?
#mylib1_strand=first 
#mylib1_strand=second
## Default value is both (strands)
#mylib1_strand=both

## if the libraries have been spiked then you may provide the fasta
## file with the spiked sequences (in the raw_data folder)
## if set to ERCC then iRAP will assume the standard ERCC spikeins
#spikein_fasta=ERCC

## Have the file in a different sub-folder
# mylib1_dir=somesubfolder

## See SAM/BAM specification for more details about the following two parameters
## read group id (to be included in the BAM file) - this is not supported by all mappers
#myLib1_rgid=
## sam/bam header lines to include in the BAM file
#myLib1_shl="@CO\tThis is a comment\n@CO\tand another line..."

## LibName=Fastq file
myLib2=f2.fastq
## read size
myLib2_rs=75
## quality encoding (33 or 64)
myLib2_qual=33


## Paired-end
## LibName=Fastq files
myLib3=f3_1.fastq f3_2.fastq
## read size
myLib3_rs=50
## quality encoding (33 or 64)
myLib3_qual=33
## insert size
myLib3_ins=350
## standard deviation
myLib3_sd=60

## LibName=Fastq files
myLib4=f4_1.fastq f4_2.fastq
## read size
myLib4_rs=50
## quality encoding (33 or 64)
myLib4_qual=33
## insert size
myLib4_ins=350
## standard deviation
myLib4_sd=60


## handling barcodes (UMI, cell barcode, sample)
## Check $IRAP_DIR/aux/mk/irap_sc_defs.mk for examples of predefined
## parameters

## UMI white list
# myLib4_known_umi_file=path2file

## Cell barcodes white list
# myLib4_known_cells_file=path2file

## mylib4_index1=f4_I1.fastq
## where is the UMI
#mylib4_umi_read=index1
## start base
#mylib4_umi_offset=0
## number of bases
#mylib4_umi_size=9

## where is the cell barcode
#mylib4_cell_read=index1
## start base
#mylib4_cell_offset=10
## number of bases
#mylib4_cell_size=9

## where is the sample barcode
#mylib4_sample_read=index1
## start base
#mylib4_sample_offset=10
## number of bases
# mylib4_sample_size=9


## list the names of your single-end (se) and paired (pe) libraries
se=myLib1 myLib2
pe=myLib3 myLib4
## No SE data
# se=
## No PE data
# pe=

##################################################################
##
## Passing/overriding parameters
##
## It is possible to pass parameters to the mappers and quantification
## methods. Note that this should be done carefully since it may break the
## pipeline (e.g., if the location of the input and/or output files is
## changed).
##################################################################

## Overriding/changing the mappers' parameters:

## options used to align
## _map_options=options 
## Example:
## tophat2_map_options=--min-intron-length 5 --no-coverage-search

## options used to index
## _index_options=options 
## Example:
## star_index_options=--limitGenomeGenerateRAM=31000000000

##################################################################
## Overriding/changing the parameters of the quantification methods: 
## _params=options
## Example
## htseq_params= -q

##################################################################
## Single cell RNA-seq  (since version 1.0.0)

## possible values: drop-seq, 10x_v1, 10x_v1p, 10x_v2, smart, smart-seq2, smart-seq
#sc_protocol=

## Filter (cells)
## Cells with less or equal than $(sc_non_zero_rows) are discarded
#sc_non_zero_rows=1
#cell_filt_min_features=0.05   # minimum number of features expressed as a percentage of the total number of features

cell_filt_max_ERCC=0.8  # maximum percentage of expression that may be atributed to ERCC spike-ins

## pre-blacklisted cells
#cell_filt_controls=   # file with the a known list of cells that should not be used in downstream analysis

cell_filt_outliers=y   # filter outliers based on the total number of counts/expr (y|n)
## Exclude outliers based on the 5*median absolute difference (like scater).
#cell_outliers_mad=5
#cell_filt_min_expression=1 # minimum expression per feature
#cell_filt_min_tot_expr=1000 # minimum number of counts per cell


## UMI counting
## parameters passed to bam_umi_count (umi_count quant. option)
#bam_umi_count_params=--min_reads 1 --multi_mapped --min_umis 1
## parameters passed to umis
#umis_params=--cb_cutoff 2

## Maximum number of cells
#sc_max_cells=800000
## maximum number of features quantified (if protein coding only genes are considered then this value can be reduced)
#sc_max_features=80000
## average number of features expected to be expressed per cell
#sc_feat_cell=5000

## TSNE
## filter genes based on the number of cells where they are expressed
#tsne_min_cells=1
## filter cells based on the number of genes expressed
#tsne_min_genes=1

## Clustering
## Only used in single cell
#min_clusters=2
#max_clusters=2
#clustering_method=sc3


##################################################################
## Enable the options/parameters used by Expression Atlas
## 
## sop=atlas