Merge pull request #147 from WackerO/add_proteus

Add proteus module for maxquant data analysis
nf-core · Oct 10, 2023 · c9d5328 · c9d5328
2 parents 1e75341 + f26a224
commit c9d5328
Show file tree

Hide file tree

Showing 19 changed files with 821 additions and 57 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,6 +30,7 @@ jobs:
           - "test"
           - "test_nogtf"
           - "test_affy"
+          - "test_maxquant"
           - "test_soft"
     steps:
       - name: Check out pipeline code

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [# 136](https://github.com/nf-core/differentialabundance/pull/136)] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO ([@azedinez](https://github.com/azedinez), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#137](https://github.com/nf-core/differentialabundance/pull/137)] - Add `--sizefactors_from_controls` and `--gene_id_col` for DESeq2 module to modules.config ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#145](https://github.com/nf-core/differentialabundance/pull/145)] - Template update for nf-core/tools v2.9 ([@nf-core-bot](https://github.com/nf-core-bot), review by [@pinin4fjords](https://github.com/pinin4fjords), [@WackerO](https://github.com/WackerO))
+- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - Add Maxquant analysis module ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 
 ### `Fixed`
 
@@ -28,8 +29,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Changed`
 
 - [[#159](https://github.com/nf-core/differentialabundance/issues/159)] - CUSTOM/MATRIXFILTER module update ([@WackerO](https://github.com/WackerO), review by [@suzannejin](https://github.com/suzannejin))
-- [[#152](https://github.com/nf-core/differentialabundance/issues/152)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
+- [[#154](https://github.com/nf-core/differentialabundance/issues/154)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#151](https://github.com/nf-core/differentialabundance/issues/151)] - Module update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
+- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - RMARKDOWNNOTEBOOK env update, SHINYNGS and CUSTOM update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 
 ## v1.2.0 - 2023-04-19
 

diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd
@@ -35,11 +35,18 @@ params:
   features_metadata_cols: NULL 
   features_gtf_feature_type: NULL
   features_gtf_table_first_field: NULL
+  features_log2_assays: NULL
   raw_matrix: null                                            # e.g. 0_salmon.merged.gene_counts.tsv
   normalised_matrix: null
   variance_stabilised_matrix: null                            # e.g. test_files/3_treatment-WT-P23H.vst.tsv
   contrasts_file: null                                        # e.g. GSE156533.contrasts.csv
   differential_table: file.csv
+  proteus_measurecol_prefix: NULL
+  proteus_norm_function: NULL
+  proteus_plotsd_method: NULL
+  proteus_plotmv_loess: NULL
+  proteus_palette_name: NULL
+  proteus_round_digits: NULL
   affy_cel_files_archive: NULL
   affy_file_name_col: NULL
   affy_background: NULL
@@ -235,26 +242,27 @@ names(assay_names) = assay_names
 assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]])
 
 assay_data <- lapply(assay_files, function(x) {
-  mat <- read_matrix(
-    x,
-    sample_metadata = observations,
-    row.names = 1
+  mat <- na.omit(
+    read_matrix(
+      x,
+      sample_metadata = observations,
+      row.names = 1
+    )
   )
   colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))]
-
-  # Bit hacky, but ensure log
-  if (max(mat) > 20){
-    log2(mat+1)
-  }else{
-    mat 
-  }
+  mat 
 })
 
+if (!is.null(params$features_log2_assays)) {
+  # Remove brackets from assay list. TODO: Remove if this is added to cond_log2_transform_assays
+  features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', params$features_log2_assays))
+  assay_data <- cond_log2_transform_assays(assay_data, features_log2_assays)
+}
+
 # Now we can rename the observations rows using the title field
 rownames(observations) <- observations[[params$observations_name_col]]
 
 # Run PCA early so we can understand how important each variable is
-
 pca_datas <- lapply(names(assay_data), function(assay_type){
   compilePCAData(assay_data[[assay_type]])
 })
@@ -321,7 +329,6 @@ differential_results <- lapply(differential_files, function(diff_file){
   }
   
   # Annotate differential tables if possible
-  
   if (! is.null(params$features)){
     diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column)
   }
@@ -594,7 +601,6 @@ for (assay_type in rev(names(assay_data))){
     variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features)
 
     dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name)
-    
     p <- clusteringDendrogram(
       2^assay_data[[assay_type]][variable_genes, ],
       observations[, iv, drop = FALSE],
@@ -813,6 +819,11 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){
 
 # Methods
 
+```{r, echo=FALSE, results='asis', eval=params$study_type == 'maxquant'}
+cat(paste0("\n## Protein abundance import\n"))
+make_params_table('importing maxquant output', 'proteus_', remove_pattern = TRUE)
+```
+
 ## Filtering
 
 ```{r, echo=FALSE, results='asis'}

diff --git a/conf/maxquant.config b/conf/maxquant.config
@@ -0,0 +1,45 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running MaxQuant proteomics analysis
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines settings specific to MaxQuant proteomics analysis
+
+    Use as follows:
+        nextflow run nf-core/differentialabundance -profile maxquant,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+
+    config_profile_name         = 'MaxQuant profile'
+    config_profile_description  = 'Settings for MaxQuant analysis'
+
+    // Study
+    study_type              = 'maxquant'
+    study_abundance_type    = 'intensities'
+
+    // Features
+    features_id_col         = 'Majority protein IDs'
+    features_name_col       = 'Majority protein IDs'
+    features_metadata_cols  = 'Majority protein IDs'
+    features_type           = 'protein'
+
+    // Exploratory
+    exploratory_assay_names = "raw,normalised"
+    exploratory_final_assay = "normalised"
+
+    // Differential options
+    differential_file_suffix         = ".limma.results.tsv"
+    differential_fc_column           = "logFC"
+    differential_pval_column         = "P.Value"
+    differential_qval_column         = "adj.P.Val"
+    differential_feature_id_column   = "probe_id"
+    differential_feature_name_column = "Majority protein IDs"
+
+    // Proteus options
+    proteus_measurecol_prefix = 'LFQ intensity '
+
+    // Shiny does not work for this datatype
+    shinyngs_build_app               = false
+}
diff --git a/conf/modules.config b/conf/modules.config
@@ -100,6 +100,43 @@ process {
         ].join(' ').trim() }
     }
 
+    withName: PROTEUS {
+        publishDir = [
+            [
+                path: { "${params.outdir}/tables/proteus/${meta.id}/" },
+                mode: params.publish_dir_mode,
+                pattern: '*.tsv'
+            ],
+            [
+                path: { "${params.outdir}/plots/proteus/${meta.id}/" },
+                mode: params.publish_dir_mode,
+                pattern: '*.png'
+            ],
+            [
+                path: { "${params.outdir}/other/proteus/${meta.id}/" },
+                mode: params.publish_dir_mode,
+                pattern: '*.rds'
+
+            ],
+            [
+                path: { "${params.outdir}/other/proteus/" },
+                mode: params.publish_dir_mode,
+                pattern: '*sessionInfo.log'
+            ]
+        ]
+        ext.args = { [
+            "--contrast_variable \"${meta.id}\"",
+            "--sample_id_col \"${params.observations_id_col}\"",
+            "--protein_id_col \"${params.features_id_col}\"",
+            "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"",
+            "--norm_function $params.proteus_norm_function",
+            "--plotsd_method $params.proteus_plotsd_method",
+            "--plotmv_loess $params.proteus_plotmv_loess",
+            "--palette_name $params.proteus_palette_name",
+            "--round_digits $params.proteus_round_digits"
+        ].join(' ').trim() }
+    }
+
     withName: GEOQUERY_GETGEO {
         publishDir = [
             [
@@ -291,7 +328,8 @@ process {
             "--assay_names \"${params.exploratory_assay_names}\"",
             "--final_assay \"${params.exploratory_final_assay}\"",
             "--outlier_mad_threshold ${params.exploratory_mad_threshold}",
-            "--palette_name \"${params.exploratory_palette_name}\""
+            "--palette_name \"${params.exploratory_palette_name}\"",
+            ( (params.study_type == 'maxquant') ? "--log2_assays ''" : (((params.features_log2_assays == null) ? '' : "--log2_assays \"$params.features_log2_assays\"".replace('[', '').replace(']', ''))) )
         ].join(' ').trim() }
     }
 
@@ -352,8 +390,8 @@ process {
     }
 
     withName: RMARKDOWNNOTEBOOK {
-        conda = "bioconda::r-shinyngs=1.8.1"
-        container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" }
+        conda = "bioconda::r-shinyngs=1.8.2"
+        container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" }
         publishDir = [
             path: { "${params.outdir}/report" },
             mode: params.publish_dir_mode,

diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config
@@ -0,0 +1,37 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple
+    pipeline test with MaxQuant Mass-spec data.
+
+    Use as follows:
+        nextflow run nf-core/differentialabundance -profile test_maxquant,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+includeConfig 'maxquant.config'
+
+params {
+    study_name = 'PXD043349'
+    config_profile_name        = 'MaxQuant test profile'
+    config_profile_description = 'MaxQuant test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input     = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv'
+    matrix    = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt'
+    contrasts =  'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv'
+
+    // Observations
+    observations_id_col = 'Experiment'
+    observations_name_col = 'Name'
+
+    // Exploratory
+    exploratory_main_variable      = 'Celltype'
+}
diff --git a/docs/output.md b/docs/output.md
@@ -37,6 +37,11 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i
     - `[contrast]/png/volcano.png`: Volcano plots of -log(10) p value agains log(2) fold changes
   - `gsea/`: Directory containing graphical outputs from GSEA (where enabled). Plots are stored in directories named for the associated contrast.
     - `[contrast]/png/[gsea_plot_type].png`
+  - `proteus/`: If `--study_type maxquant`: Directory containing plots produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any).
+    - `[contrast]/[norm_function].normalized_dendrogram.png`: A sample clustering dendrogram after normalization.
+    - `[contrast]/[norm_function].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level.
+    - `[contrast]/[norm_function].normalized_distributions.png`: A plot of sample distributions after normalization.
+    - `[contrast]/raw_distributions.png`: A plot of sample distributions without normalization.
 
 </details>
 
@@ -61,6 +66,9 @@ Most plots are included in the HTML report (see above), but are also included in
     - `OR [contrast_name].limma.results.tsv`: Results of Limma differential analyis (Affymetrix arrays)
   - `gsea/`: Directory containing tables of differential gene set analyis from GSEA (where enabled)
     - `[contrast]/[contrast].gsea_report_for_[condition].tsv`: A GSEA report table for each side of each contrast
+  - `proteus/`: If `--study_type maxquant`: Directory containing abundance values produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any).
+    - `[contrast]/[norm_function].normalized_proteingroups_tab.tsv`: Abundance table after normalization.
+    - `[contrast]/raw_proteingroups_tab.tsv`: Abundance table without normalization.
 
 </details>
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -11,10 +11,10 @@ Differential analysis is a common task in a variety of use cases. In essence, al
 With the above in mind, running this workflow requires:
 
 - a set of abundance values. This can be:
-  - (for RNA-seq): a matrix of quantifications with observations by column and features by row
+  - (for RNA-seq or MaxQuant proteomics measurements): a matrix of quantifications with observations by column and features by row
   - (for Affymetrix microarrays): a tar'd archive of CEL files
 - a description of the observations such as a sample sheet from RNA-seq analysis
-- a description of the features, for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. You can also supply your own table.
+- a description of the features, for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. Skip for MaxQuant. You can also supply your own table.
 - a specification of how the matrix should be split, and how the resulting groups should be compared
 
 ## Observations (samplesheet) input
@@ -49,6 +49,14 @@ The file can be tab or comma separated.
 
 This is a numeric square matrix file, comma or tab-separated, with a column for every observation, and features corresponding to the supplied feature set. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs.
 
+### MaxQuant intensities
+
+```bash
+--matrix '[path to matrix file]'
+```
+
+This is the proteinGroups.txt file produced by MaxQuant. It is a tab-separated matrix file with a column for every observation (plus additional columns for other types of measurements and information); each row contains these data for a set of proteins. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. The parameter `--proteus_measurecol_prefix` defines which prefix is used to extract those matrix columns which contain the measurements to be used. For example, the default `LFQ intensity ` will indicate that columns like LFQ intensity S1, LFQ intensity S2, LFQ intensity S3 etc. are used (do not forget trailing whitespace in this parameter, if required!).
+
 ### Affymetrix microarrays
 
 ```bash
@@ -109,7 +117,7 @@ The file can be tab or comma separated.
 --gtf '[path to gtf file]'
 ```
 
-This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix.
+This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix. Skip for MaxQuant.
 
 ### Annotation package identifiers for Affymetrix arrays
 
@@ -123,7 +131,7 @@ To override the above options, you may also supply your own features table as a
 --features '[path to features TSV]'
 ```
 
-By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run.
+By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. Please use this option for MaxQuant analysis, i.e. do not provide features.
 
 ## Shiny app generation
 
@@ -197,7 +205,7 @@ The typical command for running the pipeline is as follows:
 
 ```bash
 nextflow run nf-core/differentialabundance \
-    [--profile rnaseq OR -profile affy] \
+    [-profile rnaseq OR -profile affy] \
     --input samplesheet.csv \
     --contrasts contrasts.csv \
     [--matrix assay_matrix.tsv OR --affy_cel_files_archive cel_files.tar] \