nf-core · WackerO · Oct 10, 2023 · Jun 15, 2023 · Jun 26, 2023 · Jun 27, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,6 +30,7 @@ jobs:
           - "test"
           - "test_nogtf"
           - "test_affy"
+          - "test_maxquant"
           - "test_soft"
     steps:
       - name: Check out pipeline code

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [# 136](https://github.com/nf-core/differentialabundance/pull/136)] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO ([@azedinez](https://github.com/azedinez), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#137](https://github.com/nf-core/differentialabundance/pull/137)] - Add `--sizefactors_from_controls` and `--gene_id_col` for DESeq2 module to modules.config ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#145](https://github.com/nf-core/differentialabundance/pull/145)] - Template update for nf-core/tools v2.9 ([@nf-core-bot](https://github.com/nf-core-bot), review by [@pinin4fjords](https://github.com/pinin4fjords), [@WackerO](https://github.com/WackerO))
+- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - Add Maxquant analysis module ([@WackerO](https://github.com/WackerO), review by)
 
 ### `Fixed`
 

diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd
@@ -35,6 +35,7 @@ params:
   features_metadata_cols: NULL 
   features_gtf_feature_type: NULL
   features_gtf_table_first_field: NULL
+  features_log2_assays: NULL
   raw_matrix: null                                            # e.g. 0_salmon.merged.gene_counts.tsv
   normalised_matrix: null
   variance_stabilised_matrix: null                            # e.g. test_files/3_treatment-WT-P23H.vst.tsv
@@ -234,22 +235,73 @@ assay_names <- simpleSplit(params$exploratory_assay_names)
 names(assay_names) = assay_names
 assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]])
 
-assay_data <- lapply(assay_files, function(x) {
-  mat <- read_matrix(
-    x,
-    sample_metadata = observations,
-    row.names = 1
+# Set up vector of unlogged assay files (if any)
+unlogged <- c()
+features_log2_assays <- params$features_log2_assays
+
+# For maxquant input override param as all assays are already logged by proteus
+if (params$study_type == 'maxquant') {
+  features_log2_assays <- ""
+}
+
+if (is.null(features_log2_assays)) {
+  # Guess unlogged assays
+  for (assay in c(1:length(assay_files))) {
+    if (max(assay_files[[assay]]) > 20) {
+      unlogged <- append(unlogged, assay)
+    }
+  }
+
+} else {
+  features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', features_log2_assays)) # Remove brackets from assay list
+
+  if (features_log2_assays != "") {
+
+    if (is_valid_positive_integer_vector(features_log2_assays)) {
+      # Convert to list of assay positions
+      unlogged <- unique(as.integer(simpleSplit(features_log2_assays)))
+      invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)]
+
+      if (length(invalid_assays) > 0){
+        stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', ')))
+      }
+
+    } else {
+      # Last option is string of assay names, so get positions of those names in the assay list
+      unlogged <- unique(simpleSplit(features_log2_assays))
+
+      # Check if all names are valid
+      invalid_assays <- unlogged[!(unlogged %in% names(assay_files))]
+      if (length(invalid_assays) > 0) {
+        stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --features_log2_assays."))
+      }
+      unlogged <- match(unlogged, names(assay_files))
+    }
+
+  }
+}
+
+assay_data <- lapply(c(1:length(assay_files)), function(x) {
+  mat <- na.omit(
+    read_matrix(
+      assay_files[[x]],
+      sample_metadata = observations,
+      row.names = 1
+    )
   )
   colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))]
 
-  # Bit hacky, but ensure log
-  if (max(mat) > 20){
-    log2(mat+1)
-  }else{
-    mat 
+  # If assay is in unlogged list, apply log2
+  if (x %in% unlogged) {
+    mat <- log2(mat+1)
   }
+  mat
+
 })
 
+# Lapply does not go over the assay_files themselves anymore, so we need to specifically assign their names to the data
+names(assay_data) <- names(assay_files)
+
 # Now we can rename the observations rows using the title field
 rownames(observations) <- observations[[params$observations_name_col]]
 
@@ -321,7 +373,6 @@ differential_results <- lapply(differential_files, function(diff_file){
   }
 
   # Annotate differential tables if possible
-
   if (! is.null(params$features)){
     diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column)
   }
@@ -593,7 +644,7 @@ for (assay_type in rev(names(assay_data))){
     variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features)
 
     dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name)
-    
+
     p <- clusteringDendrogram(
       2^assay_data[[assay_type]][variable_genes, ],
       observations[, iv, drop = FALSE],

diff --git a/conf/maxquant.config b/conf/maxquant.config
@@ -0,0 +1,42 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running MaxQuant proteomics analysis
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines settings specific to MaxQuant proteomics analysis
+
+    Use as follows:
+        nextflow run nf-core/differentialabundance -profile maxquant,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+
+    config_profile_name         = 'MaxQuant profile'
+    config_profile_description  = 'Settings for MaxQuant analysis'
+
+    // Study
+    study_type              = 'maxquant'
+    study_abundance_type    = 'intensities'
+
+    // Features
+    features_id_col         = 'Majority protein IDs'
+    features_name_col       = 'Majority protein IDs'
+    features_metadata_cols  = 'Majority protein IDs'
+    features_type           = 'protein'
+
+    // Exploratory
+    exploratory_assay_names = "raw,normalised,variance_stabilised"
+    exploratory_final_assay = "variance_stabilised"
+
+    // Differential options
+    differential_file_suffix         = ".limma.results.tsv"
+    differential_fc_column           = "logFC"
+    differential_pval_column         = "P.Value"
+    differential_qval_column         = "adj.P.Val"
+    differential_feature_id_column   = "probe_id"
+    differential_feature_name_column = "probe_id"
+
+    // Shiny does not work for this datatype
+    shinyngs_build_app               = false
+}
diff --git a/conf/modules.config b/conf/modules.config
@@ -100,6 +100,36 @@ process {
         ].join(' ').trim() }
     }
 
+    withName: PROTEUS {
+        publishDir = [
+            [
+                path: { "${params.outdir}/tables/proteus" },
+                mode: params.publish_dir_mode,
+                pattern: '*proteingroups_tab.tsv'
+            ],
+            [
+                path: { "${params.outdir}/plots/proteus" },
+                mode: params.publish_dir_mode,
+                pattern: '*.png'
+            ],
+            [
+                path: { "${params.outdir}/other/proteus" },
+                mode: params.publish_dir_mode,
+                pattern: '*.{rds,sessionInfo.log}'
+            ]
+        ]
+        ext.args = { [
+            "--sample_id_col \"${params.observations_id_col}\"",
+            "--protein_id_col \"${params.features_id_col}\"",
+            "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"",
+            "--normfuns $params.proteus_norm_functions",
+            "--plotSampleDistributions_method $params.proteus_plotSD_method",
+            "--plotMV_loess $params.proteus_plotMV_loess",
+            "--palette_name $params.proteus_palette_name",
+            "--round_digits $params.proteus_round_digits"
+        ].join(' ').trim() }
+    }
+
     withName: GEOQUERY_GETGEO {
         publishDir = [
             [
@@ -291,7 +321,8 @@ process {
             "--assay_names \"${params.exploratory_assay_names}\"",
             "--final_assay \"${params.exploratory_final_assay}\"",
             "--outlier_mad_threshold ${params.exploratory_mad_threshold}",
-            "--palette_name \"${params.exploratory_palette_name}\""
+            "--palette_name \"${params.exploratory_palette_name}\"",
+            ( (params.study_type == 'maxquant') ? "--log2_assays ''" : (((params.features_log2_assays == null) ? '' : "--log2_assays \"$params.features_log2_assays\"".replace('[', '').replace(']', ''))) )
         ].join(' ').trim() }
     }
 

diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config
@@ -0,0 +1,50 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple
+    pipeline test with MaxQuant Mass-spec data.
+
+    Use as follows:
+        nextflow run nf-core/differentialabundance -profile test_maxquant,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+includeConfig 'maxquant.config'
+
+params {
+    study_name = 'PXD043349'
+    config_profile_name        = 'MaxQuant test profile'
+    config_profile_description = 'MaxQuant test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input     = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv'
+    matrix    = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt'
+    contrasts =  'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv'
+
+    // Features
+    features_id_col = 'Majority protein IDs'
+    features_metadata_cols = 'Majority protein IDs'
+
+    // Observations
+    observations_id_col = 'Experiment'
+    observations_name_col = 'Name'
+    proteus_measurecol_prefix = 'LFQ intensity '
+
+    // Exploratory
+    exploratory_main_variable      = 'Celltype'
+    exploratory_assay_names        = 'raw,normalised'
+    exploratory_final_assay        = 'normalised'
+
+    // Differential
+    differential_feature_id_column = 'probe_id'
+    differential_feature_name_column = 'Majority protein IDs'
+    differential_fc_column         = 'logFC'
+    differential_qval_column       = 'adj.P.Val'
+}
diff --git a/docs/output.md b/docs/output.md
@@ -37,6 +37,11 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i
     - `[contrast]/png/volcano.png`: Volcano plots of -log(10) p value agains log(2) fold changes
   - `gsea/`: Directory containing graphical outputs from GSEA (where enabled). Plots are stored in directories named for the associated contrast.
     - `[contrast]/png/[gsea_plot_type].png`
+  - `proteus/`: If `--study_type maxquant`: Directory containing plots produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any).
+    - `[contrast].proteus.[normfun].normalized_dendrogram.png`: A sample clustering dendrogram after normalization, if chosen.
+    - `[contrast].proteus.[normfun].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level, if chosen.
+    - `[contrast].proteus.[normfun].normalized_distributions.png`: A plot of sample distributions after normalization, if chosen.
+    - `[contrast].proteus.raw_distributions.png`: A plot of sample distributions without normalization.
 
 </details>
 
@@ -61,6 +66,9 @@ Most plots are included in the HTML report (see above), but are also included in
     - `OR [contrast_name].limma.results.tsv`: Results of Limma differential analyis (Affymetrix arrays)
   - `gsea/`: Directory containing tables of differential gene set analyis from GSEA (where enabled)
     - `[contrast]/[contrast].gsea_report_for_[condition].tsv`: A GSEA report table for each side of each contrast
+  - `proteus/`: If `--study_type maxquant`: Directory containing abundance values produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any).
+    - `[contrast].proteus.[normfun].normalized_proteingroups_tab.tsv`: Abundance table after normalization, if chosen.
+    - `[contrast].proteus.raw_proteingroups_tab.tsv`: Abundance table without normalization.
 
 </details>
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -11,10 +11,10 @@ Differential analysis is a common task in a variety of use cases. In essence, al
 With the above in mind, running this workflow requires:
 
 - a set of abundance values. This can be:
-  - (for RNA-seq): a matrix of quantifications with observations by column and features by row
+  - (for RNA-seq or MaxQuant proteomics measurements): a matrix of quantifications with observations by column and features by row
   - (for Affymetrix microarrays): a tar'd archive of CEL files
 - a description of the observations such as a sample sheet from RNA-seq analysis
-- a description of the features, for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. You can also supply your own table.
+- a description of the features (skip for MaxQuant), for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. You can also supply your own table.
 - a specification of how the matrix should be split, and how the resulting groups should be compared
 
 ## Observations (samplesheet) input
@@ -49,6 +49,14 @@ The file can be tab or comma separated.
 
 This is a numeric square matrix file, comma or tab-separated, with a column for every observation, and features corresponding to the supplied feature set. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs.
 
+### MaxQuant intensities
+
+```bash
+--matrix '[path to matrix file]'
+```
+
+This is the proteinGroups.txt file produced by MaxQuant. It is a tab-separated matrix file with a column for every observation (plus additional columns for other types of measurements and information); each row contains these data for a set of proteins. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. The parameter `--proteus_measurecol_prefix` defines which prefix is used to extract those matrix columns which contain the measurements to be used. For example, the default `LFQ intensity ` will indicate that columns like LFQ intensity S1, LFQ intensity S2, LFQ intensity S3 etc. are used (do not forget trailing whitespace in this parameter, if required!).
+
 ### Affymetrix microarrays
 
 ```bash
@@ -109,7 +117,7 @@ The file can be tab or comma separated.
 --gtf '[path to gtf file]'
 ```
 
-This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix.
+This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix. Skip for MaxQuant.
 
 ### Annotation package identifiers for Affymetrix arrays
 
@@ -123,11 +131,11 @@ To override the above options, you may also supply your own features table as a
 --features '[path to features TSV]'
 ```
 
-By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run.
+By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. Please use this option for MaxQuant analysis, i.e. do not provide features.
 
 ## Shiny app generation
 
-The pipeline is capable of building, and even deploying (to [shinyapps.io](https://www.shinyapps.io/)) for you a Shiny app built with [ShinyNGS](https://github.com/pinin4fjords/shinyngs).
+The pipeline is capable of building, and even deploying (to [shinyapps.io](https://www.shinyapps.io/)) for you a Shiny app built with [ShinyNGS](https://github.com/pinin4fjords/shinyngs) (disabled for MaxQuant).
 
 This is enabled with:
 
@@ -197,7 +205,7 @@ The typical command for running the pipeline is as follows:
 
 ```bash
 nextflow run nf-core/differentialabundance \
-    [--profile rnaseq OR -profile affy] \
+    [-profile rnaseq OR -profile affy] \
     --input samplesheet.csv \
     --contrasts contrasts.csv \
     [--matrix assay_matrix.tsv OR --affy_cel_files_archive cel_files.tar] \

diff --git a/modules.json b/modules.json
@@ -60,6 +60,11 @@
                         "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8",
                         "installed_by": ["modules"]
                     },
+                    "proteus/readproteingroups": {
+                        "branch": "master",
+                        "git_sha": "007dd9c990670392d3fb6607529966a1a614e1e1",
+                        "installed_by": ["modules"]
+                    },
                     "rmarkdownnotebook": {
                         "branch": "master",
                         "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",