-
Notifications
You must be signed in to change notification settings - Fork 89
/
results.R
1270 lines (1179 loc) · 54.5 KB
/
results.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' Extract results from a DESeq analysis
#'
#' \code{results} extracts a result table from a DESeq analysis giving base means across samples,
#' log2 fold changes, standard errors, test statistics, p-values and adjusted p-values;
#' \code{resultsNames} returns the names of the estimated effects (coefficents) of the model;
#' \code{removeResults} returns a \code{DESeqDataSet} object with results columns removed.
#'
#' The results table when printed will provide the information about
#' the comparison, e.g. "log2 fold change (MAP): condition treated vs untreated", meaning
#' that the estimates are of log2(treated / untreated), as would be returned by
#' \code{contrast=c("condition","treated","untreated")}.
#' Multiple results can be returned for analyses beyond a simple two group comparison,
#' so \code{results} takes arguments \code{contrast} and \code{name} to help
#' the user pick out the comparisons of interest for printing a results table.
#' The use of the \code{contrast} argument is recommended for exact specification
#' of the levels which should be compared and their order.
#'
#' If \code{results} is run without specifying \code{contrast} or \code{name},
#' it will return the comparison of the last level of the last variable in the
#' design formula over the first level of this variable. For example, for a simple two-group
#' comparison, this would return the log2 fold changes of the second group over the
#' first group (the reference level). Please see examples below and in the vignette.
#'
#' The argument \code{contrast} can be used to generate results tables for
#' any comparison of interest, for example, the log2 fold change between
#' two levels of a factor, and its usage is described below. It can also
#' accomodate more complicated numeric comparisons.
#' Note that \code{contrast} will set to 0 the estimated LFC in a
#' comparison of two groups, where all of the counts in the two groups
#' are equal to 0 (while other groups have positive counts), while
#' \code{name} will not automatically set these LFC to 0.
#' The test statistic used for a contrast is:
#'
#' \deqn{ c^t \beta / \sqrt{c^t \Sigma c } }{ c' beta / sqrt( c' Sigma c ) }
#'
#' The argument \code{name} can be used to generate results tables for
#' individual effects, which must be individual elements of \code{resultsNames(object)}.
#' These individual effects could represent continuous covariates, effects
#' for individual levels, or individual interaction effects.
#'
#' Information on the comparison which was used to build the results table,
#' and the statistical test which was used for p-values (Wald test or likelihood ratio test)
#' is stored within the object returned by \code{results}. This information is in
#' the metadata columns of the results table, which is accessible by calling \code{mcols}
#' on the \code{\link{DESeqResults}} object returned by \code{results}.
#'
#' On p-values:
#'
#' By default, independent filtering is performed to select a set of genes
#' for multiple test correction which maximizes the number of adjusted
#' p-values less than a given critical value \code{alpha} (by default 0.1).
#' See the reference in this man page for details on independent filtering.
#' The filter used for maximizing the number of rejections is the mean
#' of normalized counts for all samples in the dataset.
#' Several arguments from the \code{filtered_p} function of
#' the genefilter package (used within the \code{results} function)
#' are provided here to control the independent filtering behavior.
#' (Note \code{filtered_p} R code is now copied into DESeq2
#' package to avoid gfortran requirements.)
#' In DESeq2 version >= 1.10, the threshold that is chosen is
#' the lowest quantile of the filter for which the
#' number of rejections is close to the peak of a curve fit
#' to the number of rejections over the filter quantiles.
#' 'Close to' is defined as within 1 residual standard deviation.
#' The adjusted p-values for the genes which do not pass the filter threshold
#' are set to \code{NA}.
#'
#' By default, \code{results} assigns a p-value of \code{NA}
#' to genes containing count outliers, as identified using Cook's distance.
#' See the \code{cooksCutoff} argument for control of this behavior.
#' Cook's distances for each sample are accessible as a matrix "cooks"
#' stored in the \code{assays()} list. This measure is useful for identifying rows where the
#' observed counts might not fit to a Negative Binomial distribution.
#'
#' For analyses using the likelihood ratio test (using \code{\link{nbinomLRT}}),
#' the p-values are determined solely by the difference in deviance between
#' the full and reduced model formula. A single log2 fold change is printed
#' in the results table for consistency with other results table outputs,
#' however the test statistic and p-values may nevertheless involve
#' the testing of one or more log2 fold changes.
#' Which log2 fold change is printed in the results table can be controlled
#' using the \code{name} argument, or by default this will be the estimated
#' coefficient for the last element of \code{resultsNames(object)}.
#'
#' If \code{useT=TRUE} was specified when running \code{DESeq} or \code{nbinomWaldTest},
#' then the p-value generated by \code{results} will also make use of the
#' t distribution for the Wald statistic, using the degrees of freedom
#' in \code{mcols(object)$tDegreesFreedom}.
#'
#' @references Richard Bourgon, Robert Gentleman, Wolfgang Huber: Independent
#' filtering increases detection power for high-throughput experiments.
#' PNAS (2010), \url{http://dx.doi.org/10.1073/pnas.0914005107}
#'
#' @param object a DESeqDataSet, on which one
#' of the following functions has already been called:
#' \code{\link{DESeq}}, \code{\link{nbinomWaldTest}}, or \code{\link{nbinomLRT}}
#' @param contrast this argument specifies what comparison to extract from
#' the \code{object} to build a results table. one of either:
#' \itemize{
#' \item a character vector with exactly three elements:
#' the name of a factor in the design formula,
#' the name of the numerator level for the fold change,
#' and the name of the denominator level for the fold change
#' (simplest case)
#' \item a list of 2 character vectors: the names of the fold changes
#' for the numerator, and the names of the fold changes
#' for the denominator.
#' these names should be elements of \code{resultsNames(object)}.
#' if the list is length 1, a second element is added which is the
#' empty character vector, \code{character()}.
#' (more general case, can be to combine interaction terms and main effects)
#' \item a numeric contrast vector with one element
#' for each element in \code{resultsNames(object)} (most general case)
#' }
#' If specified, the \code{name} argument is ignored.
#' @param name the name of the individual effect (coefficient) for
#' building a results table. Use this argument rather than \code{contrast}
#' for continuous variables, individual effects or for individual interaction terms.
#' The value provided to \code{name} must be an element of \code{resultsNames(object)}.
#' @param lfcThreshold a non-negative value which specifies a log2 fold change
#' threshold. The default value is 0, corresponding to a test that
#' the log2 fold changes are equal to zero. The user can
#' specify the alternative hypothesis using the \code{altHypothesis} argument,
#' which defaults to testing
#' for log2 fold changes greater in absolute value than a given threshold.
#' If \code{lfcThreshold} is specified,
#' the results are for Wald tests, and LRT p-values will be overwritten.
#' @param altHypothesis character which specifies the alternative hypothesis,
#' i.e. those values of log2 fold change which the user is interested in
#' finding. The complement of this set of values is the null hypothesis which
#' will be tested. If the log2 fold change specified by \code{name}
#' or by \code{contrast} is written as \eqn{ \beta }{ beta }, then the possible values for
#' \code{altHypothesis} represent the following alternate hypotheses:
#' \itemize{
#' \item greaterAbs: \eqn{|\beta| > \textrm{lfcThreshold} }{ |beta| > lfcThreshold },
#' and p-values are two-tailed
#' \item lessAbs: \eqn{ |\beta| < \textrm{lfcThreshold} }{ |beta| < lfcThreshold },
#' p-values are the maximum of the upper and lower tests.
#' The Wald statistic given is positive, an SE-scaled distance from the closest boundary
#' \item greater: \eqn{ \beta > \textrm{lfcThreshold} }{ beta > lfcThreshold }
#' \item less: \eqn{ \beta < -\textrm{lfcThreshold} }{ beta < -lfcThreshold }
#' \item greaterAbs2014: older implementation of greaterAbs from 2014, less power
#' }
#' @param listValues only used if a list is provided to \code{contrast}:
#' a numeric of length two: the log2 fold changes in the list are multiplied by these values.
#' the first number should be positive and the second negative.
#' by default this is \code{c(1,-1)}
#' @param cooksCutoff theshold on Cook's distance, such that if one or more
#' samples for a row have a distance higher, the p-value for the row is
#' set to NA. The default cutoff is the .99 quantile of the F(p, m-p) distribution,
#' where p is the number of coefficients being fitted and m is the number of samples.
#' Set to \code{Inf} or \code{FALSE} to disable the resetting of p-values to NA.
#' Note: this test excludes the Cook's distance of samples belonging to experimental
#' groups with only 2 samples.
#' @param independentFiltering logical, whether independent filtering should be
#' applied automatically
#' @param alpha the significance cutoff used for optimizing the independent
#' filtering (by default 0.1). If the adjusted p-value cutoff (FDR) will be a
#' value other than 0.1, \code{alpha} should be set to that value.
#' @param filter the vector of filter statistics over which the independent
#' filtering will be optimized. By default the mean of normalized counts is used.
#' @param theta the quantiles at which to assess the number of rejections
#' from independent filtering
#' @param pAdjustMethod the method to use for adjusting p-values, see \code{?p.adjust}
#' @param filterFun an optional custom function for performing independent filtering
#' and p-value adjustment, with arguments \code{res} (a DESeqResults object),
#' \code{filter} (the quantitity for filtering tests),
#' \code{alpha} (the target FDR),
#' \code{pAdjustMethod}. This function should return a DESeqResults object
#' with a \code{padj} column.
#' @param format character, either \code{"DataFrame"},
#' \code{"GRanges"}, or \code{"GRangesList"},
#' whether the results should be printed as a \code{\link{DESeqResults}} DataFrame,
#' or if the results DataFrame should be attached as metadata columns to
#' the \code{GRanges} or \code{GRangesList} \code{rowRanges} of the \code{DESeqDataSet}.
#' If the \code{rowRanges} is a \code{GRangesList}, and \code{GRanges} is requested,
#' the range of each gene will be returned
#' @param saveCols character or numeric vector, the columns of
#' \code{mcols(object)} to pass into the \code{results} output
#' @param test this is automatically detected internally if not provided.
#' the one exception is after \code{nbinomLRT} has been run, \code{test="Wald"}
#' will generate Wald statistics and Wald test p-values.
#' @param addMLE if \code{betaPrior=TRUE} was used (non-default),
#' this logical argument specifies if the "unshrunken" maximum likelihood estimates (MLE)
#' of log2 fold change should be added as a column to the results table (default is FALSE).
#' This argument is preserved for backward compatability, as now \code{betaPrior=FALSE}
#' by default and the recommended pipeline is
#' to generate shrunken MAP estimates using \code{\link{lfcShrink}}.
#' This argument functionality is only implemented for \code{contrast}
#' specified as three element character vectors.
#' @param tidy whether to output the results table with rownames as a first column 'row'.
#' the table will also be coerced to \code{data.frame}
#' @param parallel if FALSE, no parallelization. if TRUE, parallel
#' execution using \code{BiocParallel}, see next argument \code{BPPARAM}
#' @param BPPARAM an optional parameter object passed internally
#' to \code{\link{bplapply}} when \code{parallel=TRUE}.
#' If not specified, the parameters last registered with
#' \code{\link{register}} will be used.
#' @param minmu lower bound on the estimated count (used when calculating contrasts)
#'
#' @return For \code{results}: a \code{\link{DESeqResults}} object, which is
#' a simple subclass of DataFrame. This object contains the results columns:
#' \code{baseMean}, \code{log2FoldChange}, \code{lfcSE}, \code{stat},
#' \code{pvalue} and \code{padj},
#' and also includes metadata columns of variable information.
#' The \code{lfcSE} gives the standard error of the \code{log2FoldChange}.
#' For the Wald test, \code{stat} is the Wald statistic: the \code{log2FoldChange}
#' divided by \code{lfcSE}, which is compared to a standard Normal distribution
#' to generate a two-tailed \code{pvalue}. For the likelihood ratio test (LRT),
#' \code{stat} is the difference in deviance between the reduced model and the full model,
#' which is compared to a chi-squared distribution to generate a \code{pvalue}.
#'
#' For \code{resultsNames}: the names of the columns available as results,
#' usually a combination of the variable name and a level
#'
#' For \code{removeResults}: the original \code{DESeqDataSet} with results metadata columns removed
#'
#' @seealso \code{\link{DESeq}}, \code{\link{lfcShrink}}
#'
#' @examples
#'
#' ## Example 1: two-group comparison
#'
#' dds <- makeExampleDESeqDataSet(m=4)
#'
#' dds <- DESeq(dds)
#' res <- results(dds, contrast=c("condition","B","A"))
#'
#' # with more than two groups, the call would look similar, e.g.:
#' # results(dds, contrast=c("condition","C","A"))
#' # etc.
#'
#' ## Example 2: two conditions, two genotypes, with an interaction term
#'
#' dds <- makeExampleDESeqDataSet(n=100,m=12)
#' dds$genotype <- factor(rep(rep(c("I","II"),each=3),2))
#'
#' design(dds) <- ~ genotype + condition + genotype:condition
#' dds <- DESeq(dds)
#' resultsNames(dds)
#'
#' # the condition effect for genotype I (the main effect)
#' results(dds, contrast=c("condition","B","A"))
#'
#' # the condition effect for genotype II
#' # this is, by definition, the main effect *plus* the interaction term
#' # (the extra condition effect in genotype II compared to genotype I).
#' results(dds, list( c("condition_B_vs_A","genotypeII.conditionB") ))
#'
#' # the interaction term, answering: is the condition effect *different* across genotypes?
#' results(dds, name="genotypeII.conditionB")
#'
#' ## Example 3: two conditions, three genotypes
#'
#' # ~~~ Using interaction terms ~~~
#'
#' dds <- makeExampleDESeqDataSet(n=100,m=18)
#' dds$genotype <- factor(rep(rep(c("I","II","III"),each=3),2))
#' design(dds) <- ~ genotype + condition + genotype:condition
#' dds <- DESeq(dds)
#' resultsNames(dds)
#'
#' # the condition effect for genotype I (the main effect)
#' results(dds, contrast=c("condition","B","A"))
#'
#' # the condition effect for genotype III.
#' # this is the main effect *plus* the interaction term
#' # (the extra condition effect in genotype III compared to genotype I).
#' results(dds, contrast=list( c("condition_B_vs_A","genotypeIII.conditionB") ))
#'
#' # the interaction term for condition effect in genotype III vs genotype I.
#' # this tests if the condition effect is different in III compared to I
#' results(dds, name="genotypeIII.conditionB")
#'
#' # the interaction term for condition effect in genotype III vs genotype II.
#' # this tests if the condition effect is different in III compared to II
#' results(dds, contrast=list("genotypeIII.conditionB", "genotypeII.conditionB"))
#'
#' # Note that a likelihood ratio could be used to test if there are any
#' # differences in the condition effect between the three genotypes.
#'
#' # ~~~ Using a grouping variable ~~~
#'
#' # This is a useful construction when users just want to compare
#' # specific groups which are combinations of variables.
#'
#' dds$group <- factor(paste0(dds$genotype, dds$condition))
#' design(dds) <- ~ group
#' dds <- DESeq(dds)
#' resultsNames(dds)
#'
#' # the condition effect for genotypeIII
#' results(dds, contrast=c("group", "IIIB", "IIIA"))
#'
#' @rdname results
#' @aliases results resultsNames removeResults
#' @export
results <- function(object, contrast, name,
lfcThreshold=0,
altHypothesis=c("greaterAbs","lessAbs","greater","less","greaterAbs2014"),
listValues=c(1,-1),
cooksCutoff,
independentFiltering=TRUE,
alpha=0.1, filter, theta,
pAdjustMethod="BH",
filterFun,
format=c("DataFrame","GRanges","GRangesList"),
saveCols=NULL,
test=c("Wald","LRT"),
addMLE=FALSE,
tidy=FALSE,
parallel=FALSE, BPPARAM=bpparam(),
minmu=0.5) {
stopifnot(is(object, "DESeqDataSet"))
# match args
format <- match.arg(format)
altHypothesis <- match.arg(altHypothesis)
if (!missing(test)) {
test <- match.arg(test)
}
### initial argument testing ###
stopifnot(lfcThreshold >= 0)
stopifnot(length(lfcThreshold)==1)
stopifnot(length(alpha)==1)
stopifnot(alpha > 0 & alpha < 1)
stopifnot(length(pAdjustMethod)==1)
stopifnot(length(listValues)==2 & is.numeric(listValues))
stopifnot(listValues[1] > 0 & listValues[2] < 0)
if (!"results" %in% mcols(mcols(object))$type) {
stop("couldn't find results. you should first run DESeq()")
}
if (missing(test)) {
test <- attr(object, "test")
} else if (test == "Wald" & attr(object, "test") == "LRT") {
# initially test was LRT, now need to add Wald statistics and p-values
object <- makeWaldTest(object)
} else if (test == "LRT" & attr(object, "test") == "Wald") {
stop("the LRT requires the user run nbinomLRT or DESeq(dds,test='LRT')")
}
if (lfcThreshold == 0 & altHypothesis == "lessAbs") {
stop("when testing altHypothesis='lessAbs', set the argument lfcThreshold to a positive value")
}
if (addMLE) {
if (!attr(object,"betaPrior")) {
stop("addMLE=TRUE is only for when a beta prior was used.
otherwise, the log2 fold changes are already MLE")
}
if (!missing(name) & missing(contrast)) {
stop("addMLE=TRUE should be used by providing character vector
of length 3 to 'contrast' instead of using 'name'")
}
}
if (format == "GRanges" & is(rowRanges(object),"GRangesList")) {
if (any(elementNROWS(rowRanges(object)) == 0)) {
stop("rowRanges is GRangesList and one or more GRanges have length 0. Use format='DataFrame' or 'GRangesList'")
}
}
if (!is.null(saveCols)) {
if (is(saveCols, "character"))
stopifnot(all(saveCols %in% colnames(mcols(object))))
if (is(saveCols, "numeric")) {
stopifnot(saveCols == round(saveCols))
stopifnot(min(saveCols) > 0)
stopifnot(max(saveCols) <= ncol(mcols(object)))
}
}
if (!missing(contrast)) {
if (attr(object,"modelMatrixType") == "user-supplied" & is.character(contrast)) {
stop("only list- and numeric-type contrasts are supported for user-supplied model matrices")
}
}
if (is(design(object), "formula")) {
hasIntercept <- attr(terms(design(object)),"intercept") == 1
isExpanded <- attr(object, "modelMatrixType") == "expanded"
termsOrder <- attr(terms.formula(design(object)),"order")
# if no intercept was used or an expanded model matrix was used,
# and neither 'contrast' nor 'name' were specified,
# and no interactions...
# then we create the result table: last / first level for last variable
if ((test == "Wald") & (isExpanded | !hasIntercept) & missing(contrast) & missing(name) & all(termsOrder < 2)) {
designVars <- all.vars(design(object))
lastVarName <- designVars[length(designVars)]
lastVar <- colData(object)[[lastVarName]]
if (is.factor(lastVar)) {
nlvls <- nlevels(lastVar)
contrast <- c(lastVarName, levels(lastVar)[nlvls], levels(lastVar)[1])
}
}
}
if (missing(name)) {
name <- lastCoefName(object)
} else {
if (length(name) != 1 | !is.character(name)) {
stop("the argument 'name' should be a character vector of length 1")
}
}
### done with input argument testing ###
WaldResults <- paste0("WaldPvalue_",name) %in% names(mcols(object))
LRTResults <- "LRTPvalue" %in% names(mcols(object))
# this will be used in cleanContrast, and in the lfcThreshold chunks below
useT <- "tDegreesFreedom" %in% names(mcols(object))
# if performing a contrast call the function cleanContrast()
if (!missing(contrast)) {
resNames <- resultsNames(object)
# do some arg checking/cleaning
contrast <- checkContrast(contrast, resNames)
### cleanContrast call ###
# need to go back to C++ code in order to build the beta covariance matrix
# then this is multiplied by the numeric contrast to get the Wald statistic.
# with 100s of samples, this can get slow, so offer parallelization
if (!parallel) {
res <- cleanContrast(object, contrast, expanded=isExpanded, listValues=listValues,
test=test, useT=useT, minmu=minmu)
} else if (parallel) {
# parallel execution
nworkers <- getNworkers(BPPARAM)
idx <- factor(sort(rep(seq_len(nworkers),length.out=nrow(object))))
res <- do.call(rbind, bplapply(levels(idx), function(l) {
cleanContrast(object[idx == l,,drop=FALSE], contrast,
expanded=isExpanded, listValues=listValues,
test=test, useT=useT, minmu=minmu)
}, BPPARAM=BPPARAM))
}
} else {
# if not performing a contrast
# pull relevant columns from mcols(object)
log2FoldChange <- getCoef(object, name)
lfcSE <- getCoefSE(object, name)
stat <- getStat(object, test, name)
pvalue <- getPvalue(object, test, name)
res <- cbind(mcols(object)["baseMean"],log2FoldChange,lfcSE,stat,pvalue)
names(res) <- c("baseMean","log2FoldChange","lfcSE","stat","pvalue")
}
rownames(res) <- rownames(object)
# add unshrunken MLE coefficients to the results table
if (addMLE) {
if (is.numeric(contrast)) stop("addMLE only implemented for: contrast=c('condition','B','A')")
if (is.list(contrast)) stop("addMLE only implemented for: contrast=c('condition','B','A')")
res <- cbind(res, mleContrast(object, contrast))
res <- res[,c("baseMean","log2FoldChange","lfcMLE","lfcSE","stat","pvalue")]
# if an all zero contrast, also zero out the lfcMLE
res$lfcMLE[ which(res$log2FoldChange == 0 & res$stat == 0) ] <- 0
}
# only if we need to generate new p-values
if ( !(lfcThreshold == 0 & altHypothesis == "greaterAbs") ) {
if (test == "LRT") {
stop("tests of log fold change above or below a theshold must be Wald tests.")
}
# check requirement if betaPrior was set to FALSE
if (altHypothesis == "lessAbs" & attr(object, "betaPrior")) {
stop("testing altHypothesis='lessAbs' requires setting the DESeq() argument betaPrior=FALSE")
}
# easier to read
LFC <- res$log2FoldChange
SE <- res$lfcSE
T <- lfcThreshold
if (useT) {
df <- mcols(object)$tDegreesFreedom
pfunc <- function(q) pt(q, df=df, lower.tail=FALSE)
} else {
pfunc <- function(q) pnorm(q, lower.tail=FALSE)
}
if (altHypothesis == "greaterAbs") {
# a new version of greaterAbs suggested by Nikos Ignatiadis
# which is more powerful and still controls Type I error
if (!useT) {
pfunc <- function(lfc_T, lfc, se) {
pnorm(-abs(lfc) + lfc_T, sd=se) + pnorm(-abs(lfc) - lfc_T, sd=se)
}
} else {
pfunc <- function(lfc_T, lfc, se) {
pt((-abs(lfc) + lfc_T)/se, df=df) + pt((-abs(lfc) - lfc_T)/se, df=df)
}
}
newStat <- LFC / SE # just output the Wald stat ...
newPvalue <- mapply(pfunc, T, LFC, SE)
} else if (altHypothesis == "greaterAbs2014") {
# this is the version of greaterAbs that was used 2014-2023
newStat <- sign(LFC) * pmax((abs(LFC) - T)/SE, 0)
newPvalue <- pmin(1, 2 * pfunc((abs(LFC) - T)/SE))
} else if (altHypothesis == "lessAbs") {
newStatAbove <- pmax((T - LFC)/SE, 0)
pvalueAbove <- pfunc((T - LFC)/SE)
newStatBelow <- pmax((LFC + T)/SE, 0)
pvalueBelow <- pfunc((LFC + T)/SE)
newStat <- pmin(newStatAbove, newStatBelow)
newPvalue <- pmax(pvalueAbove, pvalueBelow)
} else if (altHypothesis == "greater") {
newStat <- pmax((LFC - T)/SE, 0)
newPvalue <- pfunc((LFC - T)/SE)
} else if (altHypothesis == "less") {
newStat <- pmin((LFC + T)/SE, 0)
newPvalue <- pfunc((-T - LFC)/SE)
}
res$stat <- newStat
res$pvalue <- newPvalue
}
# calculate Cook's cutoff
m <- nrow(attr(object,"dispModelMatrix"))
p <- ncol(attr(object,"dispModelMatrix"))
defaultCutoff <- qf(.99, p, m - p)
if (missing(cooksCutoff)) {
cooksCutoff <- defaultCutoff
}
stopifnot(length(cooksCutoff)==1)
if (is.logical(cooksCutoff) & cooksCutoff) {
cooksCutoff <- defaultCutoff
}
# apply cutoff based on maximum Cook's distance
performCooksCutoff <- (is.numeric(cooksCutoff) | cooksCutoff)
if (performCooksCutoff) {
cooksOutlier <- mcols(object)$maxCooks > cooksCutoff
### BEGIN heuristic to avoid filtering genes with low count outliers
# as according to Cook's cutoff. only for two group designs.
# dont filter if three or more counts are larger
if (any(cooksOutlier,na.rm=TRUE) & is(design(object), "formula")) {
designVars <- all.vars(design(object))
if (length(designVars) == 1) {
var <- colData(object)[[designVars]]
if (is(var, "factor") && nlevels(var) == 2) {
dontFilter <- logical(sum(cooksOutlier,na.rm=TRUE))
for (i in seq_along(dontFilter)) {
# index along rows of object
ii <- which(cooksOutlier)[i]
# count for the outlier with max cooks
outCount <- counts(object)[ii,which.max(assays(object)[["cooks"]][ii,])]
# if three or more counts larger than the outlier
if (sum(counts(object)[ii,] > outCount) >= 3) {
# don't filter out the p-value for that gene
dontFilter[i] <- TRUE
}
}
# reset the outlier status for these genes
cooksOutlier[which(cooksOutlier)][dontFilter] <- FALSE
}
}
} ### END heuristic ###
res$pvalue[which(cooksOutlier)] <- NA
}
# if original baseMean was positive, but now zero due to replaced counts, fill in results
if ( sum(mcols(object)$replace, na.rm=TRUE) > 0) {
nowZero <- which(mcols(object)$replace & mcols(object)$baseMean == 0)
res$log2FoldChange[nowZero] <- 0
if (addMLE) { res$lfcMLE[nowZero] <- 0 }
res$lfcSE[nowZero] <- 0
res$stat[nowZero] <- 0
res$pvalue[nowZero] <- 1
}
# add prior information
deseq2.version <- packageVersion("DESeq2")
if (!attr(object,"betaPrior")) {
priorInfo <- list(type="none", package="DESeq2", version=deseq2.version)
} else {
betaPriorVar <- attr(object, "betaPriorVar")
priorInfo <- list(type="normal", package="DESeq2", version=deseq2.version,
betaPriorVar=betaPriorVar)
}
# make results object
deseqRes <- DESeqResults(res, priorInfo=priorInfo)
# p-value adjustment
if (missing(filterFun)) {
deseqRes <- pvalueAdjustment(deseqRes, independentFiltering, filter,
theta, alpha, pAdjustMethod)
} else {
deseqRes <- filterFun(deseqRes, filter, alpha, pAdjustMethod)
}
# stash lfcThreshold
metadata(deseqRes)[["lfcThreshold"]] <- lfcThreshold
# remove rownames and attach as a new column, 'row'
if (tidy) {
colnms <- colnames(deseqRes)
deseqRes$row <- rownames(deseqRes)
mcols(deseqRes,use.names=TRUE)["row","type"] <- "results"
mcols(deseqRes,use.names=TRUE)["row","description"] <- "row names"
deseqRes <- deseqRes[,c("row",colnms)]
rownames(deseqRes) <- NULL
deseqRes <- as.data.frame(deseqRes)
}
# return DataFrame, GRanges or GRangesList
out <- resultsFormatSwitch(object=object, res=deseqRes, format=format, saveCols=saveCols)
return(out)
}
#' @rdname results
#' @export
resultsNames <- function(object) {
names(mcols(object))[grep("log2 fold change",mcols(mcols(object))$description)]
}
#' @rdname results
#' @export
removeResults <- function(object) {
resCols <- mcols(mcols(object))$type == "results"
if (sum(resCols,na.rm=TRUE) > 0) {
mcols(object) <- mcols(object)[,-which(resCols),drop=FALSE]
}
return(object)
}
###########################################################
# unexported functons
###########################################################
pvalueAdjustment <- function(res, independentFiltering, filter,
theta, alpha, pAdjustMethod) {
# perform independent filtering
if (independentFiltering) {
if (missing(filter)) {
filter <- res$baseMean
}
if (missing(theta)) {
lowerQuantile <- mean(filter == 0)
if (lowerQuantile < .95) upperQuantile <- .95 else upperQuantile <- 1
theta <- seq(lowerQuantile, upperQuantile, length=50)
}
# do filtering using genefilter
stopifnot(length(theta) > 1)
stopifnot(length(filter) == nrow(res))
filtPadj <- filtered_p(filter=filter, test=res$pvalue,
theta=theta, method=pAdjustMethod)
numRej <- MatrixGenerics::colSums(filtPadj < alpha, na.rm = TRUE)
# prevent over-aggressive filtering when all genes are null,
# by requiring the max number of rejections is above a fitted curve.
# If the max number of rejection is not greater than 10, then don't
# perform independent filtering at all.
lo.fit <- lowess(numRej ~ theta, f=1/5)
if (max(numRej) <= 10) {
j <- 1
} else {
residual <- if (all(numRej==0)) {
0
} else {
numRej[numRej > 0] - lo.fit$y[numRej > 0]
}
# this usually works: find the threshold at which num rejections
# surpasses the root mean squared error around the fitted curve.
# it may not work if there is a sharp uptick in the curve at
# the end of the grid, and there is very little variation.
maxFit <- max(lo.fit$y)
rmse <- sqrt(mean(residual^2))
thresh <- maxFit - rmse
j <- if (any(numRej > thresh)) {
# backup case: if low variation and uptick at end,
# pick the first point at which num rejections reaches
# 90% of the fitted curve, or 80% of the fitted curve
which(numRej > thresh)[1]
} else if (any(numRej > .9 * maxFit)) {
which(numRej > .9 * maxFit)[1]
} else if (any(numRej > .8 * maxFit)) {
which(numRej > .8 * maxFit)[1]
} else {
1
}
}
# j <- which.max(numRej) # old method, not stable
padj <- filtPadj[, j, drop=TRUE]
cutoffs <- quantile(filter, theta)
filterThreshold <- cutoffs[j]
filterNumRej <- data.frame(theta=theta, numRej=numRej)
filterTheta <- theta[j]
metadata(res)[["filterThreshold"]] <- filterThreshold
metadata(res)[["filterTheta"]] <- filterTheta
metadata(res)[["filterNumRej"]] <- filterNumRej
metadata(res)[["lo.fit"]] <- lo.fit
metadata(res)[["alpha"]] <- alpha
} else {
# regular p-value adjustment
# does not include those rows which were removed
# by maximum Cook's distance
padj <- p.adjust(res$pvalue,method=pAdjustMethod)
}
res$padj <- padj
# add metadata to padj column
mcols(res)$type[names(res)=="padj"] <- "results"
mcols(res)$description[names(res)=="padj"] <- paste(pAdjustMethod,"adjusted p-values")
res
}
# function copied from `genefilter` package to avoid gfortran requirement
filtered_p <- function( filter, test, theta, data, method = "none" ) {
if ( is.function( filter ) )
U1 <- filter( data )
else
U1 <- filter
cutoffs <- quantile( U1, theta )
result <- matrix( NA_real_, length( U1 ), length( cutoffs ) )
colnames( result ) <- names( cutoffs )
for ( i in 1:length( cutoffs ) ) {
use <- U1 >= cutoffs[i]
if( any( use ) ) {
if( is.function( test ) )
U2 <- test( data[use,] )
else
U2 <- test[use]
result[use,i] <- p.adjust( U2, method )
}
}
return( result )
}
# two low-level functions used by results() to perform contrasts
#
# getContrast takes a DESeqDataSet object
# and a numeric vector specifying a contrast
# and returns a vector of Wald statistics
# corresponding to the contrast.
#
# cleanContrast checks for the validity of
# the specified contrast (numeric or character vector)
# and turns character vector contrast into the appropriate
# numeric vector contrast
#
# results() calls cleanContrast() which calls getContrast()
#
# the formula used is:
# c' beta / sqrt( c' sigma c)
# where beta is the coefficient vector
# and sigma is the covariance matrix for beta
getContrast <- function(object, contrast, useT=FALSE, minmu) {
if (missing(contrast)) {
stop("must provide a contrast")
}
if (is.null(attr(object,"modelMatrix"))) {
stop("was expecting a model matrix stored as an attribute of the DESeqDataSet")
}
modelMatrix <- attr(object, "modelMatrix")
# only continue on the rows with non-zero row mean
objectNZ <- object[!mcols(object)$allZero,]
normalizationFactors <- getSizeOrNormFactors(objectNZ)
alpha_hat <- dispersions(objectNZ)
coefColumns <- names(mcols(objectNZ))[grep("log2 fold change",mcols(mcols(object))$description)]
# convert betas to log scale
beta_mat <- log(2) * as.matrix(mcols(objectNZ)[,coefColumns,drop=FALSE])
# convert beta prior variance to log scale
lambda = 1/(log(2)^2 * attr(object,"betaPriorVar"))
# check if DESeq() replaced outliers
countsMatrix <- if ("replaceCounts" %in% assayNames(object)) {
assays(objectNZ)[["replaceCounts"]]
} else {
counts(objectNZ)
}
# use weights if they are present in assays(object)
if ("weights" %in% assayNames(object)) {
useWeights <- TRUE
weights <- assays(object)[["weights"]]
stopifnot(all(weights >= 0))
weights <- weights / apply(weights, 1, max)
} else {
useWeights <- FALSE
weights <- matrix(1, nrow=nrow(object), ncol=ncol(object))
}
betaRes <- fitBeta(ySEXP = countsMatrix, xSEXP = modelMatrix,
nfSEXP = normalizationFactors,
alpha_hatSEXP = alpha_hat,
contrastSEXP = contrast,
beta_matSEXP = beta_mat,
lambdaSEXP = lambda,
weightsSEXP = weights,
useWeightsSEXP = useWeights,
tolSEXP = 1e-8, maxitSEXP = 0,
useQRSEXP=FALSE, # QR not relevant, fitting loop isn't entered
minmuSEXP=minmu)
# convert back to log2 scale
contrastEstimate <- log2(exp(1)) * betaRes$contrast_num
contrastSE <- log2(exp(1)) * betaRes$contrast_denom
contrastStatistic <- contrastEstimate / contrastSE
if (useT) {
stopifnot("tDegreesFreedom" %in% names(mcols(object)))
df <- mcols(objectNZ)$tDegreesFreedom
contrastPvalue <- 2*pt(abs(contrastStatistic),df=df,lower.tail=FALSE)
} else {
contrastPvalue <- 2*pnorm(abs(contrastStatistic),lower.tail=FALSE)
}
contrastList <- list(log2FoldChange=contrastEstimate,
lfcSE=contrastSE,
stat=contrastStatistic,
pvalue=contrastPvalue)
contrastResults <- buildDataFrameWithNARows(contrastList,
mcols(object)$allZero)
names(contrastResults) <- c("log2FoldChange","lfcSE","stat","pvalue")
contrastResults
}
# this function takes a desired contrast as specified by results(),
# performs checks, and then either returns the already existing contrast
# or generates the contrast by calling getContrast() using a numeric vector
cleanContrast <- function(object, contrast, expanded=FALSE, listValues, test, useT, minmu) {
# get the names of columns in the beta matrix
resNames <- resultsNames(object)
# if possible, return pre-computed columns, which are
# already stored in mcols(dds). this will be the case using
# results() with 'name', or if expanded model matrices were not
# run and the contrast contains the reference level as numerator or denominator
resReady <- FALSE
if (is.character(contrast)) {
contrastFactor <- contrast[1]
if (!contrastFactor %in% names(colData(object))) {
stop(paste(contrastFactor,"should be the name of a factor in the colData of the DESeqDataSet"))
}
if (!is(colData(object)[[contrastFactor]], "factor")) {
stop(paste(contrastFactor,"is not a factor, see ?results"))
}
contrastNumLevel <- contrast[2]
contrastDenomLevel <- contrast[3]
contrastBaseLevel <- levels(colData(object)[,contrastFactor])[1]
# check for intercept
hasIntercept <- attr(terms(design(object)),"intercept") == 1
firstVar <- contrastFactor == all.vars(design(object))[1]
# tricky case: if the design has no intercept, the factor is
# not the first variable in the design, and one of the numerator or denominator
# is the reference level, then the desired contrast is simply a coefficient (or -1 times)
noInterceptPullCoef <- !hasIntercept & !firstVar &
(contrastBaseLevel %in% c(contrastNumLevel, contrastDenomLevel))
# case 1: standard model matrices: pull coef or build the appropriate contrast
# coefficients names are of the form "factor_level_vs_baselevel"
# output: contrastNumColumn and contrastDenomColumn
if (!expanded & (hasIntercept | noInterceptPullCoef)) {
# use make.names() so the column names are
# the same as created by DataFrame in mcols(object).
contrastNumColumn <- make.names(paste0(contrastFactor,"_",contrastNumLevel,"_vs_",contrastBaseLevel))
contrastDenomColumn <- make.names(paste0(contrastFactor,"_",contrastDenomLevel,"_vs_",contrastBaseLevel))
# check that the desired contrast is already
# available in mcols(object), and then we can either
# take it directly or multiply the log fold
# changes and Wald stat by -1
if ( contrastDenomLevel == contrastBaseLevel ) {
cleanName <- paste(contrastFactor,contrastNumLevel,"vs",contrastDenomLevel)
# the results can be pulled directly from mcols(object)
name <- if (!noInterceptPullCoef) {
make.names(paste0(contrastFactor,"_",contrastNumLevel,"_vs_",contrastDenomLevel))
} else {
make.names(paste0(contrastFactor,contrastNumLevel))
}
if (!name %in% resNames) {
stop(paste("as",contrastDenomLevel,"is the reference level, was expecting",name,"to be present in 'resultsNames(object)'"))
}
log2FoldChange <- getCoef(object, name)
lfcSE <- getCoefSE(object, name)
stat <- getStat(object, test, name)
pvalue <- getPvalue(object, test, name)
res <- cbind(mcols(object)["baseMean"],log2FoldChange,lfcSE,stat,pvalue)
names(res) <- c("baseMean","log2FoldChange","lfcSE","stat","pvalue")
lfcType <- if (attr(object,"betaPrior")) "MAP" else "MLE"
lfcDesc <- paste0("log2 fold change (",lfcType,"): ",cleanName)
mcols(res,use.names=TRUE)["log2FoldChange","description"] <- lfcDesc
resReady <- TRUE
} else if ( contrastNumLevel == contrastBaseLevel ) {
# fetch the results for denom vs num
# and mutiply the log fold change and stat by -1
cleanName <- paste(contrastFactor,contrastNumLevel,"vs",contrastDenomLevel)
swapName <- if (!noInterceptPullCoef) {
make.names(paste0(contrastFactor,"_",contrastDenomLevel,"_vs_",contrastNumLevel))
} else {
make.names(paste0(contrastFactor,contrastDenomLevel))
}
if (!swapName %in% resNames) {
stop(paste("as",contrastNumLevel,"is the reference level, was expecting",swapName,"to be present in 'resultsNames(object)'"))
}
log2FoldChange <- getCoef(object, swapName)
lfcSE <- getCoefSE(object, swapName)
stat <- getStat(object, test, swapName)
pvalue <- getPvalue(object, test, swapName)
res <- cbind(mcols(object)["baseMean"],log2FoldChange,lfcSE,stat,pvalue)
names(res) <- c("baseMean","log2FoldChange","lfcSE","stat","pvalue")
res$log2FoldChange <- -1 * res$log2FoldChange
if (test == "Wald") res$stat <- -1 * res$stat
lfcType <- if (attr(object,"betaPrior")) "MAP" else "MLE"
# rename some of the columns using the flipped contrast
if (test == "Wald") {
contrastDescriptions <- paste(c(paste0("log2 fold change (",lfcType,"):"),
"standard error:",
"Wald statistic:","Wald test p-value:"), cleanName)
mcols(res,use.names=TRUE)[c("log2FoldChange","lfcSE","stat","pvalue"),
"description"] <- contrastDescriptions
} else {
contrastDescriptions <- paste(c(paste0("log2 fold change (",lfcType,"):"),
"standard error:"), cleanName)
mcols(res,use.names=TRUE)[c("log2FoldChange","lfcSE"),
"description"] <- contrastDescriptions
}
resReady <- TRUE
} else {
# check for the case where neither are present
# as comparisons against reference level
if ( ! (contrastNumColumn %in% resNames &
contrastDenomColumn %in% resNames) ) {
stop(paste(contrastNumLevel,"and",contrastDenomLevel,"should be levels of",contrastFactor,"such that",contrastNumColumn,"and",contrastDenomColumn,"are contained in 'resultsNames(object)'"))
}
}
# case 2: expanded model matrices or no intercept and first variable
# need to then build the appropriate contrast.
# these coefficient names have the form "factorlevel"
# output: contrastNumColumn and contrastDenomColumn
} else {
# we only need to check validity
contrastNumColumn <- make.names(paste0(contrastFactor, contrastNumLevel))
contrastDenomColumn <- make.names(paste0(contrastFactor, contrastDenomLevel))
if ( ! (contrastNumColumn %in% resNames & contrastDenomColumn %in% resNames) ) {
stop(paste(paste0(contrastFactor,contrastNumLevel),"and",paste0(contrastFactor,contrastDenomLevel),
"are expected to be in resultsNames(object)"))
}
}
# check if both levels have all zero counts
# (this has to be down here to make use of error checking above)
contrastAllZero <- contrastAllZeroCharacter(object, contrastFactor,
contrastNumLevel, contrastDenomLevel)
}
# if the result table not already built in the above code
if (!resReady) {
# here, a numeric / list / character contrast which will be converted
# into a numeric contrast and run through getContrast()
if (is.numeric(contrast)) {
# make name for numeric contrast
signMap <- c("","","+")
contrastSigns <- signMap[sign(contrast)+2]
contrastName <- paste(paste0(contrastSigns,as.character(contrast)),collapse=",")
} else if (is.list(contrast)) {
# interpret list contrast into numeric and make a name for the contrast
lc1 <- length(contrast[[1]])
lc2 <- length(contrast[[2]])
# these just used for naming
listvalname1 <- round(listValues[1],3)
listvalname2 <- round(listValues[2],3)
if (lc1 > 0 & lc2 > 0) {
listvalname2 <- abs(listvalname2)
listvalname1 <- if (listvalname1 == 1) "" else paste0(listvalname1," ")
listvalname2 <- if (listvalname2 == 1) "" else paste0(listvalname2," ")
contrastName <- paste0(listvalname1,paste(contrast[[1]],collapse="+")," vs ",listvalname2,paste(contrast[[2]],collapse="+"))
} else if (lc1 > 0 & lc2 == 0) {
listvalname1 <- if (listvalname1 == 1) "" else paste0(listvalname1," ")
contrastName <- paste0(listvalname1,paste(contrast[[1]],collapse="+")," effect")
} else if (lc1 == 0 & lc2 > 0) {
contrastName <- paste(listvalname2,paste(contrast[[2]],collapse="+"),"effect")
}
contrastNumeric <- rep(0,length(resNames))
contrastNumeric[resNames %in% contrast[[1]]] <- listValues[1]
contrastNumeric[resNames %in% contrast[[2]]] <- listValues[2]
contrast <- contrastNumeric
} else if (is.character(contrast)) {
# interpret character contrast into numeric and make a name for the contrast
contrastNumeric <- rep(0,length(resNames))
contrastNumeric[resNames == contrastNumColumn] <- 1
contrastNumeric[resNames == contrastDenomColumn] <- -1
contrast <- contrastNumeric