-
Notifications
You must be signed in to change notification settings - Fork 19
/
calcLocEnrichment.R
188 lines (155 loc) · 7.66 KB
/
calcLocEnrichment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
######################################################################
# ENRICHMENT - Actual workhorse enrichment calculation functions
######################################################################
#' Enrichment Calculation
#'
#' Workhorse function that calculates overlaps between userSets,
#' and then uses a fisher's exact test rank them by significance
#' of the overlap.
#'
#' @param userSets Regions of interest
#' @param userUniverse Regions tested for inclusion in userSets
#' @param regionDB Region DB to check for overlap, from loadRegionDB()
#' @param minOverlap (Default:1) Minimum bases required to count an overlap
#' @param cores Number of processors
#' @param redefineUserSets run redefineUserSets() on your userSets?
#' @param direction Defaults to "enrichment", but may also accept
#' "depletion", which will swap the direction of the fisher test (use
#' 'greater' or less' value passed to the 'alternative' option of
#' fisher.test)
#' @return Data.table with enrichment results. Rows correspond to individual
#' pairwise fisher's tests comparing a single userSet with a single databaseSet.
#' The columns in this data.table are: userSet and dbSet: index into their
#' respective input region sets. pvalueLog: -log10(pvalue) from the fisher's exact
#' result; oddsRatio: result from the fisher's exact test; support: number of
#' regions in userSet overlapping databaseSet; rnkPV, rnkOR, rnkSup: rank in this
#' table of p-value, oddsRatio, and Support respectively. The --value is the
#' negative natural log of the p-value returned from a one-sided fisher's exact
#' test. maxRnk, meanRnk: max and mean of the 3 previous ranks, providing a
#' combined ranking system. b, c, d: 3 other values completing the 2x2 contingency
#' table (with support). The remaining columns describe the dbSet for the row.
#'
#' If you have the qvalue package installed from bioconductor, runLOLA will add
#' a q-value transformation to provide FDR scores automatically.
#' @export
#' @example
#' R/examples/example.R
runLOLA = function(userSets, userUniverse, regionDB, minOverlap=1, cores=1,
redefineUserSets=FALSE, direction="enrichment") {
# Silence R CMD check Notes:
support=d=b=userSet=pValueLog=rnkSup=rnkPV=rnkOR=NULL
oddsRatio=maxRnk=meanRnk=dbSet=description=NULL
annotationDT = regionDB$regionAnno
testSetsGRL = regionDB$regionGRL
if (direction == "depletion") {
fisherAlternative = "less"
} else {
fisherAlternative = "greater"
}
annotationDT[, dbSet := seq_len(nrow(annotationDT))]
setkey(annotationDT, dbSet)
### Data sanity checks ###
#Confirm we received GRangesList objects, convert from list if possible.
userSets = listToGRangesList(userSets)
testSetsGRL = listToGRangesList(testSetsGRL)
setLapplyAlias(cores)
if (any(is.null(names(testSetsGRL)))) {
names(testSetsGRL) = seq_along(testSetsGRL)
}
if (redefineUserSets) { #redefine user sets in terms of universe?
userSets = redefineUserSets(userSets, userUniverse, cores=cores)
userSets = listToGRangesList(userSets)
}
userSetsLength = unlist(lapplyAlias((userSets), length))
if (! any( isDisjoint( userSets) ) ) {
message("You have non-disjoint userSets.")
}
### Construct significance tests ###
message("Calculating unit set overlaps...")
# Returns for each userSet, a vector of length length(testSetsGRL), with total
# number of regions in that set overlapping anything in each testSetsGRL; this
# is then lapplied across each userSet.
geneSetDatabaseOverlap =
lapplyAlias( (userSets), countOverlapsRev, testSetsGRL, minoverlap=minOverlap)
# This is WRONG:
#geneSetDatabaseOverlap =
#lapplyAlias( as.list(userSets), countOverlapsAnyRev, testSetsGRL)
# This will become "support" -- the number of regions in the
# userSet (which I implicitly assume is ALSO the number of regions
# in the universe) that overlap anything in each database set.
# Turn results into an overlap matrix. It is
# dbSets (rows) by userSets (columns), counting overlap.
olmat = do.call(cbind, geneSetDatabaseOverlap)
message("Calculating universe set overlaps...")
# Now for each test set, how many items *in the universe* does
# it overlap? This will go into the calculation for c
#faster. Returns number of items in userUniverse.
testSetsOverlapUniverse = countOverlaps(testSetsGRL, userUniverse,
minoverlap=minOverlap)
# Returns number of items in test set (not used:)
#testSetsOverlapUniverse = countOverlapsAny(testSetsGRL, userUniverse)
# Total size of the universe
universeLength = length(userUniverse)
# To build the fisher matrix, support is 'a'
scoreTable = data.table(reshape2::melt(t(olmat), variable.factor=FALSE))
setnames(scoreTable, c("Var1", "Var2", "value"), c("userSet", "dbSet", "support"))
# reshape2 has an annoying habit of converting strings into factors, which
# is undesirable. If the userSets are named with strings, make sure they stay
# character. Integers are already handled appropriately.
if ("factor" %in% class(scoreTable[, userSet])) {
scoreTable$userSet = as.character(scoreTable$userSet)
}
message("Calculating Fisher scores...")
# b = the # of items *in the universe* that overlap each dbSet,
# less the support; This is the number of items in the universe
# that are in the dbSet ONLY (not in userSet)
# c = the size of userSet, less the support; This is the opposite:
# Items in the userSet ONLY (not in the dbSet)
scoreTable[,c("b", "c"):=list(b=testSetsOverlapUniverse[match(dbSet,
names(testSetsOverlapUniverse))]-support, c=userSetsLength-support)]
# This is the regions in the universe, but not in dbSet nor userSet.
scoreTable[,d:=universeLength-support-b-c]
if( scoreTable[,any(b<0)] ) { # Inappropriate universe.
warning(cleanws("Negative b entry in table. This means either: 1) Your user sets
contain items outside your universe; or 2) your universe has a region that
overlaps multiple user set regions, interfering with the universe set overlap
calculation."))
return(scoreTable)
}
if( scoreTable[,any(c<0)] ) {
warning("Negative c entry in table. Bug with userSetsLength; this should not happen.")
return(scoreTable)
}
scoreTable[,c("pValueLog", "oddsRatio") :=
fisher.test(matrix(c(support,b,c,d), 2, 2), alternative=fisherAlternative)[c("p.value",
"estimate")], by=list(userSet,dbSet)]
# Include qvalue if package exists.
if (requireNamespace("qvalue", quietly=TRUE)) {
# Wrap in try block since this is not vital.
# if you want qvalues...
tryCatch( {
scoreTable[,qValue:=qvalue::qvalue(pValueLog)$qvalue]
}, error = function(e) { warning("Problem in FDR calculation with qvalue.") })
} else {
# Another possibility for the future:
# scoreTable[,qValue:=qValues = pmin(pValues*length(pValues),1)]
}
scoreTable[, pValueLog:=-log10(pValueLog + 10^-322)]
### Finalize and Rank results ###
scoreTable[, rnkSup:=rank(-support, ties.method="min"), by=userSet]
scoreTable[, rnkPV:=rank(-pValueLog, ties.method="min"), by=userSet]
scoreTable[, rnkOR:=rank(-oddsRatio, ties.method="min"), by=userSet]
scoreTable[, maxRnk:=max(c(rnkSup, rnkPV, rnkOR)), by=list(userSet,dbSet)]
scoreTable[, meanRnk:=signif(mean(c(rnkSup, rnkPV, rnkOR)), 3), by=list(userSet,dbSet)]
# Append description column
setkeyv(scoreTable, "dbSet")
scoreTable = scoreTable[annotationDT]
# limit description to 80 characters
scoreTable[,description:=substr(description, 0, 80)]
orderedCols = c("userSet", "dbSet", "collection", "pValueLog", "oddsRatio",
"support", "rnkPV", "rnkOR", "rnkSup", "maxRnk", "meanRnk", "b", "c", "d",
"description", "cellType", "tissue", "antibody", "treatment", "dataSource", "filename")
unorderedCols = setdiff(colnames(scoreTable), orderedCols)
setcolorder(scoreTable, c(orderedCols, unorderedCols))
scoreTable[order(pValueLog, -meanRnk, decreasing=TRUE),]
}