-
Notifications
You must be signed in to change notification settings - Fork 3
/
precision_recall.R
125 lines (123 loc) · 5.38 KB
/
precision_recall.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#' Compute precision-recall
#'
#' Compute precision and recall using each \link[GenomicRanges]{GRanges}
#' object in \code{peakfiles} as the "query"
#' against each \link[GenomicRanges]{GRanges} object in \code{reference}
#' as the subject.
#'
#' @param thresholding_cols Depending on which columns are present,
#' \link[GenomicRanges]{GRanges} will
#' be filtered at each threshold according to one or more of the following:
#' \itemize{
#' \item{"total_signal" : }{Used by the peak calling software
#' \href{https://github.com/FredHutch/SEACR}{SEACR}.
#' \emph{NOTE}: Another SEACR column (e.g. "max_signal") can be used
#' together or instead of "total_signal".}
#' \item{"qValue"}{Used by the peak calling software
#' \href{https://github.com/macs3-project/MACS}{MACS2/3}.
#' Should contain the negative log of the p-values after multiple
#' testing correction.}
#' \item{"Peak Score" : }{
#' Used by the peak calling software
#' \href{http://homer.ucsd.edu/homer/ngs/peaks.html}{HOMER}.}
#' }
#' @param initial_threshold Numeric threshold that was provided to SEACR
#' (via the parameter \code{--ctrl}) when calling peaks without an IgG
#' control.
#' @param max_threshold Maximum threshold to test.
#' @param n_threshold Number of thresholds to test.
#' @param cast Cast the data into a format that's more compatible with
#' \pkg{ggplot2}.
#' @param save_path File path to save precision-recall results to.
#' @param verbose Print messages.
#' @inheritParams EpiCompare
#' @inheritParams get_bpparam
#' @inheritParams check_workers
#' @inheritDotParams bpplapply
#' @return Overlap
#'
#' @export
#' @importFrom data.table rbindlist
#' @examples
#' data("CnR_H3K27ac")
#' data("CnT_H3K27ac")
#' data("encode_H3K27ac")
#' peakfiles <- list(CnR_H3K27ac=CnR_H3K27ac, CnT_H3K27ac=CnT_H3K27ac)
#' reference <- list("encode_H3K27ac" = encode_H3K27ac)
#'
#' pr_df <- precision_recall(peakfiles = peakfiles,
#' reference = reference,
#' workers = 1)
precision_recall <- function(peakfiles,
reference,
thresholding_cols=c("total_signal",
"qValue",
"Peak Score"),
initial_threshold=0,
n_threshold=20,
max_threshold=1,
cast=TRUE,
workers=1,
verbose=TRUE,
save_path=
tempfile(fileext = "precision_recall.csv"),
...){
precision <- recall <- F1 <- NULL;
workers <- check_workers(workers = workers)
threshold_list <- seq(from=initial_threshold,
to=1-(max_threshold/n_threshold),
length.out=n_threshold)
names(threshold_list) <- threshold_list
#### Check which have necessary columns #####
peakfiles <- check_grlist_cols(grlist = peakfiles,
target_cols = thresholding_cols)
##### Iterate over peakfiles #####
pr_df <- bpplapply(X = threshold_list,
workers = workers,
FUN = function(thresh){
if(verbose) message_parallel("Threshold=",thresh,": Filtering peaks")
peakfiles_filt <- mapply(peakfiles,
FUN=function(gr){
## Compute percentiles
gr <- compute_percentiles(
gr = gr,
thresholding_cols = thresholding_cols,
initial_threshold = initial_threshold
)
## Filter
gr <- filter_percentiles(
gr = gr,
thresh = thresh,
thresholding_cols = thresholding_cols)
return(gr)
})
df <- overlap_percent(peaklist1 = peakfiles_filt,
peaklist2 = reference,
suppress_messages = FALSE,
precision_recall = TRUE)
return(df)
}, ...) |> data.table::rbindlist(use.names = TRUE, idcol = "threshold")
#### Recast data ###
if(isTRUE(cast)){
#### Post-process data ####
messager("Reformatting precision-recall data.")
pr_df <- data.table::dcast(
data = pr_df,
formula = "peaklist1 + peaklist2 + threshold ~ type",
fun.aggregate = mean,
value.var = c("Percentage","overlap","total"))
data.table::setnames(pr_df,
c("Percentage_precision","Percentage_recall"),
c("precision","recall"))
pr_df$threshold <- as.numeric(pr_df$threshold)
#### Compute F1 #####
pr_df[,F1:=(2*(precision*recall) / (precision+recall))]
pr_df[is.na(F1),]$F1 <- 0
}
#### Save ####
save_path <- save_results(dat = pr_df,
save_path = save_path,
type = "precision-recall")
#### Return ####
return(pr_df)
}