Skip to content

Commit

Permalink
first working version v.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
mdbrown committed Sep 7, 2015
0 parents commit 59dc811
Show file tree
Hide file tree
Showing 25 changed files with 1,712 additions and 0 deletions.
12 changes: 12 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Package: DecisionCurve
Type: Package
Title: Calculate and Plot Decision Curves
Version: 0.1
Date: 2015-08-04
Author: Marshall Brown
Maintainer: <mdbrown@fredhutch.org>
Description: Decision curves are a useful tool to evaluate the population impact of adopting a risk prediction instrument into clinical practice. Given one or more instruments (risk models) that predict the probability of a binary outcome, this package calculates and plots decision curves, which display estimates of the standardized net benefit by the probabilty threshold used to categorize observations as 'high risk.' Bootstrap confidence intervals are displayed as well. This package is a companion to the manuscript 'Kerr. et. al (put ref here.)'.
License: GPL 2.0
LazyData: TRUE
Depends: testthat, reshape, pander, MASS

21 changes: 21 additions & 0 deletions DecisionCurve.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Generated by roxygen2 (4.1.1): do not edit by hand

S3method(summary,DecisionCurve)
export(Add_CostBenefit_Axis)
export(DecisionCurve)
103 changes: 103 additions & 0 deletions R/Add_CostBenefit_Axis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#' Add cost benefit ratio axis to a decision curve plot.
#'
#' If you
#'
#' @param xlim
#' @param cost.benefits Character vector of the form c("c1:b1", "c2:b2", ..., "cn:bn") with integers ci, bi corresponding to specific cost:benefit ratios to print.
#' @param n.cost.benefits number of cost:benefit ratios to print if cost.benefit.axis = TRUE (default n.cost.benefit = 6).
#' @param line x-axis line to print the axis (default is 4).
#'
#' @return List with components
#'
#' @seealso \code{\link{summary.DecisionCurve}}, \code{\link{DecisionCurve}}
#' @examples
#'#helper function
#' expit <- function(xx) exp(xx)/ (1+exp(xx))
#'
#'#load simulated data
#'data(dcaData)
#'# Assume we have access to previously published models
#'# (or models built using a training set)
#'# that we can use to predict the risk of cancer.
#'
#'# Basic model using demographic variables: Age, Female, Smokes.
#'dcaData$BasicModel <- with(dcaData, expit(-7.3 + 0.18*Age - 0.08*Female + 0.80*Smokes ) )
#'
#'# Model using demographic + markers : Age, Female, Smokes, Marker1 and Marker2.
#'dcaData$FullModel <- with(dcaData, expit(-10.5 + 0.22*Age - 0.01*Female + 0.91*Smokes + 2.03*Marker1 - 1.56*Marker2))
#'
#'#use DecisionCurve defaults (set bootstraps = 25 here to reduce computation time).
#'DecisionCurve(dcaData,
#' outcome = "Cancer", predictors = c("BasicModel", "FullModel"),
#' bootstraps = 25)
#'
#'par(mar = c(7.5, 4, 3, 2) + 0.1)
#'(finish later)
#'
#' @export


Add_CostBenefit_Axis <- function(xlim,
cost.benefits,
n.cost.benefits = 6,
line = 4, ...){

if(missing(cost.benefits)){
#no cost.benefits are given, so we must choose them (wisely) from available defaults
#listed here
tmp <- c( 1/2, 1/3, 1/4, 2/3, 3/4, 1/5, 1/10, 2/5, 3/5, 4/5, 7/10, 9/10, 5/6, 1/15, 1/20, 1/25, 1/30, 1/40, 1/50, 1/60, 1/75, 1/80, 1/100)
dd <- data.frame( "name" = c("1:1", "1:2", "1:3", "1:4", "2:3", "3:4", "1:5", "1:10", "2:5", "3:5", "4:5", "7:10", "9:10", "5:6", "1:15", "1:20", "1:25", "1:30", "1:40", "1:50", "1:60", "1:75","1:80", "1:100",
"2:1", "3:1", "4:1", "3:2", "4:3", "5:1", "10:1", "5:2", "5:3", "5:4", "10:7", "10:9", "6:5", "15:1", "20:1", "25:1", "30:1", "40:1", "50:1", "60:1","75:1", "80:1", "100:1"),
"value" = c(1/1, tmp, 1/tmp))
dd$threshold <- dd$value/(1+dd$value)
dd <- dd[order(dd$threshold),]

dd.sub <- subset(dd, threshold >= min(xlim) & threshold <= max(xlim))
#check to make sure there are default choices within xlim
if(nrow(dd.sub)==0){
warning("Range of risk thresholds printed do not span available default Cost:Benefit ratios. Cost:Benefit axis will not be printed.")
}else{

#evenly spaced increments of threshold from xlim
thresh.quants <- quantile(seq(min(xlim), max(xlim), length.out = 100),
probs = seq(0, 1, length.out = n.cost.benefits),
type = 1)

#now find C:B that are near thresh.quants
match.quant <- numeric(length(thresh.quants))
for(i in seq_along(thresh.quants)){
match.quant[i] <- which.min(abs(thresh.quants[i] - dd.sub$threshold))
}

#just print the C:B that have been chosen
out <- dd.sub[match.quant, ]

axis(1, at = out$threshold, labels = out$name, line = line, ...)
invisible(out)
}
}else{
#cost benefits are given, so we plot them.
dd <- data.frame("name" = cost.benefits)

#extract data from the
CB.split <- strsplit(as.character(cost.benefits), split = ":")
C <- sapply(CB.split, function(x) as.numeric(x[1]))
B <- sapply(CB.split, function(x) as.numeric(x[2]))

dd$value = C/B
dd$threshold <- dd$value/(1+dd$value)
dd <- dd[order(dd$threshold),]
dd.sub <- subset(dd, threshold >= min(xlim) & threshold <= max(xlim))
#check to make sure there are default choices within xlim
if(nrow(dd.sub)==0){
warning("Range of risk thresholds printed do not span provided Cost:Benefit ratios. Cost:Benefit axis will not be printed.")
}else{

out <- dd.sub
axis(1, at = out$threshold, labels = out$name, line = line, ...)
invisible(out)
}
}
}


162 changes: 162 additions & 0 deletions R/DecisionCurve.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#' Plot decision curves
#'
#'Decision curves are a useful tool to evaluate the population impact of adopting a risk prediction instrument into clinical practice. Given one or more instruments (risk models) that predict the probability of a binary outcome, this package calculates and plots decision curves, which display estimates of the standardized net benefit by the probabilty threshold used to categorize observations as 'high risk.' Bootstrap confidence intervals are displayed as well. This package is a companion to the manuscript 'Kerr. et. al (put ref here.)'.
#'
#' @param data data.frame containing outcome and predictors. Missing data on any of the predictors will cause the entire observation to be removed.
#' @param outcome Name of outcome of interest found in data. Within 'data', the outcome variable must be a numeric vector of 0's and 1's.
#' @param predictors Vector of names for risk predictors to calculate decision curves. 'data' must have columns corresponding to the names provided consisting of predicted risks Pr(outcome = 1 | predictor). Risks must fall between 0 and 1.
#' @param thresholds Numeric vector of high risk thresholds to use when plotting and calculating net benefit values.
#' @param standardize logical (default TRUE) indicating whether to use the standardized net benefit (NB/disease prevalence) or not.
#' @param confidence.intervals Numeric (default 0.95 for 95\% confidence bands) level of bootstrap confidence intervals to plot. Set as NA or 'none' to remove confidence intervals. See details for more information.
#' @param bootstraps Number of bootstrap replicates to use to calculate confidence intervals (default 500).
#' @param cost.benefit.axis logical (default TRUE) indicating whether to print an additional x-axis showing relative cost:benefit ratios in addition to risk thresholds.
#' @param n.cost.benefits number of cost:benefit ratios to print if cost.benefit.axis = TRUE (default n.cost.benefit = 6).
#' @param cost.benefits Character vector of the form c("c1:b1", "c2:b2", ..., "cn:bn") with integers ci, bi corresponding to specific cost:benefit ratios to print.
#' @param legend.position One of "topright" (default), "top", "topleft", "left", "bottomleft", "bottom", "bottomright", "right", or "none" indicating where the legend should be printed.
#' @param col vector of color names to be used in plotting corresponding to the 'predictors' given. Default colors will be chosen from rainbow(..., v = .8). See details for more information on plot parameters.
#' @param lty vector of linetypes to plot corresponding to 'predictors'.
#' @param lwd vector of linewidths to plot corresponding to 'predictors'.
#' @param xlim vector giving c(min, max) of x-axis. Defaults to c(min(thresholds), max(thresholds)).
#' @param ylim vector giving c(min, max) of y-axis.
#' @param xlab label of main x-axis.
#' @param ylab label of y-axis.
#' @param cost.benefit.xlab label of cost:benefit ratio axis.
#' @param plot Logical (default TRUE) indicating whether to plot decision curves.
#' @param ... other options directly send to plot()
#'
#' @details Confidence intervals for (standardized) net benefit are calculated pointwise at each risk threshold. Bootstrap sampling is done without stratifying on outcome, so disease prevalence varies within bootstrap samples. Note that the confidence intervals calculated assume the risk prediction tool
#' @return List with components
#' \itemize{
#' \item plot.data: a list with further data.frame components 'estimates', 'ci_lower', and 'ci_upper' showing point estimates of (standardized) net benefits, and lower and upper ci's, respectively.
#' \item derived.data: A data frame in long form showing the following for each predictor and each 'threshold', 'FPF':false positive fraction, 'TPF': true positive fraction, 'NB': net benefit, 'sNB': standardized net benefit, 'rho': outcome prevalence, 'predictor': name of predictor, 'x_lower', 'x_upper': the lower and upper confidence bands for NB or sNB.
#' \item standardized: Whether standardized net benefit or net benefit is returned.
#' \item call: matched function call.
#' }
#'
#' @seealso \code{\link{summary.DecisionCurve}}, \code{\link{Add_CostBenefit_Axis}}
#' @examples
#'#helper function
#' expit <- function(xx) exp(xx)/ (1+exp(xx))
#'
#'#load simulated data
#'data(dcaData)
#'# Assume we have access to previously published models
#'# (or models built using a training set)
#'# that we can use to predict the risk of cancer.
#'
#'# Basic model using demographic variables: Age, Female, Smokes.
#'dcaData$BasicModel <- with(dcaData, expit(-7.3 + 0.18*Age - 0.08*Female + 0.80*Smokes ) )
#'
#'# Model using demographic + markers : Age, Female, Smokes, Marker1 and Marker2.
#'dcaData$FullModel <- with(dcaData, expit(-10.5 + 0.22*Age - 0.01*Female + 0.91*Smokes + 2.03*Marker1 - 1.56*Marker2))
#'
#'#use DecisionCurve defaults (set bootstraps = 25 here to reduce computation time).
#'DecisionCurve(dcaData,
#' outcome = "Cancer", predictors = c("BasicModel", "FullModel"),
#' bootstraps = 25)
#'
#' @export

DecisionCurve <- function(data,
outcome,
predictors,
thresholds = seq(0, 1, by = .01),
standardize = TRUE,
confidence.intervals = 0.95,
bootstraps = 500,
cost.benefit.axis = TRUE,
n.cost.benefits = 6,
cost.benefits,
legend.position = "topright",
col,
lty, lwd = 2,
xlim, ylim,
xlab = "Risk Threshold", ylab,
cost.benefit.xlab = "Cost:Benefit Ratio",
plot = TRUE,
...){
call <- match.call()

#check that inputs are ok, return complete cases
cc <- check_DC_Inputs(data = data,
outcome = outcome,
predictors = predictors,
thresholds = thresholds,
standardize = standardize,
confidence.intervals = confidence.intervals,
bootstraps = bootstraps,
legend.position = legend.position,
cost.benefit.axis = cost.benefit.axis,
cost.benefits = cost.benefits,
n.cost.benefits = n.cost.benefits)

data <- data[cc, c(outcome, predictors)]

legend.position <- legend.position[1]
n.cost.benefits <- n.cost.benefits[1]
standardize <- standardize[1]


#calculate curves
dc.data <- get_DecisionCurve(data, outcome, predictors,
threshold = thresholds,
standardize = standardize,
confidence.intervals = confidence.intervals,
bootstraps = bootstraps)


#set some defaults if needed
if(missing(xlim)) xlim = range(thresholds)

if(missing(lty)) lty = rep(1, length(predictors) + 2)
if(length(lty) ==1) lty = rep(lty, length(predictors) + 2)
if(length(lty) == length(predictors)) lty = c(lty, 1, 1)

if(missing(col)) col = c(rainbow(length(predictors), v = .8), "grey66", "black")
if(length(col) == length(predictors)) col <- c(col, "grey66", "black")

if(missing(lwd)) lwd = 2
if(length(lwd) ==1) lwd <- rep(lwd, length(predictors))
if(length(lwd) == length(predictors)) lwd = c(lwd, 1, 1)

if(missing(ylab)) ylab = ifelse(standardize, "Standardized Net Benefit", "Net Benefit")

#plot the curves
if(plot){
plot_DecisionCurve(xx = dc.data,
predictors = predictors,
standardize = standardize,
confidence.intervals,
cost.benefit.axis = cost.benefit.axis,
cost.benefits = cost.benefits,
n.cost.benefits = n.cost.benefits,
cost.benefit.xlab = cost.benefit.xlab,
xlab = xlab, ylab = ylab,
col = col,
lty = lty, lwd = lwd,
xlim = xlim, ylim = ylim,
legend.position = legend.position, ...)
}
#return list of elements
xx.wide <- cast(dc.data, threshold+cost.benefit.ratio~predictor, value = ifelse(standardize, "sNB", "NB"))
if(is.numeric(confidence.intervals)){
xx.lower <- cast(dc.data, threshold+cost.benefit.ratio~predictor, value = ifelse(standardize, "sNB_lower", "NB_lower"))
xx.upper <- cast(dc.data, threshold+cost.benefit.ratio~predictor, value = ifelse(standardize, "sNB_upper", "NB_upper"))
}else{
xx.lower <- NULL
xx.upper <- NULL
}
out <- list("plot.data" = list( "estimates" = xx.wide[,c("threshold", "cost.benefit.ratio", "all", predictors, "none")],
"ci_lower" = xx.lower[,c("threshold", "cost.benefit.ratio", "all", predictors, "none")],
"ci_upper" = xx.upper[,c("threshold", "cost.benefit.ratio", "all", predictors, "none")]),
"derived.data" = dc.data,
"standardized" = standardize,
"confidence.intervals" = confidence.intervals,
"call" = call)
class(out) = "DecisionCurve"
invisible(out)

}



88 changes: 88 additions & 0 deletions R/check_DC_Inputs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
check_DC_Inputs <- function(data,
outcome,
predictors,
thresholds = seq(0, 1, by = .01),
standardize = TRUE,
confidence.intervals, bootstraps,
legend.position = "topright",
cost.benefit.axis = TRUE,
cost.benefits,
n.cost.benefits = 6 ){

#function to check all inputs, return vector of complete cases.

#check data
if(!is.data.frame(data)) stop("'data' must be a data.frame.")

#check outcome
if(!is.character(outcome)) stop("'outcome' must be a character string naming the outcome variable to be found in 'data'.")
if(length(outcome) > 1) {
warning("Only one outcome is allowed at a time. Only the first element will be used")
outcome <- outcome[1]
}
if( !is.element(outcome, names(data))) stop("outcome is not found in data: outcome must be a character string of the name for the outcome variable found in 'data'.")
if(!is.numeric(data[[outcome]])) stop("The outcome variable must refer to a numeric variable of 0's and 1's in 'data'.")
if(!all(is.element(data[[outcome]], c(0,1)))) stop("The outcome variable must refer to a numeric variable of 0's and 1's in 'data'.")

#check predictors
if(!is.character(predictors)) stop("'predictors' must be a vector of character strings naming the predictor variables to be found in 'data'.")
if( !all(is.element(predictors, names(data)))) stop("One or more predictors are not in names(data): predictors must be a vector of character strings giving the names for the predictor variables found in 'data'.")
if( any(is.element(predictors, c("all", "none")))) stop("Predictors cannot be named 'all' or 'none'")

for(p in predictors){
if(min(data[[p]], na.rm = TRUE) < 0 | max(data[[p]], na.rm = TRUE) > 1){
stop(paste("predictor", p, "shows risk predictions that are not inbetween 0 and 1. Decision curves cannot be calculated for such predictors."))
}

}

#check for complete cases, and remove missing cases if necessary.
data <- data[, c(outcome, predictors)]
cc <- complete.cases(data)
if(any(!(cc))){
warning(paste(sum(!cc), "observations have missing data, and will be removed from analysis."))
}

#check thresholds
stopifnot(is.numeric(thresholds))
stopifnot(min(thresholds) >= 0 )
stopifnot(max(thresholds) <= 1 )

#check standardize
stopifnot(is.logical(standardize))

#check conf.int
if(is.numeric(confidence.intervals) ){
if(confidence.intervals <= 0 | confidence.intervals > 1) stop("confidence.intervals must be between 0 and 1 (or 'none'/NA if no confidence intervals are wanted).")
}
#check bootstraps
stopifnot(is.numeric(bootstraps))
stopifnot(bootstraps > 0 )

#check legend.position
if(!is.element(legend.position[1], c("none", "top",
"topright", "right",
"bottomright", "bottom",
"bottomleft", "left", "topleft"))){
stop('legend.position must be one of "none", "top","topright", "right", "bottomright", "bottom", "bottomleft", "left", "topleft"')
}

#check cost.benefit.axis
stopifnot(is.logical(cost.benefit.axis))

#check fits
if(!missing(cost.benefits)){
if( is.character(all.equal(grep(":", cost.benefits), 1:length(cost.benefits)))){
stop("There was an error in how cost.benefits were input. 'cost.benefits', when specified, must be a character vector with elements 'xx:yy' with xx, yy representing the relative costs and benefits-for example c('1:2', '2:3', '1:1', '4:1').")
}
}

#check nfits
if(missing(cost.benefits)){
stopifnot(is.numeric(n.cost.benefits))
}


return(cc)
}

Loading

0 comments on commit 59dc811

Please sign in to comment.