first working version v.0.1

mdbrown · Sep 7, 2015 · 59dc811 · 59dc811
commit 59dc811
Show file tree

Hide file tree

Showing 25 changed files with 1,712 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,12 @@
+Package: DecisionCurve
+Type: Package
+Title: Calculate and Plot Decision Curves
+Version: 0.1
+Date: 2015-08-04
+Author: Marshall Brown
+Maintainer:  <mdbrown@fredhutch.org>
+Description: Decision curves are a useful tool to evaluate the population impact of adopting a risk prediction instrument into clinical practice. Given one or more instruments (risk models) that predict the probability of a binary outcome, this package calculates and plots decision curves, which display estimates of the standardized net benefit by the probabilty threshold used to categorize observations as 'high risk.'  Bootstrap confidence intervals are displayed as well. This package is a companion to the manuscript 'Kerr. et. al (put ref here.)'.
+License: GPL 2.0
+LazyData: TRUE
+Depends: testthat, reshape, pander, MASS
+
diff --git a/DecisionCurve.Rproj b/DecisionCurve.Rproj
@@ -0,0 +1,21 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,5 @@
+# Generated by roxygen2 (4.1.1): do not edit by hand
+
+S3method(summary,DecisionCurve)
+export(Add_CostBenefit_Axis)
+export(DecisionCurve)
diff --git a/R/Add_CostBenefit_Axis.R b/R/Add_CostBenefit_Axis.R
@@ -0,0 +1,103 @@
+#' Add cost benefit ratio axis to a decision curve plot.
+#'
+#' If you
+#'
+#' @param xlim
+#' @param cost.benefits Character vector of the form c("c1:b1", "c2:b2", ..., "cn:bn") with integers ci, bi corresponding to specific cost:benefit ratios to print.
+#' @param n.cost.benefits number of cost:benefit ratios to print if cost.benefit.axis = TRUE (default n.cost.benefit = 6).
+#' @param line x-axis line to print the axis (default is 4).
+#'
+#' @return List with components
+#'
+#' @seealso \code{\link{summary.DecisionCurve}},  \code{\link{DecisionCurve}}
+#' @examples
+#'#helper function
+#' expit <- function(xx) exp(xx)/ (1+exp(xx))
+#'
+#'#load simulated data
+#'data(dcaData)
+#'# Assume we have access to previously published models
+#'# (or models built using a training set)
+#'# that we can use to predict the risk of cancer.
+#'
+#'# Basic model using demographic variables: Age, Female, Smokes.
+#'dcaData$BasicModel <- with(dcaData, expit(-7.3 + 0.18*Age - 0.08*Female + 0.80*Smokes ) )
+#'
+#'# Model using demographic + markers : Age, Female, Smokes, Marker1 and Marker2.
+#'dcaData$FullModel <- with(dcaData, expit(-10.5 + 0.22*Age  - 0.01*Female + 0.91*Smokes + 2.03*Marker1 - 1.56*Marker2))
+#'
+#'#use DecisionCurve defaults (set bootstraps = 25 here to reduce computation time).
+#'DecisionCurve(dcaData,
+#'             outcome = "Cancer", predictors = c("BasicModel", "FullModel"),
+#'              bootstraps = 25)
+#'
+#'par(mar = c(7.5, 4, 3, 2) + 0.1)
+#'(finish later)
+#'
+#' @export
+
+
+Add_CostBenefit_Axis <- function(xlim,
+                                 cost.benefits,
+                                 n.cost.benefits = 6,
+                                 line = 4, ...){
+
+  if(missing(cost.benefits)){
+    #no cost.benefits are given, so we must choose them (wisely) from available defaults
+    #listed here
+    tmp <- c( 1/2, 1/3, 1/4, 2/3, 3/4, 1/5, 1/10, 2/5, 3/5, 4/5, 7/10, 9/10, 5/6, 1/15, 1/20,  1/25, 1/30, 1/40, 1/50, 1/60, 1/75, 1/80, 1/100)
+    dd <-  data.frame( "name" = c("1:1", "1:2", "1:3", "1:4", "2:3", "3:4", "1:5", "1:10", "2:5", "3:5", "4:5", "7:10", "9:10", "5:6", "1:15", "1:20", "1:25", "1:30", "1:40", "1:50", "1:60", "1:75","1:80",  "1:100",
+                                  "2:1", "3:1", "4:1", "3:2", "4:3", "5:1", "10:1", "5:2", "5:3", "5:4", "10:7", "10:9", "6:5", "15:1", "20:1", "25:1", "30:1", "40:1", "50:1",  "60:1","75:1", "80:1",  "100:1"),
+                       "value" = c(1/1, tmp, 1/tmp))
+    dd$threshold <- dd$value/(1+dd$value)
+    dd <- dd[order(dd$threshold),]
+
+    dd.sub <- subset(dd, threshold >= min(xlim) & threshold <= max(xlim))
+    #check to make sure there are default choices within xlim
+    if(nrow(dd.sub)==0){
+      warning("Range of risk thresholds printed do not span available default Cost:Benefit ratios. Cost:Benefit axis will not be printed.")
+    }else{
+
+      #evenly spaced increments of threshold from xlim
+      thresh.quants <- quantile(seq(min(xlim), max(xlim), length.out = 100),
+                                probs = seq(0, 1, length.out = n.cost.benefits),
+                                type = 1)
+
+      #now find C:B that are near thresh.quants
+      match.quant <- numeric(length(thresh.quants))
+      for(i in seq_along(thresh.quants)){
+        match.quant[i] <- which.min(abs(thresh.quants[i] - dd.sub$threshold))
+      }
+
+      #just print the C:B that have been chosen
+      out <- dd.sub[match.quant, ]
+
+      axis(1, at   = out$threshold, labels = out$name, line = line, ...)
+      invisible(out)
+    }
+  }else{
+    #cost benefits are given, so we plot them.
+    dd <- data.frame("name" = cost.benefits)
+
+    #extract data from the
+    CB.split <- strsplit(as.character(cost.benefits), split = ":")
+    C <- sapply(CB.split, function(x) as.numeric(x[1]))
+    B <- sapply(CB.split, function(x) as.numeric(x[2]))
+
+    dd$value = C/B
+    dd$threshold <- dd$value/(1+dd$value)
+    dd <- dd[order(dd$threshold),]
+    dd.sub <- subset(dd, threshold >= min(xlim) & threshold <= max(xlim))
+    #check to make sure there are default choices within xlim
+    if(nrow(dd.sub)==0){
+      warning("Range of risk thresholds printed do not span provided Cost:Benefit ratios. Cost:Benefit axis will not be printed.")
+    }else{
+
+      out <- dd.sub
+      axis(1, at   = out$threshold, labels = out$name, line = line, ...)
+      invisible(out)
+    }
+  }
+}
+
+
diff --git a/R/DecisionCurve.R b/R/DecisionCurve.R
@@ -0,0 +1,162 @@
+#' Plot decision curves
+#'
+#'Decision curves are a useful tool to evaluate the population impact of adopting a risk prediction instrument into clinical practice. Given one or more instruments (risk models) that predict the probability of a binary outcome, this package calculates and plots decision curves, which display estimates of the standardized net benefit by the probabilty threshold used to categorize observations as 'high risk.'  Bootstrap confidence intervals are displayed as well. This package is a companion to the manuscript 'Kerr. et. al (put ref here.)'.
+#'
+#' @param data data.frame containing outcome and predictors. Missing data on any of the predictors will cause the entire observation to be removed.
+#' @param outcome Name of outcome of interest found in data. Within 'data', the outcome variable must be a numeric vector of 0's and 1's.
+#' @param predictors Vector of names for risk predictors to calculate decision curves. 'data' must have columns corresponding to the names provided consisting of predicted risks Pr(outcome = 1 | predictor). Risks must fall between 0 and 1.
+#' @param thresholds Numeric vector of high risk thresholds to use when plotting and calculating net benefit values.
+#' @param standardize logical (default TRUE) indicating whether to use the standardized net benefit (NB/disease prevalence) or not.
+#' @param confidence.intervals Numeric (default 0.95 for 95\% confidence bands) level of bootstrap confidence intervals to plot. Set as NA or 'none' to remove confidence intervals. See details for more information.
+#' @param bootstraps Number of bootstrap replicates to use to calculate confidence intervals (default 500).
+#' @param cost.benefit.axis logical (default TRUE) indicating whether to print an additional x-axis showing relative cost:benefit ratios in addition to risk thresholds.
+#' @param n.cost.benefits number of cost:benefit ratios to print if cost.benefit.axis = TRUE (default n.cost.benefit = 6).
+#' @param cost.benefits Character vector of the form c("c1:b1", "c2:b2", ..., "cn:bn") with integers ci, bi corresponding to specific cost:benefit ratios to print.
+#' @param legend.position One of "topright" (default), "top", "topleft", "left", "bottomleft", "bottom", "bottomright", "right", or "none" indicating where the legend should be printed.
+#' @param col vector of color names to be used in plotting corresponding to the 'predictors' given. Default colors will be chosen from rainbow(..., v = .8). See details for more information on plot parameters.
+#' @param lty vector of linetypes to plot corresponding to 'predictors'.
+#' @param lwd vector of linewidths to plot corresponding to 'predictors'.
+#' @param xlim vector giving c(min, max) of x-axis. Defaults to c(min(thresholds), max(thresholds)).
+#' @param ylim vector giving c(min, max) of y-axis.
+#' @param xlab label of main x-axis.
+#' @param ylab label of y-axis.
+#' @param cost.benefit.xlab label of cost:benefit ratio axis.
+#' @param plot Logical (default TRUE) indicating whether to plot decision curves.
+#' @param ... other options directly send to plot()
+#'
+#' @details  Confidence intervals for (standardized) net benefit are calculated pointwise at each risk threshold. Bootstrap sampling is done without stratifying on outcome, so disease prevalence varies within bootstrap samples. Note that the confidence intervals calculated assume the risk prediction tool
+#' @return List with components
+#' \itemize{
+#'   \item plot.data: a list with further data.frame components 'estimates', 'ci_lower', and 'ci_upper' showing point estimates of (standardized) net benefits, and lower and upper ci's, respectively.
+#'   \item derived.data: A data frame in long form showing the following for each predictor and each 'threshold', 'FPF':false positive fraction, 'TPF': true positive fraction, 'NB': net benefit, 'sNB': standardized net benefit, 'rho': outcome prevalence, 'predictor': name of predictor, 'x_lower', 'x_upper': the lower and upper confidence bands for NB or sNB.
+#'   \item standardized: Whether standardized net benefit or net benefit is returned.
+#'   \item call: matched function call.
+#' }
+#'
+#' @seealso \code{\link{summary.DecisionCurve}},  \code{\link{Add_CostBenefit_Axis}}
+#' @examples
+#'#helper function
+#' expit <- function(xx) exp(xx)/ (1+exp(xx))
+#'
+#'#load simulated data
+#'data(dcaData)
+#'# Assume we have access to previously published models
+#'# (or models built using a training set)
+#'# that we can use to predict the risk of cancer.
+#'
+#'# Basic model using demographic variables: Age, Female, Smokes.
+#'dcaData$BasicModel <- with(dcaData, expit(-7.3 + 0.18*Age - 0.08*Female + 0.80*Smokes ) )
+#'
+#'# Model using demographic + markers : Age, Female, Smokes, Marker1 and Marker2.
+#'dcaData$FullModel <- with(dcaData, expit(-10.5 + 0.22*Age  - 0.01*Female + 0.91*Smokes + 2.03*Marker1 - 1.56*Marker2))
+#'
+#'#use DecisionCurve defaults (set bootstraps = 25 here to reduce computation time).
+#'DecisionCurve(dcaData,
+#'             outcome = "Cancer", predictors = c("BasicModel", "FullModel"),
+#'              bootstraps = 25)
+#'
+#' @export
+
+DecisionCurve <- function(data,
+                          outcome,
+                          predictors,
+                          thresholds = seq(0, 1, by = .01),
+                          standardize = TRUE,
+                          confidence.intervals = 0.95,
+                          bootstraps = 500,
+                          cost.benefit.axis = TRUE,
+                          n.cost.benefits = 6,
+                          cost.benefits,
+                          legend.position = "topright",
+                          col,
+                          lty, lwd = 2,
+                          xlim, ylim,
+                          xlab = "Risk Threshold", ylab,
+                          cost.benefit.xlab = "Cost:Benefit Ratio",
+                          plot = TRUE,
+                          ...){
+  call <- match.call()
+
+  #check that inputs are ok, return complete cases
+  cc <- check_DC_Inputs(data = data,
+                  outcome = outcome,
+                  predictors = predictors,
+                  thresholds = thresholds,
+                  standardize = standardize,
+                  confidence.intervals = confidence.intervals,
+                  bootstraps = bootstraps,
+                  legend.position = legend.position,
+                  cost.benefit.axis = cost.benefit.axis,
+                  cost.benefits = cost.benefits,
+                  n.cost.benefits = n.cost.benefits)
+
+  data <- data[cc, c(outcome, predictors)]
+
+  legend.position <- legend.position[1]
+  n.cost.benefits <- n.cost.benefits[1]
+  standardize <- standardize[1]
+
+
+  #calculate curves
+  dc.data <- get_DecisionCurve(data, outcome, predictors,
+                               threshold = thresholds,
+                               standardize = standardize,
+                               confidence.intervals = confidence.intervals,
+                               bootstraps = bootstraps)
+
+
+  #set some defaults if needed
+  if(missing(xlim)) xlim = range(thresholds)
+
+  if(missing(lty)) lty = rep(1, length(predictors) + 2)
+  if(length(lty) ==1) lty = rep(lty, length(predictors) + 2)
+  if(length(lty) == length(predictors)) lty = c(lty, 1, 1)
+
+  if(missing(col)) col  = c(rainbow(length(predictors), v = .8), "grey66", "black")
+  if(length(col) == length(predictors)) col <- c(col, "grey66", "black")
+
+  if(missing(lwd)) lwd = 2
+  if(length(lwd) ==1) lwd <- rep(lwd, length(predictors))
+  if(length(lwd) == length(predictors)) lwd = c(lwd, 1, 1)
+
+  if(missing(ylab)) ylab = ifelse(standardize, "Standardized Net Benefit", "Net Benefit")
+
+  #plot the curves
+  if(plot){
+  plot_DecisionCurve(xx = dc.data,
+                     predictors = predictors,
+                     standardize = standardize,
+                     confidence.intervals,
+                     cost.benefit.axis = cost.benefit.axis,
+                     cost.benefits = cost.benefits,
+                     n.cost.benefits = n.cost.benefits,
+                     cost.benefit.xlab = cost.benefit.xlab,
+                     xlab = xlab, ylab = ylab,
+                     col = col,
+                     lty = lty, lwd = lwd,
+                     xlim = xlim, ylim = ylim,
+                     legend.position = legend.position, ...)
+  }
+  #return list of elements
+  xx.wide <- cast(dc.data, threshold+cost.benefit.ratio~predictor, value = ifelse(standardize, "sNB", "NB"))
+ if(is.numeric(confidence.intervals)){
+  xx.lower <- cast(dc.data, threshold+cost.benefit.ratio~predictor, value = ifelse(standardize, "sNB_lower", "NB_lower"))
+  xx.upper <- cast(dc.data, threshold+cost.benefit.ratio~predictor, value = ifelse(standardize, "sNB_upper", "NB_upper"))
+ }else{
+   xx.lower <- NULL
+   xx.upper <- NULL
+ }
+  out <- list("plot.data" = list( "estimates" = xx.wide[,c("threshold", "cost.benefit.ratio", "all", predictors, "none")],
+                                  "ci_lower" = xx.lower[,c("threshold", "cost.benefit.ratio", "all", predictors, "none")],
+                                  "ci_upper" = xx.upper[,c("threshold", "cost.benefit.ratio", "all", predictors, "none")]),
+              "derived.data"  = dc.data,
+              "standardized" = standardize,
+              "confidence.intervals" = confidence.intervals,
+              "call" = call)
+  class(out) = "DecisionCurve"
+  invisible(out)
+
+}
+
+
+
diff --git a/R/check_DC_Inputs.R b/R/check_DC_Inputs.R
@@ -0,0 +1,88 @@
+check_DC_Inputs <- function(data,
+                            outcome,
+                            predictors,
+                            thresholds = seq(0, 1, by = .01),
+                            standardize = TRUE,
+                            confidence.intervals, bootstraps,
+                            legend.position = "topright",
+                            cost.benefit.axis = TRUE,
+                            cost.benefits,
+                            n.cost.benefits = 6 ){
+
+  #function to check all inputs, return vector of complete cases.
+
+  #check data
+  if(!is.data.frame(data)) stop("'data' must be a data.frame.")
+
+  #check outcome
+  if(!is.character(outcome)) stop("'outcome' must be a character string naming the outcome variable to be found in 'data'.")
+  if(length(outcome) > 1) {
+    warning("Only one outcome is allowed at a time. Only the first element will be used")
+    outcome <- outcome[1]
+  }
+  if( !is.element(outcome, names(data))) stop("outcome is not found in data: outcome must be a character string of the name for the outcome variable found in 'data'.")
+  if(!is.numeric(data[[outcome]])) stop("The outcome variable must refer to a numeric variable of 0's and 1's in 'data'.")
+  if(!all(is.element(data[[outcome]], c(0,1)))) stop("The outcome variable must refer to a numeric variable of 0's and 1's in 'data'.")
+
+  #check predictors
+  if(!is.character(predictors)) stop("'predictors' must be a vector of character strings naming the predictor variables to be found in 'data'.")
+  if( !all(is.element(predictors, names(data)))) stop("One or more predictors are not in names(data): predictors must be a vector of character strings giving the names for the predictor variables found in 'data'.")
+  if( any(is.element(predictors, c("all", "none")))) stop("Predictors cannot be named 'all' or 'none'")
+
+  for(p in predictors){
+    if(min(data[[p]], na.rm = TRUE) < 0 | max(data[[p]], na.rm = TRUE) > 1){
+      stop(paste("predictor", p, "shows risk predictions that are not inbetween 0 and 1. Decision curves cannot be calculated for such predictors."))
+    }
+
+  }
+
+  #check for complete cases, and remove missing cases if necessary.
+  data <- data[, c(outcome, predictors)]
+  cc <- complete.cases(data)
+  if(any(!(cc))){
+    warning(paste(sum(!cc), "observations have missing data, and will be removed from analysis."))
+  }
+
+  #check thresholds
+  stopifnot(is.numeric(thresholds))
+  stopifnot(min(thresholds) >= 0 )
+  stopifnot(max(thresholds) <= 1 )
+
+  #check standardize
+  stopifnot(is.logical(standardize))
+
+  #check conf.int
+  if(is.numeric(confidence.intervals) ){
+    if(confidence.intervals <= 0 | confidence.intervals > 1) stop("confidence.intervals must be between 0 and 1 (or 'none'/NA if no confidence intervals are wanted).")
+  }
+  #check bootstraps
+  stopifnot(is.numeric(bootstraps))
+  stopifnot(bootstraps > 0 )
+
+  #check legend.position
+  if(!is.element(legend.position[1], c("none", "top",
+                                       "topright", "right",
+                                       "bottomright", "bottom",
+                                       "bottomleft", "left", "topleft"))){
+    stop('legend.position must be one of "none", "top","topright", "right", "bottomright", "bottom", "bottomleft", "left", "topleft"')
+  }
+
+  #check cost.benefit.axis
+  stopifnot(is.logical(cost.benefit.axis))
+
+  #check fits
+  if(!missing(cost.benefits)){
+    if( is.character(all.equal(grep(":", cost.benefits), 1:length(cost.benefits)))){
+      stop("There was an error in how cost.benefits were input. 'cost.benefits', when specified, must be a character vector with elements 'xx:yy' with xx, yy representing the relative costs and benefits-for example c('1:2', '2:3', '1:1', '4:1').")
+    }
+  }
+
+  #check nfits
+  if(missing(cost.benefits)){
+  stopifnot(is.numeric(n.cost.benefits))
+  }
+
+
+  return(cc)
+}
+