diff --git a/.Rbuildignore b/.Rbuildignore index 15a6e96..c6c52e2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,6 @@ ^README\.Rmd$ ^\.github$ ^.\.sas7bdat$ +^Dockerfile$ +^docker-compose.yml$ +^.dockerignore$ diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..100bedb --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.vscode +.git \ No newline at end of file diff --git a/.gitignore b/.gitignore index b5867af..bec7be1 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,11 @@ *.sas7bdat vignettes/do_not_commit.Rmd +vignettes/ae_profile.csv +vignettes/scen.csv +vignettes/scen_old.csv +vignettes/scen_days.csv +vignettes/*.html +run_remote.md +.vscode + diff --git a/DESCRIPTION b/DESCRIPTION index 2ac6b4c..41de51a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: simaerep Title: Simulate adverse event reporting in clinical trials with the goal of detecting under-reporting sites -Version: 0.3.1 +Version: 0.3.2 Authors@R: person(given = "Bjoern", family = "Koneswarakantha", @@ -16,9 +16,9 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Depends: - ggplot2, - dplyr + ggplot2 Imports: + dplyr, lintr, tidyr, magrittr, @@ -28,18 +28,16 @@ Imports: forcats, cowplot, RColorBrewer, - feather, furrr (>= 0.2.1), - future + progressr, + knitr, + tibble Suggests: testthat, devtools, pkgdown, - knitr, - tibble, spelling, - haven, - sqldf + haven Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.1 Language: en-US diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c1ab957 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,4 @@ +FROM rocker/verse:4.1 +COPY . /simaerep +RUN R -e "devtools::install('/simaerep/.', upgrade = 'never', dependencies = TRUE, repos = 'http://cran.us.r-project.org')" +RUN rm /simaerep -r -f \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index 6f87f69..0269c92 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,8 +6,10 @@ export(check_df_visit) export(eval_sites) export(eval_sites_deprecated) export(exp_implicit_missing_visits) +export(get_config) export(get_ecd_values) export(get_pat_pool_config) +export(get_portf_perf) export(lint_package) export(pat_aggr) export(pat_pool) @@ -17,30 +19,79 @@ export(plot_sim_examples) export(plot_study) export(plot_visit_med75) export(poiss_test_site_ae_vs_study_ae) +export(prep_for_sim) export(prob_lower_site_ae_vs_study_ae) +export(purrr_bar) +export(sim_after_prep) +export(sim_scenario) export(sim_sites) export(sim_studies) export(sim_test_data_patient) +export(sim_test_data_portfolio) export(sim_test_data_study) +export(sim_ur_scenarios) export(site_aggr) -import(dplyr) -import(furrr) -import(future) +export(with_progress_cnd) import(ggplot2) -import(purrr) -import(tidyr) importFrom(RColorBrewer,brewer.pal) importFrom(cowplot,draw_label) importFrom(cowplot,get_legend) importFrom(cowplot,ggdraw) importFrom(cowplot,plot_grid) -importFrom(feather,read_feather) -importFrom(feather,write_feather) +importFrom(dplyr,arrange) +importFrom(dplyr,between) +importFrom(dplyr,bind_cols) +importFrom(dplyr,bind_rows) +importFrom(dplyr,case_when) +importFrom(dplyr,dense_rank) +importFrom(dplyr,desc) +importFrom(dplyr,distinct) +importFrom(dplyr,everything) +importFrom(dplyr,filter) +importFrom(dplyr,group_by) +importFrom(dplyr,group_by_at) +importFrom(dplyr,inner_join) +importFrom(dplyr,is_grouped_df) +importFrom(dplyr,lag) +importFrom(dplyr,left_join) +importFrom(dplyr,mutate) +importFrom(dplyr,mutate_all) +importFrom(dplyr,mutate_at) +importFrom(dplyr,n) +importFrom(dplyr,n_distinct) +importFrom(dplyr,one_of) +importFrom(dplyr,pull) +importFrom(dplyr,rename) +importFrom(dplyr,right_join) +importFrom(dplyr,row_number) +importFrom(dplyr,sample_n) +importFrom(dplyr,select) +importFrom(dplyr,summarise) +importFrom(dplyr,summarise_all) +importFrom(dplyr,summarise_at) +importFrom(dplyr,ungroup) +importFrom(dplyr,vars) importFrom(forcats,fct_relevel) +importFrom(furrr,furrr_options) +importFrom(furrr,future_map) +importFrom(furrr,future_pmap) +importFrom(knitr,kable) importFrom(lintr,lint_package) importFrom(magrittr,"%>%") +importFrom(progressr,progressor) +importFrom(progressr,with_progress) +importFrom(purrr,map) +importFrom(purrr,map2) +importFrom(purrr,map2_dbl) +importFrom(purrr,map_chr) +importFrom(purrr,map_dbl) +importFrom(purrr,map_int) +importFrom(purrr,pmap) +importFrom(purrr,pmap_dbl) +importFrom(purrr,possibly) importFrom(purrr,safely) importFrom(rlang,":=") +importFrom(rlang,.data) importFrom(stats,ecdf) importFrom(stats,median) importFrom(stats,p.adjust) @@ -49,6 +100,12 @@ importFrom(stats,quantile) importFrom(stats,rnorm) importFrom(stats,rpois) importFrom(stats,runif) +importFrom(stats,sd) importFrom(stringr,str_count) importFrom(stringr,str_pad) +importFrom(tibble,tibble) +importFrom(tidyr,fill) +importFrom(tidyr,nest) +importFrom(tidyr,tibble) +importFrom(tidyr,unnest) importFrom(utils,head) diff --git a/NEWS.md b/NEWS.md index d26446a..463066f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# simaerep 0.3.2 +- added portfolio performance assessment + # simaerep 0.3.1 - changed MIT License holder from openpharma to F. Hoffmann-La Roche Ltd and simaerep authors @@ -8,4 +11,4 @@ # simaerep 0.2.0 - use Benjamin Hochberg procedure for alpha error correction - fix warnings around parallel processing -- improved SAS files vignette \ No newline at end of file +- improved SAS files vignette diff --git a/R/0_imports.R b/R/0_imports.R new file mode 100644 index 0000000..96ec556 --- /dev/null +++ b/R/0_imports.R @@ -0,0 +1,32 @@ +# satisfy lintr +# lintr falsely flags possibly_ecdf as unused variable +# using rlang::.data here causes error with furrr in sim_test_data_portfolio +# patnum, n_ae, visit + +if (getRversion() >= "2.15.1") { + utils::globalVariables(c("possibly_ecdf", "patnum", "n_ae", "visit")) +} + +#' @importFrom progressr progressor +#' @importFrom cowplot get_legend plot_grid ggdraw draw_label plot_grid plot_grid +#' @importFrom cowplot ggdraw draw_label +#' @importFrom forcats fct_relevel +#' @importFrom RColorBrewer brewer.pal +#' @importFrom utils head +#' @importFrom stats p.adjust quantile median runif poisson.test ecdf rnorm rpois sd +#' @importFrom purrr safely possibly pmap map map2 pmap_dbl map2_dbl map_dbl +#' @importFrom purrr map_int map_chr +#' @importFrom furrr future_map future_pmap furrr_options +#' @importFrom progressr with_progress +#' @importFrom stringr str_count str_pad +#' @importFrom rlang := .data +#' @importFrom dplyr select mutate filter summarise group_by summarise_all summarise_at +#' @importFrom dplyr mutate_all mutate_at ungroup vars bind_cols bind_rows pull +#' @importFrom dplyr n_distinct distinct arrange right_join left_join inner_join +#' @importFrom dplyr rename sample_n between row_number dense_rank desc case_when +#' @importFrom dplyr group_by_at n is_grouped_df everything one_of lag +#' @importFrom tidyr tibble unnest nest fill +#' @importFrom lintr lint_package +#' @importFrom knitr kable +#' @importFrom tibble tibble +NULL diff --git a/R/lint.R b/R/lint.R index 4be7e85..196c51f 100644 --- a/R/lint.R +++ b/R/lint.R @@ -11,7 +11,6 @@ #' @seealso \code{\link[lintr]{lint_package}} #' @rdname lint_package #' @export -#' @importFrom lintr lint_package lint_package <- function(path = ".", ...) { lint_results <- lintr::lint_package(path = path, diff --git a/R/progress.R b/R/progress.R new file mode 100644 index 0000000..36b2c55 --- /dev/null +++ b/R/progress.R @@ -0,0 +1,164 @@ +#' @title execute a purrr or furrr function with a progress +#' bar +#' @description call still needs to be wrapped in with_progress() +#' @param .purrr purrr or furrr function +#' @param ... iterable arguments passed to .purrr +#' @param .f function to be executed over iterables +#' @param .f_args list of arguments passed to .f, Default: list() +#' @param .purrr_args list of arguments passed to .purrr, Default: list() +#' @param .steps integer number of iterations +#' @param .slow logical slows down execution, Default: FALSE +#' @param .progress logical, show progress bar, Default: TRUE +#' @examples +#' # purrr::map +#' progressr::with_progress( +#' purrr_bar(rep(0.25, 5), .purrr = purrr::map, .f = Sys.sleep, .steps = 5) +#' ) +#' +#'\dontrun{ +#' # purrr::walk +#' progressr::with_progress( +#' purrr_bar(rep(0.25, 5), .purrr = purrr::walk,.f = Sys.sleep, .steps = 5) +#' ) +#' +#' # progress bar off +#' progressr::with_progress( +#' purrr_bar( +#' rep(0.25, 5), .purrr = purrr::walk,.f = Sys.sleep, .steps = 5, .progress = FALSE +#' ) +#' ) +#' +#' # purrr::map2 +#' progressr::with_progress( +#' purrr_bar( +#' rep(1, 5), rep(2, 5), +#' .purrr = purrr::map2, +#' .f = `+`, +#' .steps = 5, +#' .slow = TRUE +#' ) +#') +#' +#' # purrr::pmap +#' progressr::with_progress( +#' purrr_bar( +#' list(rep(1, 5), rep(2, 5)), +#' .purrr = purrr::pmap, +#' .f = `+`, +#' .steps = 5, +#' .slow = TRUE +#' ) +#') +#' +#' # define function within purr_bar() call +#' progressr::with_progress( +#' purrr_bar( +#' list(rep(1, 5), rep(2, 5)), +#' .purrr = purrr::pmap, +#' .f = function(x, y) { +#' paste0(x, y) +#' }, +#' .steps = 5, +#' .slow = TRUE +#' ) +#') +#' +#' # with mutate +#' progressr::with_progress( +#' tibble(x = rep(0.25, 5)) %>% +#' mutate(x = purrr_bar(x, .purrr = purrr::map, .f = Sys.sleep, .steps = 5)) +#' ) +#'} +#' @rdname purrr_bar +#' @export +purrr_bar <- function(..., + .purrr, + .f, + .f_args = list(), + .purrr_args = list(), + .steps, + .slow = FALSE, + .progress = TRUE) { + + stopifnot("all .f_args list items must be named" = all(names(.f_args) != "")) + stopifnot("all .purrr_args list items must be named" = all(names(.purrr_args) != "")) + + if (.progress) { + p <- progressr::progressor(steps = .steps) + } else { + p <- NULL + } + + f <- function(..., .f_args, .p = p) { + if (.progress) p() + if (.slow) Sys.sleep(0.25) + .f_args <- c(list(...), .f_args) + do.call(.f, .f_args) + } + + do.call( + .purrr, + c( + list(...), + list(f, .f_args = .f_args, .p = p), + .purrr_args + ) + ) +} + + +#'@title conditional \code{\link[progressr]{with_progress}} +#'@description internal function. Use instead of +#' \code{\link[progressr]{with_progress}} within custom functions with progress +#' bars. +#'@param ex expression +#'@param progress logical, Default: TRUE +#'@details DETAILS +#' @examples +#' if (interactive()) { +#' +#' with_progress_cnd( +#' purrr_bar(rep(0.25, 5), .purrr = purrr::map, .f = Sys.sleep, .steps = 5), +#' progress = TRUE +#' ) +#' +#' with_progress_cnd( +#' purrr_bar(rep(0.25, 5), .purrr = purrr::map, .f = Sys.sleep, .steps = 5), +#' progress = FALSE +#' ) +#' +#' # wrap a function with progress bar with another call with progress bar +#' +#' f1 <- function(x, progress = TRUE) { +#' with_progress_cnd( +#' purrr_bar(x, .purrr = purrr::walk, .f = Sys.sleep, .steps = length(x), .progress = progress), +#' progress = progress +#' ) +#' } +#' +#' # inner progress bar blocks outer progress bar +#' progressr::with_progress( +#' purrr_bar( +#' rep(rep(1, 3),3), .purrr = purrr::walk, .f = f1, .steps = 3, +#' .f_args = list(progress = TRUE) +#' ) +#' ) +#' +#' # inner progress bar turned off +#' progressr::with_progress( +#' purrr_bar( +#' rep(list(rep(0.25, 3)), 5), .purrr = purrr::walk, .f = f1, .steps = 5, +#' .f_args = list(progress = FALSE) +#' ) +#' ) +#'} +#'@seealso \code{\link[progressr]{with_progress}} +#'@rdname with_progress_cnd +#'@export +with_progress_cnd <- function(ex, progress = TRUE) { + if (progress) { + progressr::with_progress(eval(ex)) + } else { + eval(ex) + } +} diff --git a/R/simaerep.R b/R/simaerep.R index 583b267..93fc963 100644 --- a/R/simaerep.R +++ b/R/simaerep.R @@ -22,32 +22,36 @@ if (getRversion() >= "2.15.1") { #' df_visit$study_id <- "A" #' #' df_visit_filt <- df_visit %>% -#' filter(visit != 3) +#' dplyr::filter(visit != 3) #' #' df_visit_corr <- check_df_visit(df_visit_filt) #' 3 %in% df_visit_corr$visit #' nrow(df_visit_corr) == nrow(df_visit) #' -#' df_visit_corr <- check_df_visit(bind_rows(df_visit, df_visit)) +#' df_visit_corr <- check_df_visit(dplyr::bind_rows(df_visit, df_visit)) #' nrow(df_visit_corr) == nrow(df_visit) #' #' @rdname check_df_visit #' @export check_df_visit <- function(df_visit) { + + df_visit <- ungroup(df_visit) + stopifnot( all(c("study_id", "site_number", "patnum", "n_ae", "visit") %in% names(df_visit)) ) cols_na <- df_visit %>% summarise_at( - vars( - .data$study_id, - .data$site_number, - .data$patnum, - .data$n_ae, - .data$visit), - ~ any(is.na(.)) - ) %>% + vars( + .data$study_id, + .data$site_number, + .data$patnum, + .data$n_ae, + .data$visit + ), + anyNA + ) %>% unlist() if (any(cols_na)) { @@ -67,6 +71,15 @@ check_df_visit <- function(df_visit) { all() ) + df_visit %>% + group_by(.data$study_id, .data$patnum) %>% + summarise(n_sites = n_distinct(.data$site_number), .groups = "drop") %>% + mutate(check = .data$n_sites == 1) %>% + pull(.data$check) %>% + unlist() %>% + all() %>% + stopifnot("patient ids must be site exclusive" = .) + df_visit <- exp_implicit_missing_visits(df_visit) df_visit <- aggr_duplicated_visits(df_visit) @@ -324,7 +337,6 @@ get_visit_med75 <- function(df_pat, #' @seealso \code{\link[simaerep]{site_aggr}}, #' \code{\link[simaerep]{sim_sites}}, #' \code{\link[stats]{p.adjust}} -#' @importFrom stats p.adjust #' @export eval_sites <- function(df_sim_sites, method = "BH", @@ -356,7 +368,7 @@ eval_sites <- function(df_sim_sites, if ("prob_low" %in% names(df_out)) { - if (anyNA(df_out$pval)) { + if (anyNA(df_out$prob_low)) { warning("prob_lower column contains NA") } @@ -616,11 +628,10 @@ pat_pool <- function(df_visit, df_site) { #' @seealso \code{\link[purrr]{safely}} #' @rdname prob_lower_site_ae_vs_study_ae #' @export -#' @importFrom purrr safely prob_lower_site_ae_vs_study_ae <- function(site_ae, study_ae, r = 1000, parallel = FALSE) { # if there is only one site - if (is_null(study_ae)) { + if (is.null(study_ae)) { prob_lower <- 1 return(prob_lower) } @@ -637,8 +648,6 @@ prob_lower_site_ae_vs_study_ae <- function(site_ae, study_ae, r = 1000, parallel # set-up multiprocessing # multiprocessing currently not used by sim_sites() if (parallel) { - requireNamespace("furrr") - suppressWarnings(future::plan(multiprocess)) .f_map_int <- function(...) { furrr::future_map_int(..., .options = furrr_options(seed = TRUE)) } @@ -673,13 +682,17 @@ prob_lower_site_ae_vs_study_ae <- function(site_ae, study_ae, r = 1000, parallel #' @param r integer, denotes number of simulations, default = 1000 #' @param poisson_test logical, calculates poisson.test pvalue #' @param prob_lower logical, calculates probability for getting a lower value +#' @param progress logical, display progress bar, Default = TRUE #' @return dataframe with the following columns: #' \describe{ #' \item{**study_id**}{study identification} #' \item{**site_number**}{site identification} +#' \item{**n_pat**}{number of patients at site} #' \item{**visit_med75**}{median(max(visit)) * 0.75} +#' \item{**n_pat_with_med75**}{number of patients at site with med75} #' \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level} #' \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level} +#' \item{**n_pat_with_med75_study**}{number of patients at study with med75 excl. site} #' \item{**pval**}{p-value as returned by \code{\link[stats]{poisson.test}}} #' \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 or lower} #' } @@ -705,18 +718,55 @@ prob_lower_site_ae_vs_study_ae <- function(site_ae, study_ae, r = 1000, parallel #' \code{\link[simaerep]{pat_pool}}, #' \code{\link[simaerep]{prob_lower_site_ae_vs_study_ae}}, #' \code{\link[simaerep]{poiss_test_site_ae_vs_study_ae}}, +#' \code{\link[simaerep]{sim_sites}}, +#' \code{\link[simaerep]{prep_for_sim}} #' @export -#' @import dplyr -#' @import purrr -#' @import tidyr sim_sites <- function(df_site, df_visit, r = 1000, poisson_test = TRUE, - prob_lower = TRUE) { + prob_lower = TRUE, + progress = TRUE) { df_visit <- check_df_visit(df_visit) + df_sim_prep <- prep_for_sim(df_site, df_visit) + + df_sim <- sim_after_prep(df_sim_prep, + r = r, + poisson_test = poisson_test, + prob_lower = prob_lower, + progress = progress) + + return(df_sim) +} + +#'@title prepare data for simulation +#'@description Internal function called by \code{\link[simaerep]{sim_sites}}. +#' Collect AEs per patient at visit_med75 for site and study as a vector of +#' integers. +#'@param df_visit dataframe, created by \code{\link[simaerep]{sim_sites}} +#'@param df_site dataframe created by \code{\link[simaerep]{site_aggr}} +#'@return dataframe +#' @examples +#' df_visit <- sim_test_data_study( +#' n_pat = 100, +#' n_sites = 5, +#' frac_site_with_ur = 0.4, +#' ur_rate = 0.2 +#') +#' +#' df_visit$study_id <- "A" +#' +#' df_site <- site_aggr(df_visit) +#' +#' df_prep <- prep_for_sim(df_site, df_visit) +#' df_prep +#'@rdname prep_for_sim +#' @seealso \code{\link[simaerep]{sim_sites}}, \code{\link[simaerep]{sim_after_prep}} +#'@export +prep_for_sim <- function(df_site, df_visit) { + df_pat_pool <- pat_pool(df_visit, df_site) df_sim_prep <- df_visit %>% @@ -727,20 +777,67 @@ sim_sites <- function(df_site, .data$n_pat, .data$n_pat_with_med75, .data$visit_med75) %>% - summarise(patients = list(unique(.data$patnum))) + summarise(patients = list(unique(.data$patnum)), .groups = "drop") df_sim_prep <- df_sim_prep %>% left_join(df_pat_pool, "study_id") %>% mutate( - pat_pool = map2(.data$pat_pool, .data$visit_med75, function(x, y) filter(x, .data$visit == y)), - n_ae_site = map2(.data$pat_pool, .data$patients, function(x, y) filter(x, .data$patnum %in% y)), - n_ae_study = map2(.data$pat_pool, .data$patients, function(x, y) filter(x, ! .data$patnum %in% y)), + pat_pool = map2( + .data$pat_pool, .data$visit_med75, + function(x, y) filter(x, .data$visit == y) + ), + n_ae_site = map2( + .data$pat_pool, .data$patients, + function(x, y) filter(x, .data$patnum %in% y) + ), + n_ae_study = map2( + .data$pat_pool, .data$patients, + function(x, y) filter(x, ! .data$patnum %in% y) + ), n_ae_site = map(.data$n_ae_site, "n_ae"), n_ae_study = map(.data$n_ae_study, "n_ae") ) %>% select(- .data$patients, - .data$pat_pool) + return(df_sim_prep) + +} + +#'@title start simulation after preparation +#'@description Internal function called by \code{\link[simaerep]{sim_sites}} +#' after \code{\link[simaerep]{prep_for_sim}} +#'@param df_sim_prep dataframe as returned by +#' \code{\link[simaerep]{prep_for_sim}} +#'@inheritParams sim_sites +#'@return dataframe +#' @examples +#' df_visit <- sim_test_data_study( +#' n_pat = 100, +#' n_sites = 5, +#' frac_site_with_ur = 0.4, +#' ur_rate = 0.2 +#') +#' +#' df_visit$study_id <- "A" +#' +#' df_site <- site_aggr(df_visit) +#' +#' df_prep <- prep_for_sim(df_site, df_visit) +#' +#' df_sim <- sim_after_prep(df_prep) +#' +#' df_sim +#'@rdname sim_after_prep +#'@seealso \code{\link[simaerep]{sim_sites}}, +#' \code{\link[simaerep]{prep_for_sim}} +#'@export +sim_after_prep <- function(df_sim_prep, + r = 1000, + poisson_test = FALSE, + prob_lower = TRUE, + progress = FALSE) { + df_sim <- df_sim_prep if (poisson_test) { @@ -750,17 +847,29 @@ sim_sites <- function(df_site, } if (prob_lower) { - df_sim <- df_sim %>% - mutate(prob_low = map2_dbl(.data$n_ae_site, .data$n_ae_study, - prob_lower_site_ae_vs_study_ae, - r = r - )) + with_progress_cnd( + df_sim <- df_sim %>% + mutate( + prob_low = purrr_bar( + .data$n_ae_site, .data$n_ae_study, + .purrr = map2_dbl, + .f = prob_lower_site_ae_vs_study_ae, + .f_args = list(r = r), + .steps = nrow(df_sim), + .progress = progress + ) + ), + progress = progress + ) } - df_sim %>% - mutate(mean_ae_site_med75 = map_dbl(n_ae_site, mean), - mean_ae_study_med75 = map_dbl(n_ae_study, mean)) %>% - select(- n_ae_site, - n_ae_study) %>% + # clean + + df_sim <- df_sim %>% + mutate(mean_ae_site_med75 = map_dbl(.data$n_ae_site, mean), + mean_ae_study_med75 = map_dbl(.data$n_ae_study, mean), + n_pat_with_med75_study = map_int(.data$n_ae_study, length)) %>% + select(- .data$n_ae_site, - .data$n_ae_study) %>% select(.data$study_id, .data$site_number, .data$n_pat, @@ -768,9 +877,12 @@ sim_sites <- function(df_site, .data$visit_med75, .data$mean_ae_site_med75, .data$mean_ae_study_med75, - everything()) %>% - ungroup() %>% - return() + .data$n_pat_with_med75_study, + dplyr::everything()) %>% + ungroup() + + return(df_sim) + } #' @title configure study patient pool by site parameters @@ -793,7 +905,7 @@ sim_sites <- function(df_site, #' #' df_visit2$study_id <- "B" #' -#' df_visit <- bind_rows(df_visit1, df_visit2) +#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2) #' #' df_site <- site_aggr(df_visit) #' @@ -869,7 +981,7 @@ get_pat_pool_config <- function(df_visit, df_site, min_n_pat_with_med75 = 1) { #' #' df_visit2$study_id <- "B" #' -#' df_visit <- bind_rows(df_visit1, df_visit2) +#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2) #' #' df_site <- site_aggr(df_visit) #' @@ -885,14 +997,6 @@ get_pat_pool_config <- function(df_visit, df_site, min_n_pat_with_med75 = 1) { #' mean_ae at visit_med75 #' @rdname sim_studies #' @export -#' @importFrom feather read_feather write_feather -#' @import dplyr -#' @import purrr -#' @import tidyr -#' @import furrr -#' @import future -#' @importFrom stringr str_count str_pad -#' @importFrom rlang := sim_studies <- function(df_visit, df_site, r = 100, @@ -913,7 +1017,7 @@ sim_studies <- function(df_visit, min_n_pat_with_med75 = min_n_pat_with_med75) # filter studies -------------------------------------------- - if (!is_null(studies)) { + if (!is.null(studies)) { if (!all(studies %in% unique(df_visit$study_id))) { stop("not all passed studies can be found in input data") } @@ -925,8 +1029,6 @@ sim_studies <- function(df_visit, # set-up multiprocessing ------------------------------------- if (parallel) { - requireNamespace("furrr") - suppressWarnings(future::plan(multiprocess)) .f_map <- function(...) { furrr::future_map( ..., @@ -1027,7 +1129,6 @@ sim_studies <- function(df_visit, #' knitr::kable(digits = 2) #'@rdname site_aggr #'@export -#'@importFrom stats quantile site_aggr <- function(df_visit, method = "med75_adj", min_pat_pool = 0.2) { @@ -1091,15 +1192,13 @@ site_aggr <- function(df_visit, #' @seealso [sim_sites()][sim_sites()] #' @rdname poiss_test_site_ae_vs_study_ae #' @export -#' @importFrom purrr safely -#' @importFrom stats median runif poisson.test poiss_test_site_ae_vs_study_ae <- function(site_ae, - study_ae, - visit_med75) { + study_ae, + visit_med75) { # if there is only one site - if (is_null(study_ae)) { + if (is.null(study_ae)) { pval <- 1 return(pval) } @@ -1130,99 +1229,9 @@ poiss_test_site_ae_vs_study_ae <- function(site_ae, pval <- poisson_test$result$p.value # this controls for cases when poisson.test fails for some reason - if (is_null(pval)) { + if (is.null(pval)) { pval <- 1 } return(pval) } - - -#' @title simulate study test data -#' @description evenly distributes a number of given patients across a number of -#' given sites. Then simulates ae development of each patient reducing the -#' number of reported AEs for patients distributed to AE-under-reporting sites. -#' @param n_pat integer, number of patients, Default: 1000 -#' @param n_sites integer, number of sites, Default: 20 -#' @param frac_site_with_ur fraction of AE under-reporting sites, Default: 0 -#' @param ur_rate AE under-reporting rate, will lower mean ae per visit used to -#' simulate patients at sites flagged as AE-under-reporting., Default: 0 -#' @param max_visit_mean mean of the maximum number of visits of each patient, -#' Default: 20 -#' @param max_visit_sd standard deviation of maximum number of visits of each -#' patient, Default: 4 -#' @param ae_per_visit_mean mean ae per visit per patient, Default: 0.5 -#' @return tibble with columns site_number, patnum, is_ur, max_visit_mean, -#' max_visit_sd, ae_per_visit_mean, visit, n_ae -#' @details maximum visit number will be sampled from normal distribution with -#' characteristics derived from max_visit_mean and max_visit_sd, while the ae -#' per visit will be sampled from a poisson distribution described by -#' ae_per_visit_mean. -#' @examples -#' set.seed(1) -#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5) -#' df_visit[which(df_visit$patnum == "P000001"),] -#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5, -#' frac_site_with_ur = 0.2, ur_rate = 0.5) -#' df_visit[which(df_visit$patnum == "P000001"),] -#' @rdname sim_test_data_study -#' @export -sim_test_data_study <- function(n_pat = 1000, - n_sites = 20, - frac_site_with_ur = 0, - ur_rate = 0, - max_visit_mean = 20, - max_visit_sd = 4, - ae_per_visit_mean = 0.5 - ) { - tibble(patnum = seq(1, n_pat)) %>% - mutate(patnum = str_pad(patnum, width = 6, side = "left", pad = "0"), - patnum = paste0("P", patnum), - site_number = seq(1, n_pat), - site_number = if (n_sites > 1) cut(.data$site_number, n_sites, labels = FALSE) else 1, - is_ur = ifelse(.data$site_number <= (max(.data$site_number) * frac_site_with_ur), TRUE, FALSE), - site_number = str_pad(.data$site_number, width = 4, side = "left", pad = "0"), - site_number = paste0("S", .data$site_number), - max_visit_mean = max_visit_mean, - max_visit_sd = max_visit_sd, - ae_per_visit_mean = ifelse(.data$is_ur, ae_per_visit_mean * (1 - ur_rate), ae_per_visit_mean), - aes = pmap(list(vm = max_visit_mean, - vs = max_visit_sd, - am = ae_per_visit_mean), - function(vm, vs, am) sim_test_data_patient( - .f_sample_max_visit = function() rnorm(1, mean = vm, sd = vs), - .f_sample_ae_per_visit = function(max_visit) rpois(max_visit, am))), - aes = map(aes, ~ tibble(visit = seq(1, length(.)), n_ae = .))) %>% - unnest(aes) - -} - -#' @title simulate patient ae reporting test data -#' @description helper function for [sim_test_data_study()][sim_test_data_study()] -#' @param .f_sample_ae_per_visit function used to sample the aes for each visit, -#' Default: function(x) rpois(x, 0.5) -#' @param .f_sample_max_visit function used to sample the maximum number of aes, -#' Default: function() rnorm(1, mean = 20, sd = 4) -#' @return vector containing cumulative aes -#' @details "" -#' @examples -#' replicate(5, sim_test_data_patient()) -#' replicate(5, sim_test_data_patient( -#' .f_sample_ae_per_visit = function(x) rpois(x, 1.2)) -#' ) -#' replicate(5, sim_test_data_patient( -#' .f_sample_max_visit = function() rnorm(1, mean = 5, sd = 5)) -#' ) -#' @importFrom stats ecdf rnorm rpois -#' @rdname sim_test_data_patient -#' @export -sim_test_data_patient <- function(.f_sample_max_visit = function() rnorm(1, mean = 20, sd = 4), - .f_sample_ae_per_visit = function(max_visit) rpois(max_visit, 0.5)) { - - max_visit <- as.integer(.f_sample_max_visit()) - max_visit <- ifelse(max_visit < 1, 1, max_visit) - aes <- .f_sample_ae_per_visit(max_visit) - cum_aes <- cumsum(aes) - - return(cum_aes) -} diff --git a/R/simaerep_plot.R b/R/simaerep_plot.R index 5121bb6..42b1042 100644 --- a/R/simaerep_plot.R +++ b/R/simaerep_plot.R @@ -22,15 +22,14 @@ if (getRversion() >= "2.15.1") { #' @param color_low character, hex color value, Default: '#25A69A' #' @param size_dots integer, Default: 10 #' @return ggplot object -#' @importFrom magrittr %>% #' @details ' ' #' @examples -#' study <- tibble( +#' study <- tibble::tibble( #' site = LETTERS[1:3], #' patients = c(list(seq(1, 50, 1)), list(seq(1, 40, 1)), list(seq(1, 10, 1))) #' ) %>% #' tidyr::unnest(patients) %>% -#' mutate(n_ae = as.integer(runif(min = 0, max = 10, n = nrow(.)))) +#' dplyr::mutate(n_ae = as.integer(runif(min = 0, max = 10, n = nrow(.)))) #' #' plot_dots(study) #' @rdname plot_dots @@ -73,14 +72,14 @@ plot_dots <- function(df, df_label <- df %>% group_by_at(vars(one_of(col_group))) %>% - summarize( + summarise( x = max(.data$x), y = max(.data$y), mean_ae = mean(.data$n_ae), label = paste("\u00d8:", round(mean(.data$n_ae), 1)) ) - if (!is_null(thresh)) { + if (!is.null(thresh)) { df_label <- df_label %>% mutate(color = ifelse(.data$mean_ae >= thresh, color_high, color_low)) } else { @@ -155,7 +154,6 @@ plot_dots <- function(df, #' \code{\link[cowplot]{get_legend}},\code{\link[cowplot]{plot_grid}} #' @rdname plot_sim_example #' @export -#' @importFrom cowplot get_legend plot_grid plot_sim_example <- function(substract_ae_per_pat = 0, size_dots = 10, size_raster_label = 12, @@ -306,7 +304,6 @@ plot_sim_example <- function(substract_ae_per_pat = 0, #' \code{\link[cowplot]{ggdraw}},\code{\link[cowplot]{draw_label}},\code{\link[cowplot]{plot_grid}} #' @rdname plot_sim_examples #' @export -#' @importFrom cowplot ggdraw draw_label plot_grid plot_sim_examples <- function(substract_ae_per_pat = c(0, 1, 3), ...) { make_title <- function(x, fontface = "bold", size = 14, angle = 0) { @@ -424,10 +421,6 @@ plot_sim_examples <- function(substract_ae_per_pat = c(0, 1, 3), ...) { #' plot_study(df_visit, df_site, df_eval, study = "A") #' @rdname plot_study #' @export -#' @importFrom cowplot plot_grid ggdraw draw_label -#' @importFrom forcats fct_relevel -#' @importFrom RColorBrewer brewer.pal -#' @importFrom utils head #' @import ggplot2 plot_study <- function(df_visit, df_site, @@ -442,7 +435,7 @@ plot_study <- function(df_visit, # alert level ------------------------------------------------------------- - if (is_null(df_al)) { + if (is.null(df_al)) { df_visit <- df_visit %>% mutate(alert_level_site = NA, alert_level_study = NA) @@ -821,7 +814,7 @@ plot_visit_med75 <- function(df_visit, study_possible_max_visit <- df_pat %>% filter(.data$study_id == study_id_str) %>% - summarize(study_possible_max_visit = quantile(.data$max_visit_per_pat, 1 - min_pat_pool)) %>% + summarise(study_possible_max_visit = quantile(.data$max_visit_per_pat, 1 - min_pat_pool)) %>% pull(.data$study_possible_max_visit) %>% round(0) diff --git a/R/simulate_test_data.R b/R/simulate_test_data.R new file mode 100644 index 0000000..2f0e88d --- /dev/null +++ b/R/simulate_test_data.R @@ -0,0 +1,772 @@ +#' @title simulate study test data +#' @description evenly distributes a number of given patients across a number of +#' given sites. Then simulates ae development of each patient reducing the +#' number of reported AEs for patients distributed to AE-under-reporting sites. +#' @param n_pat integer, number of patients, Default: 1000 +#' @param n_sites integer, number of sites, Default: 20 +#' @param frac_site_with_ur fraction of AE under-reporting sites, Default: 0 +#' @param ur_rate AE under-reporting rate, will lower mean ae per visit used to +#' simulate patients at sites flagged as AE-under-reporting., Default: 0 +#' @param max_visit_mean mean of the maximum number of visits of each patient, +#' Default: 20 +#' @param max_visit_sd standard deviation of maximum number of visits of each +#' patient, Default: 4 +#' @param ae_per_visit_mean mean ae per visit per patient, Default: 0.5 +#' @return tibble with columns site_number, patnum, is_ur, max_visit_mean, +#' max_visit_sd, ae_per_visit_mean, visit, n_ae +#' @details maximum visit number will be sampled from normal distribution with +#' characteristics derived from max_visit_mean and max_visit_sd, while the ae +#' per visit will be sampled from a poisson distribution described by +#' ae_per_visit_mean. +#' @examples +#' set.seed(1) +#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5) +#' df_visit[which(df_visit$patnum == "P000001"),] +#' df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5, +#' frac_site_with_ur = 0.2, ur_rate = 0.5) +#' df_visit[which(df_visit$patnum == "P000001"),] +#' @rdname sim_test_data_study +#' @export +sim_test_data_study <- function(n_pat = 1000, + n_sites = 20, + frac_site_with_ur = 0, + ur_rate = 0, + max_visit_mean = 20, + max_visit_sd = 4, + ae_per_visit_mean = 0.5 +) { + tibble(patnum = seq(1, n_pat)) %>% + mutate(patnum = str_pad(patnum, width = 6, side = "left", pad = "0"), + patnum = paste0("P", patnum), + site_number = seq(1, n_pat), + site_number = if (n_sites > 1) cut(.data$site_number, n_sites, labels = FALSE) else 1, + is_ur = ifelse(.data$site_number <= (max(.data$site_number) * frac_site_with_ur), TRUE, FALSE), + site_number = str_pad(.data$site_number, width = 4, side = "left", pad = "0"), + site_number = paste0("S", .data$site_number), + max_visit_mean = max_visit_mean, + max_visit_sd = max_visit_sd, + ae_per_visit_mean = ifelse(.data$is_ur, ae_per_visit_mean * (1 - ur_rate), ae_per_visit_mean), + aes = pmap(list(vm = max_visit_mean, + vs = max_visit_sd, + am = ae_per_visit_mean), + function(vm, vs, am) sim_test_data_patient( + .f_sample_max_visit = function() rnorm(1, mean = vm, sd = vs), + .f_sample_ae_per_visit = function(max_visit) rpois(max_visit, am))), + aes = map(aes, ~ tibble(visit = seq(1, length(.)), n_ae = .))) %>% + unnest(aes) + +} + +#' @title simulate patient ae reporting test data +#' @description helper function for [sim_test_data_study()][sim_test_data_study()] +#' @param .f_sample_ae_per_visit function used to sample the aes for each visit, +#' Default: function(x) rpois(x, 0.5) +#' @param .f_sample_max_visit function used to sample the maximum number of aes, +#' Default: function() rnorm(1, mean = 20, sd = 4) +#' @return vector containing cumulative aes +#' @details "" +#' @examples +#' replicate(5, sim_test_data_patient()) +#' replicate(5, sim_test_data_patient( +#' .f_sample_ae_per_visit = function(x) rpois(x, 1.2)) +#' ) +#' replicate(5, sim_test_data_patient( +#' .f_sample_max_visit = function() rnorm(1, mean = 5, sd = 5)) +#' ) +#' @rdname sim_test_data_patient +#' @export +sim_test_data_patient <- function(.f_sample_max_visit = function() rnorm(1, mean = 20, sd = 4), + .f_sample_ae_per_visit = function(max_visit) rpois(max_visit, 0.5)) { + + max_visit <- as.integer(.f_sample_max_visit()) + max_visit <- ifelse(max_visit < 1, 1, max_visit) + aes <- .f_sample_ae_per_visit(max_visit) + cum_aes <- cumsum(aes) + + return(cum_aes) +} + +#' @title simulate single scenario +#' @description internal function called by simulate_scenarios() +#' @param n_ae_site integer vector +#' @param n_ae_study integer vector +#' @param frac_pat_with_ur double +#' @param ur_rate double +#' @return list +#' @examples +#' sim_scenario(c(5,5,5,5), c(8,8,8,8), 0.2, 0.5) +#' sim_scenario(c(5,5,5,5), c(8,8,8,8), 0.75, 0.5) +#' sim_scenario(c(5,5,5,5), c(8,8,8,8), 1, 0.5) +#' sim_scenario(c(5,5,5,5), c(8,8,8,8), 1, 1) +#' sim_scenario(c(5,5,5,5), c(8,8,8,8), 0, 0.5) +#' sim_scenario(c(5,5,5,5), c(8,8,8,8), 2, 0.5) +#' @rdname sim_scenario +#' @export +sim_scenario <- function(n_ae_site, n_ae_study, frac_pat_with_ur, ur_rate) { + + if (frac_pat_with_ur == 0 | ur_rate == 0) { + return(list(n_ae_site = n_ae_site, n_ae_study = n_ae_study)) + } + + if (frac_pat_with_ur > 1) frac_pat_with_ur <- 1 + + n_pat_site <- length(n_ae_site) + n_pat_study <- length(n_ae_study) + n_pat_tot <- n_pat_site + n_pat_study + n_pat_ur <- round(n_pat_tot * frac_pat_with_ur, 0) + + max_ix_site <- min(c(n_pat_ur, n_pat_site)) + + n_ae_site[1:max_ix_site] <- n_ae_site[1:max_ix_site] * (1 - ur_rate) + + if (n_pat_ur > n_pat_site) { + max_ix_study <- n_pat_ur - n_pat_site + + n_ae_study[1:max_ix_study] <- n_ae_study[1:max_ix_study] * (1 - ur_rate) + } + + return(list(n_ae_site = n_ae_site, n_ae_study = n_ae_study)) +} + +#' @title Simulate Under-Reporting Scenarios +#' @description Use with simulated portfolio data to generate under-reporting +#' stats for specified scenarios. +#' @param df_portf dataframe as returned by \code{\link{sim_test_data_portfolio}} +#' @param extra_ur_sites numeric, set maximum number of additional +#' under-reporting sites, see details Default: 3 +#' @param ur_rate numeric vector, set under-reporting rates for scenarios +#' Default: c(0.25, 0.5) +#' @inheritParams sim_sites +#' @param parallel logical, use parallel processing see details, Default: FALSE +#' @param progress logical, show progress bar, Default: TRUE +#' @param site_aggr_args named list of parameters passed to +#' \code{\link{site_aggr}}, Default: list() +#' @param eval_sites_args named list of parameters passed to +#' \code{\link{eval_sites}}, Default: list() +#' @return dataframe with the following columns: +#' \describe{ +#' \item{**study_id**}{study identification} +#' \item{**site_number**}{site identification} +#' \item{**n_pat**}{number of patients at site} +#' \item{**n_pat_with_med75**}{number of patients at site with visit_med75} +#' \item{**visit_med75**}{median(max(visit)) * 0.75} +#' \item{**mean_ae_site_med75**}{mean AE at visit_med75 site level} +#' \item{**mean_ae_study_med75**}{mean AE at visit_med75 study level} +#' \item{**n_pat_with_med75_study**}{number of patients at site with +#' visit_med75 at study excl site} +#' \item{**extra_ur_sites**}{additional sites +#' with under-reporting patients} +#' \item{**frac_pat_with_ur**}{ratio of +#' patients in study that are under-reporting} +#' \item{**ur_rate**}{under-reporting rate} +#' \item{**pval**}{p-value as +#' returned by \code{\link[stats]{poisson.test}}} +#' \item{**prob_low**}{bootstrapped probability for having mean_ae_site_med75 +#' or lower} \item{**pval_adj**}{adjusted p-values} +#' \item{**prob_low_adj**}{adjusted bootstrapped probability for having +#' mean_ae_site_med75 or lower} \item{**pval_prob_ur**}{probability +#' under-reporting as 1 - pval_adj, poisson.test (use as benchmark)} +#' \item{**prob_low_prob_ur**}{probability under-reporting as 1 - +#' prob_low_adj, bootstrapped (use)} +#'} +#' @details The function will apply under-reporting scenarios to each site. +#' Reducing the number of AEs by a given under-reporting (ur_rate) for all +#' patients at the site and add the corresponding under-reporting statistics. +#' Since the under-reporting probability is also affected by the number of +#' other sites that are under-reporting we additionally calculate +#' under-reporting statistics in a scenario where additional under reporting +#' sites are present. For this we use the median number of patients per site +#' at the study to calculate the final number of patients for which we lower +#' the AEs in a given under-reporting scenario. We use the furrr package to +#' implement parallel processing as these simulations can take a long time to +#' run. For this to work we need to specify the plan for how the code should +#' run, e.g. plan(multisession, workers = 18) +#' @examples +#' if (interactive()) { +#' df_visit1 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.4, ur_rate = 0.6) +#' +#' df_visit1$study_id <- "A" +#' +#' df_visit2 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.2, ur_rate = 0.1) +#' +#' df_visit2$study_id <- "B" +#' +#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2) +#' +#' df_site_max <- df_visit %>% +#' group_by(study_id, site_number, patnum) %>% +#' summarise(max_visit = max(visit), +#' max_ae = max(n_ae), +#' .groups = "drop") +#' +#' df_config <- get_config(df_site_max) +#' +#' df_config +#' +#' df_portf <- sim_test_data_portfolio(df_config) +#' +#' df_portf +#' +#' df_scen <- sim_ur_scenarios(df_portf, +#' extra_ur_sites = 2, +#' ur_rate = c(0.5, 1)) +#' +#' +#' df_scen +#' +#' df_perf <- get_portf_perf(df_scen) +#' +#' df_perf +#' } +#' @seealso +#' \code{\link{sim_test_data_study}} +#' \code{\link{get_config}} +#' \code{\link{sim_test_data_portfolio}} +#' \code{\link{sim_ur_scenarios}} +#' \code{\link{get_portf_perf}} +#' @rdname sim_ur_scenarios +#' @export +sim_ur_scenarios <- function(df_portf, + extra_ur_sites = 3, + ur_rate = c(0.25, 0.5), + r = 1000, + poisson_test = FALSE, + prob_lower = TRUE, + parallel = FALSE, + progress = TRUE, + site_aggr_args = list(), + eval_sites_args = list()) { + # checks + + stopifnot("all site_aggr_args list items must be named" = all(names(site_aggr_args) != "")) + stopifnot("all eval_sites_args list items must be named" = all(names(eval_sites_args) != "")) + stopifnot(is.numeric(extra_ur_sites)) + stopifnot(length(extra_ur_sites) == 1) + extra_ur_sites <- as.integer(extra_ur_sites) + + + if (progress) { + message("aggregating site level") + } + + df_visit <- check_df_visit(df_portf) + + df_site <- do.call(site_aggr, c(list(df_visit = df_visit), site_aggr_args)) + + if (progress) { + message("prepping for simulation") + } + + df_sim_prep <- prep_for_sim(df_site = df_site, df_visit = df_visit) + + if (progress) { + message("generating scenarios") + } + + # create scenario grid + + df_mean_pat <- df_visit %>% + group_by(.data$study_id, .data$site_number, .data$visit) %>% + summarise(n_pat = n_distinct(.data$patnum), + .groups = "drop") %>% + group_by(.data$study_id, .data$visit) %>% + summarise(mean_n_pat = mean(.data$n_pat), + sum_n_pat = sum(.data$n_pat), + n_sites = n_distinct(.data$site_number), + .groups = "drop") + + ur_rate <- ur_rate[ur_rate > 0] + + df_grid_gr0 <- df_site %>% + select(.data$study_id, .data$site_number, .data$n_pat_with_med75, .data$visit_med75) %>% + left_join(df_mean_pat, by = c(study_id = "study_id", visit_med75 = "visit")) %>% + mutate(extra_ur_sites = list(0:extra_ur_sites)) %>% + unnest(.data$extra_ur_sites) %>% + mutate( + frac_pat_with_ur = (.data$n_pat_with_med75 + .data$extra_ur_sites * .data$mean_n_pat) / + .data$sum_n_pat, + ur_rate = list(ur_rate) + ) %>% + unnest(.data$ur_rate) %>% + select( + .data$study_id, + .data$site_number, + .data$extra_ur_sites, + .data$frac_pat_with_ur, + .data$ur_rate + ) + + df_grid_0 <- df_grid_gr0 %>% + select(.data$study_id, .data$site_number) %>% + distinct() %>% + mutate(extra_ur_sites = 0, + frac_pat_with_ur = 0, + ur_rate = 0) + + df_grid <- bind_rows(df_grid_0, df_grid_gr0) + + df_scen_prep <- df_sim_prep %>% + left_join(df_grid, by = c("study_id", "site_number")) + + # generating scenarios + + df_scen <- df_scen_prep %>% + mutate(scenarios = purrr::pmap( + list(.data$n_ae_site, .data$n_ae_study, .data$frac_pat_with_ur, .data$ur_rate), + sim_scenario + ) + ) %>% + select(- .data$n_ae_site, - .data$n_ae_study) %>% + mutate(n_ae_site = map(.data$scenarios, "n_ae_site"), + n_ae_study = map(.data$scenarios, "n_ae_study")) %>% + select(- .data$scenarios) + + if (progress) { + message("getting under-reporting stats") + } + + if (parallel) { + .purrr <- furrr::future_map + .purrr_args <- list(.options = furrr_options(seed = TRUE)) + } else { + .purrr <- purrr::map + .purrr_args <- list() + } + + df_sim_sites <- df_scen %>% + mutate(study_id_gr = .data$study_id, + site_number_gr = .data$site_number) %>% + group_by(.data$study_id_gr, .data$site_number_gr) %>% + nest() %>% + ungroup() %>% + select(- .data$study_id_gr, - .data$site_number_gr) + + with_progress_cnd( + ls_df_sim_sites <- purrr_bar( + df_sim_sites$data, + .purrr = .purrr, + .f = sim_after_prep, + .f_args = list( + r = r, + poisson_test = poisson_test, + prob_lower = prob_lower, + progress = FALSE + ), + .purrr_args = .purrr_args, + .steps = nrow(df_sim_sites), + .progress = progress + ), + progress = progress + ) + + if (progress) { + message("evaluating stats") + } + + df_sim_sites <- bind_rows(ls_df_sim_sites) + + df_eval <- do.call(eval_sites, c(list(df_sim_sites = df_sim_sites), eval_sites_args)) %>% + arrange( + .data$study_id, + .data$site_number, + .data$extra_ur_sites, + .data$frac_pat_with_ur, + .data$ur_rate + ) + +} + + +#' @title Simulate Portfolio Test Data +#' @description Simulate visit level data from a portfolio configuration. +#' @param df_config dataframe as returned by \code{\link{get_config}} +#' @param parallel logical activate parallel processing, see details, Default: FALSE +#' @param progress logical, Default: TRUE +#'@return dataframe with the following columns: \describe{ +#' \item{**study_id**}{study identification} \item{**ae_per_visit_mean**}{mean +#' AE per visit per study} \item{**site_number**}{site} +#' \item{**max_visit_sd**}{standard deviation of maximum patient visits per +#' site} \item{**max_visit_mean**}{mean of maximum patient visits per site} +#' \item{**patnum**}{number of patients} +#' \item{**visit**}{visit number} +#' \item{**n_ae**}{cumulative sum of AEs} +#'} +#' @details uses \code{\link{sim_test_data_study}}. +#' We use the `furrr` package to +#' implement parallel processing as these simulations can take a long time to +#' run. For this to work we need to specify the plan for how the code should +#' run, e.g. `plan(multisession, workers = 3) +#' @examples +#' if (interactive()) { +#' df_visit1 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.4, ur_rate = 0.6) +#' +#' df_visit1$study_id <- "A" +#' +#' df_visit2 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.2, ur_rate = 0.1) +#' +#' df_visit2$study_id <- "B" +#' +#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2) +#' +#' df_site_max <- df_visit %>% +#' group_by(study_id, site_number, patnum) %>% +#' summarise(max_visit = max(visit), +#' max_ae = max(n_ae), +#' .groups = "drop") +#' +#' df_config <- get_config(df_site_max) +#' +#' df_config +#' +#' df_portf <- sim_test_data_portfolio(df_config) +#' +#' df_portf +#' +#' df_scen <- sim_ur_scenarios(df_portf, +#' extra_ur_sites = 2, +#' ur_rate = c(0.5, 1)) +#' +#' +#' df_scen +#' +#' df_perf <- get_portf_perf(df_scen) +#' +#' df_perf +#' } +#' @seealso +#' \code{\link{sim_test_data_study}} +#' \code{\link{get_config}} +#' \code{\link{sim_test_data_portfolio}} +#' \code{\link{sim_ur_scenarios}} +#' \code{\link{get_portf_perf}} +#' @rdname sim_test_data_portfolio +#' @export +sim_test_data_portfolio <- function(df_config, parallel = FALSE, progress = TRUE) { + + # checks -------------------------- + df_config <- ungroup(df_config) + + stopifnot( + df_config %>% + summarise_all(~ ! anyNA(.)) %>% + unlist() %>% + all() + ) + + stopifnot(is.data.frame(df_config)) + + stopifnot( + all( + c("study_id", + "ae_per_visit_mean", + "site_number", + "max_visit_sd", + "max_visit_mean", + "n_pat" + ) %in% colnames(df_config) + ) + ) + + # exec -------------------------- + + if (parallel) { + .purrr <- furrr::future_pmap + .purrr_args <- list(.options = furrr_options(seed = TRUE)) + } else { + .purrr <- purrr::pmap + .purrr_args <- list() + } + + with_progress_cnd( + df_config_sim <- df_config %>% + mutate( + sim = purrr_bar( + list( + .data$ae_per_visit_mean, + .data$max_visit_sd, + .data$max_visit_mean, + .data$n_pat + ), + .purrr = .purrr, + .f = function(ae_per_visit_mean, + max_visit_sd, + max_visit_mean, + n_pat) { + sim_test_data_study( + n_pat = n_pat, + n_sites = 1, + max_visit_mean = max_visit_mean, + max_visit_sd = max_visit_sd, + ae_per_visit_mean = ae_per_visit_mean + ) %>% + select( + # using rlang::.data here causes error with furrr + patnum, visit, n_ae + ) + }, + .progress = progress, + .purrr_args = .purrr_args, + .steps = nrow(.) + ) + ), + progress = progress + ) + + df_portf <- df_config_sim %>% + unnest(.data$sim) %>% + select(- .data$n_pat) %>% + group_by(.data$study_id) %>% + mutate( + # patnums need to be made site exclusive + patnum = str_pad( + dense_rank(paste0(.data$site_number, .data$patnum)), + width = 4, + side = "left", + pad = "0" + ) + ) %>% + ungroup() + + return(df_portf) +} + +#'@title Get Portfolio Configuration +#'@description Get Portfolio configuration from a dataframe aggregated on +#' patient level with max_ae and max_visit. Will filter studies with only a few +#' sites and patients and will anonymize IDs. Portfolio configuration can be +#' used by \code{\link{sim_test_data_portfolio}} to generate data for an +#' artificial portfolio. +#'@param df_site dataframe aggregated on patient level with max_ae and max_visit +#'@param min_pat_per_study minimum number of patients per study, Default: 100 +#'@param min_sites_per_study minimum number of sites per study, Default: 10 +#'@param anonymize logical, Default: TRUE +#'@param pad_width padding width for newly created IDs, Default: 4 +#'@return dataframe with the following columns: \describe{ +#' \item{**study_id**}{study identification} \item{**ae_per_visit_mean**}{mean +#' AE per visit per study} \item{**site_number**}{site} +#' \item{**max_visit_sd**}{standard deviation of maximum patient visits per +#' site} \item{**max_visit_mean**}{mean of maximum patient visits per site} +#' \item{**n_pat**}{number of patients} } +#' @examples +#' if (interactive()) { +#' df_visit1 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.4, ur_rate = 0.6) +#'. +#' df_visit1$study_id <- "A" +#'. +#' df_visit2 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.2, ur_rate = 0.1) +#'. +#' df_visit2$study_id <- "B" +#'. +#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2) +#' +#' df_site_max <- df_visit %>% +#' group_by(study_id, site_number, patnum) %>% +#' summarise(max_visit = max(visit), +#' max_ae = max(n_ae), +#' .groups = "drop") +#' +#' df_config <- get_config(df_site_max) +#' +#' df_config +#' +#' df_portf <- sim_test_data_portfolio(df_config) +#' +#' df_portf +#' +#' df_scen <- sim_ur_scenarios(df_portf, +#' extra_ur_sites = 2, +#' ur_rate = c(0.5, 1)) +#' +#' +#' df_scen +#' +#' df_perf <- get_portf_perf(df_scen) +#' +#' df_perf +#' } +#' @seealso +#' \code{\link{sim_test_data_study}} +#' \code{\link{get_config}} +#' \code{\link{sim_test_data_portfolio}} +#' \code{\link{sim_ur_scenarios}} +#' \code{\link{get_portf_perf}} +#'@rdname get_config +#'@export +get_config <- function(df_site, + min_pat_per_study = 100, + min_sites_per_study = 10, + anonymize = TRUE, + pad_width = 4) { + + stopifnot(c("study_id", "site_number", "patnum", "max_visit", "max_ae") %in% colnames(df_site)) + stopifnot(nrow(df_site) == nrow(distinct(select(df_site, .data$study_id, .data$site_number, .data$patnum)))) + + df_site %>% + summarise_all(~ ! anyNA(.)) %>% + unlist() %>% + all() %>% + stopifnot("NA detected" = .) + + df_site %>% + group_by(.data$study_id, .data$patnum) %>% + summarise(n_sites = n_distinct(.data$site_number), .groups = "drop") %>% + mutate(check = .data$n_sites == 1) %>% + pull(.data$check) %>% + unlist() %>% + all() %>% + stopifnot("patient ids must be site exclusive" = .) + + df_config <- df_site %>% + filter(.data$max_visit > 0) %>% + group_by(.data$study_id) %>% + mutate(ae_per_visit_mean = sum(.data$max_ae) / sum(.data$max_visit)) %>% + filter( + n_distinct(.data$patnum) >= min_pat_per_study, + n_distinct(.data$site_number) >= min_sites_per_study + ) %>% + group_by(.data$study_id, .data$ae_per_visit_mean, .data$site_number) %>% + summarise(max_visit_sd = sd(.data$max_visit), + max_visit_mean = mean(.data$max_visit), + n_pat = n_distinct(.data$patnum), + .groups = "drop") %>% + mutate(max_visit_sd = ifelse(is.na(.data$max_visit_sd), 0, .data$max_visit_sd)) + + if (anonymize) { + df_config <- df_config %>% + mutate( + study_id = dense_rank(.data$study_id), + study_id = str_pad(.data$study_id, pad_width, side = "left", "0") + ) %>% + group_by(.data$study_id) %>% + mutate( + site_number = dense_rank(.data$site_number), + site_number = str_pad(.data$site_number, pad_width, side = "left", "0") + ) %>% + ungroup() + } + + stopifnot("nrows(df_config) > 0, relax filter settings!" = nrow(df_config) > 0) + + return(df_config) +} + +#' @title Get Portfolio Performance +#' @description Performance as true positive rate (tpr as tp/P) on the basis of +#' desired false positive rates (fpr as fp/P). +#' @param df_scen dataframe as returned by \code{\link{sim_ur_scenarios}} +#' @param stat character denoting the column name of the under-reporting +#' statistic, Default: 'prob_low_prob_ur' +#' @param fpr numeric vector specifying false positive rates, Default: c(0.001, +#' 0.01, 0.05) +#' @return dataframe +#' @details DETAILS +#' @examples +#' df_visit1 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.4, ur_rate = 0.6) +#' +#' df_visit1$study_id <- "A" +#' +#' df_visit2 <- sim_test_data_study(n_pat = 100, n_sites = 10, +#' frac_site_with_ur = 0.2, ur_rate = 0.1) +#' +#' df_visit2$study_id <- "B" +#' +#' df_visit <- dplyr::bind_rows(df_visit1, df_visit2) +#' +#' df_site_max <- df_visit %>% +#' dplyr::group_by(study_id, site_number, patnum) %>% +#' dplyr::summarise(max_visit = max(visit), +#' max_ae = max(n_ae), +#' .groups = "drop") +#' +#' df_config <- get_config(df_site_max) +#' +#' df_config +#' +#' df_portf <- sim_test_data_portfolio(df_config) +#' +#' df_portf +#' +#' df_scen <- sim_ur_scenarios(df_portf, +#' extra_ur_sites = 2, +#' ur_rate = c(0.5, 1)) +#' +#' +#' df_scen +#' +#' df_perf <- get_portf_perf(df_scen) +#' +#' df_perf +#' @seealso \code{\link{sim_test_data_study}} \code{\link{get_config}} +#' \code{\link{sim_test_data_portfolio}} \code{\link{sim_ur_scenarios}} +#' \code{\link{get_portf_perf}} +#' @rdname get_portf_perf +#' @export +get_portf_perf <- function(df_scen, stat = "prob_low_prob_ur", fpr = c(0.001, 0.01, 0.05)) { + + if (anyNA(df_scen[[stat]])) { + mes <- df_scen %>% + mutate(extra_ur_sites = as.factor(.data$extra_ur_sites), + ur_rate = as.factor(.data$ur_rate)) %>% + group_by(.data$extra_ur_sites, .data$ur_rate, .drop = FALSE) %>% + mutate(n_sites_total = n_distinct(.data$site_number)) %>% + group_by(.data$extra_ur_sites, .data$ur_rate, .data$n_sites_total) %>% + filter(is.na(.data[[stat]])) %>% + summarise(n = n_distinct(.data$site_number), .groups = "drop") %>% + mutate( + ratio_sites_with_na = .data$n / + ifelse(is.na(.data$n_sites_total), + 0, + .data$n_sites_total) + ) %>% + select(.data$extra_ur_sites, .data$ur_rate, .data$ratio_sites_with_na) %>% + knitr::kable() %>% + paste(collapse = "\n") + + warning( + paste("Some Simulation Scenarios have returned NA stat values.\n", mes)) + + } + + stat_at_0 <- df_scen %>% + filter(.data$ur_rate == 0, .data$frac_pat_with_ur == 0) %>% + pull(.data[[stat]]) + + df_thresh <- tibble( + fpr = fpr + ) %>% + mutate( + thresh = map_dbl( + .data$fpr, + ~ quantile(stat_at_0, probs = 1 - ., na.rm = TRUE) + ) + ) + + + df_prep <- df_scen %>% + mutate(data = list(df_thresh)) %>% + unnest(.data$data) %>% + mutate(stat = .data[[stat]]) %>% + group_by(.data$fpr, .data$thresh, .data$extra_ur_sites, .data$ur_rate) %>% + summarise( + tpr = sum(ifelse(.data$stat >= .data$thresh, 1, 0), na.rm = TRUE) / + n_distinct(paste(.data$study_id, .data$site_number)), + .groups = "drop") + + df_prep_0 <- df_prep %>% + filter(.data$ur_rate == 0) %>% + mutate(extra_ur_sites = list(unique(df_prep$extra_ur_sites))) %>% + unnest(.data$extra_ur_sites) + + df_prep_gr0 <- df_prep %>% + filter(.data$ur_rate > 0) + + bind_rows(df_prep_0, df_prep_gr0) %>% + arrange(.data$fpr, .data$ur_rate) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 78b0068..5dc7540 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -34,6 +34,7 @@ articles: - sas_files - visit_med75 - visits_or_days + - portfolio_perf reference: - title: Main Functions @@ -48,11 +49,14 @@ reference: desc: ~ contents: - sim_test_data_study + - get_config + - sim_test_data_portfolio - title: Test Functions - desc: These functions are intented to check Poisson test applicability. + desc: These functions are intented to check simaerep performance. contents: - sim_studies - - get_ecd_values + - sim_ur_scenarios + - get_portf_perf - title: Helper Functions desc: These functions are intended to be called by other functions. contents: @@ -66,11 +70,17 @@ reference: - get_site_mean_ae_dev - aggr_duplicated_visits - exp_implicit_missing_visits + - prep_for_sim + - sim_after_prep + - get_ecd_values + - purrr_bar + - with_progress_cnd + - sim_scenario - title: Deprecated contents: - eval_sites_deprecated - title: Additional Plot Functions - desc: These functions were intended for creating plots for the documentation. + desc: These functions are intended for creating plots for the documentation. contents: - plot_visit_med75 - plot_sim_examples diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8ae9b06 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,24 @@ +version: '3.7' +services: + build_image: + build: . + image: 'simaerep' + command: /bin/bash + + shell: + image: 'simaerep' + working_dir: /app + # make container wait + command: tail -F anything + volumes: + - '.:/app' + rstudio: + image: 'simaerep' + ports: + - '8787:8787' + volumes: + - '.:/home/rstudio/app' + command: /init + environment: + PASSWORD: '123' + USER: 'rstudio' diff --git a/docs/404.html b/docs/404.html index 5fa5c52..fb089e4 100644 --- a/docs/404.html +++ b/docs/404.html @@ -89,7 +89,7 @@ simaerep - 0.3.0 + 0.3.2 @@ -97,7 +97,7 @@
  • @@ -141,7 +144,7 @@