brm_recenter_nuisance()

openpharma · May 29, 2024 · 6ad5c0b · 6ad5c0b
1 parent 2c0fbb6
commit 6ad5c0b
Show file tree

Hide file tree

Showing 10 changed files with 251 additions and 63 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -37,6 +37,7 @@ export(brm_plot_draws)
 export(brm_prior_archetype)
 export(brm_prior_label)
 export(brm_prior_simple)
+export(brm_recenter_nuisance)
 export(brm_simulate)
 export(brm_simulate_categorical)
 export(brm_simulate_continuous)

diff --git a/R/brm_recenter_nuisance.R b/R/brm_recenter_nuisance.R
@@ -0,0 +1,80 @@
+#' @title Recenter nuisance variables
+#' @export
+#' @family archetype utilities
+#' @description Change the center of a nuisance variable of an
+#'   informative prior archetype.
+#' @details By "centering vector y at scalar x", we mean taking
+#'   the difference `z = y - x`. If `x` is the mean, then `mean(z)` is
+#'   0. Informative prior archetypes center nuisance variables
+#'   at their means so the parameters can be interpreted correctly
+#'   for setting informative priors. This is appropriate most of the time,
+#'   but sometimes it is better to center a column at a pre-specified
+#'   scientifically meaningful fixed number. If you want a nuisance column
+#'   to be centered at a fixed value other than its mean,
+#'   use [brm_recenter_nuisance()] to shift the center. This function
+#'   can handle any nuisance variable
+#' @return An informative prior archetype data frame with one of the
+#'   variables re-centered.
+#' @param data An informative prior archetype data frame output from
+#'   [brm_archetype_cells()] or similar.
+#' @param nuisance Character of length 1, name of the nuisance column
+#'   in the data to shift the center.
+#' @param center Numeric of length 1, value of the center to shift
+#'   the column in `nuisance`. The affected column in the returned
+#'   archetype data frame will look as if it were centered by this
+#'   value to begin with.
+#' @examples
+#' set.seed(0L)
+#' data <- brm_simulate_outline(
+#'   n_group = 2,
+#'   n_patient = 100,
+#'   n_time = 4,
+#'   rate_dropout = 0,
+#'   rate_lapse = 0
+#' ) |>
+#'   dplyr::mutate(response = rnorm(n = dplyr::n())) |>
+#'   brm_data_change() |>
+#'   brm_simulate_continuous(names = c("biomarker1", "biomarker2")) |>
+#'   brm_simulate_categorical(
+#'     names = c("status1", "status2"),
+#'     levels = c("present", "absent")
+#'   )
+#' archetype <- brm_archetype_cells(data)
+#' mean(archetype$nuisance_biomarker1) # after original centering
+#' center <- mean(data$biomarker1)
+#' center # original center, before the centering from brm_archetype_cells()
+#' attr(archetype$nuisance_biomarker1, "brm_center") # original center
+#' max(abs((data$biomarker1 - center) - archetype$nuisance_biomarker1))
+#' # Re-center nuisance_biomarker1 at 0.75.
+#' archetype <- brm_recenter_nuisance(
+#'   data = archetype,
+#'   nuisance = "nuisance_biomarker1",
+#'   center = 0.75
+#' )
+#' attr(archetype$nuisance_biomarker1, "brm_center") # new center
+#' mean(archetype$nuisance_biomarker1) # no longer equal to the center
+#' # nuisance_biomarker1 is now as though we centered it at 0.75.
+#' max(abs((data$biomarker1 - 0.75) - archetype$nuisance_biomarker1))
+brm_recenter_nuisance <- function(data, nuisance, center) {
+  brm_data_validate(data = data)
+  assert(
+    inherits(data, "brms_mmrm_archetype"),
+    message = "data must be a brms.mmrm informative prior archetype"
+  )
+  assert_chr(nuisance, "nuisance must be a character string")
+  assert(
+    nuisance %in% attr(data, "brm_archetype_nuisance"),
+    message = c(
+      "nuisance must be the name of a nuisance column in the archetype data"
+    )
+  )
+  assert_num(center, "center must be a valid non-missing numeric scalar")
+  original <- attr(data[[nuisance]], "brm_center")
+  assert_num(
+    original,
+    paste("original center of", nuisance, "could not be found")
+  )
+  data[[nuisance]] <- data[[nuisance]] + original - center
+  attr(data[[nuisance]], "brm_center") <- center
+  data
+}
diff --git a/R/utils_archetype.R b/R/utils_archetype.R
@@ -49,7 +49,7 @@ archetype_nuisance <- function(
       nuisance = out
     )
   }
-  out
+  nuisance_center(out)
 }
 
 nuisance_covariates <- function(data) {
@@ -62,16 +62,11 @@ nuisance_covariates <- function(data) {
     colnames(categorical) <- paste0(colnames(categorical), "_")
     out <- dplyr::bind_cols(out, model.matrix(~ 0 + ., categorical))
   }
-  for (name in colnames(out)) {
-    out[[name]] <- out[[name]] - mean(out[[name]])
-  }
   out
 }
 
 nuisance_baseline <- function(data) {
-  baseline <- attr(data, "brm_baseline")
-  data[[baseline]] <- data[[baseline]] - mean(data[[baseline]])
-  data[, baseline, drop = FALSE]
+  data[, attr(data, "brm_baseline"), drop = FALSE]
 }
 
 nuisance_baseline_subgroup <- function(data) {
@@ -98,7 +93,6 @@ nuisance_baseline_time <- function(data) {
 
 nuisance_baseline_terms <- function(data, baseline, formula) {
   matrix <- model.matrix(object = formula, data = data)
-  matrix <- sweep(matrix, MARGIN = 2L, STATS = colMeans(matrix), FUN = "-")
   tibble::as_tibble(as.data.frame(matrix))
 }
 
@@ -136,3 +130,12 @@ drop_zero_columns <- function(x) {
   sums <- colSums(abs(x))
   x[, sums > .Machine$double.eps, drop = FALSE]
 }
+
+nuisance_center <- function(data) {
+  for (name in colnames(data)) {
+    mean <- mean(data[[name]])
+    attr(data[[name]], "brm_center") <- mean
+    data[[name]] <- data[[name]] - mean
+  }
+  data
+}
diff --git a/man/brm_prior_archetype.Rd b/man/brm_prior_archetype.Rd
diff --git a/man/brm_prior_label.Rd b/man/brm_prior_label.Rd
diff --git a/man/brm_recenter_nuisance.Rd b/man/brm_recenter_nuisance.Rd
diff --git a/tests/testthat/test-brm_recenter_nuisance.R b/tests/testthat/test-brm_recenter_nuisance.R
@@ -0,0 +1,37 @@
+test_that("multiplication works", {
+  set.seed(0L)
+  data <- brm_simulate_outline(
+    n_group = 2,
+    n_patient = 100,
+    n_time = 4,
+    rate_dropout = 0,
+    rate_lapse = 0
+  ) |>
+    dplyr::mutate(response = rnorm(n = dplyr::n())) |>
+    brm_data_change() |>
+    brm_simulate_continuous(names = c("biomarker1", "biomarker2")) |>
+    brm_simulate_categorical(
+      names = c("status1", "status2"),
+      levels = c("present", "absent")
+    )
+  archetype <- brm_archetype_cells(data)
+  expect_equal(mean(archetype$nuisance_biomarker1), 0)
+  expect_equal(
+    mean(data$biomarker1),
+    attr(archetype$nuisance_biomarker1, "brm_center")
+  )
+  expect_equal(
+    max(abs((data$biomarker1 - center) - archetype$nuisance_biomarker1)),
+    0
+  )
+  archetype <- brm_recenter_nuisance(
+    data = archetype,
+    nuisance = "nuisance_biomarker1",
+    center = 0.75
+  )
+  expect_equal(attr(archetype$nuisance_biomarker1, "brm_center"), 0.75)
+  expect_equal(
+    max(abs((data$biomarker1 - 0.75) - archetype$nuisance_biomarker1)),
+    0
+  )
+})
diff --git a/vignettes/archetypes.Rmd b/vignettes/archetypes.Rmd
@@ -100,7 +100,7 @@ attr(archetype, "brm_archetype_interest")
 #> [5] "x_group_2_time_3" "x_group_2_time_4"
 ```
 
-and we have nuisance variables. Some nuisance variables are continuous covariates, while others are levels of one-hot-encoded concomitant factors or interactions of those concomitant factors with baseline and/or subgroup. All nuisance variables are centered at their means so the reference level of the model is at the "center" of the data and not implicitly conditional on a subset of the data. In addition, some nuisance variables are automatically dropped in order to ensure the model matrix is full-rank. This is critically important to preserve the interpretation of the columns of interest and make sure the informative priors behave as expected.
+and we have nuisance variables. Some nuisance variables are continuous covariates, while others are levels of one-hot-encoded concomitant factors or interactions of those concomitant factors with baseline and/or subgroup. All nuisance variables are centered at their means so the reference level of the model is at the "center" of the data and not implicitly conditional on a subset of the data.^[`brm_recenter_nuisance()` can retroactively recenter a nuisance column to a fixed value other than its mean.] In addition, some nuisance variables are automatically dropped in order to ensure the model matrix is full-rank. This is critically important to preserve the interpretation of the columns of interest and make sure the informative priors behave as expected.
 
 
 ```r
@@ -293,46 +293,46 @@ model <- brm_model(
 #> Compiling Stan program...
 #> Start sampling
 brms::prior_summary(model)
-#>                     prior    class                         coef group resp
-#>                    (flat)        b                                        
-#>  student_t(4, 2.17, 4.86)        b nuisance_baseline.timetime_2           
-#>  student_t(4, 3.22, 5.25)        b nuisance_baseline.timetime_3           
-#>  student_t(4, 2.53, 5.75)        b nuisance_baseline.timetime_4           
-#>                    (flat)        b          nuisance_biomarker1           
-#>                    (flat)        b          nuisance_biomarker2           
-#>                    (flat)        b      nuisance_status1_absent           
-#>                    (flat)        b     nuisance_status2_present           
-#>  student_t(4, 0.98, 2.37)        b             x_group_1_time_2           
-#>  student_t(4, 1.82, 3.32)        b             x_group_1_time_3           
-#>  student_t(4, 2.35, 4.41)        b             x_group_1_time_4           
-#>  student_t(4, 0.31, 2.22)        b             x_group_2_time_2           
-#>  student_t(4, 1.94, 2.85)        b             x_group_2_time_3           
-#>  student_t(4, 2.33, 3.41)        b             x_group_2_time_4           
-#>                    (flat)        b                                        
-#>                    (flat)        b                   timetime_2           
-#>                    (flat)        b                   timetime_3           
-#>                    (flat)        b                   timetime_4           
-#>      lkj_corr_cholesky(1) Lcortime                                        
-#>   dpar nlpar lb ub       source
-#>                         default
-#>                            user
-#>                            user
-#>                            user
-#>                    (vectorized)
-#>                    (vectorized)
-#>                    (vectorized)
-#>                    (vectorized)
-#>                            user
-#>                            user
-#>                            user
-#>                            user
-#>                            user
-#>                            user
-#>  sigma                  default
-#>  sigma             (vectorized)
-#>  sigma             (vectorized)
-#>  sigma             (vectorized)
-#>                         default
+#>                     prior    class                         coef group resp  dpar nlpar
+#>                    (flat)        b                                                    
+#>  student_t(4, 2.17, 4.86)        b nuisance_baseline.timetime_2                       
+#>  student_t(4, 3.22, 5.25)        b nuisance_baseline.timetime_3                       
+#>  student_t(4, 2.53, 5.75)        b nuisance_baseline.timetime_4                       
+#>                    (flat)        b          nuisance_biomarker1                       
+#>                    (flat)        b          nuisance_biomarker2                       
+#>                    (flat)        b      nuisance_status1_absent                       
+#>                    (flat)        b     nuisance_status2_present                       
+#>  student_t(4, 0.98, 2.37)        b             x_group_1_time_2                       
+#>  student_t(4, 1.82, 3.32)        b             x_group_1_time_3                       
+#>  student_t(4, 2.35, 4.41)        b             x_group_1_time_4                       
+#>  student_t(4, 0.31, 2.22)        b             x_group_2_time_2                       
+#>  student_t(4, 1.94, 2.85)        b             x_group_2_time_3                       
+#>  student_t(4, 2.33, 3.41)        b             x_group_2_time_4                       
+#>                    (flat)        b                                         sigma      
+#>                    (flat)        b                   timetime_2            sigma      
+#>                    (flat)        b                   timetime_3            sigma      
+#>                    (flat)        b                   timetime_4            sigma      
+#>      lkj_corr_cholesky(1) Lcortime                                                    
+#>  lb ub       source
+#>             default
+#>                user
+#>                user
+#>                user
+#>        (vectorized)
+#>        (vectorized)
+#>        (vectorized)
+#>        (vectorized)
+#>                user
+#>                user
+#>                user
+#>                user
+#>                user
+#>                user
+#>             default
+#>        (vectorized)
+#>        (vectorized)
+#>        (vectorized)
+#>             default
 ```
 
 Marginal mean estimation, post-processing, and visualization automatically understand the archetype without any user intervention.

diff --git a/vignettes/archetypes.Rmd.upstream b/vignettes/archetypes.Rmd.upstream
@@ -70,7 +70,7 @@ Those new columns constitute a custom model matrix to describe the desired param
 attr(archetype, "brm_archetype_interest")
 ```
 
-and we have nuisance variables. Some nuisance variables are continuous covariates, while others are levels of one-hot-encoded concomitant factors or interactions of those concomitant factors with baseline and/or subgroup. All nuisance variables are centered at their means so the reference level of the model is at the "center" of the data and not implicitly conditional on a subset of the data. In addition, some nuisance variables are automatically dropped in order to ensure the model matrix is full-rank. This is critically important to preserve the interpretation of the columns of interest and make sure the informative priors behave as expected.
+and we have nuisance variables. Some nuisance variables are continuous covariates, while others are levels of one-hot-encoded concomitant factors or interactions of those concomitant factors with baseline and/or subgroup. All nuisance variables are centered at their means so the reference level of the model is at the "center" of the data and not implicitly conditional on a subset of the data.^[`brm_recenter_nuisance()` can retroactively recenter a nuisance column to a fixed value other than its mean.] In addition, some nuisance variables are automatically dropped in order to ensure the model matrix is full-rank. This is critically important to preserve the interpretation of the columns of interest and make sure the informative priors behave as expected.
 
 ```{r}
 attr(archetype, "brm_archetype_nuisance")

diff --git a/vignettes/archetypes_figures/archetype_compare_data-1.png b/vignettes/archetypes_figures/archetype_compare_data-1.png