R/nm-join.R

#' Return a single data frame with model output and input data
#'
#' For NONMEM models, when a unique row identifier (e.g an integer numbering the
#' rows) is included in the input data set (i.e. the file in `$DATA`) and
#' carried into each table output, `nm_join()` can read in all output table
#' files and join back to the input data set. By default, the input data is
#' joined to the table files so that the number of rows in the result will match
#' the number of rows in the table files (i.e. the number of rows _not_ bypassed via
#' `$IGNORE`). Use the `.superset` argument to join table outputs to the
#' (complete) input data set. This function will print the number of rows and
#' columns when each file is loaded, as well as some information about the
#' joins. This **printing can be suppressed** by setting `options(bbr.verbose =
#' FALSE)`.
#'
#' @inheritParams nm_tables
#' @param .join_col Character column name to use to join table files. Defaults to
#'   `NUM`. See Details.
#' @param .superset If `FALSE`, the default, the data will be joined to the
#'   NONMEM output and if `TRUE`, the NONMEM output will be joined to the data;
#'   that is, if you use `.superset`, you will get the same number of rows as
#'   you have in the input data and NONMEM output columns like `PRED` and
#'   `CWRES` will be filled with `NA`.
#' @param .bbi_args Named list passed to `model_summary(.bbi_args)`. See
#'   [print_bbi_args()] for valid options. Defaults to `list(no_grd_file = TRUE,
#'   no_shk_file = TRUE)` because [model_summary()] is only called internally to
#'   extract the number of records and individuals, so those files are
#'   irrelevant.
#'
#' @details
#'
#' **Join column**
#'
#' The `.join_col` is the name of a single column that should appear in both the
#' input data set and any tables you want to join. We recommend you make this
#' column a simple integer numbering the rows in the input data set (for example
#' `NUM`). When this column is carried into the output table files, there will
#' be unambiguous matching from the table file back to the input data set.
#'
#' The one exception to this are `FIRSTONLY` tables. If a table file has the
#' same number of rows as the there are individuals in the input data set
#' (accounting for any filtering of data in the NONMEM control stream), it will
#' assumed to be a `FIRSTONLY` table. In this case, the table will be joined to
#' the input data by the `ID` column. If `ID` is not present in the table, it
#' will be using `.join_col`. Note that if _neither_ `ID` or the column passed
#' to `.join_col` are present in the table, the join will fail.
#'
#' Note also that, when `.join_col` is carried into table outputs, **there is no
#' need to table any other columns from the input data** as long as the
#' `nm_join()` approach is used; any column in the input data set, regardless
#' of whether it is listed in `$INPUT` or not, will be carried through from the
#' input data and therefore available in the joined result.
#'
#' **Duplicate columns are dropped**
#'
#' If a table has columns with the same name as columns in the input data set,
#' or a table that has already been joined, those columns will be dropped from
#' the joined data. If `getOption(bbr.verbose) == TRUE` a message will be
#' printed about any columns dropped this way.
#'
#' The one exception to this is the `DV` column. If `DV` is present in the input
#' data _and_ at least one of the table files, the `DV` column from the input
#' data will be renamed to `DV.DATA` and the column from the table file kept as
#' `DV`.
#'
#' The origin of each column is attached to the return value via the
#' "nm_join_origin" attribute, a list that maps each source (as named by
#' [nm_tables()]) to the columns that came from that source.
#'
#' **Duplicate Rows Warning for Join Column**
#'
#' If there are duplicate rows found in the specified `.join_col`, a warning will be raised specifying a subset of the repeated rows.
#' Duplicates may be caused by lack of output width. `FORMAT` may be need to be stated in control stream to have sufficient
#' width to avoid truncating `.join_col`.
#'
#' **Multiple tables per file incompatibility**
#'
#' Because `nm_tables()` calls [nm_file()] internally, it is _not_ compatible
#' with multiple tables written to a single file. See "Details" in [nm_file()]
#' for alternatives.
#'
#' @importFrom dplyr left_join select
#' @importFrom checkmate assert_string assert_character assert_logical assert_list
#' @seealso [nm_tables()], [nm_table_files()], [nm_file()]
#' @export
nm_join <- function(
  .mod,
  .join_col = "NUM",
  .files = nm_table_files(.mod),
  .superset = FALSE,
  .bbi_args = list(
    no_grd_file = TRUE,
    no_shk_file = TRUE
  )
) {
  if (inherits(.mod, "bbi_nmbayes_model")) {
    stop(
      "nm_join() is not supported for nmbayes models; ",
      "use `bbr.bayes::nm_join_bayes()` instead."
    )
  }

  if (inherits(.mod, "character")) {
    checkmate::assert_string(.mod)
    .mod <- read_model(.mod)
  }
  check_model_object(.mod, c(NM_MOD_CLASS, NM_SUM_CLASS))
  assert_string(.join_col)
  assert_character(.files)
  assert_logical(.superset, len = 1)
  assert_list(.bbi_args)

  df_list <- nm_tables(.mod, .files = .files)
  .d <- df_list$data
  .tbls <- df_list[2:length(df_list)]
  if (
    "DV" %in% names(.d) &&
    "DV" %in% unlist(map(.tbls, names))
  ) {
    .d <- rename(.d, DV.DATA = "DV")
  }
  col_order <- names(.d)

  # Keep track of where each column came from.
  origin <- vector(mode = "list", length = length(df_list))
  names(origin) <- names(df_list)
  origin$data <- col_order

  .join_col <- toupper(.join_col)
  if (!(.join_col %in% names(.d))) {
    stop(glue("couldn't find `.join_col` {.join_col} in data with cols: {paste(names(.d), collapse = ', ')}"))
  }

  if(anyDuplicated(.d[.join_col]) != 0)
  {
   dup_row <- .d[.join_col][duplicated( .d[.join_col]) %>% which(),]
   stop(glue("Duplicate rows were found in {.join_col}. Please see `?nm_join` for more details"))
  }

  if (.superset) {
    join_fun <- function(x, y, ...) left_join(y, x, ...)
    join_first_only_fun <- join_fun
  } else {
    join_fun <- left_join
    # For the FIRSTONLY case, joining the data to the table is expected to match
    # multiple table rows; use left_join_all() to avoid a warning.
    join_first_only_fun <- left_join_all
  }

  # get number of ID's and records
  .s <- if (!inherits(.mod, NM_SUM_CLASS)) {
    model_summary(.mod, .bbi_args = .bbi_args)
  } else {
    .mod
  }
  nid <-  .s$run_details$number_of_subjects
  nrec <- .s$run_details$number_of_data_records

  # do the join(s)
  for (.n in names(.tbls)) {
    tab <- .tbls[[.n]]
    has_id <- "ID" %in% names(tab)

    if (!(nrow(tab) %in% c(nrec, nid))) {
      # skip table if nrow doesn't match number of records or ID's
      # because if neither is true than this is the wrong kind of file
      # (or something is wrong with NONMEM output)
      warning(glue("{.n} skipped because number of rows ({nrow(tab)}) doesn't match number of records ({nrec}) or IDs ({nid})"), call. = FALSE)
    } else if (nrow(tab) == nid) {
      # if FIRSTONLY table join on ID
      verbose_msg(glue("{.n} is FIRSTONLY table"))

      # if ID is missing, get it from the data by using .join_col
      if (!has_id) {
        tab <- tab %>%
          left_join(select(.d, "ID", !!.join_col), by = .join_col)
      }

      # toss .join_col, if present, because we're joining on ID
      tab[[.join_col]] <- NULL

      # do the join
      tab <- drop_dups(tab, .d, "ID", .n)
      col_order <- union(col_order, names(tab))
      .d <- join_first_only_fun(tab, .d, by = "ID")
    } else if (nrow(tab) == nrec) {
      # otherwise, join on .join_col
      tab <- drop_dups(tab, .d, .join_col, .n)
      col_order <- union(col_order, names(tab))
      .d <- join_fun(tab, .d, by = .join_col)
    }
    origin[[.n]] <- names(tab)
  }

  verbose_msg(c(
    glue("\nfinal join stats:"),
    glue("  rows: {nrow(.d)}"),
    glue("  cols: {ncol(.d)}")
  ))

  res <- select(.d, !!col_order)
  attr(res, "nm_join_origin") <- origin

  return(res)
}


#' Drop duplicate columns to prepare for join
#' @keywords internal
drop_dups <- function(.new_table, .dest_table, .join_col, .table_name) {
  new_cols <- setdiff(names(.new_table), names(.dest_table))
  keep <- c(.join_col, new_cols)
  drop <- setdiff(names(.new_table), keep)

  verbose_msg(glue("{.table_name} adds {length(new_cols)} new cols"))
  if (length(drop) > 0) verbose_msg(glue("  dropping {length(drop)} duplicate cols: {paste(drop, collapse = ', ')}"))
  verbose_msg("") # for newline

  if(.new_table[.join_col] %>% anyDuplicated() != 0)
  {
    dup_row <- .new_table[.join_col][duplicated( .new_table[.join_col]) %>% which(),]
    stop(glue("Duplicate rows in {.join_col}: {dup_row}"))
  }
  return(.new_table[keep])
}