R/guess_ab_col.R

# ==================================================================== #
# TITLE:                                                               #
# AMR: An R Package for Working with Antimicrobial Resistance Data     #
#                                                                      #
# SOURCE CODE:                                                         #
# https://github.com/msberends/AMR                                     #
#                                                                      #
# PLEASE CITE THIS SOFTWARE AS:                                        #
# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
# Journal of Statistical Software, 104(3), 1-31.                       #
# https://doi.org/10.18637/jss.v104.i03                                #
#                                                                      #
# Developed at the University of Groningen and the University Medical  #
# Center Groningen in The Netherlands, in collaboration with many      #
# colleagues from around the world, see our website.                   #
#                                                                      #
# This R package is free software; you can freely use and distribute   #
# it for both personal and commercial purposes under the terms of the  #
# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
# the Free Software Foundation.                                        #
# We created this package for both routine data analysis and academic  #
# research and it was publicly released in the hope that it will be    #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
#                                                                      #
# Visit our website for the full manual and a complete tutorial about  #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
# ==================================================================== #

#' Guess Antibiotic Column
#'
#' This tries to find a column name in a data set based on information from the [antibiotics] data set. Also supports WHONET abbreviations.
#' @param x a [data.frame]
#' @param search_string a text to search `x` for, will be checked with [as.ab()] if this value is not a column in `x`
#' @param verbose a [logical] to indicate whether additional info should be printed
#' @param only_sir_columns a [logical] to indicate whether only antibiotic columns must be detected that were transformed to class `sir` (see [as.sir()]) on beforehand (default is `FALSE`)
#' @details You can look for an antibiotic (trade) name or abbreviation and it will search `x` and the [antibiotics] data set for any column containing a name or code of that antibiotic.
#' @return A column name of `x`, or `NULL` when no result is found.
#' @export
#' @examples
#' df <- data.frame(
#'   amox = "S",
#'   tetr = "R"
#' )
#'
#' guess_ab_col(df, "amoxicillin")
#' guess_ab_col(df, "J01AA07") # ATC code of tetracycline
#'
#' guess_ab_col(df, "J01AA07", verbose = TRUE)
#' # NOTE: Using column 'tetr' as input for J01AA07 (tetracycline).
#'
#' # WHONET codes
#' df <- data.frame(
#'   AMP_ND10 = "R",
#'   AMC_ED20 = "S"
#' )
#' guess_ab_col(df, "ampicillin")
#' guess_ab_col(df, "J01CR02")
#' guess_ab_col(df, as.ab("augmentin"))
guess_ab_col <- function(x = NULL, search_string = NULL, verbose = FALSE, only_sir_columns = FALSE) {
  meet_criteria(x, allow_class = "data.frame", allow_NULL = TRUE)
  meet_criteria(search_string, allow_class = "character", has_length = 1, allow_NULL = TRUE)
  meet_criteria(verbose, allow_class = "logical", has_length = 1)
  meet_criteria(only_sir_columns, allow_class = "logical", has_length = 1)

  if (is.null(x) && is.null(search_string)) {
    return(as.name("guess_ab_col"))
  } else {
    meet_criteria(search_string, allow_class = "character", has_length = 1, allow_NULL = FALSE)
  }

  all_found <- get_column_abx(x,
    info = verbose, only_sir_columns = only_sir_columns,
    verbose = verbose, fn = "guess_ab_col"
  )
  search_string.ab <- suppressWarnings(as.ab(search_string))
  ab_result <- unname(all_found[names(all_found) == search_string.ab])

  if (length(ab_result) == 0) {
    if (isTRUE(verbose)) {
      message_("No column found as input for ", search_string,
        " (", ab_name(search_string, language = NULL, tolower = TRUE), ").",
        add_fn = font_black,
        as_note = FALSE
      )
    }
    return(NULL)
  } else {
    if (isTRUE(verbose)) {
      message_(
        "Using column '", font_bold(ab_result), "' as input for ", search_string,
        " (", ab_name(search_string, language = NULL, tolower = TRUE), ")."
      )
    }
    return(ab_result)
  }
}

get_column_abx <- function(x,
                           ...,
                           soft_dependencies = NULL,
                           hard_dependencies = NULL,
                           verbose = FALSE,
                           info = TRUE,
                           only_sir_columns = FALSE,
                           sort = TRUE,
                           reuse_previous_result = TRUE,
                           fn = NULL,
                           return_all = FALSE) {
  # check if retrieved before, then get it from package environment
  if (isTRUE(reuse_previous_result) && identical(
    unique_call_id(
      entire_session = FALSE,
      match_fn = fn
    ),
    AMR_env$get_column_abx.call
  )) {
    # so within the same call, within the same environment, we got here again.
    # but we could've come from another function within the same call, so now only check the columns that changed

    # first remove the columns that are not existing anymore
    previous <- AMR_env$get_column_abx.out
    current <- previous[previous %in% colnames(x)]

    # then compare columns in current call with columns in original call
    new_cols <- colnames(x)[!colnames(x) %in% AMR_env$get_column_abx.checked_cols]
    if (length(new_cols) > 0) {
      # these columns did not exist in the last call, so add them
      new_cols_sir <- get_column_abx(x[, new_cols, drop = FALSE], reuse_previous_result = FALSE, info = FALSE, sort = FALSE)
      current <- c(current, new_cols_sir)
      # order according to columns in current call
      current <- current[match(colnames(x)[colnames(x) %in% current], current)]
    }

    # update pkg environment to improve speed on next run
    AMR_env$get_column_abx.out <- current
    AMR_env$get_column_abx.checked_cols <- colnames(x)

    # and return right values
    return(AMR_env$get_column_abx.out)
  }

  meet_criteria(x, allow_class = "data.frame")
  meet_criteria(soft_dependencies, allow_class = "character", allow_NULL = TRUE)
  meet_criteria(hard_dependencies, allow_class = "character", allow_NULL = TRUE)
  meet_criteria(verbose, allow_class = "logical", has_length = 1)
  meet_criteria(info, allow_class = "logical", has_length = 1)
  meet_criteria(only_sir_columns, allow_class = "logical", has_length = 1)
  meet_criteria(sort, allow_class = "logical", has_length = 1)

  if (isTRUE(info)) {
    message_("Auto-guessing columns suitable for analysis", appendLF = FALSE, as_note = FALSE)
  }

  x <- as.data.frame(x, stringsAsFactors = FALSE)
  x.bak <- x
  if (only_sir_columns == TRUE) {
    x <- x[, which(is.sir(x)), drop = FALSE]
  }

  if (NROW(x) > 10000) {
    # only test maximum of 10,000 values per column
    if (isTRUE(info)) {
      message_(" (using only ", font_bold("the first 10,000 rows"), ")...",
        appendLF = FALSE,
        as_note = FALSE
      )
    }
    x <- x[1:10000, , drop = FALSE]
  } else if (isTRUE(info)) {
    message_("...", appendLF = FALSE, as_note = FALSE)
  }

  # only check columns that are a valid AB code, ATC code, name, abbreviation or synonym,
  # or already have the 'sir' class (as.sir)
  # and that they have no more than 50% invalid values
  vectr_antibiotics <- unlist(AMR_env$AB_lookup$generalised_all)
  vectr_antibiotics <- vectr_antibiotics[!is.na(vectr_antibiotics) & nchar(vectr_antibiotics) >= 3]
  x_columns <- vapply(
    FUN.VALUE = character(1),
    colnames(x),
    function(col, df = x) {
      if (generalise_antibiotic_name(col) %in% vectr_antibiotics ||
        is.sir(x[, col, drop = TRUE]) ||
        is_sir_eligible(x[, col, drop = TRUE], threshold = 0.5)
      ) {
        return(col)
      } else {
        return(NA_character_)
      }
    }, USE.NAMES = FALSE
  )

  x_columns <- x_columns[!is.na(x_columns)]
  x <- x[, x_columns, drop = FALSE] # without drop = FALSE, x will become a vector when x_columns is length 1
  df_trans <- data.frame(
    colnames = colnames(x),
    abcode = suppressWarnings(as.ab(colnames(x), info = FALSE)),
    stringsAsFactors = FALSE
  )
  df_trans <- df_trans[!is.na(df_trans$abcode), , drop = FALSE]
  out <- as.character(df_trans$colnames)
  names(out) <- df_trans$abcode

  # add from self-defined dots (...):
  # such as get_column_abx(example_isolates %>% rename(thisone = AMX), amox = "thisone")
  all_okay <- TRUE
  dots <- list(...)
  # remove data.frames, since this is also used running `eucast_rules(eucast_rules_df = df)`
  dots <- dots[!vapply(FUN.VALUE = logical(1), dots, is.data.frame)]
  if (length(dots) > 0) {
    newnames <- suppressWarnings(as.ab(names(dots), info = FALSE))
    if (anyNA(newnames)) {
      if (isTRUE(info)) {
        message_(" WARNING", add_fn = list(font_yellow, font_bold), as_note = FALSE)
      }
      warning_("Invalid antibiotic reference(s): ", vector_and(names(dots)[is.na(newnames)], quotes = FALSE),
        call = FALSE,
        immediate = TRUE
      )
      all_okay <- FALSE
    }
    unexisting_cols <- which(!vapply(FUN.VALUE = logical(1), dots, function(col) all(col %in% x_columns)))
    if (length(unexisting_cols) > 0) {
      if (isTRUE(info)) {
        message_(" ERROR", add_fn = list(font_red, font_bold), as_note = FALSE)
      }
      stop_("Column(s) not found: ", vector_and(unlist(dots[[unexisting_cols]]), quotes = FALSE),
        call = FALSE
      )
      all_okay <- FALSE
    }
    # turn all NULLs to NAs
    dots <- unlist(lapply(dots, function(dot) if (is.null(dot)) NA else dot))
    names(dots) <- newnames
    dots <- dots[!is.na(names(dots))]
    # merge, but overwrite automatically determined ones by 'dots'
    out <- c(out[!out %in% dots & !names(out) %in% names(dots)], dots)
    # delete NAs, this will make e.g. eucast_rules(... TMP = NULL) work to prevent TMP from being used
    out <- out[!is.na(out)]
  }

  if (length(out) == 0) {
    if (isTRUE(info) && all_okay == TRUE) {
      message_("No columns found.")
    }
    AMR_env$get_column_abx.call <- unique_call_id(entire_session = FALSE, match_fn = fn)
    AMR_env$get_column_abx.checked_cols <- colnames(x.bak)
    AMR_env$get_column_abx.out <- out
    return(out)
  }

  # sort on name
  if (sort == TRUE) {
    out <- out[order(names(out), out)]
  }
  
  if (return_all == FALSE) {
    # only keep the first hits, no duplicates
    duplicates <- c(out[duplicated(names(out))], out[duplicated(unname(out))])
    if (length(duplicates) > 0) {
      all_okay <- FALSE
    }
    
    if (isTRUE(info)) {
      if (all_okay == TRUE) {
        message_(" OK.", add_fn = list(font_green, font_bold), as_note = FALSE)
      } else {
        message_(" WARNING.", add_fn = list(font_yellow, font_bold), as_note = FALSE)
      }
      for (i in seq_len(length(out))) {
        if (isTRUE(verbose) && !names(out[i]) %in% names(duplicates)) {
          message_(
            "Using column '", font_bold(out[i]), "' as input for ", names(out)[i],
            " (", ab_name(names(out)[i], tolower = TRUE, language = NULL), ")."
          )
        }
        if (names(out[i]) %in% names(duplicates)) {
          already_set_as <- out[unname(out) == unname(out[i])][1L]
          if (names(out)[i] != names(already_set_as)) {
            warning_(
              paste0(
                "Column '", font_bold(out[i]), "' will not be used for ",
                names(out)[i], " (", ab_name(names(out)[i], tolower = TRUE, language = NULL), ")",
                ", as it is already set for ",
                names(already_set_as), " (", ab_name(names(already_set_as), tolower = TRUE, language = NULL), ")"
              ),
              add_fn = font_red,
              immediate = verbose
            )
          }
        }
      }
    }
    
    out <- out[!duplicated(names(out))]
    out <- out[!duplicated(unname(out))]
    if (sort == TRUE) {
      out <- out[order(names(out), out)]
    }
  }

  if (!is.null(hard_dependencies)) {
    hard_dependencies <- unique(hard_dependencies)
    if (!all(hard_dependencies %in% names(out))) {
      # missing a hard dependency will return NA and consequently the data will not be analysed
      missing <- hard_dependencies[!hard_dependencies %in% names(out)]
      generate_warning_abs_missing(missing, any = FALSE)
      return(NA)
    }
  }
  if (!is.null(soft_dependencies)) {
    soft_dependencies <- unique(soft_dependencies)
    if (isTRUE(info) && !all(soft_dependencies %in% names(out))) {
      # missing a soft dependency may lower the reliability
      missing <- soft_dependencies[!soft_dependencies %in% names(out)]
      missing_msg <- vector_and(
        paste0(
          ab_name(missing, tolower = TRUE, language = NULL),
          " (", font_bold(missing, collapse = NULL), ")"
        ),
        quotes = FALSE
      )
      message_(
        "Reliability would be improved if these antimicrobial results would be available too: ",
        missing_msg
      )
    }
  }

  AMR_env$get_column_abx.call <- unique_call_id(entire_session = FALSE, match_fn = fn)
  AMR_env$get_column_abx.checked_cols <- colnames(x.bak)
  AMR_env$get_column_abx.out <- out
  out
}

get_ab_from_namespace <- function(x, cols_ab) {
  # cols_ab comes from get_column_abx()

  x <- trimws2(unique(toupper(unlist(strsplit(x, ",", fixed = TRUE)))))
  x_new <- character()
  for (val in x) {
    if (paste0("AB_", val) %in% ls(envir = asNamespace("AMR"))) {
      # antibiotic group names, as defined in data-raw/_pre_commit_checks.R, such as `AB_CARBAPENEMS`
      val <- eval(parse(text = paste0("AB_", val)), envir = asNamespace("AMR"))
    } else if (val %in% AMR_env$AB_lookup$ab) {
      # separate drugs, such as `AMX`
      val <- as.ab(val)
    } else {
      stop_("unknown antimicrobial drug (group): ", val, call = FALSE)
    }
    x_new <- c(x_new, val)
  }
  x_new <- unique(x_new)
  out <- cols_ab[match(x_new, names(cols_ab))]
  out[!is.na(out)]
}

generate_warning_abs_missing <- function(missing, any = FALSE) {
  missing <- paste0(missing, " (", ab_name(missing, tolower = TRUE, language = NULL), ")")
  if (any == TRUE) {
    any_txt <- c(" any of", "is")
  } else {
    any_txt <- c("", "are")
  }
  warning_(
    paste0(
      "Introducing NAs since", any_txt[1], " these antimicrobials ", any_txt[2], " required: ",
      vector_and(missing, quotes = FALSE)
    ),
    immediate = TRUE
  )
}