R/wrapper-anticlustering.R


#' Anticlustering
#'
#' Partition a pool of elements into groups (i.e., anticlusters) with
#' the aim of creating high within-group heterogeneity and high
#' between-group similarity.  Anticlustering is accomplished by
#' maximizing instead of minimizing a clustering objective function.
#' Implements anticlustering methods as described in Papenberg and
#' Klau (2021; <doi:10.1037/met0000301>), Brusco et al. 
#' (2020; <doi:10.1111/bmsp.12186>), and Papenberg (2024; 
#' <doi:10.1111/bmsp.12315>).
#'
#' @param x The data input. Can be one of two structures: (1) A
#'     feature matrix where rows correspond to elements and columns
#'     correspond to variables (a single numeric variable can be
#'     passed as a vector). (2) An N x N matrix dissimilarity matrix;
#'     can be an object of class \code{dist} (e.g., returned by
#'     \code{\link{dist}} or \code{\link{as.dist}}) or a \code{matrix}
#'     where the entries of the upper and lower triangular matrix
#'     represent pairwise dissimilarities.
#' @param K How many anticlusters should be created. Alternatively:
#'     (a) A vector describing the size of each group, or (b) a vector
#'     of length \code{nrow(x)} describing how elements are assigned
#'     to anticlusters before the optimization starts.
#' @param objective The objective to be maximized. The options
#'     "diversity" (default; previously called "distance", which is
#'     still supported), "variance", "kplus" and "dispersion" are
#'     natively supported. May also be a user-defined function. See
#'     Details.
#' @param method One of "exchange" (default) , "local-maximum",
#'     "brusco", or "ilp".  See Details.
#' @param preclustering Boolean. Should a preclustering be conducted
#'     before anticlusters are created? Defaults to \code{FALSE}. See
#'     Details.
#' @param categories A vector, data.frame or matrix representing one
#'     or several categorical variables whose distribution should be similar 
#'     between groups. See Details.
#' @param repetitions The number of times a search heuristic is
#'     initiated when using \code{method = "exchange"}, \code{method =
#'     "local-maximum"}, or \code{method = "brusco"}. In the end, the
#'     best objective found across the repetitions is returned.
#' @param standardize Boolean. If \code{TRUE} and \code{x} is a
#'     feature matrix, the data is standardized through a call to
#'     \code{\link{scale}} before the optimization starts. This
#'     argument is silently ignored if \code{x} is a distance matrix.
#'     
#'
#' @return A vector of length N that assigns a group (i.e, a number
#'     between 1 and \code{K}) to each input element.
#'
#' @importFrom Matrix sparseMatrix
#' @importFrom stats as.dist dist sd
#' 
#' @useDynLib anticlust, .registration = TRUE
#'
#' @export
#'
#' @author
#' Martin Papenberg \email{martin.papenberg@@hhu.de}
#'
#' @details
#'
#' This function is used to solve anticlustering. That is, the data
#' input is divided into \code{K} groups in such a way that elements
#' within groups are heterogeneous and the different groups are
#' similar. Anticlustering is accomplished by maximizing instead of
#' minimizing a clustering objective function. The maximization of
#' four clustering objective functions is natively supported (other
#' functions can also defined by the user as described below):
#' 
#' \itemize{
#'   \item{the `diversity`, setting 
#'         \code{objective = "diversity"} (this is the default objective)}
#'   \item{k-means `variance` objective, setting \code{objective = "variance"}}
#'   \item{`k-plus` objective, an extension of the k-means variance criterion,
#'         setting \code{objective = "kplus"}}
#'   \item{the `dispersion` objective is the minimum distance between 
#'         any two elements within the same cluster (setting 
#'         \code{objective = "dispersion"})}
#' }
#'
#' The k-means objective is the within-group variance---that is, the
#' sum of the squared distances between each element and its cluster
#' center (see \code{\link{variance_objective}}). K-means
#' anticlustering focuses on minimizing differences with regard to the
#' means of the input variables (that is, the columns in \code{x}), but it ignores any other distribution
#' characterstics such as the variance / standard deviation. K-plus anticlustering
#' (using \code{objective = "kplus"}) is an extension of the k-means criterion that also
#' minimizes differences with regard to the standard
#' deviations between groups (for details see \code{\link{kplus_anticlustering}}). K-plus
#' anticlustering can also be extended towards higher order moments such as skew and kurtosis; 
#' to consider these additional distribution characteristics, use the function
#' \code{\link{kplus_anticlustering}}. Setting \code{objective = "kplus"} in 
#' \code{anticlustering} function will only consider means 
#' and standard deviations (in my experience, this is what users usually want). 
#' It is strongly recommended to set the argument \code{standardize = TRUE} 
#' when using the k-plus objective.
#' 
#' The "diversity" objective is the sum of pairwise
#' distances of elements within the same groups (see
#' \code{\link{diversity_objective}}). Hence, anticlustering using the diversity 
#' criterion maximizes between-group similarity
#' by maximizing within-group heterogeneity (represented as the sum of all pairwise distances). 
#' If it is computed on the basis of the Euclidean distance (which is the default
#' behaviour if \code{x} is a feature matrix), the diversity is an all rounder objective that tends to equalize all distribution 
#' characteristics between groups (such as means, variances, ...). 
#' Note that the equivalence of within-group heterogeneity and between-group similarity only
#' holds for equal-sized groups. For unequal-sized groups, it is recommended to
#' use a different objective when striving for overall between-group similarity
#' (e.g., the k-plus objective). In the publication that introduces
#' the \code{anticlust} package (Papenberg & Klau, 2021), we used the term "anticluster 
#' editing" to refer to the maximization of the diversity, because the reversed 
#' procedure - minimizing the diversity - is also known as "cluster editing". 
#' 
#' The "dispersion" is the minimum distance between any two elements
#' that are part of the same cluster; maximization of this objective
#' ensures that any two elements within the same group are as
#' dissimilar from each other as possible. Applications that require
#' high within-group heterogeneity often require to maximize the
#' dispersion. Oftentimes, it is useful to also consider the diversity
#' and not only the dispersion; to optimize both objectives at the
#' same time, see the function
#' \code{\link{bicriterion_anticlustering}}.
#'
#' If the data input \code{x} is a feature matrix (that is: each row
#' is a "case" and each column is a "variable") and the option
#' \code{objective = "diversity"} or \code{objective = "dispersion"} is used, 
#' the Euclidean distance is computed as the basic unit of the objectives. If
#' a different measure of dissimilarity is preferred, you may pass a
#' self-generated dissimilarity matrix via the argument \code{x}.
#'
#' In the standard case, groups of equal size are generated. Adjust
#' the argument \code{K} to create groups of different size (see
#' Examples).
#'
#' \strong{Algorithms for anticlustering}
#'
#' By default, a heuristic method is employed for anticlustering: the
#' exchange method (\code{method = "exchange"}). First, elements are
#' randomly assigned to anticlusters (It is also possible to
#' explicitly specify the initial assignment using the argument
#' \code{K}; in this case, \code{K} has length \code{nrow(x)}.) Based
#' on the initial assignment, elements are systematically swapped
#' between anticlusters in such a way that each swap improves the
#' objective value. For an element, each possible swap with elements
#' in other clusters is simulated; then, the one swap is performed
#' that improves the objective the most, but a swap is only conducted
#' if there is an improvement at all. This swapping procedure is
#' repeated for each element. When using \code{method =
#' "local-maximum"}, the exchange method does not terminate after the
#' first iteration over all elements; instead, the swapping continues
#' until a local maximum is reached. This method corresponds to the algorithm 
#' "LCW" by Weitz and Lakshminarayanan (1998). This means that after the
#' exchange process has been conducted once for each data point, the
#' algorithm restarts with the first element and proceeds to conduct
#' exchanges until the objective cannot be improved.
#'
#' When setting \code{preclustering = TRUE}, only the \code{K - 1}
#' most similar elements serve as exchange partners for each element,
#' which can speed up the optimization (more information
#' on the preclustering heuristic follows below). If the \code{categories} argument
#' is used, only elements having the same value in \code{categories} serve as exchange
#' partners.
#' 
#' Using \code{method = "brusco"} implements the local bicriterion
#' iterated local search (BILS) heuristic by Brusco et al. (2020) and
#' returns the partition that best optimized either the diversity or
#' the dispersion during the optimization process. The function
#' \code{\link{bicriterion_anticlustering}} can also be used to run
#' the algorithm by Brusco et al., but it returns multiple partitions
#' that approximate the optimal pareto efficient set according to both
#' objectives (diversity and dispersion). Thus, to fully utilize the
#' BILS algorithm, use the function
#' \code{\link{bicriterion_anticlustering}}.
#'
#' \strong{Optimal anticlustering}
#'
#' Usually, heuristics are employed to tackle anticlustering problems,
#' and their performance is generally very satisfying.  However,
#' heuristics do not investigate all possible group assignments and
#' therefore do not (necessarily) find the
#' "globally optimal solution", i.e., a partitioning that has the best
#' possible value with regard to the objective that is optimized.  Enumerating
#' all possible partitions in order to find the best solution,
#' however, quickly becomes impossible with increasing N, and
#' therefore it is not possible to find a global optimum this
#' way. Because all anticlustering problems considered here are also
#' NP-hard, there is also no (known) clever algorithm that might
#' identify the best solution without considering all possibilities -
#' at least in the worst case. Integer linear programming (ILP) is an
#' approach for tackling NP hard problems that nevertheless tries to
#' be clever when finding optimal solutions: It does not necessarily
#' enumerate all possibilities but is still guaranteed to return the
#' optimal solution. Still, for NP hard problems such as
#' anticlustering, ILP methods will also fail at some point (i.e.,
#' when N increases).
#'
#' For the objectives \code{diversity} and \code{dispersion},
#' \code{anticlust} implements optimal solution algorithms via integer
#' linear programming. In order to use the ILP methods, set
#' \code{method = "ilp"}. The integer linear program optimizing the
#' diversity was described in Papenberg & Klau, (2021; (8) -
#' (12)). The documentation of the function
#' \code{\link{optimal_dispersion}} has more information on the
#' optimal maximization of the dispersion (this is the function that is called internally by
#' anticlustering() when using \code{objective = "dispersion"} and
#' \code{method = "ilp"}). The ILP methods either require the R
#' package \code{Rglpk} and the GNU linear programming kit
#' (<http://www.gnu.org/software/glpk/>), or the R package
#' \code{Rsymphony} and the COIN-OR SYMPHONY solver libraries
#' (<https://github.com/coin-or/SYMPHONY>). The function will try to
#' find the GLPK or SYMPHONY solver and throw an error if none is
#' available. If both are found, the GLPK solver is used. Use the functions
#' \code{\link{optimal_anticlustering}} or \code{\link{optimal_dispersion}}
#' to manually select a solver.
#'
#' Optimally maximizing the diversity only works for rather small N
#' and K; N = 20 and K = 2 is usually solved within some seconds, but
#' the run time quickly increases with increasing N (or K). The
#' maximum dispersion problem can be solved for much larger instances,
#' especially for K = 2 (which in theory is not even NP hard; note
#' that for the diversity, K = 2 is already NP hard). For K = 3, and K
#' = 4, several 100 elements can usually be processed, especially when
#' installing the SYMPHONY solver.
#' 
#' \strong{Preclustering}
#' 
#' A useful heuristic for anticlustering is to form small groups of
#' very similar elements and assign these to different groups. This
#' logic is used as a preprocessing when setting \code{preclustering =
#' TRUE}. That is, before the anticlustering objective is optimized, a
#' cluster analysis identifies small groups of similar elements (pairs
#' if \code{K = 2}, triplets if \code{K = 3}, and so forth). The
#' optimization of the anticlustering objective is then conducted
#' under the constraint that these matched elements cannot be assigned
#' to the same group. When using the exchange algorithm, preclustering
#' is conducted using a call to \code{\link{matching}}. When using
#' \code{method = "ilp"}, the preclustering optimally finds groups of
#' minimum pairwise distance by solving the integer linear program
#' described in Papenberg and Klau (2021; (8) - (10), (12) - (13)).
#' Note that when combining preclustering restrictions with \code{method = "ilp"},
#' the anticlustering result is no longer guaranteed to be globally optimal, but
#' only optimal given the preclustering restrictions.
#' 
#' \strong{Categorical variables}
#'
#' The argument \code{categories} may induce categorical constraints,
#' i.e., can be used to distribute categorical variables evenly
#' between sets.  The grouping variables indicated by
#' \code{categories} will be balanced out across anticlusters. This
#' functionality is only available for the classical exchange
#' procedures, that is, for \code{method = "exchange"} and
#' \code{method = "local-maximum"}. When \code{categories} has multiple columns 
#' (i.e., there are multiple categorical variables), each combination of categories is
#' treated as a distinct category by the exchange method (i.e., the multiple columns
#' are "merged" into a single column). This behaviour may lead
#' to less than optimal results on the level of each single categorical variable.
#' 
#' \strong{Optimize a custom objective function}
#' 
#' It is possible to pass a \code{function} to the argument
#' \code{objective}. See \code{\link{dispersion_objective}} for an
#' example. If \code{objective} is a function, the exchange method
#' assigns elements to anticlusters in such a way that the return
#' value of the custom function is maximized (hence, the function
#' should return larger values when the between-group similarity is
#' higher). The custom function has to take two arguments: the first
#' is the data argument, the second is the clustering assignment. That
#' is, the argument \code{x} will be passed down to the user-defined
#' function as first argument. \strong{However, only after}
#' \code{\link{as.matrix}} has been called on \code{x}. This implies
#' that in the function body, columns of the data set cannot be
#' accessed using \code{data.frame} operations such as
#' \code{$}. Objects of class \code{dist} will be converted to matrix
#' as well.
#' 
#' 
#' @examples
#'
#' # Optimize the default diversity criterion
#' anticlusters <- anticlustering(
#'   schaper2019[, 3:6],
#'   K = 3,
#'   categories = schaper2019$room
#' )
#' # Compare feature means by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(colMeans(x), 2))
#' # Compare standard deviations by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(apply(x, 2, sd), 2))
#' # check that the "room" is balanced across anticlusters:
#' table(anticlusters, schaper2019$room)
#' 
#' # Use multiple starts of the algorithm to improve the objective and
#' # optimize the k-means criterion ("variance")
#' anticlusters <- anticlustering(
#'   schaper2019[, 3:6],
#'   objective = "variance",
#'   K = 3,
#'   categories = schaper2019$room,
#'   method = "local-maximum",
#'   repetitions = 2
#' )
#' # Compare means and standard deviations by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(colMeans(x), 2))
#' by(schaper2019[, 3:6], anticlusters, function(x) round(apply(x, 2, sd), 2))
#' 
#' # Use different group sizes and optimize the extended k-means
#' # criterion ("kplus")
#' anticlusters <- anticlustering(
#'   schaper2019[, 3:6],
#'   objective = "kplus",
#'   K = c(24, 24, 48),
#'   categories = schaper2019$room,
#'   repetitions = 10,
#'   method = "local-maximum",
#'   standardize = TRUE
#' )
#' 
#' table(anticlusters, schaper2019$room)
#' # Compare means and standard deviations by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(colMeans(x), 2))
#' by(schaper2019[, 3:6], anticlusters, function(x) round(apply(x, 2, sd), 2))
#'
#'
#' @references
#' 
#' Brusco, M. J., Cradit, J. D., & Steinley, D. (2020). Combining
#' diversity and dispersion criteria for anticlustering: A bicriterion
#' approach. British Journal of Mathematical and Statistical
#' Psychology, 73, 275-396. https://doi.org/10.1111/bmsp.12186
#' 
#' Papenberg, M., & Klau, G. W. (2021). Using anticlustering to partition 
#' data sets into equivalent parts. Psychological Methods, 26(2), 
#' 161–174. https://doi.org/10.1037/met0000301.
#' 
#' Papenberg, M. (2024). K-plus Anticlustering: An Improved k-means Criterion for 
#' Maximizing Between-Group Similarity. British Journal of Mathematical and 
#' Statistical Psychology, 77(1), 80-102. https://doi.org/10.1111/bmsp.12315
#'
#' Späth, H. (1986). Anticlustering: Maximizing the variance criterion.
#' Control and Cybernetics, 15, 213-218.
#' 
#' Weitz, R. R., & Lakshminarayanan, S. (1998). An empirical comparison of 
#' heuristic methods for creating maximally diverse groups. Journal of the 
#' Operational Research Society, 49(6), 635-646. https://doi.org/10.1057/palgrave.jors.2600510
#'

anticlustering <- function(x, K, objective = "diversity", method = "exchange",
                           preclustering = FALSE, categories = NULL, 
                           repetitions = NULL, standardize = FALSE) {


  ## Get data into required format
  input_validation_anticlustering(x, K, objective, method, preclustering, 
                                  categories, repetitions, standardize)

  x <- to_matrix(x)
  # there is a reason why scaling happens here and below (because of ILP + kplus)
  if (!is_distance_matrix(x) && standardize == TRUE) {
    x <- scale(x)
  }

  ## Exact method using ILP
  if (method == "ilp") {
    if (objective == "dispersion") {
      return(optimal_dispersion(x, K)$groups)
    }
    return(exact_anticlustering(x, K, preclustering))
  }

  # Preclustering and categorical constraints are both processed in the
  # variable `categories` after this step:
  categories <- get_categorical_constraints(x, K, preclustering, categories)

  if (!inherits(objective, "function")) {
    if (objective == "kplus") {
      x <- cbind(x, squared_from_mean(x))
      objective <- "variance"
    }
    if (objective == "distance") {
      objective <- "diversity"
    }
  }
  
  if (!is_distance_matrix(x) && standardize == TRUE) {
    x <- scale(x)
  }
  
  # In some cases, `anticlustering()` has to be called repeatedly - 
  # redirect to `repeat_anticlustering()` in this case, which then
  # again calls anticlustering with method "exchange" and 
  # repetitions = NULL
  if (method == "local-maximum" || 
      (method == "exchange" && argument_exists(repetitions))) {
    if (!argument_exists(repetitions)) {
      repetitions <- 1
    }
    return(repeat_anticlustering(x, K, objective, categories, method, repetitions))
  }
  

  if (method == "brusco") {
    if (objective == "diversity") {
      weights <- c(0.5, 0.99, 0.999, 0.999999)
      obj_fun <- diversity_objective_
    } else if (objective == "dispersion") {
      weights <- c(0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1)
      obj_fun <- dispersion_objective_
    }
    partitions <- as.matrix(bicriterion_anticlustering(x, K, repetitions, weights))
    # get best partition wrt dispersion / diversity
    best_obj <- which.max(apply(partitions, 1, obj_fun, convert_to_distances(x)))
    return(partitions[best_obj, ])
  }

  if (inherits(objective, "function")) {
    # most generic exchange method, deals with any objective function
    return(exchange_method(x, K, objective, categories))
  }

  # Redirect to specialized fast exchange methods for diversity and 
  # variance objectives
  c_anticlustering(x, K, categories, objective)
}

# Function that processes input and returns the data set that the
# optimization is conducted on as matrix (for exchange method)
# Returned matrix either represents distances or features.
to_matrix <- function(data) {
  if (!is_distance_matrix(data)) {
    data <- as.matrix(data)
    return(data)
  }
  as.matrix(as.dist(data))
}

# Determines if preclustering constraints or categorical constraints
# are present. Returns a grouping vector if one or both constraints 
# have been passed, or NULL if none is required
get_categorical_constraints <- function(data, K, preclustering, categories) {
  if (preclustering == TRUE) {
    N <- nrow(data)
    matches <- matching(data, p = K, match_within = categories, sort_output = FALSE)
    # deal with NA in matches
    return(replace_na_by_index(matches))
  }
  if (argument_exists(categories)) {
    return(merge_into_one_variable(categories))
  }
  NULL
}

replace_na_by_index <- function(matches) {
  na_matches <- is.na(matches)
  NAs <- sum(na_matches) 
  if (NAs == 0) {
    return(matches)
  }
  max_group <- max(matches, na.rm = TRUE)
  matches[na_matches] <- max_group + 1:NAs 
  matches
}