/
wrapper-anticlustering.R
462 lines (445 loc) · 21.6 KB
/
wrapper-anticlustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
#' Anticlustering
#'
#' Partition a pool of elements into groups (i.e., anticlusters) with
#' the aim of creating high within-group heterogeneity and high
#' between-group similarity. Anticlustering is accomplished by
#' maximizing instead of minimizing a clustering objective function.
#' Implements anticlustering methods as described in Papenberg and
#' Klau (2021; <doi:10.1037/met0000301>), Brusco et al.
#' (2020; <doi:10.1111/bmsp.12186>), and Papenberg (2024;
#' <doi:10.1111/bmsp.12315>).
#'
#' @param x The data input. Can be one of two structures: (1) A
#' feature matrix where rows correspond to elements and columns
#' correspond to variables (a single numeric variable can be
#' passed as a vector). (2) An N x N matrix dissimilarity matrix;
#' can be an object of class \code{dist} (e.g., returned by
#' \code{\link{dist}} or \code{\link{as.dist}}) or a \code{matrix}
#' where the entries of the upper and lower triangular matrix
#' represent pairwise dissimilarities.
#' @param K How many anticlusters should be created. Alternatively:
#' (a) A vector describing the size of each group, or (b) a vector
#' of length \code{nrow(x)} describing how elements are assigned
#' to anticlusters before the optimization starts.
#' @param objective The objective to be maximized. The options
#' "diversity" (default; previously called "distance", which is
#' still supported), "variance", "kplus" and "dispersion" are
#' natively supported. May also be a user-defined function. See
#' Details.
#' @param method One of "exchange" (default) , "local-maximum",
#' "brusco", or "ilp". See Details.
#' @param preclustering Boolean. Should a preclustering be conducted
#' before anticlusters are created? Defaults to \code{FALSE}. See
#' Details.
#' @param categories A vector, data.frame or matrix representing one
#' or several categorical variables whose distribution should be similar
#' between groups. See Details.
#' @param repetitions The number of times a search heuristic is
#' initiated when using \code{method = "exchange"}, \code{method =
#' "local-maximum"}, or \code{method = "brusco"}. In the end, the
#' best objective found across the repetitions is returned.
#' @param standardize Boolean. If \code{TRUE} and \code{x} is a
#' feature matrix, the data is standardized through a call to
#' \code{\link{scale}} before the optimization starts. This
#' argument is silently ignored if \code{x} is a distance matrix.
#'
#'
#' @return A vector of length N that assigns a group (i.e, a number
#' between 1 and \code{K}) to each input element.
#'
#' @importFrom Matrix sparseMatrix
#' @importFrom stats as.dist dist sd
#'
#' @useDynLib anticlust, .registration = TRUE
#'
#' @export
#'
#' @author
#' Martin Papenberg \email{martin.papenberg@@hhu.de}
#'
#' @details
#'
#' This function is used to solve anticlustering. That is, the data
#' input is divided into \code{K} groups in such a way that elements
#' within groups are heterogeneous and the different groups are
#' similar. Anticlustering is accomplished by maximizing instead of
#' minimizing a clustering objective function. The maximization of
#' four clustering objective functions is natively supported (other
#' functions can also defined by the user as described below):
#'
#' \itemize{
#' \item{the `diversity`, setting
#' \code{objective = "diversity"} (this is the default objective)}
#' \item{k-means `variance` objective, setting \code{objective = "variance"}}
#' \item{`k-plus` objective, an extension of the k-means variance criterion,
#' setting \code{objective = "kplus"}}
#' \item{the `dispersion` objective is the minimum distance between
#' any two elements within the same cluster (setting
#' \code{objective = "dispersion"})}
#' }
#'
#' The k-means objective is the within-group variance---that is, the
#' sum of the squared distances between each element and its cluster
#' center (see \code{\link{variance_objective}}). K-means
#' anticlustering focuses on minimizing differences with regard to the
#' means of the input variables (that is, the columns in \code{x}), but it ignores any other distribution
#' characterstics such as the variance / standard deviation. K-plus anticlustering
#' (using \code{objective = "kplus"}) is an extension of the k-means criterion that also
#' minimizes differences with regard to the standard
#' deviations between groups (for details see \code{\link{kplus_anticlustering}}). K-plus
#' anticlustering can also be extended towards higher order moments such as skew and kurtosis;
#' to consider these additional distribution characteristics, use the function
#' \code{\link{kplus_anticlustering}}. Setting \code{objective = "kplus"} in
#' \code{anticlustering} function will only consider means
#' and standard deviations (in my experience, this is what users usually want).
#' It is strongly recommended to set the argument \code{standardize = TRUE}
#' when using the k-plus objective.
#'
#' The "diversity" objective is the sum of pairwise
#' distances of elements within the same groups (see
#' \code{\link{diversity_objective}}). Hence, anticlustering using the diversity
#' criterion maximizes between-group similarity
#' by maximizing within-group heterogeneity (represented as the sum of all pairwise distances).
#' If it is computed on the basis of the Euclidean distance (which is the default
#' behaviour if \code{x} is a feature matrix), the diversity is an all rounder objective that tends to equalize all distribution
#' characteristics between groups (such as means, variances, ...).
#' Note that the equivalence of within-group heterogeneity and between-group similarity only
#' holds for equal-sized groups. For unequal-sized groups, it is recommended to
#' use a different objective when striving for overall between-group similarity
#' (e.g., the k-plus objective). In the publication that introduces
#' the \code{anticlust} package (Papenberg & Klau, 2021), we used the term "anticluster
#' editing" to refer to the maximization of the diversity, because the reversed
#' procedure - minimizing the diversity - is also known as "cluster editing".
#'
#' The "dispersion" is the minimum distance between any two elements
#' that are part of the same cluster; maximization of this objective
#' ensures that any two elements within the same group are as
#' dissimilar from each other as possible. Applications that require
#' high within-group heterogeneity often require to maximize the
#' dispersion. Oftentimes, it is useful to also consider the diversity
#' and not only the dispersion; to optimize both objectives at the
#' same time, see the function
#' \code{\link{bicriterion_anticlustering}}.
#'
#' If the data input \code{x} is a feature matrix (that is: each row
#' is a "case" and each column is a "variable") and the option
#' \code{objective = "diversity"} or \code{objective = "dispersion"} is used,
#' the Euclidean distance is computed as the basic unit of the objectives. If
#' a different measure of dissimilarity is preferred, you may pass a
#' self-generated dissimilarity matrix via the argument \code{x}.
#'
#' In the standard case, groups of equal size are generated. Adjust
#' the argument \code{K} to create groups of different size (see
#' Examples).
#'
#' \strong{Algorithms for anticlustering}
#'
#' By default, a heuristic method is employed for anticlustering: the
#' exchange method (\code{method = "exchange"}). First, elements are
#' randomly assigned to anticlusters (It is also possible to
#' explicitly specify the initial assignment using the argument
#' \code{K}; in this case, \code{K} has length \code{nrow(x)}.) Based
#' on the initial assignment, elements are systematically swapped
#' between anticlusters in such a way that each swap improves the
#' objective value. For an element, each possible swap with elements
#' in other clusters is simulated; then, the one swap is performed
#' that improves the objective the most, but a swap is only conducted
#' if there is an improvement at all. This swapping procedure is
#' repeated for each element. When using \code{method =
#' "local-maximum"}, the exchange method does not terminate after the
#' first iteration over all elements; instead, the swapping continues
#' until a local maximum is reached. This method corresponds to the algorithm
#' "LCW" by Weitz and Lakshminarayanan (1998). This means that after the
#' exchange process has been conducted once for each data point, the
#' algorithm restarts with the first element and proceeds to conduct
#' exchanges until the objective cannot be improved.
#'
#' When setting \code{preclustering = TRUE}, only the \code{K - 1}
#' most similar elements serve as exchange partners for each element,
#' which can speed up the optimization (more information
#' on the preclustering heuristic follows below). If the \code{categories} argument
#' is used, only elements having the same value in \code{categories} serve as exchange
#' partners.
#'
#' Using \code{method = "brusco"} implements the local bicriterion
#' iterated local search (BILS) heuristic by Brusco et al. (2020) and
#' returns the partition that best optimized either the diversity or
#' the dispersion during the optimization process. The function
#' \code{\link{bicriterion_anticlustering}} can also be used to run
#' the algorithm by Brusco et al., but it returns multiple partitions
#' that approximate the optimal pareto efficient set according to both
#' objectives (diversity and dispersion). Thus, to fully utilize the
#' BILS algorithm, use the function
#' \code{\link{bicriterion_anticlustering}}.
#'
#' \strong{Optimal anticlustering}
#'
#' Usually, heuristics are employed to tackle anticlustering problems,
#' and their performance is generally very satisfying. However,
#' heuristics do not investigate all possible group assignments and
#' therefore do not (necessarily) find the
#' "globally optimal solution", i.e., a partitioning that has the best
#' possible value with regard to the objective that is optimized. Enumerating
#' all possible partitions in order to find the best solution,
#' however, quickly becomes impossible with increasing N, and
#' therefore it is not possible to find a global optimum this
#' way. Because all anticlustering problems considered here are also
#' NP-hard, there is also no (known) clever algorithm that might
#' identify the best solution without considering all possibilities -
#' at least in the worst case. Integer linear programming (ILP) is an
#' approach for tackling NP hard problems that nevertheless tries to
#' be clever when finding optimal solutions: It does not necessarily
#' enumerate all possibilities but is still guaranteed to return the
#' optimal solution. Still, for NP hard problems such as
#' anticlustering, ILP methods will also fail at some point (i.e.,
#' when N increases).
#'
#' For the objectives \code{diversity} and \code{dispersion},
#' \code{anticlust} implements optimal solution algorithms via integer
#' linear programming. In order to use the ILP methods, set
#' \code{method = "ilp"}. The integer linear program optimizing the
#' diversity was described in Papenberg & Klau, (2021; (8) -
#' (12)). The documentation of the function
#' \code{\link{optimal_dispersion}} has more information on the
#' optimal maximization of the dispersion (this is the function that is called internally by
#' anticlustering() when using \code{objective = "dispersion"} and
#' \code{method = "ilp"}). The ILP methods either require the R
#' package \code{Rglpk} and the GNU linear programming kit
#' (<http://www.gnu.org/software/glpk/>), or the R package
#' \code{Rsymphony} and the COIN-OR SYMPHONY solver libraries
#' (<https://github.com/coin-or/SYMPHONY>). The function will try to
#' find the GLPK or SYMPHONY solver and throw an error if none is
#' available. If both are found, the GLPK solver is used. Use the functions
#' \code{\link{optimal_anticlustering}} or \code{\link{optimal_dispersion}}
#' to manually select a solver.
#'
#' Optimally maximizing the diversity only works for rather small N
#' and K; N = 20 and K = 2 is usually solved within some seconds, but
#' the run time quickly increases with increasing N (or K). The
#' maximum dispersion problem can be solved for much larger instances,
#' especially for K = 2 (which in theory is not even NP hard; note
#' that for the diversity, K = 2 is already NP hard). For K = 3, and K
#' = 4, several 100 elements can usually be processed, especially when
#' installing the SYMPHONY solver.
#'
#' \strong{Preclustering}
#'
#' A useful heuristic for anticlustering is to form small groups of
#' very similar elements and assign these to different groups. This
#' logic is used as a preprocessing when setting \code{preclustering =
#' TRUE}. That is, before the anticlustering objective is optimized, a
#' cluster analysis identifies small groups of similar elements (pairs
#' if \code{K = 2}, triplets if \code{K = 3}, and so forth). The
#' optimization of the anticlustering objective is then conducted
#' under the constraint that these matched elements cannot be assigned
#' to the same group. When using the exchange algorithm, preclustering
#' is conducted using a call to \code{\link{matching}}. When using
#' \code{method = "ilp"}, the preclustering optimally finds groups of
#' minimum pairwise distance by solving the integer linear program
#' described in Papenberg and Klau (2021; (8) - (10), (12) - (13)).
#' Note that when combining preclustering restrictions with \code{method = "ilp"},
#' the anticlustering result is no longer guaranteed to be globally optimal, but
#' only optimal given the preclustering restrictions.
#'
#' \strong{Categorical variables}
#'
#' The argument \code{categories} may induce categorical constraints,
#' i.e., can be used to distribute categorical variables evenly
#' between sets. The grouping variables indicated by
#' \code{categories} will be balanced out across anticlusters. This
#' functionality is only available for the classical exchange
#' procedures, that is, for \code{method = "exchange"} and
#' \code{method = "local-maximum"}. When \code{categories} has multiple columns
#' (i.e., there are multiple categorical variables), each combination of categories is
#' treated as a distinct category by the exchange method (i.e., the multiple columns
#' are "merged" into a single column). This behaviour may lead
#' to less than optimal results on the level of each single categorical variable.
#'
#' \strong{Optimize a custom objective function}
#'
#' It is possible to pass a \code{function} to the argument
#' \code{objective}. See \code{\link{dispersion_objective}} for an
#' example. If \code{objective} is a function, the exchange method
#' assigns elements to anticlusters in such a way that the return
#' value of the custom function is maximized (hence, the function
#' should return larger values when the between-group similarity is
#' higher). The custom function has to take two arguments: the first
#' is the data argument, the second is the clustering assignment. That
#' is, the argument \code{x} will be passed down to the user-defined
#' function as first argument. \strong{However, only after}
#' \code{\link{as.matrix}} has been called on \code{x}. This implies
#' that in the function body, columns of the data set cannot be
#' accessed using \code{data.frame} operations such as
#' \code{$}. Objects of class \code{dist} will be converted to matrix
#' as well.
#'
#'
#' @examples
#'
#' # Optimize the default diversity criterion
#' anticlusters <- anticlustering(
#' schaper2019[, 3:6],
#' K = 3,
#' categories = schaper2019$room
#' )
#' # Compare feature means by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(colMeans(x), 2))
#' # Compare standard deviations by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(apply(x, 2, sd), 2))
#' # check that the "room" is balanced across anticlusters:
#' table(anticlusters, schaper2019$room)
#'
#' # Use multiple starts of the algorithm to improve the objective and
#' # optimize the k-means criterion ("variance")
#' anticlusters <- anticlustering(
#' schaper2019[, 3:6],
#' objective = "variance",
#' K = 3,
#' categories = schaper2019$room,
#' method = "local-maximum",
#' repetitions = 2
#' )
#' # Compare means and standard deviations by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(colMeans(x), 2))
#' by(schaper2019[, 3:6], anticlusters, function(x) round(apply(x, 2, sd), 2))
#'
#' # Use different group sizes and optimize the extended k-means
#' # criterion ("kplus")
#' anticlusters <- anticlustering(
#' schaper2019[, 3:6],
#' objective = "kplus",
#' K = c(24, 24, 48),
#' categories = schaper2019$room,
#' repetitions = 10,
#' method = "local-maximum",
#' standardize = TRUE
#' )
#'
#' table(anticlusters, schaper2019$room)
#' # Compare means and standard deviations by anticluster
#' by(schaper2019[, 3:6], anticlusters, function(x) round(colMeans(x), 2))
#' by(schaper2019[, 3:6], anticlusters, function(x) round(apply(x, 2, sd), 2))
#'
#'
#' @references
#'
#' Brusco, M. J., Cradit, J. D., & Steinley, D. (2020). Combining
#' diversity and dispersion criteria for anticlustering: A bicriterion
#' approach. British Journal of Mathematical and Statistical
#' Psychology, 73, 275-396. https://doi.org/10.1111/bmsp.12186
#'
#' Papenberg, M., & Klau, G. W. (2021). Using anticlustering to partition
#' data sets into equivalent parts. Psychological Methods, 26(2),
#' 161–174. https://doi.org/10.1037/met0000301.
#'
#' Papenberg, M. (2024). K-plus Anticlustering: An Improved k-means Criterion for
#' Maximizing Between-Group Similarity. British Journal of Mathematical and
#' Statistical Psychology, 77(1), 80-102. https://doi.org/10.1111/bmsp.12315
#'
#' Späth, H. (1986). Anticlustering: Maximizing the variance criterion.
#' Control and Cybernetics, 15, 213-218.
#'
#' Weitz, R. R., & Lakshminarayanan, S. (1998). An empirical comparison of
#' heuristic methods for creating maximally diverse groups. Journal of the
#' Operational Research Society, 49(6), 635-646. https://doi.org/10.1057/palgrave.jors.2600510
#'
anticlustering <- function(x, K, objective = "diversity", method = "exchange",
preclustering = FALSE, categories = NULL,
repetitions = NULL, standardize = FALSE) {
## Get data into required format
input_validation_anticlustering(x, K, objective, method, preclustering,
categories, repetitions, standardize)
x <- to_matrix(x)
# there is a reason why scaling happens here and below (because of ILP + kplus)
if (!is_distance_matrix(x) && standardize == TRUE) {
x <- scale(x)
}
## Exact method using ILP
if (method == "ilp") {
if (objective == "dispersion") {
return(optimal_dispersion(x, K)$groups)
}
return(exact_anticlustering(x, K, preclustering))
}
# Preclustering and categorical constraints are both processed in the
# variable `categories` after this step:
categories <- get_categorical_constraints(x, K, preclustering, categories)
if (!inherits(objective, "function")) {
if (objective == "kplus") {
x <- cbind(x, squared_from_mean(x))
objective <- "variance"
}
if (objective == "distance") {
objective <- "diversity"
}
}
if (!is_distance_matrix(x) && standardize == TRUE) {
x <- scale(x)
}
# In some cases, `anticlustering()` has to be called repeatedly -
# redirect to `repeat_anticlustering()` in this case, which then
# again calls anticlustering with method "exchange" and
# repetitions = NULL
if (method == "local-maximum" ||
(method == "exchange" && argument_exists(repetitions))) {
if (!argument_exists(repetitions)) {
repetitions <- 1
}
return(repeat_anticlustering(x, K, objective, categories, method, repetitions))
}
if (method == "brusco") {
if (objective == "diversity") {
weights <- c(0.5, 0.99, 0.999, 0.999999)
obj_fun <- diversity_objective_
} else if (objective == "dispersion") {
weights <- c(0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1)
obj_fun <- dispersion_objective_
}
partitions <- as.matrix(bicriterion_anticlustering(x, K, repetitions, weights))
# get best partition wrt dispersion / diversity
best_obj <- which.max(apply(partitions, 1, obj_fun, convert_to_distances(x)))
return(partitions[best_obj, ])
}
if (inherits(objective, "function")) {
# most generic exchange method, deals with any objective function
return(exchange_method(x, K, objective, categories))
}
# Redirect to specialized fast exchange methods for diversity and
# variance objectives
c_anticlustering(x, K, categories, objective)
}
# Function that processes input and returns the data set that the
# optimization is conducted on as matrix (for exchange method)
# Returned matrix either represents distances or features.
to_matrix <- function(data) {
if (!is_distance_matrix(data)) {
data <- as.matrix(data)
return(data)
}
as.matrix(as.dist(data))
}
# Determines if preclustering constraints or categorical constraints
# are present. Returns a grouping vector if one or both constraints
# have been passed, or NULL if none is required
get_categorical_constraints <- function(data, K, preclustering, categories) {
if (preclustering == TRUE) {
N <- nrow(data)
matches <- matching(data, p = K, match_within = categories, sort_output = FALSE)
# deal with NA in matches
return(replace_na_by_index(matches))
}
if (argument_exists(categories)) {
return(merge_into_one_variable(categories))
}
NULL
}
replace_na_by_index <- function(matches) {
na_matches <- is.na(matches)
NAs <- sum(na_matches)
if (NAs == 0) {
return(matches)
}
max_group <- max(matches, na.rm = TRUE)
matches[na_matches] <- max_group + 1:NAs
matches
}