Merge pull request #2 from BenjaminLouis/include

Include
pierreroudier · Oct 4, 2018 · 2dd3f51 · 2dd3f51
2 parents c4f197d + 700f96e
commit 2dd3f51
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 17 deletions.
diff --git a/R/clhs-data.frame.R b/R/clhs-data.frame.R
@@ -7,6 +7,7 @@
 clhs.data.frame <- function(
   x, # data.frame
   size, # Number of samples you want
+  include = NULL, # row index of data that must be in the final sample
   cost = NULL, # Number or name of the attribute used as a cost
   iter = 10000, # Number of max iterations
   temp = 1, # initial temperature
@@ -62,6 +63,11 @@ clhs.data.frame <- function(
     # Remove cost attribute from attribute table
     x <- x[, -1*i_cost, drop = FALSE]
 
+    # Si include, cost is 0
+    if (!is.null(include)) {
+      cost[include, ] <- 0
+    }
+
     # Flags
     cost_mode <- TRUE
     track_mode <- FALSE # cost is taken into account, therefore computed
@@ -76,7 +82,7 @@ clhs.data.frame <- function(
     data_factor <- x[, i_factor, drop = FALSE]
     # Creating a list storing the levels of each factor
     factor_levels <- apply(data_factor, 2, function(x) {
-      ifelse(is.factor(x), res <- levels(x), res <-levels(factor(x)))
+      ifelse(is.factor(x), res <- levels(x), res <- levels(factor(x)))
       res}
     )
   } else {
@@ -102,9 +108,13 @@ clhs.data.frame <- function(
   if (n_factor == 0) data_factor_sampled <- data.frame()
   else factor_obj <- alply(data_factor, 2, function(x) table(x)/n_data)
 
+  # Mandatory data in the sample
+  sampled_size <- size - length(include)
+  not_included <- setdiff(1:n_data, include)
+
   # initialise, pick randomly
   n_remainings <- n_data - size # number of individuals remaining unsampled
-  i_sampled <- sample(1:n_data, size = size, replace = FALSE) # individuals randomly chosen
+  i_sampled <- c(sample(not_included, size = sampled_size, replace = FALSE), include) # individuals randomly chosen
   i_unsampled <- setdiff(1:n_data, i_sampled) # individuals remaining unsampled
   data_continuous_sampled <- data_continuous[i_sampled, , drop = FALSE] # sampled continuous data
 
@@ -145,11 +155,11 @@ clhs.data.frame <- function(
 
     if (runif(1) < 0.5) {
       # pick a random sampled point and random unsampled point and swap them
-      idx_removed <- sample(1:length(i_sampled), size = 1, replace = FALSE)
-      spl_removed <- i_sampled[idx_removed]
+      idx_removed <- sample(1:length(setdiff(i_sampled, include)), size = 1, replace = FALSE)
+      spl_removed <- setdiff(i_sampled, include)[idx_removed]
       idx_added <- sample(1:length(i_unsampled), size = 1, replace = FALSE)
-      i_sampled <- i_sampled[-idx_removed]
-      i_sampled <- c(i_sampled, i_unsampled[idx_added])
+      i_sampled <- setdiff(i_sampled, include)[-idx_removed]
+      i_sampled <- c(i_sampled, i_unsampled[idx_added], include)
       i_unsampled <- i_unsampled[-idx_added]
       i_unsampled <- c(i_unsampled, spl_removed)
 
@@ -163,16 +173,16 @@ clhs.data.frame <- function(
     }
     else {
       # remove the worse sampled & resample
-      worse <- max(delta_obj_continuous)
-      i_worse <- which(delta_obj_continuous == worse)
+      worse <- max(delta_obj_continuous[!i_sampled %in% include])
+      i_worse <- which(delta_obj_continuous[!i_sampled %in% include] == worse)
       # If there's more than one worse candidate, we pick one at random
       if (length(i_worse) > 1) i_worse <- sample(i_worse, size = 1)
 
       # swap with reservoir
-      spl_removed <- i_sampled[i_worse] # will be removed from the sampled set. 
+      spl_removed <- setdiff(i_sampled, include)[i_worse] # will be removed from the sampled set. 
       idx_added <- sample(1:n_remainings, size = 1, replace = FALSE) # new candidate that will take their place
-      i_sampled <- i_sampled[-i_worse]
-      i_sampled <- c(i_sampled, i_unsampled[idx_added])
+      i_sampled <- setdiff(i_sampled, include)[-i_worse]
+      i_sampled <- c(i_sampled, i_unsampled[idx_added], include)
       i_unsampled <- i_unsampled[-idx_added]
       i_unsampled <- c(i_unsampled, spl_removed)
 

diff --git a/R/clhs.R b/R/clhs.R
@@ -27,6 +27,10 @@
 #' @param x A \code{data.frame}, \code{SpatialPointsDataFrame} or \code{Raster}
 #' object.
 #' @param size A non-negative integer giving the number of samples to pick.
+#' @param include Vector of row indexes of data from \code{x} that must be 
+#' included in the final sample. For the cost-constrained cLHS method, cost of 
+#' these mandatory samples is set to 0. If NULL (default), all data are randomly 
+#' choosen according to the classic cLHS method.
 #' @param cost A character giving the name or an integer giving the index of
 #' the attribute in \code{x} that gives a cost that can be use to constrain the
 #' cLHS sampling. If NULL (default), the cost-constrained implementation is not
@@ -110,4 +114,4 @@
 #' 
 #' @include clhs-data.frame.R
 #' @export
-clhs <- function(x, size,  cost,  iter,  temp,  tdecrease,  weights, eta, obj.limit, length.cycle, simple, progress, track) UseMethod("clhs")
+clhs <- function(x, size, include,  cost,  iter,  temp,  tdecrease,  weights, eta, obj.limit, length.cycle, simple, progress, track) UseMethod("clhs")
diff --git a/man/clhs.Rd b/man/clhs.Rd