neurodata · MrAE · Jan 18, 2019 · Jan 17, 2019 · Jan 17, 2019 · Jan 17, 2019
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: rerf
 Type: Package
 Title: Randomer Forest
-Version: 2.0.2
-Date: 2018-12-03
+Version: 2.0.2.9091
+Date: 2019-01-17
 Authors@R: c(
         person("Jesse", "Patsolic", role = c("ctb", "cre"), email = "software@neurodata.io"),
         person("Benjamin", "Falk", role = "ctb", email = "falk.ben@jhu.edu"),

diff --git a/R/FeatureImportance.R b/R/FeatureImportance.R
@@ -4,32 +4,87 @@
 #'
 #' @param forest a forest trained using the RerF function with argument store.impurity = TRUE
 #' @param num.cores number of cores to use. If num.cores = 0, then 1 less than the number of cores reported by the OS are used. (num.cores = 0)
+#' @param featureCounts boolean set to true if only the table of unique
+#' feature combinations is desired.
 #'
 #' @return feature.imp
 #'
 #' @examples
 #' library(rerf)
-#' forest <- RerF(as.matrix(iris[, 1:4]), iris[[5L]], num.cores = 1L, store.impurity = TRUE)
-#' feature.imp <- FeatureImportance(forest, num.cores = 1L)
+#' num.cores <- 1L
+#' fBinary <- RerF(as.matrix(iris[, 1:4]), iris[[5L]], num.cores = 1L, store.impurity = TRUE)
+#'
+#' fBinary.imp <- FeatureImportance(forest = fBinary, num.cores = num.cores)
+#'
+#' fRF <- RerF(as.matrix(iris[, 1:4]), iris[[5L]],
+#'             FUN = RandMatRF, num.cores = 1L, store.impurity = TRUE)
+#'
+#' fRF.imp <- FeatureImportance(forest = fRF, num.cores = num.cores)
+#'
+#' fC <- RerF(as.matrix(iris[, 1:4]), iris[[5L]],
+#'             FUN = RandMatContinuous, num.cores = 1L, store.impurity = TRUE)
+#'
+#' fC.imp <- FeatureImportance(forest = fC, num.cores = num.cores, featureCounts = TRUE)
+#'
 #' @export
 #' @importFrom parallel detectCores makeCluster clusterExport parSapply stopCluster
 #' @importFrom utils object.size
 
-FeatureImportance <- function(forest, num.cores = 0L) {
+FeatureImportance <- function(forest, num.cores = 0L, featureCounts = FALSE) {
   num.trees <- length(forest$trees)
   num.splits <- sapply(forest$trees, function(tree) length(tree$CutPoint))
 
   unique.projections <- vector("list", sum(num.splits))
 
   idx.start <- 1L
-  for (t in 1:num.trees) {
-    idx.end <- idx.start + num.splits[t] - 1L
-    unique.projections[idx.start:idx.end] <- lapply(1:num.splits[t], function(nd) forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]])
-    idx.start <- idx.end + 1L
+
+  ## Set algorithm depending on RandMat* used
+  if (featureCounts) {
+    message("Message: Computing feature importance as counts.\n")
+
+    for (t in 1:num.trees) {
+      idx.end <- idx.start + num.splits[t] - 1L
+      unique.projections[idx.start:idx.end] <-
+        lapply(1:num.splits[t], function(nd) forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]])
+
+      idx.start <- idx.end + 1L
+    }
+
+    unique.projections <- unique(lapply(unique.projections, getFeatures))
+
+    CompImportanceCaller <- function(tree, ...) {
+      RunFeatureImportanceCounts(tree = tree, unique.projections = unique.projections)
+    }
+  } else {
+    for (t in 1:num.trees) {
+      idx.end <- idx.start + num.splits[t] - 1L
+      unique.projections[idx.start:idx.end] <- lapply(1:num.splits[t], function(nd) forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]])
+      idx.start <- idx.end + 1L
+    }
+  }
+
+  if (identical(forest$params$fun, rerf::RandMatRF) & !featureCounts) {
+    message("Message: Computing feature importance for RandMatRF.\n")
+    unique.projections <- unique(unique.projections)
+    CompImportanceCaller <- function(tree, ...) {
+      RunFeatureImportance(tree = tree, unique.projections = unique.projections)
+    }
   }
-  unique.projections <- unique(unique.projections)
 
-  CompImportanceCaller <- function(tree, ...) RunFeatureImportance(tree = tree, unique.projections = unique.projections)
+  if (identical(forest$params$fun, rerf::RandMatBinary) & !featureCounts) {
+    message("Message: Computing feature importance for RandMatBinary.\n")
+    unique.projections <- uniqueByEquivalenceClass(
+      forest$params$paramList$p,
+      unique(unique.projections)
+    )
+
+    CompImportanceCaller <- function(tree, ...) {
+      RunFeatureImportanceBinary(
+        tree = tree,
+        unique.projections = unique.projections
+      )
+    }
+  }
 
   if (num.cores != 1L) {
     if (num.cores == 0L) {
@@ -58,5 +113,9 @@ FeatureImportance <- function(forest, num.cores = 0L) {
   sort.idx <- order(feature.imp, decreasing = TRUE)
   feature.imp <- feature.imp[sort.idx]
   unique.projections <- unique.projections[sort.idx]
-  return(feature.imp <- list(imp = feature.imp, proj = unique.projections))
+  if (!featureCounts) {
+    return(feature.imp <- list(imp = feature.imp, proj = unique.projections))
+  } else {
+    return(feature.imp <- list(impCount = feature.imp, featureCombination = unique.projections))
+  }
 }
diff --git a/R/RunFeatureImportanceBinary.R b/R/RunFeatureImportanceBinary.R
@@ -0,0 +1,37 @@
+#' Compute Feature Importance of a single RerF tree
+#'
+#' Computes feature importance of every unique feature used to make a split in a single tree.
+#'
+#' @param tree a single tree from a trained RerF model with argument store.impurity = TRUE.
+#' @param unique.projections a list of all of the unique split projections used in the RerF model.
+#'
+#' @return feature.imp
+#'
+#' @examples
+#' library(rerf)
+#' X <- iris[, -5]
+#' Y <- iris[[5]]
+#' store.impurity <- TRUE
+#' FUN <- RandMatBinary
+#' forest <- RerF(X, Y, FUN = FUN, num.cores = 1L, store.impurity = store.impurity)
+#' FeatureImportance(forest, num.cores = 1L)
+
+RunFeatureImportanceBinary <- function(tree, unique.projections) {
+
+  ## compute the 180 rotations of the projections
+  neg.up <- lapply(unique.projections, flipWeights)
+  num.proj <- length(unique.projections)
+
+  feature.imp <- double(num.proj)
+  for (nd in tree$treeMap[tree$treeMap > 0L]) {
+    index.low <- tree$matAindex[nd] + 1L
+    index.high <- tree$matAindex[nd + 1L]
+    projection.idx <-
+      which(unique.projections %in%
+        list(tree$matAstore[index.low:index.high]) |
+        neg.up %in% list(tree$matAstore[index.low:index.high]))
+    feature.imp[projection.idx] <-
+      feature.imp[projection.idx] + tree$delta.impurity[nd]
+  }
+  return(feature.imp)
+}
diff --git a/R/RunFeatureImportanceCounts.R b/R/RunFeatureImportanceCounts.R
@@ -0,0 +1,34 @@
+#' Tabulate the unique feature combinations used in a single RerF tree
+#'
+#' Computes feature importance of every unique feature used to make a split in a single tree.
+#'
+#' @param tree a single tree from a trained RerF model with argument store.impurity = TRUE.
+#' @param unique.projections a list of all of the unique split projections used in the RerF model.
+#'
+#' @return feature.counts
+#'
+#' @examples
+#' @examples
+#' library(rerf)
+#' X <- iris[, -5]
+#' Y <- iris[[5]]
+#' store.impurity <- TRUE
+#' FUN <- RandMatContinuous
+#' forest <- RerF(X, Y, FUN = FUN, num.cores = 1L, store.impurity = store.impurity)
+#' FeatureImportance(forest, num.cores = 1L, featureCounts = TRUE)
+
+RunFeatureImportanceCounts <- function(tree, unique.projections) {
+  num.proj <- length(unique.projections)
+  feature.counts <- double(num.proj)
+
+  for (nd in tree$treeMap[tree$treeMap > 0L]) {
+    index.low <- tree$matAindex[nd] + 1L
+    index.high <- tree$matAindex[nd + 1L]
+    projection.idx <-
+      which(unique.projections %in%
+        lapply(list(tree$matAstore[index.low:index.high]), getFeatures))
+    feature.counts[projection.idx] <-
+      feature.counts[projection.idx] + 1
+  }
+  return(feature.counts)
+}
diff --git a/R/helperFunctions.R b/R/helperFunctions.R
@@ -0,0 +1,102 @@
+#' Extract feature indicies from the sparse projection vector.
+#'
+#' A helper function to extract the feature indices from the projection
+#' vector stored in a tree object.
+#'
+#' @param x a list of unique.projections from the intermediate steps of
+#' the FeatureImportance function.
+#'
+#' @return list of unique feature combinations
+#'
+
+getFeatures <- function(x) {
+  s <- seq(1, length(x), by = 2)
+  return(x[s])
+}
+
+#' Extract feature weights from the sparse projection vector.
+#'
+#' A helper function to extract the feature weights from the projection
+#' vector stored in a tree object.
+#'
+#' @param x a list of unique.projections from the intermediate steps of
+#' the FeatureImportance function.
+#'
+#' @return list of unique feature weights
+#'
+
+getWeights <- function(x) {
+  s <- seq(2, length(x), by = 2)
+  return(x[s])
+}
+
+#' Change the sign of the weights
+#'
+#' A helper function to extract the feature weights from the projection
+#' vector stored in a tree object. Used in
+#' \code{RunFeatureImportanceBinary}.
+#'
+#' @param x a list of unique.projections from the intermediate steps of
+#' the FeatureImportance function.
+#'
+#' @return x with sign of weights flipped.
+#'
+
+
+flipWeights <- function(x) {
+  s <- seq(2, length(x), by = 2)
+  x[s] <- -x[s]
+  return(x)
+}
+
+
+#' Remove unique projections that are equivalent due to a rotation of 180
+#' degrees.
+#'
+#' This function finds the projections that are equivalent via a 180
+#' degree rotation and removes the duplicates.
+#'
+#' @param p the number of features in the original data.  This can be
+#' obtained from a forest object via \code{forest$params$paramList$p}.
+#' @param unique.projections a list of projections from intermediate
+#' steps of the \code{\link{FeatureImportance}} function.
+#'
+#' @return unique.projections a list which is a subset of the input.
+#'
+#' @seealso \code{\link{FeatureImportance}}
+#'
+#'
+
+uniqueByEquivalenceClass <- function(p, unique.projections) {
+
+  ## the matrix of weights (w)
+  w <- matrix(0,
+    ncol = p,
+    nrow = length(unique.projections)
+  )
+
+  for (i in 1:length(unique.projections)) {
+    for (j in seq(1, length(unique.projections[[i]]), by = 2)) {
+      w[i, unique.projections[[i]][j]] <- unique.projections[[i]][j + 1]
+    }
+  }
+
+  out <- vector("list", 1 / 2 * (length(unique.projections) - 1) *
+    length(unique.projections))
+  k <- 1
+  for (i in 1:nrow(w)) {
+    for (j in i:nrow(w)) {
+      if (all(w[i, ] == -w[j, ])) {
+        out[[k]] <- c(i, j)
+        k <- k + 1
+      }
+    }
+  }
+
+  ind <- !sapply(out, is.null)
+  out <- Reduce(rbind, out)
+
+  unique.projections[out[, 2]] <- NULL
+
+  return(unique.projections)
+}
diff --git a/man/FeatureImportance.Rd b/man/FeatureImportance.Rd
diff --git a/man/RunFeatureImportanceBinary.Rd b/man/RunFeatureImportanceBinary.Rd
diff --git a/man/RunFeatureImportanceCounts.Rd b/man/RunFeatureImportanceCounts.Rd