fix issue #91 based on discussion in the comments. (#140)

* fix issue #91 based on discussion in the comments. add some helper functions add test for new way of computing feature importance * remove need for library(Matrix) and update function parameteres. fix documentation typos [issue #91] * update test-FeatureImportance move `flipWeights` to helperFunctions * update Feature Importance to be more readable [@ben]. Merge RunFeature* into the same file. Update README with correct output names.
neurodata · Jan 18, 2019 · 0471816 · 0471816
1 parent 49ee30a
commit 0471816
Show file tree

Hide file tree

Showing 13 changed files with 573 additions and 35 deletions.
diff --git a/R/FeatureImportance.R b/R/FeatureImportance.R
@@ -4,32 +4,115 @@
 #'
 #' @param forest a forest trained using the RerF function with argument store.impurity = TRUE
 #' @param num.cores number of cores to use. If num.cores = 0, then 1 less than the number of cores reported by the OS are used. (num.cores = 0)
+#' @param type character string specifying which method to use in
+#' calculating feature importance.
+#' \describe{
+#'   \item{'C'}{specifies that unique combinations of features
+#'   should be *c*ounted across trees.}
+#'   \item{'R'}{feature importance will be calculated as in *R*andomForest.}
+#'   \item{'E'}{calculates the unique projections up to *e*quivalence if
+#'   the vector of projection weights parametrizes the same line in
+#'   \eqn{R^p}.}
+#' }
 #'
-#' @return feature.imp
+#' @return a list with 3 elements, 
+#' \describe{
+#'   \item{\code{imp}}{The vector of scores/counts, corresponding to each feature.}
+#'   \item{\code{features}}{The features/projections used.}
+#'   \item{\code{type}}{The code for the method used.}
+#'   }
 #'
 #' @examples
 #' library(rerf)
+#' num.cores <- 1L
 #' forest <- RerF(as.matrix(iris[, 1:4]), iris[[5L]], num.cores = 1L, store.impurity = TRUE)
-#' feature.imp <- FeatureImportance(forest, num.cores = 1L)
+#'
+#' imp.C <- FeatureImportance(forest, num.cores, "C")
+#' imp.R <- FeatureImportance(forest, num.cores, "R")
+#' imp.E <- FeatureImportance(forest, num.cores, "E")
+#'
+#' fRF <- RerF(as.matrix(iris[, 1:4]), iris[[5L]],
+#'             FUN = RandMatRF, num.cores = 1L, store.impurity = TRUE)
+#'
+#' fRF.imp <- FeatureImportance(forest = fRF, num.cores = num.cores)
+#'
 #' @export
 #' @importFrom parallel detectCores makeCluster clusterExport parSapply stopCluster
 #' @importFrom utils object.size
 
-FeatureImportance <- function(forest, num.cores = 0L) {
+FeatureImportance <- function(forest, num.cores = 0L, type = NULL) {
+
+  ## choose method to use for calculating feature importance
+  if(is.null(type)){
+    if(identical(forest$params$fun, rerf::RandMatRF)){
+      type <- "R"
+    } else if (identical(forest$params$fun, rerf::RandMatBinary)) {
+      type <- "E"
+    } else {
+      type <- "C"
+    }
+  }
+
   num.trees <- length(forest$trees)
   num.splits <- sapply(forest$trees, function(tree) length(tree$CutPoint))
 
-  unique.projections <- vector("list", sum(num.splits))
+  forest.projections <- vector("list")
 
-  idx.start <- 1L
+  ## Iterate over trees in the forest to save all projections used
   for (t in 1:num.trees) {
-    idx.end <- idx.start + num.splits[t] - 1L
-    unique.projections[idx.start:idx.end] <- lapply(1:num.splits[t], function(nd) forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]])
-    idx.start <- idx.end + 1L
+    tree.projections <- 
+      lapply(1:num.splits[t], function(nd) {
+             forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]]
+      })
+
+    forest.projections <- c(forest.projections, tree.projections)
   }
-  unique.projections <- unique(unique.projections)
 
-  CompImportanceCaller <- function(tree, ...) RunFeatureImportance(tree = tree, unique.projections = unique.projections)
+  ## Calculate the unique projections used according to the distribution
+  ## of weights
+  if (identical(type, "C")) {
+    message("Message: Computing feature importance as counts of unique feature combinations.\n")
+    ## compute the unique combinations of features used in the
+    ## projections 
+    unique.projections <- unique(lapply(forest.projections, getFeatures))
+
+    CompImportanceCaller <- function(tree, ...) {
+      RunFeatureImportanceCounts(tree = tree, unique.projections = unique.projections)
+    }
+    varlist <- c("unique.projections", "RunFeatureImportanceCounts")
+  } 
+
+  if (identical(type, "R")) {
+    message("Message: Computing feature importance for RandMatRF.\n")
+    ## Compute the unique projections without the need to account for
+    ## 180-degree rotations.
+    unique.projections <- unique(forest.projections)
+
+    CompImportanceCaller <- function(tree, ...) {
+      RunFeatureImportance(tree = tree, unique.projections = unique.projections)
+    }
+    varlist <- c("unique.projections", "RunFeatureImportance")
+  }
+
+  if (identical(type, "E")) {
+    message("Message: Computing feature importance for RandMatBinary.\n")
+    ## compute the unique projections properly accounting for
+    ## projections that differ by a 180-degree rotation.
+    unique.projections <- uniqueByEquivalenceClass(
+      forest$params$paramList$p,
+      unique(forest.projections)
+    )
+
+    CompImportanceCaller <- function(tree, ...) {
+      RunFeatureImportanceBinary(
+        tree = tree,
+        unique.projections = unique.projections
+      )
+    }
+    varlist <- c("unique.projections", "RunFeatureImportanceBinary")
+  }
+
+
 
   if (num.cores != 1L) {
     if (num.cores == 0L) {
@@ -41,7 +124,7 @@ FeatureImportance <- function(forest, num.cores = 0L) {
     if ((utils::object.size(forest) > 2e9) |
       .Platform$OS.type == "windows") {
       cl <- parallel::makeCluster(spec = num.cores, type = "PSOCK")
-      parallel::clusterExport(cl = cl, varlist = c("unique.projections", "RunFeatureImportance"), envir = environment())
+      parallel::clusterExport(cl = cl, varlist = varlist, envir = environment())
       feature.imp <- parallel::parSapply(cl = cl, forest$trees, FUN = CompImportanceCaller)
     } else {
       cl <- parallel::makeCluster(spec = num.cores, type = "FORK")
@@ -58,5 +141,6 @@ FeatureImportance <- function(forest, num.cores = 0L) {
   sort.idx <- order(feature.imp, decreasing = TRUE)
   feature.imp <- feature.imp[sort.idx]
   unique.projections <- unique.projections[sort.idx]
-  return(feature.imp <- list(imp = feature.imp, proj = unique.projections))
+
+  return(feature.imp <- list(imp = feature.imp, features = unique.projections, type = type))
 }
diff --git a/R/RunFeatureImportance.R b/R/RunFeatureImportance.R
@@ -19,3 +19,79 @@ RunFeatureImportance <- function(tree, unique.projections) {
   }
   return(feature.imp)
 }
+
+
+#' Compute Feature Importance of a single RerF tree
+#'
+#' Computes feature importance of every unique feature used to make a split in a single tree.
+#'
+#' @param tree a single tree from a trained RerF model with argument store.impurity = TRUE.
+#' @param unique.projections a list of all of the unique split projections used in the RerF model.
+#'
+#' @return feature.imp
+#'
+#' @examples
+#' library(rerf)
+#' X <- iris[, -5]
+#' Y <- iris[[5]]
+#' store.impurity <- TRUE
+#' FUN <- RandMatBinary
+#' forest <- RerF(X, Y, FUN = FUN, num.cores = 1L, store.impurity = store.impurity)
+#' FeatureImportance(forest, num.cores = 1L)
+
+
+RunFeatureImportanceBinary <- function(tree, unique.projections) {
+
+  ## compute the 180 rotations of the projections
+  neg.up <- lapply(unique.projections, flipWeights)
+  num.proj <- length(unique.projections)
+
+  feature.imp <- double(num.proj)
+  for (nd in tree$treeMap[tree$treeMap > 0L]) {
+    index.low <- tree$matAindex[nd] + 1L
+    index.high <- tree$matAindex[nd + 1L]
+    projection.idx <-
+      which(unique.projections %in%
+        list(tree$matAstore[index.low:index.high]) |
+        neg.up %in% list(tree$matAstore[index.low:index.high]))
+    feature.imp[projection.idx] <-
+      feature.imp[projection.idx] + tree$delta.impurity[nd]
+  }
+  return(feature.imp)
+}
+
+
+#' Tabulate the unique feature combinations used in a single RerF tree
+#'
+#' Computes feature importance of every unique feature used to make a split in a single tree.
+#'
+#' @param tree a single tree from a trained RerF model with argument store.impurity = TRUE.
+#' @param unique.projections a list of all of the unique split projections used in the RerF model.
+#'
+#' @return feature.counts
+#'
+#' @examples
+#' library(rerf)
+#' X <- iris[, -5]
+#' Y <- iris[[5]]
+#' store.impurity <- TRUE
+#' FUN <- RandMatContinuous
+#' forest <- RerF(X, Y, FUN = FUN, num.cores = 1L, store.impurity = store.impurity)
+#' FeatureImportance(forest, num.cores = 1L, type = "C")
+
+
+RunFeatureImportanceCounts <- function(tree, unique.projections) {
+  num.proj <- length(unique.projections)
+  feature.counts <- double(num.proj)
+
+  for (nd in tree$treeMap[tree$treeMap > 0L]) {
+    index.low <- tree$matAindex[nd] + 1L
+    index.high <- tree$matAindex[nd + 1L]
+    projection.idx <-
+      which(unique.projections %in%
+        lapply(list(tree$matAstore[index.low:index.high]), getFeatures))
+    feature.counts[projection.idx] <-
+      feature.counts[projection.idx] + 1
+  }
+  return(feature.counts)
+}
diff --git a/R/helperFunctions.R b/R/helperFunctions.R
@@ -0,0 +1,102 @@
+#' Extract feature indicies from the sparse projection vector.
+#'
+#' A helper function to extract the feature indices from the projection
+#' vector stored in a tree object.
+#'
+#' @param x a list of unique.projections from the intermediate steps of
+#' the FeatureImportance function.
+#'
+#' @return list of unique feature combinations
+#'
+
+getFeatures <- function(x) {
+  s <- seq(1, length(x), by = 2)
+  return(x[s])
+}
+
+#' Extract feature weights from the sparse projection vector.
+#'
+#' A helper function to extract the feature weights from the projection
+#' vector stored in a tree object.
+#'
+#' @param x a list of unique.projections from the intermediate steps of
+#' the FeatureImportance function.
+#'
+#' @return list of unique feature weights
+#'
+
+getWeights <- function(x) {
+  s <- seq(2, length(x), by = 2)
+  return(x[s])
+}
+
+#' Change the sign of the weights
+#'
+#' A helper function to extract the feature weights from the projection
+#' vector stored in a tree object. Used in
+#' \code{RunFeatureImportanceBinary}.
+#'
+#' @param x a list of unique.projections from the intermediate steps of
+#' the FeatureImportance function.
+#'
+#' @return x with sign of weights flipped.
+#'
+
+
+flipWeights <- function(x) {
+  s <- seq(2, length(x), by = 2)
+  x[s] <- -x[s]
+  return(x)
+}
+
+
+#' Remove unique projections that are equivalent due to a rotation of 180
+#' degrees.
+#'
+#' This function finds the projections that are equivalent via a 180
+#' degree rotation and removes the duplicates.
+#'
+#' @param p the number of features in the original data.  This can be
+#' obtained from a forest object via \code{forest$params$paramList$p}.
+#' @param unique.projections a list of projections from intermediate
+#' steps of the \code{\link{FeatureImportance}} function.
+#'
+#' @return unique.projections a list which is a subset of the input.
+#'
+#' @seealso \code{\link{FeatureImportance}}
+#'
+#'
+
+uniqueByEquivalenceClass <- function(p, unique.projections) {
+
+  ## the matrix of weights (w)
+  w <- matrix(0,
+    ncol = p,
+    nrow = length(unique.projections)
+  )
+
+  for (i in 1:length(unique.projections)) {
+    for (j in seq(1, length(unique.projections[[i]]), by = 2)) {
+      w[i, unique.projections[[i]][j]] <- unique.projections[[i]][j + 1]
+    }
+  }
+
+  out <- vector("list", 1 / 2 * (length(unique.projections) - 1) *
+    length(unique.projections))
+  k <- 1
+  for (i in 1:nrow(w)) {
+    for (j in i:nrow(w)) {
+      if (all(w[i, ] == -w[j, ])) {
+        out[[k]] <- c(i, j)
+        k <- k + 1
+      }
+    }
+  }
+
+  ind <- !sapply(out, is.null)
+  out <- Reduce(rbind, out)
+
+  unique.projections[out[, 2]] <- NULL
+
+  return(unique.projections)
+}
diff --git a/README.Rmd b/README.Rmd
@@ -214,7 +214,7 @@ scor
 ```
 
 ### Compute feature (projection) importance (DEV version only):
-Computes the Gini importance for all of the unique projections used to split the data. The returned value is a list with members imp and proj. The member imp is a numeric vector of feature importances sorted in decreasing order. The member proj is a list the same length as imp of vectors specifying the split projections corresponding to the values in imp. The projections are represented by the vector such that the odd numbered indices indicate the canonical feature indices and the even numbered indices indicate the linear coefficients. For example a vector (1,-1,4,1,5,-1) is the projection -X1 + X4 - X5. **Note**: it is highly advised to run this only when the splitting features (projections) have unweighted coefficients, such as for the default setting or for RF.
+Computes the Gini importance for all of the unique projections used to split the data. The returned value is a list with members imp and features.  The member imp is a numeric vector of feature importances sorted in decreasing order. The member features is a list the same length as imp of vectors specifying the split projections corresponding to the values in imp. The projections are represented by the vector such that the odd numbered indices indicate the canonical feature indices and the even numbered indices indicate the linear coefficients. For example a vector (1,-1,4,1,5,-1) is the projection -X1 + X4 - X5. **Note**: it is highly advised to run this only when the splitting features (projections) have unweighted coefficients, such as for the default setting or for RF.
 
 ```{r compute-feature-importance}
 X <- as.matrix(iris[, 1:4]) # feature matrix
@@ -228,7 +228,7 @@ forest <- RerF(as.matrix(iris[, 1:4]), iris[[5L]], FUN = RandMatRF,
                paramList = list(p = p, d = d), num.cores = 1L,
                store.impurity = TRUE, seed = 1L)
 
-feature.imp <- FeatureImportance(forest, num.cores = 1L)
+feature.imp <- FeatureImportance(forest, num.cores = 1L, type = "R")
 ```
 
 **Expected output**