Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix issue #91 based on discussion in the comments. #140

Merged
merged 5 commits into from Jan 18, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
@@ -1,8 +1,8 @@
Package: rerf
Type: Package
Title: Randomer Forest
Version: 2.0.2
Date: 2018-12-03
Version: 2.0.2.9091
Date: 2019-01-17
Authors@R: c(
person("Jesse", "Patsolic", role = c("ctb", "cre"), email = "software@neurodata.io"),
person("Benjamin", "Falk", role = "ctb", email = "falk.ben@jhu.edu"),
Expand Down
79 changes: 69 additions & 10 deletions R/FeatureImportance.R
Expand Up @@ -4,32 +4,87 @@
#'
#' @param forest a forest trained using the RerF function with argument store.impurity = TRUE
#' @param num.cores number of cores to use. If num.cores = 0, then 1 less than the number of cores reported by the OS are used. (num.cores = 0)
#' @param featureCounts boolean set to true if only the table of unique
#' feature combinations is desired.
#'
#' @return feature.imp
#'
#' @examples
#' library(rerf)
#' forest <- RerF(as.matrix(iris[, 1:4]), iris[[5L]], num.cores = 1L, store.impurity = TRUE)
#' feature.imp <- FeatureImportance(forest, num.cores = 1L)
#' num.cores <- 1L
#' fBinary <- RerF(as.matrix(iris[, 1:4]), iris[[5L]], num.cores = 1L, store.impurity = TRUE)
#'
#' fBinary.imp <- FeatureImportance(forest = fBinary, num.cores = num.cores)
#'
#' fRF <- RerF(as.matrix(iris[, 1:4]), iris[[5L]],
#' FUN = RandMatRF, num.cores = 1L, store.impurity = TRUE)
#'
#' fRF.imp <- FeatureImportance(forest = fRF, num.cores = num.cores)
#'
#' fC <- RerF(as.matrix(iris[, 1:4]), iris[[5L]],
#' FUN = RandMatContinuous, num.cores = 1L, store.impurity = TRUE)
#'
#' fC.imp <- FeatureImportance(forest = fC, num.cores = num.cores, featureCounts = TRUE)
#'
#' @export
#' @importFrom parallel detectCores makeCluster clusterExport parSapply stopCluster
#' @importFrom utils object.size

FeatureImportance <- function(forest, num.cores = 0L) {
FeatureImportance <- function(forest, num.cores = 0L, featureCounts = FALSE) {
num.trees <- length(forest$trees)
num.splits <- sapply(forest$trees, function(tree) length(tree$CutPoint))

unique.projections <- vector("list", sum(num.splits))

idx.start <- 1L
for (t in 1:num.trees) {
idx.end <- idx.start + num.splits[t] - 1L
unique.projections[idx.start:idx.end] <- lapply(1:num.splits[t], function(nd) forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]])
idx.start <- idx.end + 1L

## Set algorithm depending on RandMat* used
if (featureCounts) {
message("Message: Computing feature importance as counts.\n")

for (t in 1:num.trees) {
idx.end <- idx.start + num.splits[t] - 1L
unique.projections[idx.start:idx.end] <-
lapply(1:num.splits[t], function(nd) forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a style thing, but this piece of code would be far easier to read and maintain if we don't deal with indices and reassigning stuff and just append lists together.

pseudo code:

forest_projection = []
for (t in 1:num.trees){
  tree_projection = lapply(whatever)
  forest_projection = forest_projection + tree_projection
}


idx.start <- idx.end + 1L
}

unique.projections <- unique(lapply(unique.projections, getFeatures))

CompImportanceCaller <- function(tree, ...) {
RunFeatureImportanceCounts(tree = tree, unique.projections = unique.projections)
}
} else {
for (t in 1:num.trees) {
idx.end <- idx.start + num.splits[t] - 1L
unique.projections[idx.start:idx.end] <- lapply(1:num.splits[t], function(nd) forest$trees[[t]]$matAstore[(forest$trees[[t]]$matAindex[nd] + 1L):forest$trees[[t]]$matAindex[nd + 1L]])
idx.start <- idx.end + 1L
}
}

if (identical(forest$params$fun, rerf::RandMatRF) & !featureCounts) {
message("Message: Computing feature importance for RandMatRF.\n")
unique.projections <- unique(unique.projections)
CompImportanceCaller <- function(tree, ...) {
RunFeatureImportance(tree = tree, unique.projections = unique.projections)
}
}
unique.projections <- unique(unique.projections)

CompImportanceCaller <- function(tree, ...) RunFeatureImportance(tree = tree, unique.projections = unique.projections)
if (identical(forest$params$fun, rerf::RandMatBinary) & !featureCounts) {
message("Message: Computing feature importance for RandMatBinary.\n")
unique.projections <- uniqueByEquivalenceClass(
forest$params$paramList$p,
unique(unique.projections)
)

CompImportanceCaller <- function(tree, ...) {
RunFeatureImportanceBinary(
tree = tree,
unique.projections = unique.projections
)
}
}

if (num.cores != 1L) {
if (num.cores == 0L) {
Expand Down Expand Up @@ -58,5 +113,9 @@ FeatureImportance <- function(forest, num.cores = 0L) {
sort.idx <- order(feature.imp, decreasing = TRUE)
feature.imp <- feature.imp[sort.idx]
unique.projections <- unique.projections[sort.idx]
return(feature.imp <- list(imp = feature.imp, proj = unique.projections))
if (!featureCounts) {
return(feature.imp <- list(imp = feature.imp, proj = unique.projections))
} else {
return(feature.imp <- list(impCount = feature.imp, featureCombination = unique.projections))
}
}
37 changes: 37 additions & 0 deletions R/RunFeatureImportanceBinary.R
@@ -0,0 +1,37 @@
#' Compute Feature Importance of a single RerF tree
#'
#' Computes feature importance of every unique feature used to make a split in a single tree.
#'
#' @param tree a single tree from a trained RerF model with argument store.impurity = TRUE.
#' @param unique.projections a list of all of the unique split projections used in the RerF model.
#'
#' @return feature.imp
#'
#' @examples
#' library(rerf)
#' X <- iris[, -5]
#' Y <- iris[[5]]
#' store.impurity <- TRUE
#' FUN <- RandMatBinary
#' forest <- RerF(X, Y, FUN = FUN, num.cores = 1L, store.impurity = store.impurity)
#' FeatureImportance(forest, num.cores = 1L)

RunFeatureImportanceBinary <- function(tree, unique.projections) {

## compute the 180 rotations of the projections
neg.up <- lapply(unique.projections, flipWeights)
num.proj <- length(unique.projections)

feature.imp <- double(num.proj)
for (nd in tree$treeMap[tree$treeMap > 0L]) {
index.low <- tree$matAindex[nd] + 1L
index.high <- tree$matAindex[nd + 1L]
projection.idx <-
which(unique.projections %in%
list(tree$matAstore[index.low:index.high]) |
neg.up %in% list(tree$matAstore[index.low:index.high]))
feature.imp[projection.idx] <-
feature.imp[projection.idx] + tree$delta.impurity[nd]
}
return(feature.imp)
}
34 changes: 34 additions & 0 deletions R/RunFeatureImportanceCounts.R
@@ -0,0 +1,34 @@
#' Tabulate the unique feature combinations used in a single RerF tree
#'
#' Computes feature importance of every unique feature used to make a split in a single tree.
#'
#' @param tree a single tree from a trained RerF model with argument store.impurity = TRUE.
#' @param unique.projections a list of all of the unique split projections used in the RerF model.
#'
#' @return feature.counts
#'
#' @examples
#' @examples
#' library(rerf)
#' X <- iris[, -5]
#' Y <- iris[[5]]
#' store.impurity <- TRUE
#' FUN <- RandMatContinuous
#' forest <- RerF(X, Y, FUN = FUN, num.cores = 1L, store.impurity = store.impurity)
#' FeatureImportance(forest, num.cores = 1L, featureCounts = TRUE)

RunFeatureImportanceCounts <- function(tree, unique.projections) {
num.proj <- length(unique.projections)
feature.counts <- double(num.proj)

for (nd in tree$treeMap[tree$treeMap > 0L]) {
index.low <- tree$matAindex[nd] + 1L
index.high <- tree$matAindex[nd + 1L]
projection.idx <-
which(unique.projections %in%
lapply(list(tree$matAstore[index.low:index.high]), getFeatures))
feature.counts[projection.idx] <-
feature.counts[projection.idx] + 1
}
return(feature.counts)
}
102 changes: 102 additions & 0 deletions R/helperFunctions.R
@@ -0,0 +1,102 @@
#' Extract feature indicies from the sparse projection vector.
#'
#' A helper function to extract the feature indices from the projection
#' vector stored in a tree object.
#'
#' @param x a list of unique.projections from the intermediate steps of
#' the FeatureImportance function.
#'
#' @return list of unique feature combinations
#'

getFeatures <- function(x) {
s <- seq(1, length(x), by = 2)
return(x[s])
}

#' Extract feature weights from the sparse projection vector.
#'
#' A helper function to extract the feature weights from the projection
#' vector stored in a tree object.
#'
#' @param x a list of unique.projections from the intermediate steps of
#' the FeatureImportance function.
#'
#' @return list of unique feature weights
#'

getWeights <- function(x) {
s <- seq(2, length(x), by = 2)
return(x[s])
}

#' Change the sign of the weights
#'
#' A helper function to extract the feature weights from the projection
#' vector stored in a tree object. Used in
#' \code{RunFeatureImportanceBinary}.
#'
#' @param x a list of unique.projections from the intermediate steps of
#' the FeatureImportance function.
#'
#' @return x with sign of weights flipped.
#'


flipWeights <- function(x) {
s <- seq(2, length(x), by = 2)
x[s] <- -x[s]
return(x)
}


#' Remove unique projections that are equivalent due to a rotation of 180
#' degrees.
#'
#' This function finds the projections that are equivalent via a 180
#' degree rotation and removes the duplicates.
#'
#' @param p the number of features in the original data. This can be
#' obtained from a forest object via \code{forest$params$paramList$p}.
#' @param unique.projections a list of projections from intermediate
#' steps of the \code{\link{FeatureImportance}} function.
#'
#' @return unique.projections a list which is a subset of the input.
#'
#' @seealso \code{\link{FeatureImportance}}
#'
#'

uniqueByEquivalenceClass <- function(p, unique.projections) {

## the matrix of weights (w)
w <- matrix(0,
ncol = p,
nrow = length(unique.projections)
)

for (i in 1:length(unique.projections)) {
for (j in seq(1, length(unique.projections[[i]]), by = 2)) {
w[i, unique.projections[[i]][j]] <- unique.projections[[i]][j + 1]
}
}

out <- vector("list", 1 / 2 * (length(unique.projections) - 1) *
length(unique.projections))
k <- 1
for (i in 1:nrow(w)) {
for (j in i:nrow(w)) {
if (all(w[i, ] == -w[j, ])) {
out[[k]] <- c(i, j)
k <- k + 1
}
}
}

ind <- !sapply(out, is.null)
out <- Reduce(rbind, out)

unique.projections[out[, 2]] <- NULL

return(unique.projections)
}
22 changes: 19 additions & 3 deletions man/FeatureImportance.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions man/RunFeatureImportanceBinary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions man/RunFeatureImportanceCounts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.