Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
196 lines (147 sloc) 5.87 KB
---
title: "Terence Parr Feat Imp"
author: "Christopher Csiszar"
date: "3/22/2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = F, message = F)
```
## Biases in RF importance calculations
Simple RF Classification done in R to see how biases their RF feature importance metrics are
```{r cars}
# RF Classification - feature importances
library(tidyverse)
library(randomForest)
library(cowplot)
library(gridExtra)
setwd("~/Downloads/")
rent <- read.csv('rent.csv')
class(rent$interest_level)
#interest.map <- c("low"=1, "medium"=2, "high"=3)
#ent$interest_level <- interest.map[as.character(rent$interest_level)]
summary(rent)
## plotting functions ##
create_rfplot <- function(rf, type){
imp <- importance(rf, type=type, scale = F)
featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1])
p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
geom_bar(stat="identity", fill="#53cfff", width = 0.65) +
coord_flip() +
theme_light(base_size=20) +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.text.x = element_text(size = 15, color = "black"),
axis.text.y = element_text(size = 15, color = "black"))
return(p)
}
create_ggplot <- function(featureImportance){
p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
geom_bar(stat="identity", fill="#53cfff", width = 0.65) +
coord_flip() +
theme_light(base_size=20) +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.text.x = element_text(size = 15, color = "black"),
axis.text.y = element_text(size = 15, color = "black"))
return(p)
}
```
```{r}
rent$interest_level <- as.factor(rent$interest_level)
head(rent)
```
## Type = 1, mean decrease in Accuracy
```{r cars3}
####### no random column #########
set.seed(1)
rent$random <- sample(100, size = nrow(rent), replace = TRUE)
#Fit Random Forest Model
rf1 = randomForest(interest_level ~ .,
ntree = 40,
data = rent[, 1:6],
nodesize = 1, importance = TRUE)
#print(rf)
importance(rf1, type = 1)
#round(importance(rf), 2)
# Variable Importance
k = varImpPlot(rf1,
sort = T,
main="Top - Variable Importance")
#p1 <- create_rfplot(rf1, type = 1)
#ggsave('../article/images/cls_permute_R.svg',
# plot = p1, device = 'svg', height = 4, width = 6)
######## with random column ########
#Fit Random Forest Model
rf2 = randomForest(interest_level ~ .,
ntree = 40,
data = rent,
nodesize = 1, importance = TRUE)
#print(rf)
#importance(rf2, type = 1)
imp1 <- data.frame(importance(rf2, type = 2))
write.csv(imp1, file="imp_R_class_gini.csv")
#round(importance(rf), 2)
imp1 <- data.frame(importance(rf2, type = 1))
write.csv(imp1, file="imp_R_class_acc.csv")
# Variable Importance
#varImpPlot(rf,
#sort = T,
#main="Top - Variable Importance")
#p2 <- create_rfplot(rf2, type = 1)
#ggsave('../article/images/cls_permute_random_R.svg',
# plot = p2, device = 'svg', height = 4, width = 6)
```
## Type = 2, mean decrease in Gini
```{r cars4}
####### no random column #########
#p1 <- create_rfplot(rf1, type = 2)
#ggsave('../article/images/cls_dflt_R.svg',
#plot = p1, device = 'svg', height = 4, width = 6)
######## with random column ########
#imp1 <- data.frame(importance(rf2, type = 2))
#write.csv(imp1, file="imp_R_class_gini.csv")
#p2 <- create_rfplot(rf2, type = 2)
#ggsave('../article/images/cls_dflt_random_R.svg',
#plot = p2, device = 'svg', height = 4, width = 6)
```
## Cost by dropping column analysis
```{r cars5, eval=FALSE}
####### no random column #########
get_drop_imp <- function(rent, columns){
X <- rent[,c(columns, 'interest_level')] # data
rf <- randomForest(interest_level~., data = X,
ntree = 40, mtry=2, nodesize=1, importance=T)
full_rsq <- -1*mean(rf$err.rate) #
imp <- c()
for (c in columns){
X_sub <- X[, !(colnames(X) == c)]
rf <- randomForest(interest_level~., data = X_sub,
ntree = 40, mtry=2, nodesize=1, importance=T)
sub_rsq <- -1*mean(rf$err.rate) #
diff_rsq <- full_rsq - sub_rsq
imp <- c(imp, diff_rsq)
}
featureImportance <- data.frame(Feature=columns, Importance=imp)
return(featureImportance)
}
columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'price')
featureImportance <- get_drop_imp(rent[, 1:6], columns)
write.csv(featureImportance, file="imp_R_class_gini.csv")
#p1 <- create_ggplot(featureImportance)
#ggsave('../article/images/cls_drop_R.svg',
#plot = p1, device = 'svg', height = 4, width = 6)
columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'random', 'price')
featureImportance <- get_drop_imp(rent, columns)
write.csv(featureImportance, file="imp_R_class_drop.csv")
#p2 <- create_ggplot(featureImportance)
#ggsave('../article/images/cls_drop_random_R.svg',
#plot = p2, device = 'svg', height = 4, width = 6)
```
## Takeaways
It appears that RF feature importance in R has several different metrics when evaluating. It seems that the "decrease in accuracy" metric places the `random` column dead last, as expected, while the "decrease in Gini" metric is terribly biased due to high cardinality, placing the `random` column as second most important.
Another thing to note is, due to low cardinality, `bedrooms` is a less important feature for Gini decrease metrics.
More of RF feature importance interpretation in R:
https://cran.r-project.org/web/packages/randomForest/randomForest.pdf
https://stats.stackexchange.com/questions/197827/how-to-interpret-mean-decrease-in-accuracy-and-mean-decrease-gini-in-random-fore
https://stackoverflow.com/questions/736514/r-random-forests-variable-importance