# Random Forests in R

We're using the [randomForest](https://www.rdocumentation.org/packages/randomForest) package from R. 

The dataset is available from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing.

## Set up environment and required packages

The package *tidyverse* includes *dplyr, tidyr, readr, ggplot2*

In [None]:
options(warn=-1)

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(ROCR))
suppressPackageStartupMessages(library(randomForest))

## Load the data
We have 3 data sets
1. The full bank data set with more than 41.000 entries, quite unbalanced
2. A smaller subset - still unbalanced
3. A balanced sample of the full data set with ~ 9200 entries

In [None]:
data_dir_default = "../data/"
data_sets = c("bank-full", "bank-10percent", "bank-balanced")

A little helper function for loading the different data sets

In [None]:
read_data <- function(data_set, data_dir = data_dir_default) {
  data_set <- paste(data_dir, data_set, ".csv", sep='')
  read.csv(data_set)
}

We're using the balanced data set first

In [None]:
bank_data <- read_data("bank-balanced")
cat("# data rows: ", nrow(bank_data), "- # features: ", ncol(bank_data), "\n")

## Partition the data in training and test set

In [None]:
partition_data <- function(data, prop = 0.8) {

  set.seed(4711)
  n <- nrow(data)
  n_train <- round(0.8 * n) 
  partition <- sample(1:n, n_train)
  
  first <-  data[partition,]
  second  <-  data[-partition,]
  
  list(first, second)
}

We'll use the standard 80/20 split

In [None]:
partitions <- partition_data(bank_data)
train.df <- partitions[[1]]
test.df  <- partitions[[2]]

cat("Number of training samples :", nrow(train.df), "\n")
cat("Number of test samples     :", nrow(test.df), "\n")

## Build the Model
We're using 100 trees, the default of 500 seems quite high

In [None]:
model <- randomForest(y ~ ., 
                      data = train.df,
                      ntree = 100)

Let's see how the error rate goes down with the increasing number of trees. The 'yes', 'no' curves are errors on the objects in those classes. OOB is the mean prediction error on each training sample xᵢ, using only the trees that did not have xᵢ in their bootstrap sample.

In [None]:
layout(matrix(c(1,2),nrow=1), width=c(4,1)) 
par(mar=c(5,4,4,0)) #No margin on the right side
plot(model, log="y")
par(mar=c(5,0,4,2)) #No margin on the left side
plot(c(0,1),type="n", axes=F, xlab="", ylab="")
legend("top", colnames(model$err.rate),col=1:4,cex=0.8,fill=1:4)

## Evaluate on the test set
To get the predicted classes we need to call *predict.rpart* with *type="class"*, for a probability matrix with *type="prob"*

In [None]:
predicted <- function(model, data) {
    predicted_class = predict(object = model,  
                                newdata = data,
                                type = "response")  

    predicted_probs = predict(object = model,  
                                newdata = data,   
                                type = "prob")
    predicted_probs_yes <- predicted_probs[,"yes"]
    
    return (list(predicted_class, predicted_probs_yes))
}

predicted_class_probs <- predicted(model, test.df)
predict.class <- predicted_class_probs[[1]]
predict.probs.yes <- predicted_class_probs[[2]]

### Confusion Matrix

In [None]:
evaluation <- confusionMatrix(data = predict.class,       
                              reference = test.df$y)
print(evaluation)

### Classification Accuracy

In [None]:
accuracy <- evaluation$overall["Accuracy"]
cat("Classification Accuracy : ", format(100*accuracy,digits = 4), "%\n")

### ROC Curve

In [None]:
pred <- prediction(predict.probs.yes, test.df$y)
roc_perf <- performance(pred,"tpr","fpr")
plot(roc_perf, colorize=TRUE)

### Area under Curve (AUC)

In [None]:
auc_perf <- performance(pred,"auc")
auc <- auc_perf@y.values[[1]]
cat("AUC :", format(auc,digits = 4))

## What's the effect of different number of trees grown ?
We're creating models on different data sets with varying numbers of trees in the random forest.
We're only evaluating the AUC on the unbalanced data sets since the accuracy doesn't really mean anything here.

In [None]:
build_and_evaluate_model <- function(data_set, ntree) {
    data <- read_data(data_set)
    partitions <- partition_data(data)
    train.df <- partitions[[1]]
    test.df  <- partitions[[2]]
    model <- randomForest(y ~ ., 
                          data = train.df, 
                          ntree = ntree)

    predicted_class_probs <- predicted(model, test.df)
    predict.class <- predicted_class_probs[[1]]
    predict.probs.yes <- predicted_class_probs[[2]]
    
    evaluation <- confusionMatrix(data = predict.class,       
                                  reference = test.df$y)
    
    accuracy <- evaluation$overall["Accuracy"]
    
    pred <- prediction(predict.probs.yes, test.df$y)
    auc_perf <- performance(pred,"auc")
    auc <- auc_perf@y.values[[1]]
    
    return(list(accuracy, auc))
}

In [None]:
for (data_set in data_sets) {
  message(paste("=====", data_set, "====="))
  for (ntree in c(10, 20, 50, 100, 500)) {
      result <- build_and_evaluate_model(data_set, ntree)
      if ( data_set == "bank-balanced") {
          msg <- paste("# trees :", ntree, 
                       "\taccuracy :", format(100*result[[1]],digits = 4), 
                       "\tAUC :", format(result[[2]],digits = 4))
      }
      else {
          msg <- paste("# trees :", ntree, 
                       "\tAUC :", format(result[[2]],digits = 4))
      }
      message(msg)
  }
}