# Apply Random Forest

In [1]:
library(grid)
library(rpart)
library(rpart.plot)
library(partykit)
library(lattice)
library(ggplot2)
library(caret)
library(e1071)
library(randomForest)

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:ggplot2’:

    margin



In [2]:
set.seed(1)

In [3]:
invalids <- c('Time', 'Longitude', 'Latitude', 'Brake', 'Accel', 'flag', 'Curve100', 'Curve150', 'TimeHeadway')
# invalids <- c('Time', 'Longitude', 'Latitude', 'Brake', 'Accel', 'flag', 'Curve100', 'Curve150')

In [32]:
valuables <- c("RoadType","CurveAverage","Speed", "Curve100", "Curve150","MaxSpeed","RiskFactor","Curve","DistSignal","Pitch","AheadDistance","AverageVelocity","TimeToCollision","AccelerationSpeed", "Engine", "SteeringAngle", "TimeHeadway", "Jerk", "LaneCount", "DiffAvgSpeed", "EmptinessOfRoad", "RoadFactor")

In [33]:
printf <- function(...) cat(sprintf(...))

In [6]:
CV <- function(dfx, verbose=FALSE) {        
    folds <- createFolds(dfx$flag)
    
    count <- 1
    errs <- c()
    reds <- c()
    blues <- c()
    for (ids in folds) {
        train <- dfx[-ids, ]
        test <- dfx[ids, ]
        fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=500)
        p <- predict(fit, newdata=test)
        
        if (verbose) {            
            printf("Fold%d\n", count)
            result <- correctVsPredict(test, p)
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
            count <- count + 1
            printf("\n")
        }
        
        nerr <- sum((p == test$flag) == FALSE)
        errs <- c(errs, nerr / nrow(test))
    }
    
    return(c(mean(errs), mean(reds), mean(blues)))
}

In [7]:
correctVsPredict <- function(test, predictedFlags) {    
    # for All
    # nTests <- nrow(test)
    # nCorrectAll <- sum((predictedFlags == test$flag) == TRUE)
    # printf("As fo All: correct/all = %d/%d = %f\n", nCorrectAll, nTests, nCorrectAll / nTests)
    
    # for Red
    predictedRedRows <- test[predictedFlags == "Red", ]
    nCorrectReds <- sum((predictedRedRows$flag == 'Red') == TRUE)
    nPredictedReds <- nrow(predictedRedRows)
    printf("As for Red: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds / nPredictedReds)
    
    # for Blue
    predictedBlueRows <- test[predictedFlags == "Blue", ]
    nCorrectBlues <- sum((predictedBlueRows$flag == 'Blue') == TRUE)
    nPredictedBlues <- nrow(predictedBlueRows)
    printf("As for Blue: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues / nPredictedBlues)
    
    c(nCorrectReds/nPredictedReds, nCorrectBlues/nPredictedBlues)
}

In [8]:
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == "Red", ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

In [9]:
createFormula <- function(invalids, allFeatures) {
    isValidColumn <- function(x) ! x %in% invalids
    
    features <- Filter(isValidColumn, allFeatures)
    return(paste("flag ~ ", paste(features, collapse=" + ")))
}

# Predict Red

In [34]:
df3 <- read.csv("../data/middle/sp5.csv", stringsAsFactors=FALSE)

In [35]:
df3$flag[df3$flag == "RedA"] <- "Red"
df3$flag[df3$flag == "RedB"] <- "Red"
df3$flag[df3$flag == "BlueA"] <- "Blue"
df3$flag[df3$flag == "BlueB"] <- "Blue"
df3$flag <- as.factor(df3$flag)

In [36]:
fomula <- paste("flag ~ ", paste(valuables, collapse=" + "))

In [37]:
set.seed(1)

In [38]:
df <- df3[, c(valuables, "flag")]

In [39]:
result <- CV(df, verbose=TRUE)
printRedRatios(df)
printf("Red: Mean correct/predict = %f\n", result[2])
printf("Blue: Mean correct/predict = %f\n", result[3])
printf("CV value: %f", result[1])

Fold1
As for Red: correct/predict = 41/57 = 0.719298
As for Blue: correct/predict = 28/47 = 0.595745

Fold2
As for Red: correct/predict = 47/72 = 0.652778
As for Blue: correct/predict = 20/33 = 0.606061

Fold3
As for Red: correct/predict = 49/79 = 0.620253
As for Blue: correct/predict = 15/25 = 0.600000

Fold4
As for Red: correct/predict = 46/65 = 0.707692
As for Blue: correct/predict = 26/39 = 0.666667

Fold5
As for Red: correct/predict = 51/77 = 0.662338
As for Blue: correct/predict = 19/27 = 0.703704

Fold6
As for Red: correct/predict = 43/62 = 0.693548
As for Blue: correct/predict = 26/42 = 0.619048

Fold7
As for Red: correct/predict = 46/67 = 0.686567
As for Blue: correct/predict = 24/38 = 0.631579

Fold8
As for Red: correct/predict = 47/67 = 0.701493
As for Blue: correct/predict = 25/37 = 0.675676

Fold9
As for Red: correct/predict = 43/67 = 0.641791
As for Blue: correct/predict = 21/37 = 0.567568

Fold10
As for Red: correct/predict = 46/72 = 0.638889
As for Blue: correct/predict

In [21]:
set.seed(1)

In [None]:
# ntree.lim <- seq(50, 600, 30)

tune <- tune.randomForest(
        flag ~ .,
        data=dfx,
        ntree=ntree.lim,
        tunecontrol = tune.control(sampling = "cross", cross = 10)
)

In [None]:
summary(tune)

In [None]:
plot(tune)

In [22]:
fit <- randomForest(flag ~ ., data=df, mtry=3, ntree=500)

In [23]:
fit$importance

Unnamed: 0,MeanDecreaseGini
RoadType,9.290083
CurveAverage,20.239817
Speed,31.759536
Curve100,13.918153
Curve150,13.396295
MaxSpeed,29.73528
RiskFactor,28.436219
Curve,12.536519
DistSignal,31.143696
Pitch,30.293228
