# Apply Random Forest

In [1]:
library(grid)
library(rpart)
library(rpart.plot)
library(partykit)
library(lattice)
library(ggplot2)
library(caret)
library(e1071)
library(dplyr)
library(randomForest)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked from ‘package:ggplot2’:

    margin



In [2]:
set.seed(1)

In [3]:
invalids <- c('Time', 'Longitude', 'Latitude', 'Brake', 'Accel', 'flag', 'Curve100', 'Curve150', 'TimeHeadway')
# invalids <- c('Time', 'Longitude', 'Latitude', 'Brake', 'Accel', 'flag', 'Curve100', 'Curve150')

In [4]:
valuables <- c("RoadType","CurveAverage","Speed", "Curve100", "Curve150","MaxSpeed","RiskFactor","Curve","DistSignal","Pitch","AheadDistance","AverageVelocity","TimeToCollision","AccelerationSpeed", "Engine", "SteeringAngle", "TimeHeadway", "Jerk", "LaneCount", "DiffAvgSpeed", "EmptinessOfRoad", "RoadFactor")

In [5]:
printf <- function(...) cat(sprintf(...))

In [6]:
CV <- function(dfx, verbose=FALSE) {        
    folds <- createFolds(dfx$flag)
    
    count <- 1
    errs <- c()
    reds <- c()
    blues <- c()
    shorts <- c()
    for (ids in folds) {
        train <- dfx[-ids, c(valuables, "flag")]
        test <- dfx[ids, ]
        fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=500)
        p <- predict(fit, newdata=test)
        
        if (verbose) {            
            printf("Fold%d\n", count)
            result <- correctVsPredict(test, p)
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
            shorts <- c(shorts, shortErr(test, p))
            count <- count + 1
            printf("\n")
        }
        
        nerr <- sum(p != test$flag)
        errs <- c(errs, nerr / nrow(test))
    }
    
    return(c(mean(errs), mean(shorts), mean(reds), mean(blues)))
}

In [7]:
shortErr <- function(test, predictedFlags) {
    predictedRedRowsInTrue <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel" & predictedFlags == "Red", ])
    nRedAccel <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel", ])
    printf("Short: predictedRedInTrueA/trueRedA = %d/%d = %f\n", predictedRedRowsInTrue, nRedAccel, predictedRedRowsInTrue / nRedAccel)
    # print(predictedRedRowsInTrue %>% group_by(flag, AccelOrBrake) %>% summarize(count=n()))
    predictedRedRowsInTrue / nRedAccel
}

In [8]:
correctVsPredict <- function(test, predictedFlags) {    
    # for All
    # nTests <- nrow(test)
    # nCorrectAll <- sum((predictedFlags == test$flag) == TRUE)
    # printf("As fo All: correct/all = %d/%d = %f\n", nCorrectAll, nTests, nCorrectAll / nTests)
    
    # for Red
    predictedRedRows <- test[predictedFlags == "Red", ]
    nCorrectReds <- sum(predictedRedRows$flag == 'Red')
    nPredictedReds <- nrow(predictedRedRows)
    printf("As for Red: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds / nPredictedReds)
    
    # for Blue
    predictedBlueRows <- test[predictedFlags == "Blue", ]
    nCorrectBlues <- sum(predictedBlueRows$flag == 'Blue')
    nPredictedBlues <- nrow(predictedBlueRows)
    printf("As for Blue: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues / nPredictedBlues)
    
    c(nCorrectReds/nPredictedReds, nCorrectBlues/nPredictedBlues)
}

In [9]:
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == "Red", ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

In [10]:
createFormula <- function(invalids, allFeatures) {
    isValidColumn <- function(x) ! x %in% invalids
    
    features <- Filter(isValidColumn, allFeatures)
    return(paste("flag ~ ", paste(features, collapse=" + ")))
}

# Predict Red

In [11]:
df3 <- read.csv("../data/middle/sp5.csv", stringsAsFactors=FALSE)

In [12]:
df3 %>% group_by(flag) %>% summarize(count=n())

flag,count
BlueA,77
BlueB,372
RedA,401
RedB,192


In [13]:
df3$AccelOrBrake[df3$flag == "RedA"] <- "Accel"
df3$AccelOrBrake[df3$flag == "RedB"] <- "Brake"
df3$AccelOrBrake[df3$flag == "BlueA"] <- "Accel"
df3$AccelOrBrake[df3$flag == "BlueB"] <- "Brake"

In [14]:
df3$flag[df3$flag == "RedA"] <- "Red"
df3$flag[df3$flag == "RedB"] <- "Red"
df3$flag[df3$flag == "BlueA"] <- "Blue"
df3$flag[df3$flag == "BlueB"] <- "Blue"
df3$flag <- as.factor(df3$flag)

In [15]:
fomula <- paste("flag ~ ", paste(valuables, collapse=" + "))

In [16]:
set.seed(1)

In [17]:
df <- df3[, c(valuables, "AccelOrBrake", "flag")]

In [18]:
result <- CV(df, verbose=TRUE)
printRedRatios(df)
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean predictedRedInTrueA/trueRedA = %f\n", result[2])
printf("CV value: %f", result[1])

Fold1
As for Red: correct/predict = 42/58 = 0.724138
As for Blue: correct/predict = 28/46 = 0.608696
Short: predictedRedInTrueA/trueRedA = 29/40 = 0.725000

Fold2
As for Red: correct/predict = 48/70 = 0.685714
As for Blue: correct/predict = 23/35 = 0.657143
Short: predictedRedInTrueA/trueRedA = 37/44 = 0.840909

Fold3
As for Red: correct/predict = 48/77 = 0.623377
As for Blue: correct/predict = 16/27 = 0.592593
Short: predictedRedInTrueA/trueRedA = 29/35 = 0.828571

Fold4
As for Red: correct/predict = 49/69 = 0.710145
As for Blue: correct/predict = 25/35 = 0.714286
Short: predictedRedInTrueA/trueRedA = 34/40 = 0.850000

Fold5
As for Red: correct/predict = 50/76 = 0.657895
As for Blue: correct/predict = 19/28 = 0.678571
Short: predictedRedInTrueA/trueRedA = 30/37 = 0.810811

Fold6
As for Red: correct/predict = 45/65 = 0.692308
As for Blue: correct/predict = 25/39 = 0.641026
Short: predictedRedInTrueA/trueRedA = 36/46 = 0.782609

Fold7
As for Red: correct/predict = 49/69 = 0.710145
As fo

In [19]:
set.seed(1)

In [20]:
ntree.lim <- seq(50, 600, 30)

tune <- tune.randomForest(
        flag ~ .,
        data=df[, c(valuables, "flag")],
        ntree=ntree.lim,
        tunecontrol = tune.control(sampling = "cross", cross = 10)
)

In [21]:
summary(tune)


Parameter tuning of ‘randomForest’:

- sampling method: 10-fold cross validation 

- best parameters:
 ntree
   350

- best performance: 0.3368407 

- Detailed performance results:
   ntree     error dispersion
1     50 0.3503114 0.02859054
2     80 0.3598535 0.03128121
3    110 0.3502198 0.03206765
4    140 0.3377381 0.04350117
5    170 0.3598260 0.04217218
6    200 0.3560073 0.03099303
7    230 0.3397070 0.02940900
8    260 0.3396886 0.03078244
9    290 0.3416209 0.03138413
10   320 0.3396978 0.02162423
11   350 0.3368407 0.02828038
12   380 0.3435348 0.04252303
13   410 0.3492857 0.02984729
14   440 0.3387546 0.03370955
15   470 0.3464286 0.03437398
16   500 0.3397070 0.04557505
17   530 0.3464011 0.03120883
18   560 0.3464286 0.02884950
19   590 0.3464103 0.03229233


In [25]:
png("RandomForest-ntree.png", height=960, width=960, res=144)
plot(tune)
dev.off()

In [26]:
fit <- randomForest(flag ~ ., data=df[, c(valuables, "flag")], mtry=3, ntree=500)

In [42]:
imps <- data.frame(importance=c(fit$importance), names=valuables)
imps[order(imps$importance, decreasing=TRUE), ]

Unnamed: 0,importance,names
20,34.671525,DiffAvgSpeed
17,32.854625,TimeHeadway
9,32.735325,DistSignal
3,30.99597,Speed
14,30.450949,AccelerationSpeed
10,30.408327,Pitch
7,29.300301,RiskFactor
6,29.277197,MaxSpeed
15,28.865474,Engine
16,28.012838,SteeringAngle
