# Apply Random Forest

In [1]:
library(ggplot2)
library(caret)
library(e1071)
library(dplyr)
library(randomForest)

Loading required package: lattice

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked from ‘package:ggplot2’:

    margin



In [3]:
set.seed(1)

In [4]:
# 利用する特徴量を列挙
valuables <- c("RoadType","CurveAverage","Speed", "Curve100", "Curve150","MaxSpeed","RiskFactor","Curve","DistSignal","Pitch","AheadDistance","AverageVelocity","TimeToCollision","AccelerationSpeed", "Engine", "SteeringAngle", "TimeHeadway", "Jerk", "LaneCount", "DiffAvgSpeed", "EmptinessOfRoad", "RoadFactor")

In [5]:
printf <- function(...) cat(sprintf(...))

In [6]:
# CV value を計算
# dfx: 使用するデータ
# verbose: ログを出力するかどうか
CV <- function(dfx, verbose=FALSE) {        
    # k=5 束の fold を作成
    folds <- createFolds(dfx$flag, k=5)
    
    # 各 fold における結果を格納する変数
    count <- 1
    errs <- c()       # CV 値
    reds <- c()     # 長時間平均 (Red だと予測して実際に Red だったもの)
    blues <- c()    # 長時間平均 (Blue だと予測して実際に Blue だったもの)
    shorts <- c()  # 短時間平均
    for (ids in folds) {
        train <- dfx[-ids, c(valuables, "flag")]                                            # 予測に使う特徴量だけ抜き出す (AccelOrBrake を抜く)
        test <- dfx[ids, ]
        fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=500)   # Random Forest で学習
        p <- predict(fit, newdata=test)                                                      # test データを使って予測
        
        # 予測した結果を用いて短期、長期平均を計算
        if (verbose) {            
            printf("Fold%d\n", count)
            result <- correctVsPredict(test, p)
            shorts <- c(shorts, shortErr(test, p))
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
            count <- count + 1
            printf("\n")
        } else {
            result <- correctVsPredict(test, p, verbose)
            shorts <- c(shorts, shortErr(test, p, verbose))
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
        }
        
        nerr <- sum(p != test$flag)
        errs <- c(errs, nerr / nrow(test))
    }
    
    c(mean(errs), mean(shorts), mean(reds), mean(blues))
}

In [7]:
# 短時間平均を計算する
shortErr <- function(test, predictedFlags, verbose=FALSE) {
    # Red と予測したテストのうち、Red かつ Accel を取り出す
    predictedRedRowsInTrue <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel" & predictedFlags == "Red", ])
    # テストのうち Red かつ Accel を取り出す
    nRedAccel <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel", ])
    
    if (verbose) {
      printf("Short: predictedRedAInTrueA/trueRedA = %d/%d = %f\n", predictedRedRowsInTrue, nRedAccel, predictedRedRowsInTrue/nRedAccel)
      # print(predictedRedRowsInTrue %>% group_by(flag, AccelOrBrake) %>% summarize(count=n()))
    }
    
    predictedRedRowsInTrue/nRedAccel
}

In [8]:
# 長時間平均を計算する
correctVsPredict <- function(test, predictedFlags, verbose=FALSE) {    
    # for Red
    predictedRedRows <- test[predictedFlags == "Red", ]
    nCorrectReds <- sum(predictedRedRows$flag == 'Red')
    nPredictedReds <- nrow(predictedRedRows)
    
    # for Blue
    predictedBlueRows <- test[predictedFlags == "Blue", ]
    nCorrectBlues <- sum(predictedBlueRows$flag == 'Blue')
    nPredictedBlues <- nrow(predictedBlueRows)
        
    if (verbose) {
        printf("As for Red: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds/nPredictedReds)
        printf("As for Blue: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues/nPredictedBlues)  
    }    
    
    c(nCorrectReds/nPredictedReds, nCorrectBlues/nPredictedBlues)
}

In [9]:
# 全体のうち Red と Blue がどれぐらいの割合かを出力する
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == "Red", ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

# Predict Red

In [8]:
df3 <- read.csv("../data/middle/sp5.csv", stringsAsFactors=FALSE)

In [9]:
df3$AccelOrBrake[df3$flag == "RedA"] <- "Accel"
df3$AccelOrBrake[df3$flag == "RedB"] <- "Brake"
df3$AccelOrBrake[df3$flag == "BlueA"] <- "Accel"
df3$AccelOrBrake[df3$flag == "BlueB"] <- "Brake"

In [10]:
df3$flag[df3$flag == "RedA"] <- "Red"
df3$flag[df3$flag == "RedB"] <- "Red"
df3$flag[df3$flag == "BlueA"] <- "Blue"
df3$flag[df3$flag == "BlueB"] <- "Blue"
df3$flag <- as.factor(df3$flag)

In [None]:
verbose=FALSE

In [11]:
# 使用する特徴量だけ抜き取る
df <- df3[, c(valuables, "AccelOrBrake", "flag")]

In [18]:
# 全データに対して CV 値, 長時間平均, 短時間平均を計算
result <- CV(df, verbose=TRUE)
printRedRatios(df)
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean predictedRedInTrueA/trueRedA = %f\n", result[2])
printf("CV value: %f", result[1])

Fold1
As for Red: correct/predict = 42/58 = 0.724138
As for Blue: correct/predict = 28/46 = 0.608696
Short: predictedRedInTrueA/trueRedA = 29/40 = 0.725000

Fold2
As for Red: correct/predict = 48/70 = 0.685714
As for Blue: correct/predict = 23/35 = 0.657143
Short: predictedRedInTrueA/trueRedA = 37/44 = 0.840909

Fold3
As for Red: correct/predict = 48/77 = 0.623377
As for Blue: correct/predict = 16/27 = 0.592593
Short: predictedRedInTrueA/trueRedA = 29/35 = 0.828571

Fold4
As for Red: correct/predict = 49/69 = 0.710145
As for Blue: correct/predict = 25/35 = 0.714286
Short: predictedRedInTrueA/trueRedA = 34/40 = 0.850000

Fold5
As for Red: correct/predict = 50/76 = 0.657895
As for Blue: correct/predict = 19/28 = 0.678571
Short: predictedRedInTrueA/trueRedA = 30/37 = 0.810811

Fold6
As for Red: correct/predict = 45/65 = 0.692308
As for Blue: correct/predict = 25/39 = 0.641026
Short: predictedRedInTrueA/trueRedA = 36/46 = 0.782609

Fold7
As for Red: correct/predict = 49/69 = 0.710145
As fo

In [19]:
set.seed(1)

In [None]:
# ハイパーパラメータチューニング
ntree.lim <- seq(50, 600, 30)

tune <- tune.randomForest(
        flag ~ .,
        data=df[, c(valuables, "flag")],
        ntree=ntree.lim,
        tunecontrol = tune.control(sampling = "cross", cross = 10) # 10-fold CV により評価
)

In [21]:
# 結果の表示と保存
summary(tune)

ERROR: Error in object[[i]]:  'closure' 型のオブジェクトは部分代入可能ではありません 


In [22]:
png("RandomForest-ntree.png", height=960, width=960, res=144)
plot(tune)
dev.off()

ERROR: Error in model.frame(train.x):  引数 "train.x" がありませんし、省略時既定値もありません 


In [13]:
# Random Forest の Feature Importance を計算
fit <- randomForest(flag ~ ., data=df[, c(valuables, "flag")], mtry=3, ntree=500)

In [14]:
# カラム名と importance をカラムとする dataframe を作成
imps <- data.frame(importance=c(fit$importance), names=valuables)
# importance が高い順にソートして出力
print(imps[order(imps$importance, decreasing=TRUE), ])

   importance             names
20  35.080792      DiffAvgSpeed
17  32.997685       TimeHeadway
3   31.759536             Speed
9   31.143696        DistSignal
14  30.588316 AccelerationSpeed
10  30.293228             Pitch
6   29.735280          MaxSpeed
12  28.714308   AverageVelocity
15  28.679538            Engine
7   28.436219        RiskFactor
16  26.934218     SteeringAngle
18  25.806578              Jerk
11  24.557217     AheadDistance
2   20.239817      CurveAverage
21  20.115719   EmptinessOfRoad
13  17.654443   TimeToCollision
4   13.918153          Curve100
5   13.396295          Curve150
8   12.536519             Curve
22  10.753858        RoadFactor
1    9.290083          RoadType
19   7.368673         LaneCount
