In [1]:
library(ggplot2)
library(caret)
library(dplyr)
library(randomForest)

Loading required package: lattice

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked from ‘package:ggplot2’:

    margin



In [2]:
set.seed(1)

In [3]:
jRoadType <- c("都市間高速", "都市高速", "有料道路", "国道", "県道", "主要地方道", "一般道1、一般道2、一般道3", "その他")
jHighSpeeds <- c("都市間高速", "都市高速", "有料道路")

In [4]:
# 利用する特徴量を列挙
valuables <- c("RoadType","CurveAverage","Speed", "Curve100", "Curve150","MaxSpeed","RiskFactor","Curve","DistSignal","Pitch","AheadDistance","AverageVelocity","TimeToCollision","AccelerationSpeed", "Engine", "SteeringAngle", "TimeHeadway", "Jerk", "LaneCount", "DiffAvgSpeed", "EmptinessOfRoad", "RoadFactor")

In [5]:
printf <- function(...) cat(sprintf(...))

In [6]:
# CV value を計算
# dfx: 使用するデータ
# verbose: ログを出力するかどうか
CV <- function(dfx, verbose=FALSE) {        
    # k=5 束の fold を作成
    folds <- createFolds(dfx$flag, k=5)
    
    # 各 fold における結果を格納する変数
    count <- 1
    errs <- c()       # CV 値
    reds <- c()     # 長時間平均 (Red だと予測して実際に Red だったもの)
    blues <- c()    # 長時間平均 (Blue だと予測して実際に Blue だったもの)
    shorts <- c()  # 短時間平均
    for (ids in folds) {
        train <- dfx[-ids, c(valuables, "flag")]                                            # 予測に使う特徴量だけ抜き出す (AccelOrBrake を抜く)
        test <- dfx[ids, ]
        fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=300)   # Random Forest で学習
        p <- predict(fit, newdata=test)                                                      # test データを使って予測
        
        # 予測した結果を用いて短期、長期平均を計算
        if (verbose) {
            printf("Fold%d\n", count)
            result <- correctVsPredict(test, p, verbose)
            shorts <- c(shorts, shortErr(test, p, verbose))
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
            count <- count + 1
            printf("\n")
        } else {
            result <- correctVsPredict(test, p, verbose)
            shorts <- c(shorts, shortErr(test, p, verbose))
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
        }
        
        # CV 値を計算
        nerr <- sum(p != test$flag)
        errs <- c(errs, nerr / nrow(test))
    }
    
    c(mean(errs), mean(shorts), mean(reds), mean(blues))
}

In [7]:
# 短時間平均を計算する
shortErr <- function(test, predictedFlags, verbose=FALSE) {
    # Red と予測したテストのうち、Red かつ Accel を取り出す
    predictedRedRowsInTrue <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel" & predictedFlags == "Red", ])
    # テストのうち Red かつ Accel を取り出す
    nRedAccel <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel", ])
    
    if (verbose) {
      printf("Short: predictedRedAInTrueA/trueRedA = %d/%d = %f\n", predictedRedRowsInTrue, nRedAccel, predictedRedRowsInTrue/nRedAccel)
      # print(predictedRedRowsInTrue %>% group_by(flag, AccelOrBrake) %>% summarize(count=n()))
    }
    
    predictedRedRowsInTrue/nRedAccel
}

In [8]:
correctVsPredict <- function(test, predictedFlags, verbose=FALSE) {    
    # for Red
    predictedRedRows <- test[predictedFlags == "Red", ]
    nCorrectReds <- sum(predictedRedRows$flag == 'Red')
    nPredictedReds <- nrow(predictedRedRows)
    
    # for Blue
    predictedBlueRows <- test[predictedFlags == "Blue", ]
    nCorrectBlues <- sum(predictedBlueRows$flag == 'Blue')
    nPredictedBlues <- nrow(predictedBlueRows)
        
    if (verbose) {
        printf("As for Red: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds/nPredictedReds)
        printf("As for Blue: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues/nPredictedBlues)  
    }    
    
    c(nCorrectReds/nPredictedReds, nCorrectBlues/nPredictedBlues)
}

In [9]:
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == "Red", ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

# Predict Reds

In [10]:
df <- read.csv("../data/middle/sp5.csv", stringsAsFactors=FALSE)

In [11]:
df$AccelOrBrake[df$flag == "RedA"] <- "Accel"
df$AccelOrBrake[df$flag == "RedB"] <- "Brake"
df$AccelOrBrake[df$flag == "BlueA"] <- "Accel"
df$AccelOrBrake[df$flag == "BlueB"] <- "Brake"

In [12]:
df$flag[df$flag == "RedA"] <- "Red"
df$flag[df$flag == "RedB"] <- "Red"
df$flag[df$flag == "BlueA"] <- "Blue"
df$flag[df$flag == "BlueB"] <- "Blue"
df$flag <- as.factor(df$flag)

In [13]:
allFeatures <- c(colnames(df))

In [14]:
roadTypes <- unique(df$RoadType)

In [15]:
# 使用する特徴量だけ抜き取る
df <- df[, c(valuables, "AccelOrBrake", "flag")]

In [16]:
verbose = FALSE

In [17]:
for (i in roadTypes) {
    printf("RoadType: %d (%s)\n", i, jRoadType[i+1])
    dfx <- df[df$RoadType == i, ]
    result <- CV(dfx, verbose=verbose)
    printRedRatios(dfx)
    printf("Red: Mean correct/predict = %f\n", result[3])
    printf("Blue: Mean correct/predict = %f\n", result[4])
    printf("Short: Mean accel/predictRed = %f\n", result[2]) 
    printf("CV value: %f", result[1])
    printf("\n\n")
}

RoadType: 7 (その他)
Red/All = 25/27 = 0.925926
1 - Red/All = 2/27 = 0.074074
Red: Mean correct/predict = 0.933333
Blue: Mean correct/predict = NaN
Short: Mean accel/predictRed = NaN
CV value: 0.066667

RoadType: 6 (一般道1、一般道2、一般道3)
Red/All = 90/189 = 0.476190
1 - Red/All = 99/189 = 0.523810
Red: Mean correct/predict = 0.674652
Blue: Mean correct/predict = 0.655475
Short: Mean accel/predictRed = 0.591935
CV value: 0.338834

RoadType: 4 (県道)
Red/All = 36/71 = 0.507042
1 - Red/All = 35/71 = 0.492958
Red: Mean correct/predict = 0.532540
Blue: Mean correct/predict = 0.491429
Short: Mean accel/predictRed = 0.680000
CV value: 0.492381

RoadType: 5 (主要地方道)
Red/All = 154/303 = 0.508251
1 - Red/All = 149/303 = 0.491749
Red: Mean correct/predict = 0.618821
Blue: Mean correct/predict = 0.635907
Short: Mean accel/predictRed = 0.693095
CV value: 0.376448

RoadType: 3 (国道)
Red/All = 202/324 = 0.623457
1 - Red/All = 122/324 = 0.376543
Red: Mean correct/predict = 0.673860
Blue: Mean correct/predict = 0.56

# Divide More ...

In [18]:
# 各 RoadType ごとにカウントする
print(df %>% group_by(RoadType) %>% summarize(counts=n()))

# A tibble: 7 × 2
  RoadType counts
     <int>  <int>
1        0     81
2        2     47
3        3    324
4        4     71
5        5    303
6        6    189
7        7     27


# Threshold = 10, 30

In [19]:
set.seed(1)

In [21]:
verbose = TRUE

In [22]:
i = 5
threshold1 <- 10
threshold2 <- 30
printf("RoadType: %d (%s)\n", i, jRoadType[i+1])
dfx <- df[df$RoadType == i, ]
dfx1 <- dfx %>% filter(AverageVelocity <= threshold1)
dfx2 <- dfx %>% filter(AverageVelocity > threshold1, AverageVelocity <= threshold2)
dfx3 <- dfx %>% filter(AverageVelocity > threshold2)

for (i in c(1, 2, 3)) {
    if (i == 1) {
        result <- CV(dfx1, verbose=verbose)        
    } else if (i == 2) {
        result <- CV(dfx2, verbose=verbose)        
    } else {
        result <- CV(dfx3, verbose=verbose)        
    }
    
    printf("AverageVelocity <= %d:\n", threshold1)
    printf("Red: Mean correct/predict = %f\n", result[3])
    printf("Blue: Mean correct/predict = %f\n", result[4])
    printf("Short: Mean accel/predictRed = %f\n", result[2]) 
    printf("CV value: %f", result[1])        
    printf("\n\n")
}

RoadType: 5 (主要地方道)
Fold1
As for Red: correct/predict = 5/9 = 0.555556
As for Blue: correct/predict = 3/5 = 0.600000
Short: predictedRedAInTrueA/trueRedA = 2/4 = 0.500000

Fold2
As for Red: correct/predict = 4/5 = 0.800000
As for Blue: correct/predict = 6/8 = 0.750000
Short: predictedRedAInTrueA/trueRedA = 1/3 = 0.333333

Fold3
As for Red: correct/predict = 2/4 = 0.500000
As for Blue: correct/predict = 5/9 = 0.555556
Short: predictedRedAInTrueA/trueRedA = 1/1 = 1.000000

Fold4
As for Red: correct/predict = 2/4 = 0.500000
As for Blue: correct/predict = 6/11 = 0.545455
Short: predictedRedAInTrueA/trueRedA = 0/4 = 0.000000

Fold5
As for Red: correct/predict = 5/8 = 0.625000
As for Blue: correct/predict = 4/6 = 0.666667
Short: predictedRedAInTrueA/trueRedA = 2/3 = 0.666667

AverageVelocity <= 10:
Red: Mean correct/predict = 0.596111
Blue: Mean correct/predict = 0.623535
Short: Mean accel/predictRed = 0.500000
CV value: 0.388938

Fold1
As for Red: correct/predict = 13/19 = 0.684211
As for B

In [23]:
i = 3
threshold1 <- 10
threshold2 <- 30
printf("RoadType: %d (%s)\n", i, jRoadType[i+1])
dfx <- df[df$RoadType == i, ]
dfx1 <- dfx %>% filter(AverageVelocity <= threshold1)
dfx2 <- dfx %>% filter(AverageVelocity > threshold1, AverageVelocity <= threshold2)
dfx3 <- dfx %>% filter(AverageVelocity > threshold2)

for (i in c(1, 2, 3)) {
    if (i == 1) {
        result <- CV(dfx1, verbose=verbose)        
    } else if (i == 2) {
        result <- CV(dfx2, verbose=verbose)        
    } else {
        result <- CV(dfx3, verbose=verbose)        
    }
    
    printf("AverageVelocity <= %d:\n", threshold1)
    printf("Red: Mean correct/predict = %f\n", result[3])
    printf("Blue: Mean correct/predict = %f\n", result[4])
    printf("Short: Mean accel/predictRed = %f\n", result[2]) 
    printf("CV value: %f", result[1])        
    printf("\n\n")
}

RoadType: 3 (国道)
Fold1
As for Red: correct/predict = 6/10 = 0.600000
As for Blue: correct/predict = 0/1 = 0.000000
Short: predictedRedAInTrueA/trueRedA = 4/5 = 0.800000

Fold2
As for Red: correct/predict = 6/7 = 0.857143
As for Blue: correct/predict = 2/2 = 1.000000
Short: predictedRedAInTrueA/trueRedA = 6/6 = 1.000000

Fold3
As for Red: correct/predict = 4/4 = 1.000000
As for Blue: correct/predict = 3/6 = 0.500000
Short: predictedRedAInTrueA/trueRedA = 4/7 = 0.571429

Fold4
As for Red: correct/predict = 7/10 = 0.700000
As for Blue: correct/predict = 1/1 = 1.000000
Short: predictedRedAInTrueA/trueRedA = 6/6 = 1.000000

Fold5
As for Red: correct/predict = 6/7 = 0.857143
As for Blue: correct/predict = 3/4 = 0.750000
Short: predictedRedAInTrueA/trueRedA = 5/6 = 0.833333

AverageVelocity <= 10:
Red: Mean correct/predict = 0.802857
Blue: Mean correct/predict = 0.650000
Short: Mean accel/predictRed = 0.840952
CV value: 0.264040

Fold1
As for Red: correct/predict = 15/22 = 0.681818
As for Blu

# Feature Importance

In [24]:
for (i in c(0, 2, 3, 4, 5, 6, 7)) {
    dfx <- df[df$RoadType == i, ]
    train <- dfx[, c(valuables, "flag")]
    fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=300) 

    imps <- data.frame(importance=c(fit$importance), names=valuables)
    printf("RoadType: %d\n", i)
    print(imps[order(imps$importance, decreasing=TRUE), ])
    printf("\n")
}

RoadType: 0
   importance             names
7   3.1691697        RiskFactor
14  2.7362158 AccelerationSpeed
17  2.5608716       TimeHeadway
20  2.4036585      DiffAvgSpeed
10  2.3910390             Pitch
16  2.2965735     SteeringAngle
3   2.1020384             Speed
18  2.0829468              Jerk
15  1.9998877            Engine
11  1.9938175     AheadDistance
12  1.9364477   AverageVelocity
13  1.7428196   TimeToCollision
6   1.3549366          MaxSpeed
21  1.0037285   EmptinessOfRoad
9   0.9614612        DistSignal
4   0.8298273          Curve100
5   0.8275719          Curve150
2   0.7671527      CurveAverage
8   0.7130743             Curve
22  0.5331850        RoadFactor
19  0.4799424         LaneCount
1   0.0000000          RoadType

RoadType: 2
   importance             names
9   2.7344496        DistSignal
10  1.8916143             Pitch
3   1.8906718             Speed
20  1.4181672      DiffAvgSpeed
15  1.1882412            Engine
12  1.1688655   AverageVelocity
14  0.9958207 A

# Feature Importance dvd by RT & VEL

In [25]:
set.seed(1)

In [26]:
threshold1 <- 10
threshold2 <- 30

In [28]:
velocities <- c("slow", "middle", "fast")
df <- df[, c(valuables, "flag")]
for (i in c(5, 3)) {
    printf("RoadType: %d (%s)\n", i, jRoadType[i+1])
    dfx <- df[df$RoadType == i, ]
    dfx1 <- dfx %>% filter(AverageVelocity <= threshold1)
    dfx2 <- dfx %>% filter(AverageVelocity > threshold1, AverageVelocity <= threshold2)
    dfx3 <- dfx %>% filter(AverageVelocity > threshold2)
    
    for (j in c(1, 2, 3)) {
        if (j == 1) {
            fit <- randomForest(flag ~ ., data=dfx1, mtry=3, ntree=300) 
        } else if (j == 2) {
            fit <- randomForest(flag ~ ., data=dfx2, mtry=3, ntree=300) 
        } else {
            fit <- randomForest(flag ~ ., data=dfx3, mtry=3, ntree=300) 
        }
        
        imps <- data.frame(importance=c(fit$importance), names=valuables)
        printf("RoadType: %d, Velocity: %s\n", i, velocities[j])
        print(imps[order(imps$importance, decreasing=TRUE), ])
        printf("\n")
    }
}

RoadType: 5 (主要地方道)
RoadType: 5, Velocity: slow
   importance             names
9   2.9234737        DistSignal
17  2.8157682       TimeHeadway
16  2.2516111     SteeringAngle
12  2.0825782   AverageVelocity
7   2.0811564        RiskFactor
18  2.0367175              Jerk
15  1.9922644            Engine
6   1.9717569          MaxSpeed
14  1.8702329 AccelerationSpeed
2   1.8490868      CurveAverage
3   1.6975300             Speed
13  1.6920829   TimeToCollision
20  1.6146841      DiffAvgSpeed
10  1.5881088             Pitch
21  1.3448731   EmptinessOfRoad
11  1.2585747     AheadDistance
8   0.6699901             Curve
22  0.6575988        RoadFactor
19  0.5140442         LaneCount
5   0.4911440          Curve150
4   0.4180078          Curve100
1   0.0000000          RoadType

RoadType: 5, Velocity: middle
   importance             names
17   6.954017       TimeHeadway
7    6.374480        RiskFactor
3    6.301493             Speed
15   5.952699            Engine
9    5.718815        Dist

In [29]:
set.seed(1)
for (i in c(0, 2, 3, 4, 5, 6, 7)) {
    dfx <- df[df$RoadType == i, ]
    train <- dfx[, c(valuables, "flag")]
    fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=300) 

    printf("RoadType: %d\n", i)
    printf("Used features count: %d\n", length(valuables[varUsed(fit, by.tree=FALSE, count=FALSE)])) 
    print(valuables[varUsed(fit, by.tree=FALSE, count=FALSE)])
    printf("\n")
}

RoadType: 0
Used features count: 21
 [1] "CurveAverage"      "Speed"             "Curve100"         
 [4] "Curve150"          "MaxSpeed"          "RiskFactor"       
 [7] "Curve"             "DistSignal"        "Pitch"            
[10] "AheadDistance"     "AverageVelocity"   "TimeToCollision"  
[13] "AccelerationSpeed" "Engine"            "SteeringAngle"    
[16] "TimeHeadway"       "Jerk"              "LaneCount"        
[19] "DiffAvgSpeed"      "EmptinessOfRoad"   "RoadFactor"       

RoadType: 2
Used features count: 21
 [1] "CurveAverage"      "Speed"             "Curve100"         
 [4] "Curve150"          "MaxSpeed"          "RiskFactor"       
 [7] "Curve"             "DistSignal"        "Pitch"            
[10] "AheadDistance"     "AverageVelocity"   "TimeToCollision"  
[13] "AccelerationSpeed" "Engine"            "SteeringAngle"    
[16] "TimeHeadway"       "Jerk"              "LaneCount"        
[19] "DiffAvgSpeed"      "EmptinessOfRoad"   "RoadFactor"       

RoadType: 3
Used