In [1]:
library(grid)
library(rpart)
library(rpart.plot)
library(partykit)
library(lattice)
library(ggplot2)
library(caret)
library(dplyr)
library(randomForest)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked from ‘package:ggplot2’:

    margin



In [2]:
set.seed(1)

In [3]:
jRoadType <- c("都市間高速", "都市高速", "有料道路", "国道", "県道", "主要地方道", "一般道1、一般道2、一般道3", "その他")
jHighSpeeds <- c("都市間高速", "都市高速", "有料道路")

In [4]:
valuables <- c("RoadType","CurveAverage","Speed", "Curve100", "Curve150","MaxSpeed","RiskFactor","Curve","DistSignal","Pitch","AheadDistance","AverageVelocity","TimeToCollision","AccelerationSpeed", "Engine", "SteeringAngle", "TimeHeadway", "Jerk", "LaneCount", "DiffAvgSpeed", "EmptinessOfRoad", "RoadFactor")

In [5]:
printf <- function(...) cat(sprintf(...))

In [6]:
CV <- function(dfx, verbose=FALSE) {        
    folds <- createFolds(dfx$flag, k=5)
    
    count <- 1
    errs <- c()
    reds <- c()
    blues <- c()
    shorts <- c()
    for (ids in folds) {
        train <- dfx[-ids, c(valuables, "flag")]
        test <- dfx[ids, ]
        fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=300) 
        p <- predict(fit, newdata=test)
        
        if (verbose) {
            printf("Fold%d\n", count)
            result <- correctVsPredict(test, p, verbose)
            shorts <- c(shorts, shortErr(test, p))
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
            count <- count + 1
            printf("\n")
        } else {
            result <- correctVsPredict(test, p, verbose)
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
        }
        
        nerr <- sum(p != test$flag)
        errs <- c(errs, nerr / nrow(test))
    }
    
    return(c(mean(errs), mean(shorts), mean(reds), mean(blues)))
}

In [7]:
shortErr <- function(test, predictedFlags) {
    predictedRedRowsInTrue <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel" & predictedFlags == "Red", ])
    nRedAccel <- nrow(test[test$flag == "Red" & test$AccelOrBrake == "Accel", ])
    printf("Short: predictedRedAInTrueA/trueRedA = %d/%d = %f\n", predictedRedRowsInTrue, nRedAccel, predictedRedRowsInTrue / nRedAccel)
    # print(predictedRedRowsInTrue %>% group_by(flag, AccelOrBrake) %>% summarize(count=n()))
    predictedRedRowsInTrue / nRedAccel
}

In [8]:
correctVsPredict <- function(test, predictedFlags, verbose=FALSE) {    
    # for All
    # nTests <- nrow(test)
    # nCorrectAll <- sum((predictedFlags == test$flag) == TRUE)
    # printf("As fo All: correct/all = %d/%d = %f\n", nCorrectAll, nTests, nCorrectAll / nTests)
    
    # for Red
    predictedRedRows <- test[predictedFlags == "Red", ]
    nCorrectReds <- sum((predictedRedRows$flag == 'Red') == TRUE)
    nPredictedReds <- nrow(predictedRedRows)
    
    # for Blue
    predictedBlueRows <- test[predictedFlags == "Blue", ]
    nCorrectBlues <- sum((predictedBlueRows$flag == 'Blue') == TRUE)
    nPredictedBlues <- nrow(predictedBlueRows)
        
    if (verbose) {
        printf("As for Red: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds / nPredictedReds)
        printf("As for Blue: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues / nPredictedBlues)  
    }    
    
    c(nCorrectReds/nPredictedReds, nCorrectBlues/nPredictedBlues)
}

In [9]:
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == "Red", ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

In [10]:
createFormula <- function(invalids, allFeatures) {
    isValidColumn <- function(x) ! x %in% invalids
    
    features <- Filter(isValidColumn, allFeatures)
    return(paste("flag ~ ", paste(features, collapse=" + ")))
}

# Predict Reds

In [11]:
df3 <- read.csv("../data/middle/sp5.csv", stringsAsFactors=FALSE)

In [12]:
df3$AccelOrBrake[df3$flag == "RedA"] <- "Accel"
df3$AccelOrBrake[df3$flag == "RedB"] <- "Brake"
df3$AccelOrBrake[df3$flag == "BlueA"] <- "Accel"
df3$AccelOrBrake[df3$flag == "BlueB"] <- "Brake"

In [13]:
df3$flag[df3$flag == "RedA"] <- "Red"
df3$flag[df3$flag == "RedB"] <- "Red"
df3$flag[df3$flag == "BlueA"] <- "Blue"
df3$flag[df3$flag == "BlueB"] <- "Blue"
df3$flag <- as.factor(df3$flag)

In [14]:
allFeatures <- c(colnames(df3))

In [15]:
roadTypes <- unique(df3$RoadType)

In [16]:
df <- df3[, c(valuables, "AccelOrBrake", "flag")]

In [17]:
for (i in roadTypes) {
    printf("RoadType: %d (%s)\n", i, jRoadType[i+1])
    dfx <- df[df$RoadType == i, ]
    result <- CV(dfx, verbose=TRUE)
    printRedRatios(dfx)
    printf("Red: Mean correct/predict = %f\n", result[3])
    printf("Blue: Mean correct/predict = %f\n", result[4])
    printf("Short: Mean accel/predictRed = %f\n", result[2]) 
    printf("CV value: %f", result[1])
    printf("\n\n")
}

RoadType: 7 (その他)
Fold1
As for Red: correct/predict = 5/5 = 1.000000
As for Blue: correct/predict = 0/0 = NaN
Short: predictedRedAInTrueA/trueRedA = 1/1 = 1.000000

Fold2
As for Red: correct/predict = 5/6 = 0.833333
As for Blue: correct/predict = 0/0 = NaN
Short: predictedRedAInTrueA/trueRedA = 0/0 = NaN

Fold3
As for Red: correct/predict = 5/5 = 1.000000
As for Blue: correct/predict = 0/0 = NaN
Short: predictedRedAInTrueA/trueRedA = 1/1 = 1.000000

Fold4
As for Red: correct/predict = 5/5 = 1.000000
As for Blue: correct/predict = 0/0 = NaN
Short: predictedRedAInTrueA/trueRedA = 2/2 = 1.000000

Fold5
As for Red: correct/predict = 5/6 = 0.833333
As for Blue: correct/predict = 0/0 = NaN
Short: predictedRedAInTrueA/trueRedA = 1/1 = 1.000000

Red/All = 25/27 = 0.925926
1 - Red/All = 2/27 = 0.074074
Red: Mean correct/predict = 0.933333
Blue: Mean correct/predict = NaN
Short: Mean accel/predictRed = NaN
CV value: 0.066667

RoadType: 6 (一般道1、一般道2、一般道3)
Fold1
As for Red: correct/predict = 11/18

# Divide More ...

In [18]:
df %>% group_by(RoadType) %>% summarize(counts=n())

RoadType,counts
0,81
2,47
3,324
4,71
5,303
6,189
7,27


# Threshold = 10, 30

In [19]:
set.seed(1)

In [20]:
# Not Use
dfx <- df[df$RoadType == i, ]
folds <- createFolds(dfx$flag, k=5)

In [21]:
verbose = TRUE

In [22]:
i = 6
dfx <- df[df$RoadType == i, ]
folds <- createFolds(dfx$flag, k=5)

In [23]:
i = 4
dfx <- df[df$RoadType == i, ]
folds <- createFolds(dfx$flag, k=5)

In [24]:
i = 5
threshold1 <- 10
threshold2 <- 30
printf("RoadType: %d (%s)\n", i, jRoadType[i+1])
dfx <- df[df$RoadType == i, ]
dfx1 <- dfx %>% filter(AverageVelocity <= threshold1)
dfx2 <- dfx %>% filter(AverageVelocity > threshold1, AverageVelocity <= threshold2)
dfx3 <- dfx %>% filter(AverageVelocity > threshold2)

result1 <- CV(dfx1, verbose=verbose)        
printf("AverageVelocity <= %d:\n", threshold1)
printRedRatios(dfx1)        
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean accel/predictRed = %f\n", result[2]) 
printf("CV value: %f", result1[1])        
printf("\n\n")
        
result2 <- CV(dfx2, verbose=verbose)
printf("%d < AverageVelocity <= %d:\n", threshold1, threshold2)
printRedRatios(dfx2)
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean accel/predictRed = %f\n", result[2]) 
printf("CV value: %f", result2[1])       
printf("\n\n")

result3 <- CV(dfx3, verbose=verbose)        
printf("AverageVelocity > %d:\n", threshold2)
printRedRatios(dfx3)
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean accel/predictRed = %f\n", result[2]) 
printf("CV value: %f", result2[1])       
printf("\n\n")

RoadType: 5 (主要地方道)
Fold1
As for Red: correct/predict = 4/7 = 0.571429
As for Blue: correct/predict = 4/7 = 0.571429
Short: predictedRedAInTrueA/trueRedA = 2/4 = 0.500000

Fold2
As for Red: correct/predict = 4/7 = 0.571429
As for Blue: correct/predict = 5/8 = 0.625000
Short: predictedRedAInTrueA/trueRedA = 2/3 = 0.666667

Fold3
As for Red: correct/predict = 2/3 = 0.666667
As for Blue: correct/predict = 6/10 = 0.600000
Short: predictedRedAInTrueA/trueRedA = 1/3 = 0.333333

Fold4
As for Red: correct/predict = 1/3 = 0.333333
As for Blue: correct/predict = 5/11 = 0.454545
Short: predictedRedAInTrueA/trueRedA = 0/2 = 0.000000

Fold5
As for Red: correct/predict = 4/5 = 0.800000
As for Blue: correct/predict = 6/8 = 0.750000
Short: predictedRedAInTrueA/trueRedA = 2/3 = 0.666667

AverageVelocity <= 10:
Red/All = 33/69 = 0.478261
1 - Red/All = 36/69 = 0.521739
Red: Mean correct/predict = 0.724762
Blue: Mean correct/predict = 0.690000
Short: Mean accel/predictRed = 0.716667
CV value: 0.403077

Fo

In [25]:
i = 3
threshold1 <- 10
threshold2 <- 30
printf("RoadType: %d (%s)\n", i, jRoadType[i+1])
dfx <- df[df$RoadType == i, ]
dfx1 <- dfx %>% filter(AverageVelocity <= threshold1)
dfx2 <- dfx %>% filter(AverageVelocity > threshold1, AverageVelocity <= threshold2)
dfx3 <- dfx %>% filter(AverageVelocity > threshold2)

result1 <- CV(dfx1, verbose=verbose)        
printf("AverageVelocity <= %d:\n", threshold1)
printRedRatios(dfx1)        
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean accel/predictRed = %f\n", result[2]) 
printf("CV value: %f", result1[1])        
printf("\n\n")
        
result2 <- CV(dfx2, verbose=verbose)
printf("%d < AverageVelocity <= %d:\n", threshold1, threshold2)
printRedRatios(dfx2)
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean accel/predictRed = %f\n", result[2]) 
printf("CV value: %f", result2[1])       
printf("\n\n")

result3 <- CV(dfx3, verbose=verbose)        
printf("AverageVelocity > %d:\n", threshold2)
printRedRatios(dfx3)
printf("Red: Mean correct/predict = %f\n", result[3])
printf("Blue: Mean correct/predict = %f\n", result[4])
printf("Short: Mean accel/predictRed = %f\n", result[2]) 
printf("CV value: %f", result2[1])       
printf("\n\n")

RoadType: 3 (国道)
Fold1
As for Red: correct/predict = 5/6 = 0.833333
As for Blue: correct/predict = 2/3 = 0.666667
Short: predictedRedAInTrueA/trueRedA = 4/5 = 0.800000

Fold2
As for Red: correct/predict = 7/10 = 0.700000
As for Blue: correct/predict = 0/0 = NaN
Short: predictedRedAInTrueA/trueRedA = 6/6 = 1.000000

Fold3
As for Red: correct/predict = 4/5 = 0.800000
As for Blue: correct/predict = 3/6 = 0.500000
Short: predictedRedAInTrueA/trueRedA = 4/6 = 0.666667

Fold4
As for Red: correct/predict = 6/10 = 0.600000
As for Blue: correct/predict = 0/1 = 0.000000
Short: predictedRedAInTrueA/trueRedA = 6/7 = 0.857143

Fold5
As for Red: correct/predict = 7/10 = 0.700000
As for Blue: correct/predict = 1/1 = 1.000000
Short: predictedRedAInTrueA/trueRedA = 6/6 = 1.000000

AverageVelocity <= 10:
Red/All = 34/52 = 0.653846
1 - Red/All = 18/52 = 0.346154
Red: Mean correct/predict = 0.724762
Blue: Mean correct/predict = 0.690000
Short: Mean accel/predictRed = 0.716667
CV value: 0.322626

Fold1
As 

# Feature Importance

In [32]:
for (i in c(0, 2, 3, 4, 5, 6, 7)) {
    dfx <- df[df$RoadType == i, ]
    train <- dfx[, c(valuables, "flag")]
    fit <- randomForest(flag ~ ., data=train, mtry=3, ntree=300) 

    imps <- data.frame(importance=c(fit$importance), names=valuables)
    printf("RoadType: %d\n", i)
    print(imps[order(imps$importance, decreasing=TRUE), ])
    printf("\n")
}

RoadType: 0
   importance             names
7   2.7992790        RiskFactor
14  2.6774875 AccelerationSpeed
20  2.3904664      DiffAvgSpeed
15  2.3861884            Engine
10  2.3778746             Pitch
17  2.3359975       TimeHeadway
16  2.3013105     SteeringAngle
3   2.1879790             Speed
11  2.1491667     AheadDistance
12  1.9938741   AverageVelocity
18  1.8834201              Jerk
13  1.7259675   TimeToCollision
6   1.3327716          MaxSpeed
9   1.0750342        DistSignal
21  0.9639663   EmptinessOfRoad
4   0.8433305          Curve100
5   0.8083668          Curve150
2   0.7796089      CurveAverage
22  0.5913020        RoadFactor
8   0.5690892             Curve
19  0.2966602         LaneCount
1   0.0000000          RoadType

RoadType: 2
   importance             names
9   3.1521290        DistSignal
3   1.9499192             Speed
10  1.7332526             Pitch
20  1.4833191      DiffAvgSpeed
15  1.1529746            Engine
18  1.0802113              Jerk
12  1.0325269  