In [1]:
library(grid)
library(rpart)
library(rpart.plot)
library(partykit)
library(lattice)
library(ggplot2)
library(caret)

In [2]:
set.seed(1)

In [3]:
# 利用する特徴量を列挙
valuables <- c("RoadType","CurveAverage","Speed", "Curve100", "Curve150","MaxSpeed","RiskFactor","Curve","DistSignal","Pitch","AheadDistance","AverageVelocity","TimeToCollision","AccelerationSpeed", "Engine", "SteeringAngle", "TimeHeadway", "Jerk", "LaneCount", "DiffAvgSpeed", "EmptinessOfRoad", "RoadFactor")

In [4]:
printf <- function(...) cat(sprintf(...))

In [5]:
# CV value を計算
# dfx: 使用するデータ
# verbose: ログを出力するかどうか
CV <- function(dfx, verbose=FALSE) {        
    # k=10 束の fold を作成
    folds <- createFolds(dfx$flag, k=10)
    
    # 各 fold における結果を格納する変数
    count <- 1
    errs <- c()       # CV 値
    reds <- c()     # 長時間平均 (Red だと予測して実際に Red だったもの)
    blues <- c()    # 長時間平均 (Blue だと予測して実際に Blue だったもの)
    for (ids in folds) {
        train <- dfx[-ids, ]
        test <- dfx[ids, ]
        fit <- rpart(flag ~ ., data=train, method="class", cp=0.017)  # Decision Tree を使って分類
        p <- predict(fit, newdata=test)                                                   # test データを使って予測
        predictedFlags <- colnames(p)[max.col(p, ties.method = "first")]   # 予測確率が一番高いものを選択
        
        # 予測した結果を用いて長期平均を計算
        if (verbose) {            
            printf("Fold%d\n", count)
            result <- correctVsPredict(test, predictedFlags, verbose)
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
            count <- count + 1
            printf("\n")
        } else {
            result <- correctVsPredict(test, predictedFlags, verbose)
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
        }
        
        nerr <- sum(predictedFlags != test$flag)
        errs <- c(errs, nerr / nrow(test))
    }
    
    c(mean(errs), mean(reds), mean(blues))
}

In [6]:
# 長時間平均を計算する
correctVsPredict <- function(test, predictedFlags, verbose=FALSE) {    
    # for Red
    predictedRedRows <- test[predictedFlags == "Red", ]
    nCorrectReds <- sum(predictedRedRows$flag == 'Red')
    nPredictedReds <- nrow(predictedRedRows)
    
    # for Blue
    predictedBlueRows <- test[predictedFlags == "Blue", ]
    nCorrectBlues <- sum(predictedBlueRows$flag == 'Blue')
    nPredictedBlues <- nrow(predictedBlueRows)
        
    if (verbose) {
        printf("As for Red: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds/nPredictedReds)
        printf("As for Blue: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues/nPredictedBlues)  
    }    
    
    c(nCorrectReds/nPredictedReds, nCorrectBlues/nPredictedBlues)
}

In [7]:
# 全体のうち Red と Blue がどれぐらいの割合かを出力する
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == "Red", ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

# Predict Red

In [8]:
df3 <- read.csv("../data/middle/sp5.csv", stringsAsFactors=FALSE)

In [9]:
df3$flag[df3$flag == "RedA"] <- "Red"
df3$flag[df3$flag == "RedB"] <- "Red"
df3$flag[df3$flag == "BlueA"] <- "Blue"
df3$flag[df3$flag == "BlueB"] <- "Blue"
df3$flag <- as.factor(df3$flag)

In [10]:
# 使用する特徴量だけ抜き取る
df <- df3[, c(valuables, "flag")]

In [11]:
# 全データに対して CV 値, 長時間平均, 短時間平均を計算
result <- CV(df, verbose=TRUE)
printRedRatios(df3)
printf("Red: Mean correct/predict = %f\n", result[2])
printf("Blue: Mean correct/predict = %f\n", result[3])
printf("CV value: %f", result[1])

Fold1
As for Red: correct/predict = 43/68 = 0.632353
As for Blue: correct/predict = 19/36 = 0.527778

Fold2
As for Red: correct/predict = 47/72 = 0.652778
As for Blue: correct/predict = 20/33 = 0.606061

Fold3
As for Red: correct/predict = 46/78 = 0.589744
As for Blue: correct/predict = 13/26 = 0.500000

Fold4
As for Red: correct/predict = 47/69 = 0.681159
As for Blue: correct/predict = 23/35 = 0.657143

Fold5
As for Red: correct/predict = 48/76 = 0.631579
As for Blue: correct/predict = 17/28 = 0.607143

Fold6
As for Red: correct/predict = 39/66 = 0.590909
As for Blue: correct/predict = 18/38 = 0.473684

Fold7
As for Red: correct/predict = 50/75 = 0.666667
As for Blue: correct/predict = 20/30 = 0.666667

Fold8
As for Red: correct/predict = 50/74 = 0.675676
As for Blue: correct/predict = 21/30 = 0.700000

Fold9
As for Red: correct/predict = 48/73 = 0.657534
As for Blue: correct/predict = 20/31 = 0.645161

Fold10
As for Red: correct/predict = 49/76 = 0.644737
As for Blue: correct/predict

In [12]:
# 結果の表示と保存
fit <- rpart(flag ~ ., data=df, method="class", cp=0.017)

In [13]:
printcp(fit)


Classification tree:
rpart(formula = flag ~ ., data = df, method = "class", cp = 0.017)

Variables actually used in tree construction:
[1] AccelerationSpeed CurveAverage      DiffAvgSpeed      Engine           
[5] RoadFactor        TimeHeadway       TimeToCollision  

Root node error: 449/1042 = 0.4309

n= 1042 

        CP nsplit rel error  xerror     xstd
1 0.074610      0   1.00000 1.00000 0.035602
2 0.028953      2   0.85078 0.91537 0.035136
3 0.024499      3   0.82183 0.92873 0.035223
4 0.017817      5   0.77283 0.91759 0.035151
5 0.017075      6   0.75501 0.89755 0.035012
6 0.017000      9   0.70379 0.89978 0.035028


In [14]:
png("DTree-cp.png", height=960, width=960, res=144)
plotcp(fit)
dev.off()

In [15]:
# Tree の Variable importance を表示
summary(fit)

Call:
rpart(formula = flag ~ ., data = df, method = "class", cp = 0.017)
  n= 1042 

          CP nsplit rel error    xerror       xstd
1 0.07461024      0 1.0000000 1.0000000 0.03560167
2 0.02895323      2 0.8507795 0.9153675 0.03513625
3 0.02449889      3 0.8218263 0.9287305 0.03522312
4 0.01781737      5 0.7728285 0.9175947 0.03515108
5 0.01707498      6 0.7550111 0.8975501 0.03501247
6 0.01700000      9 0.7037862 0.8997773 0.03502844

Variable importance
      TimeHeadway      DiffAvgSpeed        RiskFactor             Speed 
               13                11                 9                 8 
  EmptinessOfRoad          MaxSpeed          RoadType            Engine 
                7                 6                 6                 5 
    AheadDistance AccelerationSpeed          Curve150   TimeToCollision 
                4                 4                 4                 4 
         Curve100      CurveAverage             Curve        RoadFactor 
                4         

In [16]:
# 木の表示
png("DTree.png", height=960, width=960, res=144)
plot(fit, uniform=TRUE, main="Classification Tree")
text(fit, use.n=TRUE, all=TRUE, cex=.6)
dev.off()