In [1]:
library(grid)
library(rpart)
library(rpart.plot)
library(partykit)
library(lattice)
library(ggplot2)
library(caret)

In [2]:
set.seed(1)

In [3]:
invalids <- c('Time', 'Longitude', 'Latitude', 'Brake', 'Accel', 'flag')
isValidColumn <- function(x) ! x %in% invalids

In [4]:
printf <- function(...) cat(sprintf(...))

In [5]:
CV <- function(dfx, verbose=FALSE) {        
    folds <- createFolds(dfx$flag)
    
    count <- 1
    errs <- c()
    for (ids in folds) {
        train <- dfx[-ids, ]
        test <- dfx[ids, ]
        fit <- rpart(expr, data=train, method="class")
        p <- predict(fit, newdata=test)
        predictedFlags <- colnames(p)[max.col(p, ties.method = "first")]
        
        if (verbose) {            
            printf("Fold%d\n", count)
            correctVsPredict(test, predictedFlags)
            count <- count + 1
            printf("\n")
        }
        
        nerr <- sum((predictedFlags == test$flag) == FALSE)
        errs <- c(errs, nerr / nrow(test))
    }
    
    return(mean(errs))
}

In [6]:
correctVsPredict <- function(test, predictedFlags) {    
    # for All
    # nTests <- nrow(test)
    # nCorrectAll <- sum((predictedFlags == test$flag) == TRUE)
    # printf("As fo All: correct/all = %d/%d = %f\n", nCorrectAll, nTests, nCorrectAll / nTests)
    
    # for Red
    predictedRedRows <- test[predictedFlags == "Red", ]
    nCorrectReds <- sum((predictedRedRows$flag == 'Red') == TRUE)
    nPredictedReds <- nrow(predictedRedRows)
    printf("As for A: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds / nPredictedReds)
    
    # for Blue
    predictedBlueRows <- test[predictedFlags == "Blue", ]
    nCorrectBlues <- sum((predictedBlueRows$flag == 'Blue') == TRUE)
    nPredictedBlues <- nrow(predictedBlueRows)
    printf("As for B: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues / nPredictedBlues)
}

In [7]:
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == "Red", ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

# Predict Red

In [8]:
df3 <- read.csv("../data/middle/sp1.csv", stringsAsFactors=FALSE)

In [9]:
df3$flag[df3$flag == "RedA"] <- "Red"
df3$flag[df3$flag == "RedB"] <- "Red"
df3$flag[df3$flag == "BlueA"] <- "Blue"
df3$flag[df3$flag == "BlueB"] <- "Blue"
df3$flag <- as.factor(df3$flag)

In [10]:
allFeatures <- c(colnames(df3))
features <- Filter(isValidColumn, allFeatures)

In [11]:
expr <- paste("flag ~ ", paste(features, collapse=" + "))

In [12]:
folds3 <- createFolds(df3$flag)

In [13]:
cv <- CV(df3, verbose=TRUE)
printRedRatios(df3)
printf("CV value: %f", cv)

Fold1
As for A: correct/predict = 40/60 = 0.666667
As for B: correct/predict = 22/39 = 0.564103

Fold2
As for A: correct/predict = 42/63 = 0.666667
As for B: correct/predict = 21/36 = 0.583333

Fold3
As for A: correct/predict = 39/64 = 0.609375
As for B: correct/predict = 17/35 = 0.485714

Fold4
As for A: correct/predict = 42/64 = 0.656250
As for B: correct/predict = 20/35 = 0.571429

Fold5
As for A: correct/predict = 43/68 = 0.632353
As for B: correct/predict = 16/29 = 0.551724

Fold6
As for A: correct/predict = 38/52 = 0.730769
As for B: correct/predict = 28/47 = 0.595745

Fold7
As for A: correct/predict = 38/61 = 0.622951
As for B: correct/predict = 19/37 = 0.513514

Fold8
As for A: correct/predict = 40/66 = 0.606061
As for B: correct/predict = 16/33 = 0.484848

Fold9
As for A: correct/predict = 44/73 = 0.602740
As for B: correct/predict = 13/25 = 0.520000

Fold10
As for A: correct/predict = 38/63 = 0.603175
As for B: correct/predict = 17/35 = 0.485714

Red/All = 566/985 = 0.574619


In [14]:
fit3 <- rpart(expr, data=df3, method="class")

In [15]:
summary(fit3)

Call:
rpart(formula = expr, data = df3, method = "class")
  n= 985 

          CP nsplit rel error    xerror       xstd
1 0.03436754      0 1.0000000 1.0000000 0.03703250
2 0.02386635      5 0.8281623 0.9761337 0.03690972
3 0.01670644      7 0.7804296 0.9713604 0.03688323
4 0.01431981      9 0.7470167 0.9976134 0.03702095
5 0.01312649     10 0.7326969 0.9976134 0.03702095
6 0.01193317     12 0.7064439 1.0143198 0.03709850
7 0.01000000     13 0.6945107 1.0071599 0.03706622

Variable importance
       RiskFactor        DistSignal   TimeToCollision       TimeHeadway 
               14                13                13                12 
    AheadDistance             Speed AccelerationSpeed          RoadType 
                9                 8                 7                 7 
            Curve    DistManBicycle     SteeringAngle            Engine 
                7                 3                 2                 2 
        LaneCount        ManBicycle   ManBicycleCount 
         