# Apply Generalized Boosted Regression Modeling

In [1]:
library(grid)
library(rpart)
library(rpart.plot)
library(partykit)
library(lattice)
library(ggplot2)
library(caret)
library(gbm)

Loading required package: survival

Attaching package: ‘survival’

The following object is masked from ‘package:caret’:

    cluster

Loading required package: splines
Loading required package: parallel
Loaded gbm 2.1.1


In [2]:
set.seed(1)

In [3]:
# invalids <- c('Time', 'Longitude', 'Latitude', 'Brake', 'Accel', 'flag', 'Curve100', 'Curve150', 'TimeHeadway')
invalids <- c('Time', 'Longitude', 'Latitude', 'Brake', 'Accel', 'flag', 'Curve100', 'Curve150', 'RightLamp', 'LeftLamp', 'UpLamp')

In [35]:
valuables <- c("RoadType","CurveAverage","Speed","MaxSpeed","RiskFactor", "DistSignal","Pitch","AheadDistance","AverageVelocity","TimeToCollision","AccelerationSpeed","LaneCount","Engine", "SteeringAngle")

In [5]:
printf <- function(...) cat(sprintf(...))

In [22]:
CV <- function(dfx, valuables, verbose=FALSE) {        
    folds <- createFolds(dfx$flag)
    
    count <- 1
    errs <- c()
    reds <- c()
    blues <- c()
    for (ids in folds) {
        train <- dfx[-ids, ]
        test <- dfx[ids, ]         
        fit <- gbm.fit(x=train[, valuables], y=train[, "flag"], distribution="gaussian", interaction.depth=6, n.trees=1000, shrinkage=0.001, verbose=FALSE)
        p <- predict(fit, newdata=test, n.trees=1000, verbose=FALSE)
        p <- p > 0.5
        
        if (verbose) {            
            printf("Fold%d\n", count)
            result <- correctVsPredict(test, p)
            reds <- c(reds, result[1])
            blues <- c(blues, result[2])
            count <- count + 1
            printf("\n")
        }
        
        nerr <- sum(!(p == test$flag))
        errs <- c(errs, nerr / nrow(test))
    }
    
    return(c(mean(errs), mean(reds), mean(blues)))
}

In [7]:
toRB <- function(x) {
    if (x) return("Red")            
    return("Blue")
}

In [8]:
correctVsPredict <- function(test, predictedFlags) {
    # for All 
    # nTests <- nrow(test)
    # nCorrectAll <- sum((predictedFlags == test$flag) == TRUE)
    # printf("As fo All: correct/all = %d/%d = %f\n", nCorrectAll, nTests, nCorrectAll / nTests)
    
    # for Red
    predictedRedRows <- test[predictedFlags,]
    nCorrectReds <- sum(predictedRedRows$flag)
    nPredictedReds <- nrow(predictedRedRows)
    printf("As for Red: correct/predict = %d/%d = %f\n", nCorrectReds, nPredictedReds, nCorrectReds / nPredictedReds)
    
    # for Blue
    predictedBlueRows <- test[!predictedFlags, ]
    nCorrectBlues <- sum(!predictedBlueRows$flag)
    nPredictedBlues <- nrow(predictedBlueRows)
    printf("As for Blue: correct/predict = %d/%d = %f\n", nCorrectBlues, nPredictedBlues, nCorrectBlues / nPredictedBlues)
    
    c(nCorrectReds/nPredictedReds, nCorrectBlues/nPredictedBlues)
}

In [9]:
printRedRatios <- function(dfx) {
    nRed <- nrow(dfx[dfx$flag == TRUE, ])
    nAll <- nrow(dfx)
    printf("Red/All = %d/%d = %f\n", nRed, nAll, nRed/nAll)
    printf("1 - Red/All = %d/%d = %f\n", nAll - nRed, nAll, 1 - nRed/nAll)
}

In [10]:
createFormula <- function(invalids, allFeatures) {
    isValidColumn <- function(x) ! x %in% invalids
    
    features <- Filter(isValidColumn, allFeatures)
    return(paste("flag ~ ", paste(features, collapse=" + ")))
}

# Predict Red

In [36]:
df3 <- read.csv("../data/middle/sp4.csv", stringsAsFactors=FALSE)

In [37]:
df3$flag[df3$flag == "RedA"] <- "Red"
df3$flag[df3$flag == "RedB"] <- "Red"
df3$flag[df3$flag == "BlueA"] <- "Blue"
df3$flag[df3$flag == "BlueB"] <- "Blue"
df3$flag <- as.factor(df3$flag)

In [38]:
set.seed(1)

In [39]:
dfx <- df3[, c(valuables, "flag")]
dfx$flag <- dfx$flag == "Red"

In [40]:
result <- CV(dfx, valuables, verbose=TRUE)
printRedRatios(dfx)
printf("Red: Mean correct/predict = %f\n", result[2])
printf("Blue: Mean correct/predict = %f\n", result[3])
printf("CV value: %f", result[1])

Fold1
As for Red: correct/predict = 53/80 = 0.662500
As for Blue: correct/predict = 17/24 = 0.708333

Fold2
As for Red: correct/predict = 51/81 = 0.629630
As for Blue: correct/predict = 15/24 = 0.625000

Fold3
As for Red: correct/predict = 52/83 = 0.626506
As for Blue: correct/predict = 14/21 = 0.666667

Fold4
As for Red: correct/predict = 52/87 = 0.597701
As for Blue: correct/predict = 10/17 = 0.588235

Fold5
As for Red: correct/predict = 54/87 = 0.620690
As for Blue: correct/predict = 12/17 = 0.705882

Fold6
As for Red: correct/predict = 49/81 = 0.604938
As for Blue: correct/predict = 13/23 = 0.565217

Fold7
As for Red: correct/predict = 50/74 = 0.675676
As for Blue: correct/predict = 21/31 = 0.677419

Fold8
As for Red: correct/predict = 52/78 = 0.666667
As for Blue: correct/predict = 19/26 = 0.730769

Fold9
As for Red: correct/predict = 46/77 = 0.597403
As for Blue: correct/predict = 14/27 = 0.518519

Fold10
As for Red: correct/predict = 50/79 = 0.632911
As for Blue: correct/predict