## Classifying handwritten digits

In [1]:
# installing feather library, if not yet available
if (!("feather" %in% rownames(installed.packages()))) {
    install.packages("feather")
}

In [2]:
# installing caret library, if not yet available
if (!("caret" %in% rownames(installed.packages()))) {
    install.packages("caret")
}

In [3]:
# installing kernlab library, if not yet available
if (!("kernlab" %in% rownames(installed.packages()))) {
    install.packages("kernlab")
}

In [4]:
# installing e1071 library, if not yet available
if (!("e1071" %in% rownames(installed.packages()))) {
    install.packages("e1071")
} 

In [5]:
# installing MLmetrics library, if not yet available
if (!("MLmetrics" %in% rownames(installed.packages()))) {
    install.packages("MLmetrics")
} 

In [6]:
# installing doParallel library, if not yet available
if (!("doParallel" %in% rownames(installed.packages()))) {
    install.packages("doParallel")
} 

In [7]:
library(feather)

url <- "https://github.com/lmassaron/datasets/releases/download/1.0/mnist.feather"
destfile <- "mnist.feather"
download.file(url, destfile, mode =  "wb")

digits <- read_feather(destfile)

In this example we leave the train/test partition to the caret library, by the createDataPartition command (https://www.rdocumentation.org/packages/caret/versions/6.0-86/topics/createDataPartition). It has more options, rendering it more similar to Python Scikit-learn's functions.

In [8]:
library(caret)

# partitioning the data into train and test sets
set.seed(42)
train <- as.numeric(createDataPartition(y=as.factor(digits$target), p=0.8, list=FALSE))

# calculating the number of variables and filtering the ones with enough data
no_vars <- ncol(digits)
non_zero_var <- sapply(digits[,2:ncol(digits)], function(v){var(v, na.rm=TRUE)>0.001} )

# Defining train and test target as a factor
y <- as.factor(digits$target[train])
yt <- as.factor(digits$target[-train])

# Defining train and test predictors as a data.frame
X <- as.data.frame(digits[train, 2:no_vars][,non_zero_var]) 
Xt <- as.data.frame(digits[-train, 2:no_vars][,non_zero_var])

Loading required package: lattice

Loading required package: ggplot2



We also try to speeden up the examples by using parallelism since we expect most of the computers of our readers have multiple cores.

In [9]:
# Speeding up the cross-validation by setting parallel operations
library(doParallel)
cl <- makePSOCKcluster(3) # we set the number of parallel works in respect to the number of CV folds
registerDoParallel(cl)

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



This time we use caret to its full extent, by setting a TrainControl and a tunegrid of values to be checked.

In [10]:
set.seed(42)

ctrl <- trainControl(method="cv", 
                     number=3,
                     summaryFunction=multiClassSummary,
                     classProbs=FALSE)

# Define ranges for the two parameters
C_range <-     sapply(seq(-1.5, 2, 0.5), function(x){10^x})
sigma_range <- sapply(seq(-3, -2, 0.5), function(x){10^x})
                     
grid <- expand.grid(sigma = sigma_range,
                    C = C_range
)

svm_model<-train(X, y,
                 method = "svmRadial", 
                 metric="Accuracy",
                 preProc = c("center", "scale"), 
                 trControl=ctrl,
                 tuneGrid = grid,
                 allowParallel=TRUE)

"There were missing values in resampled performance measures."


In [11]:
print(svm_model)

Support Vector Machines with Radial Basis Function Kernel 

1442 samples
  60 predictor
  10 classes: 'digit_0', 'digit_1', 'digit_2', 'digit_3', 'digit_4', 'digit_5', 'digit_6', 'digit_7', 'digit_8', 'digit_9' 

Pre-processing: centered (60), scaled (60) 
Resampling: Cross-Validated (3 fold) 
Summary of sample sizes: 960, 962, 962 
Resampling results across tuning parameters:

  sigma        C             Accuracy   Kappa       Mean_F1    Mean_Sensitivity
  0.001000000    0.03162278  0.1602755  0.06565004        NaN  0.1584184       
  0.001000000    0.10000000  0.3204645  0.24410594        NaN  0.3187883       
  0.001000000    0.31622777  0.8799764  0.86662613  0.8802308  0.8793822       
  0.001000000    1.00000000  0.9334083  0.92600760  0.9336127  0.9331204       
  0.001000000    3.16227766  0.9659982  0.96221919  0.9659256  0.9658152       
  0.001000000   10.00000000  0.9757146  0.97301502  0.9756922  0.9755534       
  0.001000000   31.62277660  0.9819646  0.97995947  0.98178

In [12]:
preds <- predict(svm_model, newdata = Xt, type = "raw")
test_accuracy <- sum(preds == yt) / length(yt)

In [13]:
cat(paste("Test accuracy:", test_accuracy))

Test accuracy: 0.983098591549296