#42 break up tuning into multiple sections WIP (#43)

* #42 break up tuning into multiple sections * Add train performance examples * Add train to hyperpar tuning effect * Fix example and use holdout where cv not necessary for explanation * Style spelling and formatting feedback
mlr-archive · Aug 18, 2016 · 89ca106 · 89ca106
1 parent 1f0a3c8
commit 89ca106
Show file tree

Hide file tree

Showing 5 changed files with 526 additions and 185 deletions.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -18,6 +18,7 @@ pages:
     - 'Predict': 'predict.md'
     - 'Performance': 'performance.md'
     - 'Resampling': 'resample.md'
+    - 'Tuning': 'tune.md'
     - 'Benchmark Experiments': 'benchmark_experiments.md'
     - 'Parallelization': 'parallelization.md'
     - 'Visualization': 'visualization.md'
@@ -27,7 +28,7 @@ pages:
     - 'Preprocessing': 'preproc.md'
     - 'Imputation': 'impute.md'
     - 'Bagging': 'bagging.md'
-    - 'Tuning': 'tune.md'
+    - 'Advanced Tuning': 'advanced_tune.md'
     - 'Feature Selection': 'feature_selection.md'
     - 'Nested Resampling': 'nested_resampling.md'
     - 'Cost-Sensitive Classification': 'cost_sensitive_classif.md'
@@ -37,6 +38,7 @@ pages:
     - 'Learning Curves': 'learning_curve.md'
     - 'Partial Dependence Plots': 'partial_dependence.md'
     - 'Classifier Calibration Plots': 'classifier_calibration.md'
+    - 'Hyperparameter Tuning Effects': 'hyperpar_tuning_effects.md'
 - Extend:
     - 'Create Custom Learners': 'create_learner.md'
     - 'Create Custom Measures': 'create_measure.md'

diff --git a/src/advanced_tune.Rmd b/src/advanced_tune.Rmd
@@ -0,0 +1,93 @@
+# Advanced Tuning
+
+## Iterated F-Racing for mixed spaces and dependencies
+
+The package supports a larger number of tuning algorithms, which can all be looked up and
+selected via [&TuneControl]. One of the cooler algorithms is iterated F-racing from the 
+[%irace] package (technical description [here](http://iridia.ulb.ac.be/IridiaTrSeries/link/IridiaTr2011-004.pdf)). This not only works for arbitrary parameter types (numeric, integer,
+discrete, logical), but also for so-called dependent / hierarchical parameters:
+
+```{r}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -12, upper = 12, trafo = function(x) 2^x),
+  makeDiscreteParam("kernel", values = c("vanilladot", "polydot", "rbfdot")),
+  makeNumericParam("sigma", lower = -12, upper = 12, trafo = function(x) 2^x,
+    requires = quote(kernel == "rbfdot")),
+  makeIntegerParam("degree", lower = 2L, upper = 5L,
+    requires = quote(kernel == "polydot"))
+)
+ctrl = makeTuneControlIrace(maxExperiments = 200L)
+rdesc = makeResampleDesc("Holdout")
+res = tuneParams("classif.ksvm", iris.task, rdesc, par.set = ps, control = ctrl, show.info = FALSE)
+print(head(as.data.frame(res$opt.path)))
+```
+
+See how we made the kernel parameters like `sigma` and `degree` dependent on the `kernel`
+selection parameters? This approach allows you to tune parameters of multiple kernels at once, 
+efficiently concentrating on the ones which work best for your given data set.
+
+
+## Tuning across whole model spaces with ModelMultiplexer
+
+We can now take the following example even one step further. If we use the
+[ModelMultiplexer](&makeModelMultiplexer) we can tune over different model classes at once,
+just as we did with the SVM kernels above.
+
+```{r}
+base.learners = list(
+  makeLearner("classif.ksvm"),
+  makeLearner("classif.randomForest")
+)
+lrn = makeModelMultiplexer(base.learners)
+```
+
+Function [&makeModelMultiplexerParamSet] offers a simple way to contruct parameter set for tuning:
+The parameter names are prefixed automatically and the `requires` element is set, too,
+to make all paramaters subordinate to `selected.learner`.
+
+```{r}
+ps = makeModelMultiplexerParamSet(lrn,
+  makeNumericParam("sigma", lower = -12, upper = 12, trafo = function(x) 2^x),
+  makeIntegerParam("ntree", lower = 1L, upper = 500L)
+)
+print(ps)
+rdesc = makeResampleDesc("CV", iters = 2L)
+ctrl = makeTuneControlIrace(maxExperiments = 200L)
+res = tuneParams(lrn, iris.task, rdesc, par.set = ps, control = ctrl, show.info = FALSE)
+print(head(as.data.frame(res$opt.path)))
+```
+
+
+## Multi-criteria evaluation and optimization
+
+During tuning you might want to optimize multiple, potentially conflicting, performance measures
+simultaneously.
+
+In the following example we aim to minimize both, the false positive and the false negative rates
+([fpr](measures.md) and [fnr](measures.md)).
+We again tune the hyperparameters of an SVM (function [ksvm](&kernlab::ksvm)) with a radial
+basis kernel and use the [sonar classification task](&sonar.task) for illustration.
+As search strategy we choose a random search.
+
+For all available multi-criteria tuning algorithms see [&TuneMultiCritControl].
+
+```{r}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -12, upper = 12, trafo = function(x) 2^x),
+  makeNumericParam("sigma", lower = -12, upper = 12, trafo = function(x) 2^x)
+)
+ctrl = makeTuneMultiCritControlRandom(maxit = 30L)
+rdesc = makeResampleDesc("Holdout")
+res = tuneParamsMultiCrit("classif.ksvm", task = sonar.task, resampling = rdesc, par.set = ps,
+  measures = list(fpr, fnr), control = ctrl, show.info = FALSE)
+res
+head(as.data.frame(trafoOptPath(res$opt.path)))
+```
+
+The results can be visualized with function [&plotTuneMultiCritResult].
+The plot shows the false positive and false negative rates for all parameter settings evaluated
+during tuning. Points on the Pareto front are slightly increased.
+
+```{r}
+plotTuneMultiCritResult(res)
+```
diff --git a/src/hyperpar_tuning_effects.Rmd b/src/hyperpar_tuning_effects.Rmd
@@ -0,0 +1,245 @@
+---
+title: "tutorial_draft"
+output: github_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(mlr)
+```
+
+# Evaluating Hyperparameter Tuning
+
+As mentioned in the [Tuning](tune.md) tutorial, tuning a machine learning algorithm typically involves:
+
+* the hyperparameter search space:
+
+```{r search_space, message = FALSE, warning = FALSE}
+# ex: create a search space for the C hyperparameter from 0.01 to 0.1
+ps = makeParamSet(
+  makeNumericParam("C", lower = 0.01, upper = 0.1)
+)
+```
+
+* the optimization algorithm (aka tuning method):
+
+```{r opt_algo, message = FALSE, warning = FALSE}
+# ex: random search with 100 iterations
+ctrl = makeTuneControlRandom(maxit = 100L)
+```
+
+* an evaluation method, i.e., a resampling strategy and a performance measure:
+
+```{r eval_method, message = FALSE, warning = FALSE}
+# ex: 2-fold CV
+rdesc = makeResampleDesc("CV", iters = 2L)
+```
+
+After tuning, you may want to evaluate the tuning process in order to answer questions such as:
+
+* How does varying the value of a hyperparameter change the performance of the machine learning algorithm?
+* What's the relative importance of each hyperparameter?
+* How did the optimization algorithm (prematurely) converge?
+
+mlr provides methods to generate and plot the data in order to evaluate the effect of hyperparameter tuning
+
+## Generating hyperparameter tuning data
+
+mlr separates the generation of the data from the plotting of the data in case the user wishes to use the data in a custom way downstream.
+
+The `generateHyperParsEffectData` method takes the tuning result along with 2 additional arguments: `trafo` and `include.diagnostics`. The `trafo` argument will convert the hyperparameter data to be on the transformed scale in case a transformation was used when creating the parameter (as in the case below). The `include.diagnostics` argument will tell mlr whether to include the eol and any error messages from the learner.
+
+Below we perform random search on the C parameter for SVM on the famous Pima Indians dataset. We generate the hyperparameter effect data so that the C parameter is on the transformed scale and we do not include diagnostic data:
+
+```{r gen_data, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -5, upper = 5, trafo = function(x) 2^x)
+)
+ctrl = makeTuneControlRandom(maxit = 100L)
+rdesc = makeResampleDesc("CV", iters = 2L)
+res = tuneParams("classif.ksvm", task = pid.task, control = ctrl, 
+  measures = list(acc, mmce), resampling = rdesc, par.set = ps, show.info = F)
+generateHyperParsEffectData(res, trafo = T, include.diagnostics = F)
+```
+
+As a reminder from the [resampling](resample.md) tutorial, if we wanted to generate data on the training set as well as the validation set, we only need to make a few minor changes:
+
+```{r gen_data_train, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -5, upper = 5, trafo = function(x) 2^x)
+)
+ctrl = makeTuneControlRandom(maxit = 100L)
+rdesc = makeResampleDesc("CV", iters = 2L, predict = "both")
+res = tuneParams("classif.ksvm", task = pid.task, control = ctrl, 
+  measures = list(acc, setAggregation(acc, train.mean), mmce, setAggregation(mmce, 
+    train.mean)), resampling = rdesc, par.set = ps, show.info = F)
+generateHyperParsEffectData(res, trafo = T, include.diagnostics = F)
+```
+
+In the example below, we perform grid search on the C parameter for SVM on the Pima Indians dataset using nested cross validation. We generate the hyperparameter effect data so that the C parameter is on the untransformed scale and we do not include diagnostic data. As you can see below, nested cross validation is supported without any extra work by the user, allowing the user to obtain an unbiased estimator for the performance.
+
+```{r gen_nested, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -5, upper = 5, trafo = function(x) 2^x)
+)
+ctrl = makeTuneControlGrid()
+rdesc = makeResampleDesc("CV", iters = 2L)
+lrn = makeTuneWrapper("classif.ksvm", control = ctrl, 
+  measures = list(acc, mmce), resampling = rdesc, par.set = ps, show.info = F)
+res = resample(lrn, task = pid.task, resampling = cv2, extract = getTuneResult)
+generateHyperParsEffectData(res)
+```
+
+After generating the hyperparameter effect data, the next step is to visualize it. mlr has several methods built-in to visualize the data, meant to support the needs of the researcher and the engineer in industry. The next few sections will walk through the visualization support for several usecases.
+
+## Visualizing the effect of a single hyperparameter
+
+In a situation when the user is tuning a single hyperparameter for a learner, the user may wish to plot the performance of the learner against the values of the hyperparameter.
+
+In the example below, we tune the number of clusters against the silhouette
+score on the Pima dataset. We specify the x-axis with the `x` argument and the y-axis with the `y` argument. If the `plot.type` argument is not specified, mlr will attempt to plot a scatterplot by default. Since `plotHyperParsEffect` returns a `ggplot2` object, we can easily customize it to our liking!
+
+```{r cluster, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeDiscreteParam("centers", values = 3:10)
+)
+ctrl = makeTuneControlGrid()
+rdesc = makeResampleDesc("Holdout")
+res = tuneParams("cluster.kmeans", task = mtcars.task, control = ctrl, 
+  measures = silhouette, resampling = rdesc, par.set = ps, show.info = F)
+data = generateHyperParsEffectData(res)
+plt = plotHyperParsEffect(data, x = "centers", y = "silhouette.test.mean")
+# add our own touches to the plot
+plt + geom_point(colour = "red") + 
+  ggtitle("Evaluating Number of Cluster Centers on mtcars") + 
+  scale_x_continuous(breaks = 3:10) +
+  theme_bw()
+```
+
+In the example below, we tune SVM with the C hyperparameter on the Pima dataset. We will use simulated annealing optimizer, so we are interested in seeing if the optimization algorithm actually improves with iterations. By default, mlr only plots improvements to the global optimum.
+
+```{r sa_single, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -5, upper = 5, trafo = function(x) 2^x)
+)
+ctrl = makeTuneControlGenSA(budget = 100L)
+rdesc = makeResampleDesc("Holdout")
+res = tuneParams("classif.ksvm", task = pid.task, control = ctrl, 
+  resampling = rdesc, par.set = ps, show.info = F)
+data = generateHyperParsEffectData(res)
+plt = plotHyperParsEffect(data, x = "iteration", y = "mmce.test.mean", 
+  plot.type = "line")
+plt + ggtitle("Analyzing convergence of simulated annealing") +
+  theme_minimal()
+```
+
+In the case of a learner crash, mlr will impute the crash with the worst value graphically and indicate the point. In the example below, we give the C parameter negative values, which will result in a learner crash for SVM.
+
+```{r single_crash, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeDiscreteParam("C", values = c(-1, -0.5, 0.5, 1, 1.5))
+)
+ctrl = makeTuneControlGrid()
+rdesc = makeResampleDesc("CV", iters = 2L)
+res = tuneParams("classif.ksvm", task = pid.task, control = ctrl, 
+  measures = list(acc, mmce), resampling = rdesc, par.set = ps, show.info = F)
+data = generateHyperParsEffectData(res)
+plt = plotHyperParsEffect(data, x = "C", y = "acc.test.mean")
+plt + ggtitle("SVM learner crashes with negative C") +
+  theme_bw()
+```
+
+The example below uses nested cross validation with an outer loop of 2 runs. mlr indicates each run within the visualization.
+
+```{r single_nested, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -5, upper = 5, trafo = function(x) 2^x)
+)
+ctrl = makeTuneControlGrid()
+rdesc = makeResampleDesc("Holdout")
+lrn = makeTuneWrapper("classif.ksvm", control = ctrl, 
+  measures = list(acc, mmce), resampling = rdesc, par.set = ps, show.info = F)
+res = resample(lrn, task = pid.task, resampling = cv2, extract = getTuneResult)
+data = generateHyperParsEffectData(res)
+plotHyperParsEffect(data, x = "C", y = "acc.test.mean", plot.type = "line")
+```
+
+## Visualizing the effect of 2 hyperparameters
+
+In the case of tuning 2 hyperparameters simultaneously, mlr provides the ability to plot a heatmap and contour plot in addition to a scatterplot or line.
+
+In the example below, we tune the C and sigma parameters for SVM on the Pima dataset. We use interpolation to produce a regular grid for plotting the heatmap. The `interpolation` argument accepts any regression learner from mlr to perform the interpolation. The `z` argument will be used to fill the heatmap or color lines, depending on the `plot.type` used.
+
+```{r two, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -5, upper = 5, trafo = function(x) 2^x),
+  makeNumericParam("sigma", lower = -5, upper = 5, trafo = function(x) 2^x))
+ctrl = makeTuneControlRandom(maxit = 100L)
+rdesc = makeResampleDesc("Holdout")
+learn = makeLearner("classif.ksvm", par.vals = list(kernel = "rbfdot"))
+res = tuneParams(learn, task = pid.task, control = ctrl, measures = acc, 
+  resampling = rdesc, par.set = ps, show.info = F)
+data = generateHyperParsEffectData(res)
+plt = plotHyperParsEffect(data, x = "C", y = "sigma", z = "acc.test.mean", 
+  plot.type = "heatmap", interpolate = "regr.earth")
+min_plt = min(data$data$acc.test.mean, na.rm = TRUE)
+max_plt = max(data$data$acc.test.mean, na.rm = TRUE)
+med_plt = mean(c(min_plt, max_plt))
+plt + scale_fill_gradient2(breaks = seq(min_plt, max_plt, length.out = 5), 
+  low = "blue", mid = "white", high = "red", midpoint = med_plt)
+```
+
+We can use the `show.experiments` argument in order to visualize which points were specifically passed to the learner in the original experiment and which points were interpolated by mlr:
+
+```{r two_showargs, message = FALSE, warning = FALSE}
+plt = plotHyperParsEffect(data, x = "C", y = "sigma", z = "acc.test.mean", 
+  plot.type = "heatmap", interpolate = "regr.earth", show.experiments = TRUE)
+plt + scale_fill_gradient2(breaks = seq(min_plt, max_plt, length.out = 5), 
+  low = "blue", mid = "white", high = "red", midpoint = med_plt)
+```
+
+We can also visualize how long the optimizer takes to reach an optima for the same example:
+
+```{r two_optima, message = FALSE, warning = FALSE}
+plotHyperParsEffect(data, x = "iteration", y = "acc.test.mean", 
+  plot.type = "line")
+```
+
+In the case where we are tuning 2 hyperparameters and we have a learner crash, mlr will indicate the respective points and impute them with the worst value. In the example below, we tune C and sigma, forcing C to be negative for some instances which will crash SVM. We perform interpolation to get a regular grid in order to plot a heatmap. We can see that the interpolation creates axis parallel lines resulting from the learner crashes.
+
+```{r two_crash, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeDiscreteParam("C", values = c(-1, 0.5, 1.5, 1, 0.2, 0.3, 0.4, 5)),
+  makeDiscreteParam("sigma", values = c(-1, 0.5, 1.5, 1, 0.2, 0.3, 0.4, 5)))
+ctrl = makeTuneControlGrid()
+rdesc = makeResampleDesc("Holdout")
+learn = makeLearner("classif.ksvm", par.vals = list(kernel = "rbfdot"))
+res = tuneParams(learn, task = pid.task, control = ctrl, measures = acc, 
+  resampling = rdesc, par.set = ps, show.info = F)
+data = generateHyperParsEffectData(res)
+plotHyperParsEffect(data, x = "C", y = "sigma", z = "acc.test.mean", 
+  plot.type = "heatmap", interpolate = "regr.earth")
+```
+
+A slightly more complicated example is using nested cross validation while simultaneously tuning 2 hyperparameters. In order to plot a heatmap in this case, mlr will aggregate each of the nested runs by a user-specified function. The default function is `mean`. As expected, we can still take advantage of interpolation.
+
+```{r two_nested, message = FALSE, warning = FALSE}
+ps = makeParamSet(
+  makeNumericParam("C", lower = -5, upper = 5, trafo = function(x) 2^x),
+  makeNumericParam("sigma", lower = -5, upper = 5, trafo = function(x) 2^x))
+ctrl = makeTuneControlRandom(maxit = 100)
+rdesc = makeResampleDesc("Holdout")
+learn = makeLearner("classif.ksvm", par.vals = list(kernel = "rbfdot"))
+lrn = makeTuneWrapper(learn, control = ctrl, measures = list(acc, mmce), 
+  resampling = rdesc, par.set = ps, show.info = F)
+res = resample(lrn, task = pid.task, resampling = cv2, extract = getTuneResult)
+data = generateHyperParsEffectData(res)
+plt = plotHyperParsEffect(data, x = "C", y = "sigma", z = "acc.test.mean", 
+  plot.type = "heatmap", interpolate = "regr.earth", show.experiments = TRUE, 
+  nested.agg = mean)
+min_plt = min(plt$data$acc.test.mean, na.rm = TRUE)
+max_plt = max(plt$data$acc.test.mean, na.rm = TRUE)
+med_plt = mean(c(min_plt, max_plt))
+plt + scale_fill_gradient2(breaks = seq(min_plt, max_plt, length.out = 5), 
+  low = "red", mid = "white", high = "blue", midpoint = med_plt)
+```
diff --git a/src/learning_curve.Rmd b/src/learning_curve.Rmd
@@ -60,11 +60,23 @@ lrns = list(
 )
 rin = makeResampleDesc(method = "CV", iters = 5)
 lc = generateLearningCurveData(learners = lrns, task = sonar.task,
-                               percs = seq(0.1, 1, by = 0.1), measures = acc,
-                               resampling = rin, show.info = FALSE)
+  percs = seq(0.1, 1, by = 0.1), measures = acc,
+  resampling = rin, show.info = FALSE)
 plotLearningCurve(lc)
 ```
 
+We can display performance on the train set as well as the test set:
+
+```{r}
+rin2 = makeResampleDesc(method = "CV", iters = 5, predict = "both")
+lc2 = generateLearningCurveData(learners = lrns, task = sonar.task, 
+  percs = seq(0.1, 1, by = 0.1), 
+  measures = list(acc,setAggregation(acc, train.mean)), resampling = rin2, 
+  show.info = FALSE)
+plotLearningCurve(lc2, facet = "learner")
+```
+
+
 There is also an experimental [%ggvis] plotting function, [&plotLearningCurveGGVIS]. Instead of the `facet`
 argument to [&plotLearningCurve] there is an argument `interactive` which plays a similar role. As subplots
 are not available in [%ggvis], measures or learners are mapped to an interactive sidebar which allows selection