Merge 7a59061 into a1dd696

mlr-org · May 29, 2017 · d42bb15 · d42bb15
2 parents a1dd696 + 7a59061
commit d42bb15
Show file tree

Hide file tree

Showing 53 changed files with 814 additions and 85 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -97,6 +97,8 @@ S3method(getTaskClassLevels,ClassifTask)
 S3method(getTaskClassLevels,ClassifTaskDesc)
 S3method(getTaskClassLevels,MultilabelTask)
 S3method(getTaskClassLevels,MultilabelTaskDesc)
+S3method(getTaskClassLevels,OneClassTask)
+S3method(getTaskClassLevels,OneClassTaskDesc)
 S3method(getTaskCosts,Task)
 S3method(getTaskDesc,TaskDesc)
 S3method(getTaskDesc,default)
@@ -126,6 +128,7 @@ S3method(makePrediction,ClassifTaskDesc)
 S3method(makePrediction,ClusterTaskDesc)
 S3method(makePrediction,CostSensTaskDesc)
 S3method(makePrediction,MultilabelTaskDesc)
+S3method(makePrediction,OneClassTaskDesc)
 S3method(makePrediction,RegrTaskDesc)
 S3method(makePrediction,SurvTaskDesc)
 S3method(makeRLearner,classif.C50)
@@ -221,6 +224,7 @@ S3method(makeRLearner,cluster.kmeans)
 S3method(makeRLearner,multilabel.cforest)
 S3method(makeRLearner,multilabel.rFerns)
 S3method(makeRLearner,multilabel.randomForestSRC)
+S3method(makeRLearner,oneclass.svm)
 S3method(makeRLearner,regr.GPfit)
 S3method(makeRLearner,regr.IBk)
 S3method(makeRLearner,regr.LiblineaRL2L1SVR)
@@ -420,6 +424,7 @@ S3method(predictLearner,cluster.kmeans)
 S3method(predictLearner,multilabel.cforest)
 S3method(predictLearner,multilabel.rFerns)
 S3method(predictLearner,multilabel.randomForestSRC)
+S3method(predictLearner,oneclass.svm)
 S3method(predictLearner,regr.GPfit)
 S3method(predictLearner,regr.IBk)
 S3method(predictLearner,regr.LiblineaRL2L1SVR)
@@ -517,6 +522,7 @@ S3method(print,LearningCurveData)
 S3method(print,ListLearners)
 S3method(print,Measure)
 S3method(print,MultilabelTask)
+S3method(print,OneClassTask)
 S3method(print,OptModel)
 S3method(print,PartialDependenceData)
 S3method(print,Prediction)
@@ -673,6 +679,7 @@ S3method(trainLearner,cluster.kmeans)
 S3method(trainLearner,multilabel.cforest)
 S3method(trainLearner,multilabel.rFerns)
 S3method(trainLearner,multilabel.randomForestSRC)
+S3method(trainLearner,oneclass.svm)
 S3method(trainLearner,regr.GPfit)
 S3method(trainLearner,regr.IBk)
 S3method(trainLearner,regr.LiblineaRL2L1SVR)
@@ -944,6 +951,7 @@ export(makeMultilabelDBRWrapper)
 export(makeMultilabelNestedStackingWrapper)
 export(makeMultilabelStackingWrapper)
 export(makeMultilabelTask)
+export(makeOneClassTask)
 export(makeOverBaggingWrapper)
 export(makeOversampleWrapper)
 export(makePrediction)
@@ -954,6 +962,7 @@ export(makeRLearnerClassif)
 export(makeRLearnerCluster)
 export(makeRLearnerCostSens)
 export(makeRLearnerMultilabel)
+export(makeRLearnerOneClass)
 export(makeRLearnerRegr)
 export(makeRLearnerSurv)
 export(makeRegrTask)

diff --git a/R/Measure.R b/R/Measure.R
@@ -38,6 +38,7 @@
 #'     \item{surv}{Is the measure applicable for survival?}
 #'     \item{cluster}{Is the measure applicable for cluster?}
 #'     \item{costsens}{Is the measure applicable for cost-sensitive learning?}
+#'     \item{oneclass}{Is the measure applicable for one-classification (anomaly detection) learning?}
 #'     \item{req.pred}{Is prediction object required in calculation? Usually the case.}
 #'     \item{req.truth}{Is truth column required in calculation? Usually the case.}
 #'     \item{req.task}{Is task object required in calculation? Usually not the case}
@@ -123,14 +124,15 @@ makeMeasure = function(id, minimize, properties = character(0L),
 #' Get the default measure for a task type, task, task description or a learner.
 #' Currently these are:
 #'  \tabular{ll}{
+#'    oneclass    \tab f1 \cr
 #'    classif     \tab mmce\cr
 #'    regr        \tab mse\cr
 #'    cluster     \tab db\cr
 #'    surv        \tab cindex\cr
 #'    costsens    \tab mcp\cr
 #'    multilabel  \tab multilabel.hamloss\cr
 #' }
-#'
+#' Note: default measure f1 for oneclass is an adhoc solution and not the optimal measurement for unsupervised learning.
 #' @param x [\code{character(1)} | \code{\link{Task}} | \code{\link{TaskDesc}} | \code{\link{Learner}}]\cr
 #'  Task type, task, task description, learner name, a learner, or a type of learner (e.g. "classif").
 #' @return [\code{\link{Measure}}].
@@ -147,6 +149,7 @@ getDefaultMeasure = function(x) {
   else
     x
   switch(type,
+    oneclass = f1,
     classif = mmce,
     cluster = db,
     regr = mse,

diff --git a/R/OneClassTask.R b/R/OneClassTask.R
@@ -0,0 +1,124 @@
+#' @export
+#' @rdname Task
+
+makeOneClassTask = function(id = deparse(substitute(data)), data, target,
+  weights = NULL, blocking = NULL, fixup.data = "warn", positive = NA_character_, negative = NA_character_,
+    check.data = TRUE) {
+  assertString(id)
+
+  # positive needs to be a string, if it's a number convert it into string
+  assert(
+    checkString(positive, na.ok = TRUE),
+    checkNumber(positive, na.ok = TRUE)
+  )
+  if (isScalarNumeric(positive))
+    positive = as.character(positive)
+
+  assert(
+    checkString(negative, na.ok = TRUE),
+    checkNumber(negative, na.ok = TRUE)
+  )
+  if (isScalarNumeric(negative))
+    negative = as.character(negative)
+
+  assertDataFrame(data)
+  assertString(target) # that this is a valid colname will be check later in makeSupervisedTask
+
+  assertChoice(fixup.data, choices = c("no", "quiet", "warn"))
+  assertFlag(check.data)
+
+  if (fixup.data != "no") {
+    x = data[[target]]
+    if (is.character(x) || is.logical(x) || is.integer(x)) {
+      data[[target]] = as.factor(x)
+    }
+    # we probably dont want to autodrop empty target levels here (as in classif), as the anomaly class could be empty
+  }
+  # check that class column is factor and has max 2 class levels
+  if (check.data) {
+    assertFactor(data[[target]], any.missing = FALSE, empty.levels.ok = TRUE, max.levels = 2L, .var.name = target)
+  }
+
+  # check if positive and negative are element of class levels
+  levs = levels(data[[target]])
+
+  if (length(levs) == 2) {
+    if (!is.na(positive) && !is.na(negative) && !setequal(c(positive, negative), levs)) {
+      stopf("'positive' or 'negative' not equal to class levels")
+    }
+    if (!is.na(positive)) {
+      if (positive %nin% levs)
+        stopf("'positive' not element of the two class levels,")
+    }
+    if (!is.na(negative)) {
+      if (negative %nin% levs)
+        stopf("'negative' not element of the two class levels,")
+    }
+  } else if (length(levs) == 1) {
+    if (!is.na(positive) && !is.na(negative) && sum(c(positive, negative) %in% levs) == 0)
+      stopf("Neither 'positive' nor 'negative' are subset of class levels")
+  }
+
+  task = makeSupervisedTask("oneclass", data, target, weights, blocking,
+    fixup.data = fixup.data, check.data = check.data)
+
+  if (fixup.data != "no") {
+    # add pos and neg as levels if they are missing
+    if (length(levs) == 1) {
+      if (!is.na(positive) && !is.na(negative)) {
+        levels(data[[target]]) = union(levs, c(positive, negative))
+      } else {
+        if (!is.na(positive)) {
+          if (positive %nin% levs) levels(data[[target]]) = c(levs, positive)
+          else stopf("Cannot add second class level when 'positive' is equal to the only class level and no 'negative' is specified!")
+        }
+        if (!is.na(negative)) {
+          if (negative %nin% levs) levels(data[[target]]) = c(levs, negative)
+          else stopf("Cannot add second class level when 'negative' is equal to the only class level and no 'positive' is specified!")
+        }
+      }
+    }
+
+    task$env$data = data
+  }
+
+  task$task.desc = makeOneClassTaskDesc(id, data, target, weights, blocking, positive, negative)
+  addClasses(task, "OneClassTask")
+}
+
+makeOneClassTaskDesc = function(id, data, target, weights, blocking, positive, negative) {
+  td = makeTaskDescInternal("oneclass", id, data, target, weights, blocking)
+  levs = levels(data[[target]])
+  m = length(levs)
+  if (is.na(positive) && is.na(negative)) {
+    positive = levs[1L]
+    if (m < 2L)
+      stopf("Cannot auto-set negative class when there are < 2 class levels!")
+    negative = levs[2L]
+  } else if (is.na(positive)) {
+      if (m < 2L && negative %in% levs) stopf("Cannot auto-set positive class when there are < 2 class levels and negative is the only class level!")
+        positive = setdiff(levs, negative)
+    } else if (is.na(negative)) {
+      if (m < 2L && positive %in% levs) stopf("Cannot auto-set negative class when there are < 2 class levels and positve is the only class level!")
+      negative = setdiff(levs, positive)
+    }
+
+  posneg = c(positive, negative)
+  assertSetEqual(levs, posneg)
+  td$class.levels = posneg
+  td$positive = positive
+  td$negative = negative
+  return(addClasses(td, c("OneClassTaskDesc", "SupervisedTaskDesc")))
+}
+
+#' @export
+print.OneClassTask = function(x, ...) {
+  di = printToChar(table(getTaskTargets(x)), collapse = NULL)[-1L]
+  m = length(x$task.desc$class.levels)
+  print.SupervisedTask(x)
+  catf("Classes: %i", m)
+  catf(collapse(di, "\n"))
+  catf("Positive/Normal class: %s", x$task.desc$positive)
+  catf("Negative/Anomaly class: %s", x$task.desc$negative)
+}
+
diff --git a/R/Prediction.R b/R/Prediction.R
@@ -177,6 +177,48 @@ makePrediction.ClusterTaskDesc = function(task.desc, row.names, id, truth, predi
   return(p)
 }
 
+#' @export
+makePrediction.OneClassTaskDesc = function(task.desc, row.names, id, truth, predict.type, predict.threshold = NULL, y, time, error = NA_character_, dump = NULL) {
+  data = namedList(c("id", "truth", "response", "prob"))
+  data$id = id
+  # truth can come from a simple "newdata" df. then there might not be all factor levels present
+  if (!is.null(truth)) {
+    levels(truth) = union(levels(truth), task.desc$class.levels)
+    data$truth = truth
+  }
+  if (predict.type == "response") {
+    data$response = y
+    data = as.data.frame(filterNull(data))
+  } else {
+    data$prob = y
+    data = as.data.frame(filterNull(data))
+    # fix columnnames for prob if strange chars are in factor levels
+    indices = stri_detect_fixed(names(data), "prob.")
+
+    # HACK need to create colnames with prob.TRUE for the normal class
+    # otherwise getPredictionProbabilities() will throw an error
+    # "Trying to get probabilities for nonexistant classes: %s", collapse(cl) (line 56)
+    indices = stri_detect_fixed(names(data), colnames(y))
+
+    if (sum(indices) > 0) #?
+      names(data)[indices] = stri_paste("prob.", colnames(y))
+  }
+
+  p = makeS3Obj(c("PredictionOneClass", "Prediction"),
+    predict.type = predict.type,
+    data = setRowNames(data, row.names),
+    threshold = NA_real_,
+    task.desc = task.desc,
+    time = time,
+    error = error
+  )
+  if (predict.type == "prob") {
+# to be add in branch h2o and branch convertScoretoProb
+  }
+  return(p)
+}
+
+
 #' @export
 makePrediction.CostSensTaskDesc = function(task.desc, row.names, id, truth, predict.type, predict.threshold = NULL, y, time, error = NA_character_, dump = NULL) {
   data = namedList(c("id", "response"))

diff --git a/R/RLearner.R b/R/RLearner.R
@@ -67,7 +67,7 @@ makeRLearnerInternal = function(id, type, package, par.set, par.vals, properties
   requirePackages(package, why = stri_paste("learner", id, sep = " "), default.method = "load")
 
   assertString(id)
-  assertChoice(type, choices = c("classif", "regr", "multilabel", "surv", "cluster", "costsens"))
+  assertChoice(type, choices = c("oneclass", "classif", "regr", "multilabel", "surv", "cluster", "costsens"))
   assertSubset(properties, listLearnerProperties(type))
   assertClass(par.set, classes = "ParamSet")
   checkListElementClass(par.set$pars, "LearnerParam")
@@ -165,3 +165,12 @@ makeRLearnerCostSens = function(cl, package, par.set, par.vals = list(), propert
 
   return(lrn)
 }
+
+#' @export
+#' @rdname RLearner
+makeRLearnerOneClass = function(cl, package, par.set, par.vals = list(), properties = character(0L), name = cl, short.name = cl, note = "", callees = character(0L)) {
+  addClasses(
+    makeRLearnerInternal(cl, "oneclass", package, par.set, par.vals, properties, name, short.name, note, callees),
+    c(cl, "RLearnerOneClass")
+  )
+}
diff --git a/R/RLearner_oneclass_svm.R b/R/RLearner_oneclass_svm.R
@@ -0,0 +1,46 @@
+#' @export
+makeRLearner.oneclass.svm = function() {
+  makeRLearnerOneClass(
+    cl = "oneclass.svm",
+    package = "e1071",
+    par.set = makeParamSet(
+      makeDiscreteLearnerParam(id = "type", default = "one-classification", values = "one-classification"),
+      makeNumericLearnerParam(id = "nu", default = 0.5, requires = quote(type == "nu-classification" || type == "one-classification" || type == "nu-regression")),
+      makeDiscreteLearnerParam(id = "kernel", default = "radial", values = c("linear", "polynomial", "radial", "sigmoid")),
+      makeIntegerLearnerParam(id = "degree", default = 3L, lower = 1L, requires = quote(kernel == "polynomial")),
+      makeNumericLearnerParam(id = "coef0", default = 0, requires = quote(kernel == "polynomial" || kernel == "sigmoid")),
+      makeNumericLearnerParam(id = "gamma", lower = 0, requires = quote(kernel != "linear")),
+      makeNumericLearnerParam(id = "cachesize", default = 40L),
+      makeNumericLearnerParam(id = "tolerance", default = 0.001, lower = 0),
+      makeLogicalLearnerParam(id = "shrinking", default = TRUE),
+      makeIntegerLearnerParam(id = "cross", default = 0L, lower = 0L, tunable = FALSE),
+      makeLogicalLearnerParam(id = "fitted", default = TRUE, tunable = FALSE),
+      makeLogicalVectorLearnerParam(id = "scale", default = TRUE, tunable = TRUE)
+    ),
+    par.vals = list(type = "one-classification"),
+    properties =  c("oneclass", "numerics", "factors", "weights"),
+    name = "one-class Support Vector Machines (libsvm)",
+    short.name = "one-class svm",
+    callees = "svm"
+  )
+}
+
+#' @export
+trainLearner.oneclass.svm = function(.learner, .task, .subset, .weights = NULL,  ...) {
+  x = getTaskFeatureNames(.task)
+  d = getTaskData(.task, .subset)[, x]
+    e1071::svm(d, y = NULL, ...)
+}
+
+#' @export
+predictLearner.oneclass.svm = function(.learner, .model, .newdata, ...) {
+  # svm currently can't predict probabilities only response
+   p = predict(.model$learner.model, newdata = .newdata, ...)
+   if (.learner$predict.type == "response") {
+     p = as.factor(p)
+     levels(p) = union(levels(p), .model$task.desc$class.levels)
+   }
+  return(p)
+}
+
+