Skip to content

Commit

Permalink
Merge 7a59061 into a1dd696
Browse files Browse the repository at this point in the history
  • Loading branch information
MinhAnhL committed May 29, 2017
2 parents a1dd696 + 7a59061 commit d42bb15
Show file tree
Hide file tree
Showing 53 changed files with 814 additions and 85 deletions.
9 changes: 9 additions & 0 deletions NAMESPACE
Expand Up @@ -97,6 +97,8 @@ S3method(getTaskClassLevels,ClassifTask)
S3method(getTaskClassLevels,ClassifTaskDesc)
S3method(getTaskClassLevels,MultilabelTask)
S3method(getTaskClassLevels,MultilabelTaskDesc)
S3method(getTaskClassLevels,OneClassTask)
S3method(getTaskClassLevels,OneClassTaskDesc)
S3method(getTaskCosts,Task)
S3method(getTaskDesc,TaskDesc)
S3method(getTaskDesc,default)
Expand Down Expand Up @@ -126,6 +128,7 @@ S3method(makePrediction,ClassifTaskDesc)
S3method(makePrediction,ClusterTaskDesc)
S3method(makePrediction,CostSensTaskDesc)
S3method(makePrediction,MultilabelTaskDesc)
S3method(makePrediction,OneClassTaskDesc)
S3method(makePrediction,RegrTaskDesc)
S3method(makePrediction,SurvTaskDesc)
S3method(makeRLearner,classif.C50)
Expand Down Expand Up @@ -221,6 +224,7 @@ S3method(makeRLearner,cluster.kmeans)
S3method(makeRLearner,multilabel.cforest)
S3method(makeRLearner,multilabel.rFerns)
S3method(makeRLearner,multilabel.randomForestSRC)
S3method(makeRLearner,oneclass.svm)
S3method(makeRLearner,regr.GPfit)
S3method(makeRLearner,regr.IBk)
S3method(makeRLearner,regr.LiblineaRL2L1SVR)
Expand Down Expand Up @@ -420,6 +424,7 @@ S3method(predictLearner,cluster.kmeans)
S3method(predictLearner,multilabel.cforest)
S3method(predictLearner,multilabel.rFerns)
S3method(predictLearner,multilabel.randomForestSRC)
S3method(predictLearner,oneclass.svm)
S3method(predictLearner,regr.GPfit)
S3method(predictLearner,regr.IBk)
S3method(predictLearner,regr.LiblineaRL2L1SVR)
Expand Down Expand Up @@ -517,6 +522,7 @@ S3method(print,LearningCurveData)
S3method(print,ListLearners)
S3method(print,Measure)
S3method(print,MultilabelTask)
S3method(print,OneClassTask)
S3method(print,OptModel)
S3method(print,PartialDependenceData)
S3method(print,Prediction)
Expand Down Expand Up @@ -673,6 +679,7 @@ S3method(trainLearner,cluster.kmeans)
S3method(trainLearner,multilabel.cforest)
S3method(trainLearner,multilabel.rFerns)
S3method(trainLearner,multilabel.randomForestSRC)
S3method(trainLearner,oneclass.svm)
S3method(trainLearner,regr.GPfit)
S3method(trainLearner,regr.IBk)
S3method(trainLearner,regr.LiblineaRL2L1SVR)
Expand Down Expand Up @@ -944,6 +951,7 @@ export(makeMultilabelDBRWrapper)
export(makeMultilabelNestedStackingWrapper)
export(makeMultilabelStackingWrapper)
export(makeMultilabelTask)
export(makeOneClassTask)
export(makeOverBaggingWrapper)
export(makeOversampleWrapper)
export(makePrediction)
Expand All @@ -954,6 +962,7 @@ export(makeRLearnerClassif)
export(makeRLearnerCluster)
export(makeRLearnerCostSens)
export(makeRLearnerMultilabel)
export(makeRLearnerOneClass)
export(makeRLearnerRegr)
export(makeRLearnerSurv)
export(makeRegrTask)
Expand Down
5 changes: 4 additions & 1 deletion R/Measure.R
Expand Up @@ -38,6 +38,7 @@
#' \item{surv}{Is the measure applicable for survival?}
#' \item{cluster}{Is the measure applicable for cluster?}
#' \item{costsens}{Is the measure applicable for cost-sensitive learning?}
#' \item{oneclass}{Is the measure applicable for one-classification (anomaly detection) learning?}
#' \item{req.pred}{Is prediction object required in calculation? Usually the case.}
#' \item{req.truth}{Is truth column required in calculation? Usually the case.}
#' \item{req.task}{Is task object required in calculation? Usually not the case}
Expand Down Expand Up @@ -123,14 +124,15 @@ makeMeasure = function(id, minimize, properties = character(0L),
#' Get the default measure for a task type, task, task description or a learner.
#' Currently these are:
#' \tabular{ll}{
#' oneclass \tab f1 \cr
#' classif \tab mmce\cr
#' regr \tab mse\cr
#' cluster \tab db\cr
#' surv \tab cindex\cr
#' costsens \tab mcp\cr
#' multilabel \tab multilabel.hamloss\cr
#' }
#'
#' Note: default measure f1 for oneclass is an adhoc solution and not the optimal measurement for unsupervised learning.
#' @param x [\code{character(1)} | \code{\link{Task}} | \code{\link{TaskDesc}} | \code{\link{Learner}}]\cr
#' Task type, task, task description, learner name, a learner, or a type of learner (e.g. "classif").
#' @return [\code{\link{Measure}}].
Expand All @@ -147,6 +149,7 @@ getDefaultMeasure = function(x) {
else
x
switch(type,
oneclass = f1,
classif = mmce,
cluster = db,
regr = mse,
Expand Down
124 changes: 124 additions & 0 deletions R/OneClassTask.R
@@ -0,0 +1,124 @@
#' @export
#' @rdname Task

makeOneClassTask = function(id = deparse(substitute(data)), data, target,
weights = NULL, blocking = NULL, fixup.data = "warn", positive = NA_character_, negative = NA_character_,
check.data = TRUE) {
assertString(id)

# positive needs to be a string, if it's a number convert it into string
assert(
checkString(positive, na.ok = TRUE),
checkNumber(positive, na.ok = TRUE)
)
if (isScalarNumeric(positive))
positive = as.character(positive)

assert(
checkString(negative, na.ok = TRUE),
checkNumber(negative, na.ok = TRUE)
)
if (isScalarNumeric(negative))
negative = as.character(negative)

assertDataFrame(data)
assertString(target) # that this is a valid colname will be check later in makeSupervisedTask

assertChoice(fixup.data, choices = c("no", "quiet", "warn"))
assertFlag(check.data)

if (fixup.data != "no") {
x = data[[target]]
if (is.character(x) || is.logical(x) || is.integer(x)) {
data[[target]] = as.factor(x)
}
# we probably dont want to autodrop empty target levels here (as in classif), as the anomaly class could be empty
}
# check that class column is factor and has max 2 class levels
if (check.data) {
assertFactor(data[[target]], any.missing = FALSE, empty.levels.ok = TRUE, max.levels = 2L, .var.name = target)
}

# check if positive and negative are element of class levels
levs = levels(data[[target]])

if (length(levs) == 2) {
if (!is.na(positive) && !is.na(negative) && !setequal(c(positive, negative), levs)) {
stopf("'positive' or 'negative' not equal to class levels")
}
if (!is.na(positive)) {
if (positive %nin% levs)
stopf("'positive' not element of the two class levels,")
}
if (!is.na(negative)) {
if (negative %nin% levs)
stopf("'negative' not element of the two class levels,")
}
} else if (length(levs) == 1) {
if (!is.na(positive) && !is.na(negative) && sum(c(positive, negative) %in% levs) == 0)
stopf("Neither 'positive' nor 'negative' are subset of class levels")
}

task = makeSupervisedTask("oneclass", data, target, weights, blocking,
fixup.data = fixup.data, check.data = check.data)

if (fixup.data != "no") {
# add pos and neg as levels if they are missing
if (length(levs) == 1) {
if (!is.na(positive) && !is.na(negative)) {
levels(data[[target]]) = union(levs, c(positive, negative))
} else {
if (!is.na(positive)) {
if (positive %nin% levs) levels(data[[target]]) = c(levs, positive)
else stopf("Cannot add second class level when 'positive' is equal to the only class level and no 'negative' is specified!")
}
if (!is.na(negative)) {
if (negative %nin% levs) levels(data[[target]]) = c(levs, negative)
else stopf("Cannot add second class level when 'negative' is equal to the only class level and no 'positive' is specified!")
}
}
}

task$env$data = data
}

task$task.desc = makeOneClassTaskDesc(id, data, target, weights, blocking, positive, negative)
addClasses(task, "OneClassTask")
}

makeOneClassTaskDesc = function(id, data, target, weights, blocking, positive, negative) {
td = makeTaskDescInternal("oneclass", id, data, target, weights, blocking)
levs = levels(data[[target]])
m = length(levs)
if (is.na(positive) && is.na(negative)) {
positive = levs[1L]
if (m < 2L)
stopf("Cannot auto-set negative class when there are < 2 class levels!")
negative = levs[2L]
} else if (is.na(positive)) {
if (m < 2L && negative %in% levs) stopf("Cannot auto-set positive class when there are < 2 class levels and negative is the only class level!")
positive = setdiff(levs, negative)
} else if (is.na(negative)) {
if (m < 2L && positive %in% levs) stopf("Cannot auto-set negative class when there are < 2 class levels and positve is the only class level!")
negative = setdiff(levs, positive)
}

posneg = c(positive, negative)
assertSetEqual(levs, posneg)
td$class.levels = posneg
td$positive = positive
td$negative = negative
return(addClasses(td, c("OneClassTaskDesc", "SupervisedTaskDesc")))
}

#' @export
print.OneClassTask = function(x, ...) {
di = printToChar(table(getTaskTargets(x)), collapse = NULL)[-1L]
m = length(x$task.desc$class.levels)
print.SupervisedTask(x)
catf("Classes: %i", m)
catf(collapse(di, "\n"))
catf("Positive/Normal class: %s", x$task.desc$positive)
catf("Negative/Anomaly class: %s", x$task.desc$negative)
}

42 changes: 42 additions & 0 deletions R/Prediction.R
Expand Up @@ -177,6 +177,48 @@ makePrediction.ClusterTaskDesc = function(task.desc, row.names, id, truth, predi
return(p)
}

#' @export
makePrediction.OneClassTaskDesc = function(task.desc, row.names, id, truth, predict.type, predict.threshold = NULL, y, time, error = NA_character_, dump = NULL) {
data = namedList(c("id", "truth", "response", "prob"))
data$id = id
# truth can come from a simple "newdata" df. then there might not be all factor levels present
if (!is.null(truth)) {
levels(truth) = union(levels(truth), task.desc$class.levels)
data$truth = truth
}
if (predict.type == "response") {
data$response = y
data = as.data.frame(filterNull(data))
} else {
data$prob = y
data = as.data.frame(filterNull(data))
# fix columnnames for prob if strange chars are in factor levels
indices = stri_detect_fixed(names(data), "prob.")

# HACK need to create colnames with prob.TRUE for the normal class
# otherwise getPredictionProbabilities() will throw an error
# "Trying to get probabilities for nonexistant classes: %s", collapse(cl) (line 56)
indices = stri_detect_fixed(names(data), colnames(y))

if (sum(indices) > 0) #?
names(data)[indices] = stri_paste("prob.", colnames(y))
}

p = makeS3Obj(c("PredictionOneClass", "Prediction"),
predict.type = predict.type,
data = setRowNames(data, row.names),
threshold = NA_real_,
task.desc = task.desc,
time = time,
error = error
)
if (predict.type == "prob") {
# to be add in branch h2o and branch convertScoretoProb
}
return(p)
}


#' @export
makePrediction.CostSensTaskDesc = function(task.desc, row.names, id, truth, predict.type, predict.threshold = NULL, y, time, error = NA_character_, dump = NULL) {
data = namedList(c("id", "response"))
Expand Down
11 changes: 10 additions & 1 deletion R/RLearner.R
Expand Up @@ -67,7 +67,7 @@ makeRLearnerInternal = function(id, type, package, par.set, par.vals, properties
requirePackages(package, why = stri_paste("learner", id, sep = " "), default.method = "load")

assertString(id)
assertChoice(type, choices = c("classif", "regr", "multilabel", "surv", "cluster", "costsens"))
assertChoice(type, choices = c("oneclass", "classif", "regr", "multilabel", "surv", "cluster", "costsens"))
assertSubset(properties, listLearnerProperties(type))
assertClass(par.set, classes = "ParamSet")
checkListElementClass(par.set$pars, "LearnerParam")
Expand Down Expand Up @@ -165,3 +165,12 @@ makeRLearnerCostSens = function(cl, package, par.set, par.vals = list(), propert

return(lrn)
}

#' @export
#' @rdname RLearner
makeRLearnerOneClass = function(cl, package, par.set, par.vals = list(), properties = character(0L), name = cl, short.name = cl, note = "", callees = character(0L)) {
addClasses(
makeRLearnerInternal(cl, "oneclass", package, par.set, par.vals, properties, name, short.name, note, callees),
c(cl, "RLearnerOneClass")
)
}
46 changes: 46 additions & 0 deletions R/RLearner_oneclass_svm.R
@@ -0,0 +1,46 @@
#' @export
makeRLearner.oneclass.svm = function() {
makeRLearnerOneClass(
cl = "oneclass.svm",
package = "e1071",
par.set = makeParamSet(
makeDiscreteLearnerParam(id = "type", default = "one-classification", values = "one-classification"),
makeNumericLearnerParam(id = "nu", default = 0.5, requires = quote(type == "nu-classification" || type == "one-classification" || type == "nu-regression")),
makeDiscreteLearnerParam(id = "kernel", default = "radial", values = c("linear", "polynomial", "radial", "sigmoid")),
makeIntegerLearnerParam(id = "degree", default = 3L, lower = 1L, requires = quote(kernel == "polynomial")),
makeNumericLearnerParam(id = "coef0", default = 0, requires = quote(kernel == "polynomial" || kernel == "sigmoid")),
makeNumericLearnerParam(id = "gamma", lower = 0, requires = quote(kernel != "linear")),
makeNumericLearnerParam(id = "cachesize", default = 40L),
makeNumericLearnerParam(id = "tolerance", default = 0.001, lower = 0),
makeLogicalLearnerParam(id = "shrinking", default = TRUE),
makeIntegerLearnerParam(id = "cross", default = 0L, lower = 0L, tunable = FALSE),
makeLogicalLearnerParam(id = "fitted", default = TRUE, tunable = FALSE),
makeLogicalVectorLearnerParam(id = "scale", default = TRUE, tunable = TRUE)
),
par.vals = list(type = "one-classification"),
properties = c("oneclass", "numerics", "factors", "weights"),
name = "one-class Support Vector Machines (libsvm)",
short.name = "one-class svm",
callees = "svm"
)
}

#' @export
trainLearner.oneclass.svm = function(.learner, .task, .subset, .weights = NULL, ...) {
x = getTaskFeatureNames(.task)
d = getTaskData(.task, .subset)[, x]
e1071::svm(d, y = NULL, ...)
}

#' @export
predictLearner.oneclass.svm = function(.learner, .model, .newdata, ...) {
# svm currently can't predict probabilities only response
p = predict(.model$learner.model, newdata = .newdata, ...)
if (.learner$predict.type == "response") {
p = as.factor(p)
levels(p) = union(levels(p), .model$task.desc$class.levels)
}
return(p)
}


0 comments on commit d42bb15

Please sign in to comment.