diff --git a/DESCRIPTION b/DESCRIPTION index bfb08b9e11..79bc9139bf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,6 +19,7 @@ Authors@R: c( person("Giuseppe", "Casalicchio", email = "giuseppe.casalicchio@stat.uni-muenchen.de", role = "aut"), person("Mason", "Gallo", email = "masonagallo@gmail.com", role = "aut"), person("Jakob", "Bossek", email = "jakob.bossek@tu-dortmund.de", role = "ctb"), + person("Bronder", "Stephen", email = "sbronder@stevebronder.com", role = "ctb"), person("Erich", "Studerus", email = "erich.studerus@upkbs.ch", role = "ctb"), person("Leonard","Judt", email = "leonard.judt@tu-dortmund.de", role = "ctb"), person("Tobias", "Kuehn", email = "tobi.kuehn@gmx.de", role = "ctb"), @@ -44,13 +45,13 @@ Depends: Imports: BBmisc (>= 1.11), backports, - ggplot2, - stats, - stringi, checkmate (>= 1.8.2), data.table, + ggplot2, methods, parallelMap (>= 1.3), + stats, + stringi, survival, utils, XML @@ -87,6 +88,7 @@ Suggests: flare, fields, FNN, + forecast, fpc, frbs, FSelector, diff --git a/NAMESPACE b/NAMESPACE index f465d43202..2cea6ed1be 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -125,6 +125,7 @@ S3method(listMeasures,default) S3method(makePrediction,ClassifTaskDesc) S3method(makePrediction,ClusterTaskDesc) S3method(makePrediction,CostSensTaskDesc) +S3method(makePrediction,ForecastRegrTaskDesc) S3method(makePrediction,MultilabelTaskDesc) S3method(makePrediction,RegrTaskDesc) S3method(makePrediction,SurvTaskDesc) @@ -217,6 +218,7 @@ S3method(makeRLearner,cluster.cmeans) S3method(makeRLearner,cluster.dbscan) S3method(makeRLearner,cluster.kkmeans) S3method(makeRLearner,cluster.kmeans) +S3method(makeRLearner,fcregr.Arima) S3method(makeRLearner,multilabel.cforest) S3method(makeRLearner,multilabel.rFerns) S3method(makeRLearner,multilabel.randomForestSRC) @@ -415,6 +417,7 @@ S3method(predictLearner,cluster.cmeans) S3method(predictLearner,cluster.dbscan) S3method(predictLearner,cluster.kkmeans) S3method(predictLearner,cluster.kmeans) +S3method(predictLearner,fcregr.Arima) S3method(predictLearner,multilabel.cforest) S3method(predictLearner,multilabel.rFerns) S3method(predictLearner,multilabel.randomForestSRC) @@ -506,7 +509,10 @@ S3method(print,FeatureImportance) S3method(print,Filter) S3method(print,FilterMethodsList) S3method(print,FilterValues) +S3method(print,FixedCVDesc) +S3method(print,ForecastRegrTask) S3method(print,FunctionalANOVAData) +S3method(print,GrowingCVDesc) S3method(print,HoldoutDesc) S3method(print,HyperParsEffectData) S3method(print,ImputationDesc) @@ -667,6 +673,7 @@ S3method(trainLearner,cluster.cmeans) S3method(trainLearner,cluster.dbscan) S3method(trainLearner,cluster.kkmeans) S3method(trainLearner,cluster.kmeans) +S3method(trainLearner,fcregr.Arima) S3method(trainLearner,multilabel.cforest) S3method(trainLearner,multilabel.rFerns) S3method(trainLearner,multilabel.randomForestSRC) @@ -785,6 +792,7 @@ export(f1) export(fdr) export(featperc) export(filterFeatures) +export(fixedcv) export(fn) export(fnr) export(fp) @@ -860,6 +868,7 @@ export(getStackedBaseLearnerPredictions) export(getTaskClassLevels) export(getTaskCosts) export(getTaskData) +export(getTaskDates) export(getTaskDesc) export(getTaskDescription) export(getTaskFeatureNames) @@ -873,6 +882,7 @@ export(getTaskType) export(getTuneResult) export(gmean) export(gpr) +export(growingcv) export(hasLearnerProperties) export(hasMeasureProperties) export(hasProperties) @@ -927,6 +937,7 @@ export(makeFeatSelWrapper) export(makeFilter) export(makeFilterWrapper) export(makeFixedHoldoutInstance) +export(makeForecastRegrTask) export(makeImputeMethod) export(makeImputeWrapper) export(makeLearner) @@ -950,6 +961,7 @@ export(makeRLearner) export(makeRLearnerClassif) export(makeRLearnerCluster) export(makeRLearnerCostSens) +export(makeRLearnerForecastRegr) export(makeRLearnerMultilabel) export(makeRLearnerRegr) export(makeRLearnerSurv) @@ -975,6 +987,7 @@ export(makeUndersampleWrapper) export(makeWeightedClassesWrapper) export(makeWrappedModel) export(mape) +export(mase) export(mcc) export(mcp) export(meancosts) @@ -1001,6 +1014,7 @@ export(measureLSR) export(measureLogloss) export(measureMAE) export(measureMAPE) +export(measureMASE) export(measureMCC) export(measureMEDAE) export(measureMEDSE) diff --git a/R/FailureModel.R b/R/FailureModel.R index 791acc9f38..bb60f992bc 100644 --- a/R/FailureModel.R +++ b/R/FailureModel.R @@ -49,6 +49,11 @@ predictFailureModel = function(model, newdata) { rep(NA_real_, n) else matrix(NA_real_, nrow = n, ncol = 2L, dimnames = list(NULL, c("response", "se"))) + } else if (type == "fcregr") { + res = if (ptype == "response") + rep(NA_real_, n) + else + matrix(NA_real_, nrow = n, ncol = 2L, dimnames = list(NULL, c("response", "quantile"))) } else if (type == "surv") { if (ptype == "response") res = rep.int(NA_real_, n) diff --git a/R/ForecastRegrTask.R b/R/ForecastRegrTask.R new file mode 100644 index 0000000000..e86aa92ae0 --- /dev/null +++ b/R/ForecastRegrTask.R @@ -0,0 +1,74 @@ +#' @title Create a task for univariate forecasting +#' +#' @rdname Task +#' +#' @description Creates a task for univariate forecasting learners +#' +#' @export +makeForecastRegrTask = function(id = deparse(substitute(data)), data, target, + weights = NULL, blocking = NULL, frequency = 1L, date.col = "dates", fixup.data = "warn", + check.data = TRUE) { + assertString(id) + assertClass(data, "data.frame") + assertString(target) + assertString(date.col) + frequency = asCount(frequency) + assertChoice(fixup.data, choices = c("no", "quiet", "warn")) + assertFlag(check.data) + + # Need to check that dates + # 1. Exist + # 2. Are unique + # 3. Follow POSIXct convention + dates = data[, date.col, drop = FALSE] + if (check.data) { + assertNumeric(data[[target]], any.missing = FALSE, finite = TRUE, .var.name = target) + if (any(duplicated(dates))) + stop(catf("Multiple observations for %s. Dates must be unique.", dates[any(duplicated(dates)), ])) + if (!is.POSIXt(dates[, 1])) + stop(catf("Dates are of type %s, but must be in a POSIXt format", class(dates[, 1]))) + } + if (fixup.data != "no") { + if (is.integer(data[[target]])) + data[[target]] = as.double(data[[target]]) + if (is.unsorted(dates[, 1])) { + if (fixup.data == "warn") + warning("Dates and data will be sorted in ascending order") + date.order = order(dates) + data = data[date.order, , drop = FALSE] + dates = dates[date.order, , drop = FALSE] + } + } + # Remove the date column and add it as the rownames + data = data[, date.col != colnames(data), drop = FALSE] + + task = makeSupervisedTask("fcregr", data, target, weights, blocking, fixup.data = fixup.data, check.data = check.data) + task$task.desc = makeForecastRegrTaskDesc(id, data, target, weights, blocking, frequency, dates) + addClasses(task, c("ForecastRegrTask", "TimeTask")) +} + +makeForecastRegrTaskDesc = function(id, data, target, weights, blocking, frequency, dates) { + td = makeTaskDescInternal("fcregr", id, data, target, weights, blocking) + td$dates = dates + td$frequency = frequency + addClasses(td, c("ForecastRegrTaskDesc", "SupervisedTaskDesc")) +} + + +#' @export +print.ForecastRegrTask = function(x, print.weights = TRUE, ...) { + td = getTaskDesc(x) + catf("Task: %s", td$id) + catf("Type: %s", td$type) + catf("Target: %s", td$target) + catf("Observations: %i", td$size) + catf("Dates:\n Start: %s \n End: %s", td$dates[1], td$dates[length(td$dates)]) + catf("Frequency: %i", td$frequency) + catf("Features:") + catf(printToChar(td$n.feat, collapse = "\n")) + catf("Missings: %s", td$has.missings) + if (print.weights) + catf("Has weights: %s", td$has.weights) + catf("Has blocking: %s", td$has.blocking) +} + diff --git a/R/Measure.R b/R/Measure.R index c1cd930bad..497a73ca94 100644 --- a/R/Measure.R +++ b/R/Measure.R @@ -129,6 +129,7 @@ makeMeasure = function(id, minimize, properties = character(0L), #' surv \tab cindex\cr #' costsens \tab mcp\cr #' multilabel \tab multilabel.hamloss\cr +#' fcregr \tab mse #' } #' #' @param x [\code{character(1)} | \code{\link{Task}} | \code{\link{TaskDesc}} | \code{\link{Learner}}]\cr @@ -152,7 +153,8 @@ getDefaultMeasure = function(x) { regr = mse, surv = cindex, costsens = mcp, - multilabel = multilabel.hamloss + multilabel = multilabel.hamloss, + fcregr = mse ) } diff --git a/R/Prediction.R b/R/Prediction.R index da40350e64..49b9082b39 100644 --- a/R/Prediction.R +++ b/R/Prediction.R @@ -194,6 +194,57 @@ makePrediction.CostSensTaskDesc = function(task.desc, row.names, id, truth, pred ) } +#' @export +makePrediction.ForecastRegrTaskDesc = function(task.desc, row.names, id, truth, predict.type, predict.threshold = NULL, y, time, error = NA_character_, dump = NULL) { + data = namedList(c("id", "truth", "response", "se")) + if (any(inherits(y, "matrix"))) { + size.y = nrow(y) + } else { + size.y = length(y) + } + + # This will only happen when there is a task with no subset + # aka, we predict future values and have to get their times + if (length(truth) > size.y) { + row.dates = as.POSIXct(task.desc$dates) + diff.time = difftime(row.dates[2], row.dates[1], units = "auto") + start = row.dates[length(row.dates)] + diff.time + end = start + diff.time * size.y + row.dates = seq.POSIXt(start, end, by = diff.time) + data$id = NULL + data$truth = NULL + } else { + row.dates = row.names[seq_len(length(truth))] + if (inherits(y, "matrix")) { + y = y[seq_len(length(truth)), , drop = FALSE] + } else { + y = y[seq_len(length(truth))] + } + data$id = id + data$truth = truth + } + if (predict.type == "response") { + data$response = y + data = filterNull(data) + } else { + y = as.data.frame(y) + data$response = y[, 1L, drop = FALSE] + colnames(data$response) = NULL + data$se = y[, -1, drop = FALSE] + data = filterNull(data) + } + + makeS3Obj(c("PredictionForecastRegr", "Prediction"), + predict.type = predict.type, + data = setRowNames(as.data.frame(data, row.names = NULL), row.dates), + threshold = NA_real_, + task.desc = task.desc, + time = time, + error = error, + dump = dump + ) +} + #' @export print.Prediction = function(x, ...) { catf("Prediction: %i observations", nrow(x$data)) diff --git a/R/Prediction_operators.R b/R/Prediction_operators.R index 5775527b2f..da11e50043 100644 --- a/R/Prediction_operators.R +++ b/R/Prediction_operators.R @@ -106,6 +106,7 @@ getProbabilities = function(pred, cl) { #' cluster \tab integer\cr #' surv \tab numeric\cr #' multilabel \tab logical matrix, columns named with labels\cr +#' fcregr \tab numeric\cr #' } #' #' @template arg_pred diff --git a/R/RLearner.R b/R/RLearner.R index 5d4ee09b96..34469e98ef 100644 --- a/R/RLearner.R +++ b/R/RLearner.R @@ -67,7 +67,7 @@ makeRLearnerInternal = function(id, type, package, par.set, par.vals, properties requirePackages(package, why = stri_paste("learner", id, sep = " "), default.method = "load") assertString(id) - assertChoice(type, choices = c("classif", "regr", "multilabel", "surv", "cluster", "costsens")) + assertChoice(type, choices = c("classif", "regr", "multilabel", "surv", "cluster", "costsens", "fcregr")) assertSubset(properties, listLearnerProperties(type)) assertClass(par.set, classes = "ParamSet") checkListElementClass(par.set$pars, "LearnerParam") @@ -165,3 +165,13 @@ makeRLearnerCostSens = function(cl, package, par.set, par.vals = list(), propert return(lrn) } + + +#' @export +#' @rdname RLearner +makeRLearnerForecastRegr = function(cl, package, par.set, par.vals = list(), properties = character(0L), name = cl, short.name = cl, note = "", callees = character(0L)) { + addClasses( + makeRLearnerInternal(cl, "fcregr", package, par.set, par.vals, properties, name, short.name, note, callees), + c(cl, "RLearnerForecastRegr") + ) +} diff --git a/R/RLearner_fcregr_Arima.R b/R/RLearner_fcregr_Arima.R new file mode 100644 index 0000000000..f407bce7c8 --- /dev/null +++ b/R/RLearner_fcregr_Arima.R @@ -0,0 +1,86 @@ +#'@export +makeRLearner.fcregr.Arima = function() { + makeRLearnerForecastRegr( + cl = "fcregr.Arima", + package = "forecast", + par.set = makeParamSet( + makeIntegerVectorLearnerParam(id = "order", len = 3L, lower = 0L, upper = Inf, default = c(0L, 0L, 0L)), + makeIntegerVectorLearnerParam(id = "seasonal", len = 3L, lower = 0L, upper = Inf, default = c(0L, 0L, 0L)), + makeLogicalLearnerParam(id = "include.mean", default = TRUE), + makeLogicalLearnerParam(id = "include.drift", default = FALSE), + makeNumericLearnerParam(id = "lambda", default = NULL, special.vals = list(NULL), when = "both"), + makeLogicalLearnerParam(id = "biasadj", default = FALSE, when = "both"), + makeDiscreteLearnerParam(id = "method", values = c("CSS-ML", "ML", "CSS"), default = "CSS-ML"), + makeUntypedLearnerParam(id = "model", default = NULL), + # arima params + makeLogicalLearnerParam(id = "transform.pars", default = TRUE), + makeNumericVectorLearnerParam(id = "fixed", len = NA, default = NULL, special.vals = list(NULL)), + makeNumericVectorLearnerParam(id = "init", len = NA, default = NULL, special.vals = list(NULL)), + # No default + makeIntegerLearnerParam("n.cond", lower = 0L), + makeDiscreteLearnerParam("SSinit", values = c("Gardner1980", "Rossignol2011"), default = "Gardner1980", tunable = FALSE), + makeDiscreteLearnerParam("optim.method", default = "BFGS", values = c("Nelder-Mead", "BFGS", "CG", "L-BFGS-B", "SANN", "Brent"), tunable = TRUE), + makeUntypedLearnerParam("optim.controls", default = list(), tunable = FALSE), + makeNumericLearnerParam("kappa", lower = 1e6, upper = Inf, tunable = FALSE), + # prediction params + makeIntegerLearnerParam(id = "h", lower = 0L, upper = Inf, + # NOTE: object$arma[5] is the frequency of the data + default = expression(ifelse(object$arma[5] > 1L, 2L * object$arma[5], 10L)), + tunable = TRUE, + when = "predict"), + makeLogicalLearnerParam(id = "bootstrap", default = FALSE, when = "predict", tunable = FALSE), + makeNumericVectorLearnerParam(id = "level", len = NA, default = c(80, 95), when = "predict", tunable = FALSE), + makeLogicalLearnerParam(id = "fan", default = FALSE, when = "predict", tunable = FALSE), + makeIntegerLearnerParam(id = "npaths", default = 5000L, when = "predict"), + # simulate params + makeIntegerLearnerParam(id = "nsim", lower = 0L, default = expression(length(object$x)), when = "predict"), + makeIntegerLearnerParam(id = "seed", default = NULL, special.vals = list(NULL), tunable = FALSE, when = "predict"), + makeLogicalLearnerParam(id = "future", default = TRUE, when = "predict"), + keys = c("x", "object", "arma") + ), + properties = c("numerics", "quantile"), + name = "AutoRegressive Integrated Moving Average", + short.name = "Arima", + note = "All variables besides the target will be passed to the xreg argument.", + callees = c("Arima", "forecast.Arima") + ) +} + +#'@export +trainLearner.fcregr.Arima = function(.learner, .task, .subset, .weights = NULL, ...) { + data = getTaskData(.task, .subset, target.extra = TRUE) + td = getTaskDesc(.task) + data$target = ts(data$target, start = 1, frequency = td$frequency) + if (ncol(data$data) != 0) { + data$data = ts(data$data, start = 1, frequency = td$frequency) + forecast::Arima(y = data$target, xreg = data$data, ...) + } else { + forecast::Arima(y = data$target, ...) + } +} + +#'@export +predictLearner.fcregr.Arima = function(.learner, .model, .newdata, ...) { + se.fit = getLearnerPredictType(.learner) == "quantile" + model.td = getTaskDesc(.model) + mod = getLearnerModel(.model) + if (all(model.td$n.feat == 0)) { + p = forecast::forecast(mod, ...) + } else { + .newdata = ts(.newdata, start = 1, frequency = model.td$frequency) + p = forecast::forecast(mod, xreg = .newdata, ...) + } + if (!se.fit) { + p = as.numeric(p$mean) + } else { + p.mean = as.matrix(p$mean) + p.lower = p$lower + p.upper = p$upper + colnames(p.mean) = "point_forecast" + colnames(p.lower) = stri_paste("lower_", p$level) + colnames(p.upper) = stri_paste("upper_", p$level) + p = cbind(p.mean, p.lower, p.upper) + } + return(p) +} + diff --git a/R/ResampleDesc.R b/R/ResampleDesc.R index 1523342a4e..e56da012f9 100644 --- a/R/ResampleDesc.R +++ b/R/ResampleDesc.R @@ -32,7 +32,8 @@ #' @param method [\code{character(1)}]\cr #' \dQuote{CV} for cross-validation, \dQuote{LOO} for leave-one-out, \dQuote{RepCV} for #' repeated cross-validation, \dQuote{Bootstrap} for out-of-bag bootstrap, \dQuote{Subsample} for -#' subsampling, \dQuote{Holdout} for holdout. +#' subsampling, \dQuote{Holdout} for holdout, \dQuote{GrowingCV} for growing window, and +#' \dQuote{FixedCV} for fixed windowing. Note that \dQuote{GrowingCV} and \dQuote{FixedCV} are for forecasting. #' @param predict [\code{character(1)}]\cr #' What to predict during resampling: \dQuote{train}, \dQuote{test} or \dQuote{both} sets. #' Default is \dQuote{test}. @@ -45,8 +46,13 @@ #' \dQuote{Subsample} between 0 and 1. Default is 2 / 3.} #' \item{reps [\code{integer(1)}]}{Repeats for \dQuote{RepCV}. Here \code{iters = folds * reps}. #' Default is 10.} -#' \item{folds [\code{integer(1)]}}{Folds in the repeated CV for \code{RepCV}. +#' \item{folds [\code{integer(1)}]}{Folds in the repeated CV for \code{RepCV}. #' Here \code{iters = folds * reps}. Default is 10.} +#' \item{horizon [\code{integer(1)}]}{Number of observations to forecast for \code{GrowthCV} +#' and \code{FixedCV}. Default is 1} +#' \item{initial.window [\code{numeric(1)}]}{Fraction of observations to start with +#' in \code{GrowthCV} and \code{FixedCV}. Default is 0.5} +#' \item{skip [\code{numeric(1)}]}{ Fraction of windows to skip in \code{GrowthCV} and \code{FixedCV}. Default is 0} #' } #' @param stratify [\code{logical(1)}]\cr #' Should stratification be done for the target variable? @@ -76,12 +82,12 @@ #' # Holdout a.k.a. test sample estimation #' makeResampleDesc("Holdout") makeResampleDesc = function(method, predict = "test", ..., stratify = FALSE, stratify.cols = NULL) { - assertChoice(method, choices = c("Holdout", "CV", "LOO", "RepCV", "Subsample", "Bootstrap")) + assertChoice(method, choices = c("Holdout", "CV", "LOO", "RepCV", "Subsample", "Bootstrap", "GrowingCV", "FixedCV")) assertChoice(predict, choices = c("train", "test", "both")) assertFlag(stratify) if (stratify && method == "LOO") stop("Stratification cannot be done for LOO!") - if (stratify && ! is.null(stratify.cols)) + if (stratify && !is.null(stratify.cols)) stop("Arguments 'stratify' and 'stratify.cols' are mutually exclusive!") d = do.call(stri_paste("makeResampleDesc", method), list(...)) d$predict = predict @@ -141,6 +147,23 @@ makeResampleDescRepCV = function(reps = 10L, folds = 10L) { makeResampleDescInternal("repeated cross-validation", iters = folds * reps, folds = folds, reps = reps) } +makeResampleDescFixedCV = function(horizon = 1L, initial.window = .5, skip = 0) { + horizon = asInteger(horizon, lower = 1L, upper = Inf) + assertNumeric(initial.window, lower = 0, upper = 1) + assertNumeric(skip, lower = 0L, upper = 1) + makeResampleDescInternal("Fixed", iters = NA_integer_, horizon = horizon, + initial.window = initial.window, skip = skip) +} + +makeResampleDescGrowingCV = function(horizon = 1L, initial.window = .5, skip = 0) { + horizon = asInteger(horizon, lower = 1L, upper = Inf) + assertNumeric(initial.window, lower = 0, upper = 1) + assertNumeric(skip, lower = 0L, upper = 1) + makeResampleDescInternal("Growing", iters = NA_integer_, horizon = horizon, + initial.window = initial.window, skip = skip) +} + + ############################################################################################## #' @export @@ -167,6 +190,22 @@ print.RepCVDesc = function(x, ...) { catf("Stratification: %s", x$stratify) } +#' @export +print.GrowingCVDesc = function(x, ...) { + catf("Window description:\n %s: %.2f %% of observations in initial window and a horizon of %i.", + x$id, x$initial.window * 100, x$horizon) + catf("Predict: %s", x$predict) + catf("Stratification: %s", x$stratify) +} + +#' @export +print.FixedCVDesc = function(x, ...) { + catf("Window description:\n %s: %.2f %% of observations in initial window and a horizon of %i.", + x$id, x$initial.window * 100, x$horizon) + catf("Predict: %s", x$predict) + catf("Stratification: %s", x$stratify) +} + ############################################################################################## # Resample Convenience Objects, like cv10 ############################################################################################## diff --git a/R/ResampleInstances.R b/R/ResampleInstances.R index c1b38672de..0f9be616d3 100644 --- a/R/ResampleInstances.R +++ b/R/ResampleInstances.R @@ -38,3 +38,59 @@ instantiateResampleInstance.RepCVDesc = function(desc, size, task = NULL) { g = as.factor(rep(seq_len(desc$reps), each = folds)) makeResampleInstanceInternal(desc, size, train.inds = train.inds, test.inds = test.inds, group = g) } + +instantiateResampleInstance.FixedCVDesc = function(desc, size, task = NULL) { + initial.window.abs = floor(desc$initial.window * size) + assertInt(initial.window.abs, lower = 1) + if (size - initial.window.abs < desc$horizon) + stop(catf("The initial window is %i observations while the data is %i observations. \n + There is not enough data left (%i observations) to create a test set for a %i size horizon", + initial.window.abs, size, initial.window.abs - size, desc$horizon)) + skip = floor(desc$skip * size) + stops = (seq(size))[initial.window.abs:(size - desc$horizon)] + starts = stops - initial.window.abs + 1 + train.inds = mapply(seq, starts, stops, SIMPLIFY = FALSE) + test.inds = mapply(seq, stops + 1, stops + desc$horizon, SIMPLIFY = FALSE) + + thin = function(x, skip = 0) { + n = length(x) + x[seq(1, n, by = skip)] + } + + if (skip > 0) { + train.inds = thin(train.inds, skip = skip + 1) + test.inds = thin(test.inds, skip = skip + 1) + } + if (length(test.inds) == 0) + stop("Skip is too large and has removed all resampling instances. Please lower the value of skip.") + desc$iters = length(test.inds) + makeResampleInstanceInternal(desc, size, train.inds = train.inds, test.inds = test.inds ) +} + +instantiateResampleInstance.GrowingCVDesc = function(desc, size, task = NULL) { + initial.window.abs = floor(desc$initial.window * size) + assertInt(initial.window.abs, lower = 1) + if (size - initial.window.abs < desc$horizon) + stop(catf("The initial window is %i observations while the data is %i observations. \n + There is not enough data left (%i observations) to create a test set for a %i size horizon", + initial.window.abs, size, initial.window.abs - size, desc$horizon)) + skip = floor(desc$skip * size) + stops = (seq(from = 1, to = size))[initial.window.abs:(size - desc$horizon)] + starts = rep(1, length(stops)) + train.inds = mapply(seq, starts, stops, SIMPLIFY = FALSE) + test.inds = mapply(seq, stops + 1, stops + desc$horizon, SIMPLIFY = FALSE) + + thin = function(x, skip = 0) { + n = length(x) + x[seq(1, n, by = skip)] + } + + if (skip > 0) { + train.inds = thin(train.inds, skip = skip + 1) + test.inds = thin(test.inds, skip = skip + 1) + } + if (length(test.inds) == 0) + stop("Skip is too large and has removed all resampling instances. Please lower the value of skip.") + desc$iters = length(test.inds) + makeResampleInstanceInternal(desc, size, train.inds = train.inds, test.inds = test.inds ) +} diff --git a/R/StackedLearner.R b/R/StackedLearner.R index 500c0a4a33..e49278e828 100644 --- a/R/StackedLearner.R +++ b/R/StackedLearner.R @@ -127,7 +127,6 @@ makeStackedLearner = function(base.learners, super.learner = NULL, predict.type stop("The original features can not be used for this method") if (!inherits(resampling, "CVDesc")) stop("Currently only CV is allowed for resampling!") - # lrn$predict.type is "response" by default change it using setPredictType lrn = makeBaseEnsemble( id = "stack", @@ -171,7 +170,7 @@ getStackedBaseLearnerPredictions = function(model, newdata = NULL) { bms = model$learner.model$base.models method = model$learner.model$method - if (is.null(newdata)) { + if (is.null(newdata) || ncol(newdata) == 0) { probs = model$learner.model$pred.train } else { # if (model == "stack.cv") warning("Crossvalidated predictions for new data is not possible for this method.") @@ -216,8 +215,7 @@ predictLearner.StackedLearner = function(.learner, .model, .newdata, ...) { # get task information (classif) td = .model$task.desc - type = ifelse(td$type == "regr", "regr", - ifelse(length(td$class.levels) == 2L, "classif", "multiclassif")) + type = checkStackSupport(td) # predict prob vectors with each base model if (.learner$method != "compress") { @@ -327,8 +325,7 @@ averageBaseLearners = function(learner, task) { # stacking where we predict the training set in-sample, then super-learn on that stackNoCV = function(learner, task) { td = getTaskDesc(task) - type = ifelse(td$type == "regr", "regr", - ifelse(length(td$class.levels) == 2L, "classif", "multiclassif")) + type = checkStackSupport(td) bls = learner$base.learners use.feat = learner$use.feat base.models = probs = vector("list", length(bls)) @@ -369,8 +366,7 @@ stackNoCV = function(learner, task) { # stacking where we crossval the training set with the base learners, then super-learn on that stackCV = function(learner, task) { td = getTaskDesc(task) - type = ifelse(td$type == "regr", "regr", - ifelse(length(td$class.levels) == 2L, "classif", "multiclassif")) + type = checkStackSupport(td) bls = learner$base.learners use.feat = learner$use.feat # cross-validate all base learners and get a prob vector for the whole dataset for each learner @@ -424,8 +420,7 @@ hillclimbBaseLearners = function(learner, task, replace = TRUE, init = 0, bagpro assertInt(bagtime, lower = 1) td = getTaskDesc(task) - type = ifelse(td$type == "regr", "regr", - ifelse(length(td$class.levels) == 2L, "classif", "multiclassif")) + type = checkStackSupport(td) if (is.null(metric)) { if (type == "regr") { metric = function(pred, true) mean((pred - true)^2) @@ -440,7 +435,7 @@ hillclimbBaseLearners = function(learner, task, replace = TRUE, init = 0, bagpro assertFunction(metric) bls = learner$base.learners - if (type != "regr") { + if (type != "regr" && type != "fcregr" && type != "mfcregr") { for (i in seq_along(bls)) { if (bls[[i]]$predict.type == "response") stop("Hill climbing algorithm only takes probability predict type for classification.") @@ -547,13 +542,12 @@ compressBaseLearners = function(learner, task, parset = list()) { pseudo.data = data.frame(pseudo.data, target = pseudo.target$data$response) td = ensemble.model$task.desc - type = ifelse(td$type == "regr", "regr", - ifelse(length(td$class.levels) == 2L, "classif", "multiclassif")) + type = checkStackSupport(td) if (type == "regr") { new.task = makeRegrTask(data = pseudo.data, target = "target") if (is.null(learner$super.learner)) { - m = makeLearner("regr.nnet", predict.type = ) + m = makeLearner("regr.nnet", predict.type = "response") } else { m = learner$super.learner } @@ -737,3 +731,20 @@ getPseudoData = function(.data, k = 3, prob = 0.1, s = NULL, ...) { # - DONE: super learner can also return predicted probabilites # - DONE: allow regression as well +# check the learner type to see if it is supported +checkStackSupport = function(td) { + if (td$type == "regr") { + type = "regr" + } else if (td$type == "fcregr") { + type = "fcregr" + } else if (td$type == "mfcregr") { + type = "mfcregr" + } else if (length(td$class.levels) == 2L) { + type = "classif" + } else if (length(td$class.levels) > 2L) { + type = "multiclassif" + } else { + stop(catf("Learners of type %s are not supported", td$type)) + } + type +} diff --git a/R/Task.R b/R/Task.R index fda990bfb2..b1950ba225 100644 --- a/R/Task.R +++ b/R/Task.R @@ -70,10 +70,17 @@ #' Should sanity of data be checked initially at task creation? #' You should have good reasons to turn this off (one might be speed). #' Default is \code{TRUE}. +#' @param frequency [\code{Integer(1)}]\cr +#' The seasonality of the data. A frequency of 7L for daily data means a weekly seasonality, +#' 52L is weekly data with a yearly seasonality, 365L is daily data with a yearly seasonality, etc. +#' Default is 1L for no seasonality. +#' @param date.col [\code{character(1)}]\cr +#' The column which contains the dates for your data. These dates should be unique, in a POSIXt format, and in ascending order. +#' If \code{check.data} is TRUE, the first two conditions will be checked. If \code{fixup.data} is not 'no', then unorded data will be placed in ascending order. #' @return [\code{\link{Task}}]. #' @name Task #' @rdname Task -#' @aliases ClassifTask RegrTask SurvTask CostSensTask ClusterTask MultilabelTask +#' @aliases ClassifTask RegrTask SurvTask CostSensTask ClusterTask MultilabelTask ForecastRegrTask #' @examples #' if (requireNamespace("mlbench")) { #' library(mlbench) @@ -145,6 +152,9 @@ checkTaskData = function(data, cols = names(data)) { } else if (is.factor(x)) { if (hasEmptyLevels(x)) stopf("Column '%s' contains empty factor levels.", cn) + } else if (is.POSIXt(x)) { + if (any(duplicated(x))) + warning(catf("There are duplicate dates for %s", unique(x[duplicated(x)]))) } else { stopf("Unsupported feature type (%s) in column '%s'.", class(x)[1L], cn) } diff --git a/R/Task_operators.R b/R/Task_operators.R index 53147a3f24..bd37446993 100644 --- a/R/Task_operators.R +++ b/R/Task_operators.R @@ -451,7 +451,8 @@ changeData = function(task, data, costs, weights) { "cluster" = makeClusterTaskDesc(td$id, data, task$weights, task$blocking), "surv" = makeSurvTaskDesc(td$id, data, td$target, task$weights, task$blocking, td$censoring), "costsens" = makeCostSensTaskDesc(td$id, data, td$target, task$blocking, costs), - "multilabel" = makeMultilabelTaskDesc(td$id, data, td$target, td$weights, task$blocking) + "multilabel" = makeMultilabelTaskDesc(td$id, data, td$target, td$weights, task$blocking), + "fcregr" = makeForecastRegrTaskDesc(td$id, data, td$target, td$weights, td$blocking, td$frequency) ) return(task) @@ -468,3 +469,14 @@ getTaskFactorLevels = function(task) { getTaskWeights = function(task) { task$weights } + +#' @title Get the dates of the task. +#' +#' @description Returns the dates from a task if they exist. +#' @template arg_task_or_desc +#' @return [\code{character(1)}] +#' @export +#' @family task +getTaskDates = function(x) { + getTaskDesc(x)$dates +} diff --git a/R/helpers.R b/R/helpers.R index 94c2bf43e4..9ef2a44773 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -60,7 +60,7 @@ propVectorToMatrix = function(p, levs) { #' @return [\code{character}]. #' @export listTaskTypes = function() { - c("classif", "regr", "surv", "costsens", "cluster", "multilabel") + c("classif", "regr", "surv", "costsens", "cluster", "multilabel", "fcregr") } # Maybe move to BBmisc at some point @@ -98,3 +98,6 @@ suppressWarning = function(expr, str) { hasEmptyLevels = function(x) { !all(levels(x) %chin% as.character(unique(x))) } + +# Check if something is POSIXt, added so we don't need to import lubridate +is.POSIXt = function(x) inherits(x, "POSIXt") diff --git a/R/measures.R b/R/measures.R index e7ce586ee8..6b83a6d43f 100644 --- a/R/measures.R +++ b/R/measures.R @@ -41,7 +41,7 @@ NULL #' @rdname measures #' @format none featperc = makeMeasure(id = "featperc", minimize = TRUE, best = 0, worst = 1, - properties = c("classif", "classif.multi", "multilabel", "regr", "surv", "costsens", "cluster", "req.model", "req.pred"), + properties = c("classif", "classif.multi", "multilabel", "regr", "fcregr", "surv", "costsens", "cluster", "req.model", "req.pred"), name = "Percentage of original features used for model", note = "Useful for feature selection.", fun = function(task, model, pred, feats, extra.args) { @@ -53,7 +53,7 @@ featperc = makeMeasure(id = "featperc", minimize = TRUE, best = 0, worst = 1, #' @rdname measures #' @format none timetrain = makeMeasure(id = "timetrain", minimize = TRUE, best = 0, worst = Inf, - properties = c("classif", "classif.multi", "multilabel", "regr", "surv", "costsens", "cluster", "req.model"), + properties = c("classif", "classif.multi", "multilabel", "regr", "fcregr", "surv", "costsens", "cluster", "req.model"), name = "Time of fitting the model", fun = function(task, model, pred, feats, extra.args) { model$time @@ -64,7 +64,7 @@ timetrain = makeMeasure(id = "timetrain", minimize = TRUE, best = 0, worst = Inf #' @rdname measures #' @format none timepredict = makeMeasure(id = "timepredict", minimize = TRUE, best = 0, worst = Inf, - properties = c("classif", "classif.multi", "multilabel", "regr", "surv", "costsens", "cluster", "req.pred"), + properties = c("classif", "classif.multi", "multilabel", "regr", "fcregr", "surv", "costsens", "cluster", "req.pred"), name = "Time of predicting test set", fun = function(task, model, pred, feats, extra.args) { pred$time @@ -75,7 +75,7 @@ timepredict = makeMeasure(id = "timepredict", minimize = TRUE, best = 0, worst = #' @rdname measures #' @format none timeboth = makeMeasure(id = "timeboth", minimize = TRUE, best = 0, worst = Inf, - properties = c("classif", "classif.multi", "multilabel", "regr", "surv", "costsens", "cluster", "req.model", "req.pred"), + properties = c("classif", "classif.multi", "multilabel", "regr", "fcregr", "surv", "costsens", "cluster", "req.model", "req.pred"), name = "timetrain + timepredict", fun = function(task, model, pred, feats, extra.args) { model$time + pred$time @@ -90,7 +90,7 @@ timeboth = makeMeasure(id = "timeboth", minimize = TRUE, best = 0, worst = Inf, #' @rdname measures #' @format none sse = makeMeasure(id = "sse", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Sum of squared errors", note = "Defined as: sum((response - truth)^2)", fun = function(task, model, pred, feats, extra.args) { @@ -98,6 +98,44 @@ sse = makeMeasure(id = "sse", minimize = TRUE, best = 0, worst = Inf, } ) +#' @export mase +#' @rdname measures +#' @usage none +#' @format none +mase = makeMeasure( + id = "mase", + minimize = TRUE, + name = "Mean Absolute Scaled Error", + properties = c("regr", "fcregr", "req.pred", "req.truth", "req.task"), + best = 0, + worst = Inf, + fun = function(task, model, pred, feats, extra.args){ + truth = getPredictionTruth(pred) + response = getPredictionResponse(pred) + target = getTaskTargets(task) + frequency = getTaskDesc(task)$frequency + measureMASE(truth, response, target, frequency) + } +) + +#' @export measureMASE +#' @rdname measures +#' @param target [character(1)] +#' The target variable from the training data +#' @param frequency +#' The seasonality of the data +#' @format none +measureMASE = function(truth, response, target, frequency) { + error = truth - response + if (is.null(frequency)) { + naive.forecast = diff(target) + } else { + naive.forecast = diff(target, lag = frequency) + } + scale = mean(abs(naive.forecast)) + mean(abs(error / scale)) +} + #' @export measureSSE #' @rdname measures #' @format none @@ -109,7 +147,7 @@ measureSSE = function(truth, response) { #' @rdname measures #' @format none mse = makeMeasure(id = "mse", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Mean of squared errors", note = "Defined as: mean((response - truth)^2)", fun = function(task, model, pred, feats, extra.args) { @@ -129,7 +167,7 @@ measureMSE = function(truth, response) { #' @rdname measures #' @format none rmse = makeMeasure(id = "rmse", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Root mean squared error", note = "The RMSE is aggregated as sqrt(mean(rmse.vals.on.test.sets^2)). If you don't want that, you could also use `test.mean`.", fun = function(task, model, pred, feats, extra.args) { @@ -149,7 +187,7 @@ measureRMSE = function(truth, response) { #' @rdname measures #' @format none medse = makeMeasure(id = "medse", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Median of squared errors", note = "Defined as: median((response - truth)^2).", fun = function(task, model, pred, feats, extra.args) { @@ -168,7 +206,7 @@ measureMEDSE = function(truth, response) { #' @rdname measures #' @format none sae = makeMeasure(id = "sae", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Sum of absolute errors", note = "Defined as: sum(abs(response - truth))", fun = function(task, model, pred, feats, extra.args) { @@ -187,7 +225,7 @@ measureSAE = function(truth, response) { #' @rdname measures #' @format none mae = makeMeasure(id = "mae", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Mean of absolute errors", note = "Defined as: mean(abs(response - truth))", fun = function(task, model, pred, feats, extra.args) { @@ -206,7 +244,7 @@ measureMAE = function(truth, response) { #' @rdname measures #' @format none medae = makeMeasure(id = "medae", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Median of absolute errors", note = "Defined as: median(abs(response - truth)).", fun = function(task, model, pred, feats, extra.args) { @@ -225,7 +263,7 @@ measureMEDAE = function(truth, response) { #' @rdname measures #' @format none rsq = makeMeasure(id = "rsq", minimize = FALSE, best = 1, worst = -Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Coefficient of determination", note = "Also called R-squared, which is 1 - residual_sum_of_squares / total_sum_of_squares.", fun = function(task, model, pred, feats, extra.args) { @@ -239,7 +277,7 @@ rsq = makeMeasure(id = "rsq", minimize = FALSE, best = 1, worst = -Inf, measureRSQ = function(truth, response) { rss = measureSSE(truth, response) ess = sum((truth - mean(truth))^2L) - if (ess == 0){ + if (ess == 0) { warning("Measure is undefined if all truth values are equal.") return(NA_real_) } @@ -250,7 +288,7 @@ measureRSQ = function(truth, response) { #' @rdname measures #' @format none expvar = makeMeasure(id = "expvar", minimize = FALSE, best = 1, worst = 0, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Explained variance", note = "Similar to measure rsq (R-squared). Defined as explained_sum_of_squares / total_sum_of_squares.", fun = function(task, model, pred, feats, extra.args) { @@ -264,7 +302,7 @@ expvar = makeMeasure(id = "expvar", minimize = FALSE, best = 1, worst = 0, measureEXPVAR = function(truth, response) { regss = sum((response - mean(truth))^2L) ess = sum((truth - mean(truth))^2L) - if (ess == 0){ + if (ess == 0) { warning("Measure is undefined if all truth values are equal.") return(NA_real_) } @@ -275,13 +313,13 @@ measureEXPVAR = function(truth, response) { #' @rdname measures #' @format none arsq = makeMeasure(id = "adjrsq", minimize = FALSE, best = 1, worst = 0, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Adjusted coefficient of determination", note = "Defined as: 1 - (1 - rsq) * (p / (n - p - 1L)). Adjusted R-squared is only defined for normal linear regression.", fun = function(task, model, pred, feats, extra.args) { n = length(pred$data$truth) p = length(model$features) - if (n == p + 1){ + if (n == p + 1) { warning("Adjusted R-squared is undefined if the number observations is equal to the number of independent variables plus one.") return(NA_real_) } @@ -293,7 +331,7 @@ arsq = makeMeasure(id = "adjrsq", minimize = FALSE, best = 1, worst = 0, #' @rdname measures #' @format none rrse = makeMeasure(id = "rrse", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Root relative squared error", note = "Defined as sqrt (sum_of_squared_errors / total_sum_of_squares). Undefined for single instances and when every truth value is identical. In this case the output will be NA.", fun = function(task, model, pred, feats, extra.args) { @@ -317,7 +355,7 @@ measureRRSE = function(truth, response){ #' @rdname measures #' @format none rae = makeMeasure(id = "rae", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Relative absolute error", note = "Defined as sum_of_absolute_errors / mean_absolute_deviation. Undefined for single instances and when every truth value is identical. In this case the output will be NA.", fun = function(task, model, pred, feats, extra.args) { @@ -341,7 +379,7 @@ measureRAE = function(truth, response){ #' @rdname measures #' @format none mape = makeMeasure(id = "mape", minimize = TRUE, best = 0, worst = Inf, - properties = c("regr", "req.pred", "req.truth"), + properties = c("regr", "fcregr", "req.pred", "req.truth"), name = "Mean absolute percentage error", note = "Defined as the abs(truth_i - response_i) / truth_i. Won't work if any truth value is equal to zero. In this case the output will be NA.", fun = function(task, model, pred, feats, extra.args) { @@ -352,8 +390,8 @@ mape = makeMeasure(id = "mape", minimize = TRUE, best = 0, worst = Inf, #' @export measureMAPE #' @rdname measures #' @format none -measureMAPE = function(truth, response){ - if (any(truth == 0)){ +measureMAPE = function(truth, response) { + if (any(truth == 0)) { warning("Measure is undefined if any truth value is equal to 0.") return(NA_real_) } @@ -624,7 +662,7 @@ logloss = makeMeasure(id = "logloss", minimize = TRUE, best = 0, worst = Inf, #' @export measureLogloss #' @rdname measures #' @format none -measureLogloss = function(probabilities, truth){ +measureLogloss = function(probabilities, truth) { eps = 1e-15 #let's confine the predicted probabilities to [eps,1 - eps], so logLoss doesn't reach infinity under any circumstance probabilities[probabilities > 1 - eps] = 1 - eps @@ -650,7 +688,7 @@ ssr = makeMeasure(id = "ssr", minimize = FALSE, best = 1, worst = 0, #' @export measureSSR #' @rdname measures #' @format none -measureSSR = function(probabilities, truth){ +measureSSR = function(probabilities, truth) { truth = match(as.character(truth), colnames(probabilities)) p = getRowEls(probabilities, truth) mean(p / sqrt(rowSums(probabilities^2))) @@ -673,7 +711,7 @@ qsr = makeMeasure(id = "qsr", minimize = FALSE, best = 1, worst = -1, #' @export measureQSR #' @rdname measures #' @format none -measureQSR = function(probabilities, truth){ +measureQSR = function(probabilities, truth) { #We add this line because binary tasks only output one probability column if (is.null(dim(probabilities))) probabilities = cbind(probabilities, 1 - probabilities) truth = factor(truth, levels = colnames(probabilities)) diff --git a/R/predict.R b/R/predict.R index bc352fbbbb..1423a2bd22 100644 --- a/R/predict.R +++ b/R/predict.R @@ -51,11 +51,11 @@ predict.WrappedModel = function(object, task, newdata, subset = NULL, ...) { assertClass(task, classes = "Task") size = getTaskSize(task) } else { - assertDataFrame(newdata, min.rows = 1L) if (class(newdata)[1] != "data.frame") { - warningf("Provided data for prediction is not a pure data.frame but from class %s, hence it will be converted.", class(newdata)[1]) - newdata = as.data.frame(newdata) - } + warningf("Provided data for prediction is not a pure data.frame but from class %s, hence it will be converted.", class(newdata)[1]) + newdata = as.data.frame(newdata) + } + assertDataFrame(newdata, min.rows = 1L) size = nrow(newdata) } subset = checkTaskSubset(subset, size) diff --git a/R/predictLearner.R b/R/predictLearner.R index 3d2d9a9a73..9c243d5d2d 100644 --- a/R/predictLearner.R +++ b/R/predictLearner.R @@ -41,7 +41,8 @@ predictLearner = function(.learner, .model, .newdata, ...) { if (inherits(lmod, "NoFeaturesModel")) { predictNofeatures(.model, .newdata) } else { - assertDataFrame(.newdata, min.rows = 1L, min.cols = 1L) + if (.learner$type != "fcregr") + assertDataFrame(.newdata, min.rows = 1L, min.cols = 1L) UseMethod("predictLearner") } } @@ -96,6 +97,15 @@ checkPredictLearnerOutput = function(learner, model, p) { if (ncol(p) != 2L) stopf("predictLearner for %s has not returned a numeric matrix with 2 columns!", learner$id) } + } else if (learner$type == "fcregr") { + if (learner$predict.type == "response" && cl != "numeric" && cl != "ts") { + stopf("predictLearner for %s has returned a class %s instead of a numeric!", learner$id, cl) + } else if (learner$predict.type == "quantile") { + if (!is.matrix(p)) + stopf("predictLearner for %s has returned a class %s instead of a matrix!", learner$id, cl) + if (ncol(p) < 2L) + stopf("predictLearner for %s has not returned a numeric matrix with more than 2 columns!", learner$id) + } } else if (learner$type == "surv") { if (learner$predict.type == "prob") stop("Survival does not support prediction of probabilites yet.") diff --git a/R/resample.R b/R/resample.R index 74dc48167d..61070efdbc 100644 --- a/R/resample.R +++ b/R/resample.R @@ -29,6 +29,12 @@ #' See \code{\link{ResampleDesc}}. #' @param stratify [\code{logical(1)}]\cr #' See \code{\link{ResampleDesc}}. +#' @param horizon [\code{integer(1)}]\cr +#' See \code{\link{ResampleDesc}}. +#' @param initial.window [\code{integer(1)}]\cr +#' See \code{\link{ResampleDesc}}. +#' @param skip [\code{integer(1)}]\cr +#' See \code{\link{ResampleDesc}}. #' @template arg_measures #' @param weights [\code{numeric}]\cr #' Optional, non-negative case weight vector to be used during fitting. diff --git a/R/resample_convenience.R b/R/resample_convenience.R index 546997e013..8c7870b43d 100644 --- a/R/resample_convenience.R +++ b/R/resample_convenience.R @@ -66,3 +66,24 @@ bootstrapB632plus = function(learner, task, iters = 30, stratify = FALSE, measur measures = checkMeasures(measures, task, aggr = b632plus) resample(learner, task, rdesc, measures = measures, models = models, keep.pred = keep.pred, show.info = show.info) } + +#' @rdname resample +#' @export +growingcv = function(learner, task, horizon = 1, initial.window = .5, skip = 0, measures, models = FALSE, keep.pred = TRUE, ..., show.info = getMlrOption("show.info")) { + learner = checkLearner(learner, ...) + assertClass(task, classes = "ForecastTask") + rdesc = makeResampleDesc("GrowingCV", horizon = horizon, initial.window = initial.window, skip = skip) + measures = checkMeasures(measures, task, aggr = b632plus) + resample(learner, task, rdesc, measures = measures, models = models, keep.pred = keep.pred, show.info = show.info) +} + +#' @rdname resample +#' @export +fixedcv = function(learner, task, horizon = 1L, initial.window = .5, skip = 0, measures, models = FALSE, keep.pred = TRUE, ..., show.info = getMlrOption("show.info")) { + learner = checkLearner(learner, ...) + assertClass(task, classes = "ForecastTask") + rdesc = makeResampleDesc("FixedCV", horizon = horizon, initial.window = initial.window, skip = skip) + measures = checkMeasures(measures, task, aggr = b632plus) + resample(learner, task, rdesc, measures = measures, models = models, keep.pred = keep.pred, show.info = show.info) +} + diff --git a/R/setPredictType.R b/R/setPredictType.R index e6994ba48b..d52114621f 100644 --- a/R/setPredictType.R +++ b/R/setPredictType.R @@ -34,12 +34,15 @@ setPredictType.Learner = function(learner, predict.type) { regr = c("response", "se"), surv = c("response", "prob"), costsens = "response", - cluster = c("response", "prob") + cluster = c("response", "prob"), + fcregr = c("response", "quantile") )) if (predict.type == "prob" && !hasLearnerProperties(learner, "prob")) stopf("Trying to predict probs, but %s does not support that!", learner$id) if (predict.type == "se" && !hasLearnerProperties(learner, "se")) stopf("Trying to predict standard errors, but %s does not support that!", learner$id) + if (predict.type == "quantile" && !hasLearnerProperties(learner, "quantile")) + stopf("Trying to predict quantiles, but %s does not support that!", learner$id) learner$predict.type = predict.type return(learner) } diff --git a/R/train.R b/R/train.R index 11f1400107..8a6a74d484 100644 --- a/R/train.R +++ b/R/train.R @@ -64,8 +64,9 @@ train = function(learner, task, subset, weights = NULL) { vars = getTaskFeatureNames(task) # no vars? then use no vars model - if (length(vars) == 0L) { - learner.model = makeNoFeaturesModel(targets = task$env$data[subset, tn], task.desc = getTaskDesc(task)) + #NOTE: Most forecasting tasks are univariate (only using y), so this will check does not work here + if (length(vars) == 0L && getLearnerType(learner) != "fcregr") { + learner.model = makeNoFeaturesModel(targets = getTaskData(task)[subset, tn], task.desc = getTaskDesc(task)) time.train = 0 } else { opts = getLearnerOptions(learner, c("show.learner.output", "on.learner.error", "on.learner.warning", "on.error.dump")) diff --git a/R/zzz.R b/R/zzz.R index 4baa250608..b47cfe67d2 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -26,9 +26,10 @@ mlr$learner.properties = list( regr = c("numerics", "factors", "ordered", "missings", "weights", "se", "featimp", "oobpreds"), cluster = c("numerics", "factors", "ordered", "missings", "weights", "prob"), surv = c("numerics", "factors", "ordered", "missings", "weights", "prob", "lcens", "rcens", "icens", "featimp", "oobpreds"), - costsens = c("numerics", "factors", "ordered", "missings", "weights", "prob", "twoclass", "multiclass") + costsens = c("numerics", "factors", "ordered", "missings", "weights", "prob", "twoclass", "multiclass"), + fcregr = c("numerics", "quantile", "weights") ) mlr$learner.properties$any = unique(unlist(mlr$learner.properties)) ### Measure properties -mlr$measure.properties = c("classif", "classif.multi", "multilabel", "regr", "surv", "cluster", "costsens", "req.pred", "req.truth", "req.task", "req.feats", "req.model", "req.prob") +mlr$measure.properties = c("classif", "classif.multi", "multilabel", "regr", "surv", "cluster", "costsens", "fcregr", "req.pred", "req.truth", "req.task", "req.feats", "req.model", "req.prob") diff --git a/man/RLearner.Rd b/man/RLearner.Rd index 972a433f43..88f848c22e 100644 --- a/man/RLearner.Rd +++ b/man/RLearner.Rd @@ -14,6 +14,7 @@ \alias{makeRLearnerSurv} \alias{makeRLearnerCluster} \alias{makeRLearnerCostSens} +\alias{makeRLearnerForecastRegr} \title{Internal construction / wrapping of learner object.} \usage{ makeRLearner() @@ -41,6 +42,10 @@ makeRLearnerCluster(cl, package, par.set, par.vals = list(), makeRLearnerCostSens(cl, package, par.set, par.vals = list(), properties = character(0L), name = cl, short.name = cl, note = "", callees = character(0L)) + +makeRLearnerForecastRegr(cl, package, par.set, par.vals = list(), + properties = character(0L), name = cl, short.name = cl, note = "", + callees = character(0L)) } \arguments{ \item{cl}{[\code{character(1)}]\cr diff --git a/man/Task.Rd b/man/Task.Rd index 11d3b9edec..c4d7f44e95 100644 --- a/man/Task.Rd +++ b/man/Task.Rd @@ -1,10 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ClassifTask.R, R/ClusterTask.R, -% R/CostSensTask.R, R/MultilabelTask.R, R/RegrTask.R, R/SurvTask.R, R/Task.R +% R/CostSensTask.R, R/ForecastRegrTask.R, R/MultilabelTask.R, R/RegrTask.R, +% R/SurvTask.R, R/Task.R \name{makeClassifTask} \alias{makeClassifTask} \alias{makeClusterTask} \alias{makeCostSensTask} +\alias{makeForecastRegrTask} \alias{makeMultilabelTask} \alias{makeRegrTask} \alias{makeSurvTask} @@ -15,8 +17,8 @@ \alias{CostSensTask} \alias{ClusterTask} \alias{MultilabelTask} -\title{Create a classification, regression, survival, cluster, cost-sensitive classification or -multilabel task.} +\alias{ForecastRegrTask} +\title{Create a task for univariate forecasting} \usage{ makeClassifTask(id = deparse(substitute(data)), data, target, weights = NULL, blocking = NULL, positive = NA_character_, @@ -28,6 +30,10 @@ makeClusterTask(id = deparse(substitute(data)), data, weights = NULL, makeCostSensTask(id = deparse(substitute(data)), data, costs, blocking = NULL, fixup.data = "warn", check.data = TRUE) +makeForecastRegrTask(id = deparse(substitute(data)), data, target, + weights = NULL, blocking = NULL, frequency = 1L, date.col = "dates", + fixup.data = "warn", check.data = TRUE) + makeMultilabelTask(id = deparse(substitute(data)), data, target, weights = NULL, blocking = NULL, positive = NA_character_, fixup.data = "warn", check.data = TRUE) @@ -93,6 +99,15 @@ The columns correspond to classes and their names are the class labels Each entry (i,j) of the matrix specifies the cost of predicting class j for observation i.} +\item{frequency}{[\code{Integer(1)}]\cr +The seasonality of the data. A frequency of 7L for daily data means a weekly seasonality, +52L is weekly data with a yearly seasonality, 365L is daily data with a yearly seasonality, etc. +Default is 1L for no seasonality.} + +\item{date.col}{[\code{character(1)}]\cr +The column which contains the dates for your data. These dates should be unique, in a POSIXt format, and in ascending order. +If \code{check.data} is TRUE, the first two conditions will be checked. If \code{fixup.data} is not 'no', then unorded data will be placed in ascending order.} + \item{censoring}{[\code{character(1)}]\cr Censoring type. Allowed choices are \dQuote{rcens} for right censored data (default), \dQuote{lcens} for left censored and \dQuote{icens} for interval censored data using @@ -103,6 +118,8 @@ See \code{\link[survival]{Surv}} for details.} [\code{\link{Task}}]. } \description{ +Creates a task for univariate forecasting learners + The task encapsulates the data and specifies - through its subclasses - the type of the task. It also contains a description object detailing further aspects of the data. diff --git a/man/getDefaultMeasure.Rd b/man/getDefaultMeasure.Rd index 441914d755..ea868cc264 100644 --- a/man/getDefaultMeasure.Rd +++ b/man/getDefaultMeasure.Rd @@ -23,5 +23,6 @@ Currently these are: surv \tab cindex\cr costsens \tab mcp\cr multilabel \tab multilabel.hamloss\cr + fcregr \tab mse } } diff --git a/man/getPredictionResponse.Rd b/man/getPredictionResponse.Rd index dccbce1f06..3086130ebc 100644 --- a/man/getPredictionResponse.Rd +++ b/man/getPredictionResponse.Rd @@ -28,6 +28,7 @@ The following types are returned, depending on task type: cluster \tab integer\cr surv \tab numeric\cr multilabel \tab logical matrix, columns named with labels\cr + fcregr \tab numeric\cr } } \seealso{ diff --git a/man/getTaskClassLevels.Rd b/man/getTaskClassLevels.Rd index ec34523015..d7b89060ed 100644 --- a/man/getTaskClassLevels.Rd +++ b/man/getTaskClassLevels.Rd @@ -19,7 +19,8 @@ actually return the same thing. } \seealso{ Other task: \code{\link{getTaskCosts}}, - \code{\link{getTaskData}}, \code{\link{getTaskDesc}}, + \code{\link{getTaskData}}, \code{\link{getTaskDates}}, + \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskCosts.Rd b/man/getTaskCosts.Rd index dbff0e12ab..00ec45def7 100644 --- a/man/getTaskCosts.Rd +++ b/man/getTaskCosts.Rd @@ -22,7 +22,8 @@ Returns \dQuote{NULL} if the task is not of type \dQuote{costsens}. } \seealso{ Other task: \code{\link{getTaskClassLevels}}, - \code{\link{getTaskData}}, \code{\link{getTaskDesc}}, + \code{\link{getTaskData}}, \code{\link{getTaskDates}}, + \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskData.Rd b/man/getTaskData.Rd index b6e1a4e5d4..cee9f7c6b8 100644 --- a/man/getTaskData.Rd +++ b/man/getTaskData.Rd @@ -61,7 +61,8 @@ head(getTaskData(task, subset = 1:100, recode.target = "01")) } \seealso{ Other task: \code{\link{getTaskClassLevels}}, - \code{\link{getTaskCosts}}, \code{\link{getTaskDesc}}, + \code{\link{getTaskCosts}}, \code{\link{getTaskDates}}, + \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskDates.Rd b/man/getTaskDates.Rd new file mode 100644 index 0000000000..afdb788168 --- /dev/null +++ b/man/getTaskDates.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Task_operators.R +\name{getTaskDates} +\alias{getTaskDates} +\title{Get the dates of the task.} +\usage{ +getTaskDates(x) +} +\arguments{ +\item{x}{[\code{\link{Task}} | \code{\link{TaskDesc}}]\cr +Task or its description object.} +} +\value{ +[\code{character(1)}] +} +\description{ +Returns the dates from a task if they exist. +} +\seealso{ +Other task: \code{\link{getTaskClassLevels}}, + \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, + \code{\link{getTaskDesc}}, + \code{\link{getTaskFeatureNames}}, + \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, + \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, + \code{\link{getTaskTargetNames}}, + \code{\link{getTaskTargets}}, \code{\link{getTaskType}}, + \code{\link{subsetTask}} +} diff --git a/man/getTaskDesc.Rd b/man/getTaskDesc.Rd index 4be01a15a8..4d20bb1738 100644 --- a/man/getTaskDesc.Rd +++ b/man/getTaskDesc.Rd @@ -19,6 +19,7 @@ See title. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, + \code{\link{getTaskDates}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskFeatureNames.Rd b/man/getTaskFeatureNames.Rd index 9fe0321e3a..e8a72c8794 100644 --- a/man/getTaskFeatureNames.Rd +++ b/man/getTaskFeatureNames.Rd @@ -19,9 +19,9 @@ Target column name is not included. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, \code{\link{getTaskFormula}}, - \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, - \code{\link{getTaskSize}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, + \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, + \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, \code{\link{getTaskTargetNames}}, \code{\link{getTaskTargets}}, \code{\link{getTaskType}}, \code{\link{subsetTask}} diff --git a/man/getTaskFormula.Rd b/man/getTaskFormula.Rd index f0f0938e4f..6b769ff700 100644 --- a/man/getTaskFormula.Rd +++ b/man/getTaskFormula.Rd @@ -33,7 +33,7 @@ For multilabel it is \dQuote{ + ... + ~ .}. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskId.Rd b/man/getTaskId.Rd index 632e92936e..1b782d2ce8 100644 --- a/man/getTaskId.Rd +++ b/man/getTaskId.Rd @@ -19,7 +19,7 @@ See title. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskNFeats.Rd b/man/getTaskNFeats.Rd index c5181c4c7a..d607d910cc 100644 --- a/man/getTaskNFeats.Rd +++ b/man/getTaskNFeats.Rd @@ -19,7 +19,7 @@ See title. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskSize.Rd b/man/getTaskSize.Rd index d08ab88845..9dd61b92e8 100644 --- a/man/getTaskSize.Rd +++ b/man/getTaskSize.Rd @@ -19,7 +19,7 @@ See title. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, diff --git a/man/getTaskTargetNames.Rd b/man/getTaskTargetNames.Rd index 4ba5a7b242..04e3c33465 100644 --- a/man/getTaskTargetNames.Rd +++ b/man/getTaskTargetNames.Rd @@ -20,7 +20,7 @@ actually return the same thing. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskTargets.Rd b/man/getTaskTargets.Rd index e1fb9f5534..c8653e2a86 100644 --- a/man/getTaskTargets.Rd +++ b/man/getTaskTargets.Rd @@ -36,7 +36,7 @@ getTaskTargets(task) \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/getTaskType.Rd b/man/getTaskType.Rd index d3ed626659..919ad9adc2 100644 --- a/man/getTaskType.Rd +++ b/man/getTaskType.Rd @@ -19,7 +19,7 @@ See title. \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/man/makeResampleDesc.Rd b/man/makeResampleDesc.Rd index 97aa63b880..c99d45741a 100644 --- a/man/makeResampleDesc.Rd +++ b/man/makeResampleDesc.Rd @@ -17,7 +17,8 @@ makeResampleDesc(method, predict = "test", ..., stratify = FALSE, \item{method}{[\code{character(1)}]\cr \dQuote{CV} for cross-validation, \dQuote{LOO} for leave-one-out, \dQuote{RepCV} for repeated cross-validation, \dQuote{Bootstrap} for out-of-bag bootstrap, \dQuote{Subsample} for -subsampling, \dQuote{Holdout} for holdout.} +subsampling, \dQuote{Holdout} for holdout, \dQuote{GrowingCV} for growing window, and + \dQuote{FixedCV} for fixed windowing. Note that \dQuote{GrowingCV} and \dQuote{FixedCV} are for forecasting.} \item{predict}{[\code{character(1)}]\cr What to predict during resampling: \dQuote{train}, \dQuote{test} or \dQuote{both} sets. @@ -32,8 +33,13 @@ Further parameters for strategies.\cr \dQuote{Subsample} between 0 and 1. Default is 2 / 3.} \item{reps [\code{integer(1)}]}{Repeats for \dQuote{RepCV}. Here \code{iters = folds * reps}. Default is 10.} -\item{folds [\code{integer(1)]}}{Folds in the repeated CV for \code{RepCV}. +\item{folds [\code{integer(1)}]}{Folds in the repeated CV for \code{RepCV}. Here \code{iters = folds * reps}. Default is 10.} +\item{horizon [\code{integer(1)}]}{Number of observations to forecast for \code{GrowthCV} + and \code{FixedCV}. Default is 1} +\item{initial.window [\code{numeric(1)}]}{Fraction of observations to start with + in \code{GrowthCV} and \code{FixedCV}. Default is 0.5} +\item{skip [\code{numeric(1)}]}{ Fraction of windows to skip in \code{GrowthCV} and \code{FixedCV}. Default is 0} }} \item{stratify}{[\code{logical(1)}]\cr diff --git a/man/measures.Rd b/man/measures.Rd index 17ea9d1f05..19ec0336b8 100644 --- a/man/measures.Rd +++ b/man/measures.Rd @@ -8,6 +8,8 @@ \alias{timepredict} \alias{timeboth} \alias{sse} +\alias{mase} +\alias{measureMASE} \alias{measureSSE} \alias{mse} \alias{measureMSE} @@ -136,6 +138,10 @@ timeboth sse +none + +measureMASE(truth, response, target, frequency) + measureSSE(truth, response) mse @@ -373,6 +379,11 @@ Vector of the true class.} \item{response}{[\code{factor}]\cr Vector of the predicted class.} +\item{target}{[character(1)] +The target variable from the training data} + +\item{frequency}{The seasonality of the data} + \item{probabilities}{[\code{numeric} | \code{matrix}]\cr a) For purely binary classification measures: The predicted probabilities for the positive class as a numeric vector. b) For multiclass classification measures: The predicted probabilities for all classes, always as a numeric matrix, where diff --git a/man/resample.Rd b/man/resample.Rd index 757eafda17..b2368c4365 100644 --- a/man/resample.Rd +++ b/man/resample.Rd @@ -9,6 +9,8 @@ \alias{bootstrapOOB} \alias{bootstrapB632} \alias{bootstrapB632plus} +\alias{growingcv} +\alias{fixedcv} \title{Fit models according to a resampling strategy.} \usage{ resample(learner, task, resampling, measures, weights = NULL, @@ -42,6 +44,14 @@ bootstrapB632(learner, task, iters = 30, stratify = FALSE, measures, bootstrapB632plus(learner, task, iters = 30, stratify = FALSE, measures, models = FALSE, keep.pred = TRUE, ..., show.info = getMlrOption("show.info")) + +growingcv(learner, task, horizon = 1, initial.window = 0.5, skip = 0, + measures, models = FALSE, keep.pred = TRUE, ..., + show.info = getMlrOption("show.info")) + +fixedcv(learner, task, horizon = 1L, initial.window = 0.5, skip = 0, + measures, models = FALSE, keep.pred = TRUE, ..., + show.info = getMlrOption("show.info")) } \arguments{ \item{learner}{[\code{\link{Learner}} | \code{character(1)}]\cr @@ -103,6 +113,15 @@ See \code{\link{ResampleDesc}}.} \item{split}{[\code{numeric(1)}]\cr See \code{\link{ResampleDesc}}.} + +\item{horizon}{[\code{integer(1)}]\cr +See \code{\link{ResampleDesc}}.} + +\item{initial.window}{[\code{integer(1)}]\cr +See \code{\link{ResampleDesc}}.} + +\item{skip}{[\code{integer(1)}]\cr +See \code{\link{ResampleDesc}}.} } \value{ [\code{\link{ResampleResult}}]. diff --git a/man/subsetTask.Rd b/man/subsetTask.Rd index 327e078f4b..389dfcee0d 100644 --- a/man/subsetTask.Rd +++ b/man/subsetTask.Rd @@ -36,7 +36,7 @@ subsetTask(task, subset = 1:100) \seealso{ Other task: \code{\link{getTaskClassLevels}}, \code{\link{getTaskCosts}}, \code{\link{getTaskData}}, - \code{\link{getTaskDesc}}, + \code{\link{getTaskDates}}, \code{\link{getTaskDesc}}, \code{\link{getTaskFeatureNames}}, \code{\link{getTaskFormula}}, \code{\link{getTaskId}}, \code{\link{getTaskNFeats}}, \code{\link{getTaskSize}}, diff --git a/tests/run-fcregr.R b/tests/run-fcregr.R new file mode 100644 index 0000000000..3ce16660a3 --- /dev/null +++ b/tests/run-fcregr.R @@ -0,0 +1,4 @@ +library(testthat) +if (identical(Sys.getenv("TRAVIS"), "true") || identical(Sys.getenv("R_EXPENSIVE_TEST_OK"), "true")) { + test_check("mlr", "_fcregr_") +} diff --git a/tests/testthat/helper_helpers.R b/tests/testthat/helper_helpers.R index 6bf3188643..d341745f35 100644 --- a/tests/testthat/helper_helpers.R +++ b/tests/testthat/helper_helpers.R @@ -51,6 +51,9 @@ testSimple = function(t.name, df, target, train.inds, old.predicts, parset = lis # FIXME this heuristic will backfire eventually if (length(target) == 0) task = makeClusterTask(data = df) + # This is almost guranteed to break + else if (is.POSIXt(df$dates)) + task = makeForecastRegrTask(data = df, target = target, date.col = "dates") else if (is.numeric(df[, target])) task = makeRegrTask(data = df, target = target) else if (is.factor(df[, target])) @@ -61,24 +64,28 @@ testSimple = function(t.name, df, target, train.inds, old.predicts, parset = lis task = makeMultilabelTask(data = df, target = target) else stop("Should not happen!") + set.seed(getOption("mlr.debug.seed")) m = try(train(lrn, task, subset = inds)) - if (inherits(m, "FailureModel")){ + if (inherits(m, "FailureModel")) { expect_is(old.predicts, "try-error") } else { + set.seed(getOption("mlr.debug.seed")) cp = predict(m, newdata = test) # Multilabel has a special data structure if (class(task)[1] == "MultilabelTask") { rownames(cp$data) = NULL expect_equal(unname(cp$data[, substr(colnames(cp$data), 1, 8) == "response"]), unname(old.predicts)) - } else { - # to avoid issues with dropped levels in the class factor we only check the elements as chars - if (is.numeric(cp$data$response) && is.numeric(old.predicts)) + } else if (inherits(task, "MultiForecastRegrTask")) { + expect_equal(unname(as.matrix(cp$data[, substr(colnames(cp$data), 1, 8) == "response"]), force = TRUE), + unname(as.matrix(old.predicts))) + } else if (is.numeric(cp$data$response) && is.numeric(old.predicts)) { expect_equal(unname(cp$data$response), unname(old.predicts), tol = 1e-5) - else + } else { expect_equal(as.character(cp$data$response), as.character(old.predicts)) } - } + } + } testSimpleParsets = function(t.name, df, target, train.inds, old.predicts.list, parset.list) { @@ -265,3 +272,58 @@ quiet = function(expr) { capture.output({ret = expr}) ret } + +testSimpleUpdate = function(t.name, target, train.df, update.df, + test.df, old.predicts, parset = list()) { + + lrn = do.call("makeLearner", c(list(t.name), parset)) + # FIXME this heuristic will backfire eventually + if (length(target) == 0) + task = makeClusterTask(data = train.df) + else if (is.POSIXt(df$dates)) + task = makeForecastRegrTask(data = train.df, target = target, date.col = "dates") + else if (is.numeric(train.df[, target])) + task = makeRegrTask(data = train.df, target = target) + else if (is.factor(train.df[, target])) + task = makeClassifTask(data = train.df, target = target) + else if (is.data.frame(train.df[, target]) && is.numeric(train.df[, target[1L]]) && is.logical(train.df[, target[2L]])) + task = makeSurvTask(data = train.df, target = target) + else if (is.data.frame(train.df[, target]) && is.logical(train.df[, target[1L]])) + task = makeMultilabelTask(data = train.df, target = target) + else + stop("Should not happen!") + set.seed(getOption("mlr.debug.seed")) + m = try(train(lrn, task)) + m = try(updateModel(m, task, update.df)) + if (inherits(m, "FailureModel")) { + expect_is(old.predicts, "try-error") + } else { + set.seed(getOption("mlr.debug.seed")) + cp = predict(m, newdata = test.df) + # Multilabel has a special data structure + if (class(task)[1] == "MultilabelTask") { + rownames(cp$data) = NULL + expect_equal(unname(cp$data[, substr(colnames(cp$data), 1, 8) == "response"]), unname(old.predicts)) + } else { + # to avoid issues with dropped levels in the class factor we only check the elements as chars + if (is.numeric(cp$data$response) && is.numeric(old.predicts)) + expect_equal(unname(cp$data$response), unname(old.predicts), tol = .2) + else + expect_equal(as.character(cp$data$response), as.character(old.predicts)) + } + } +} + +testSimpleParsetsUpdate = function(t.name, df, target, update.inds, train.inds, + test.inds, old.predicts.list, parset.list) { + + train.df = df[train.inds, ] + update.df = df[update.inds] + test.df = df[test.inds, ] + + for (i in seq_len(length(parset.list))) { + parset = parset.list[[i]] + old.predicts = old.predicts.list[[i]] + testSimpleUpdate(t.name, target, train.df, update.df, test.df, old.predicts, parset) + } +} diff --git a/tests/testthat/helper_learners_all.R b/tests/testthat/helper_learners_all.R index becb30f613..cab363309b 100644 --- a/tests/testthat/helper_learners_all.R +++ b/tests/testthat/helper_learners_all.R @@ -79,6 +79,14 @@ testBasicLearnerProperties = function(lrn, task, hyperpars, pred.type = "respons s = p$data$se expect_numeric(info = info, s, lower = 0, finite = TRUE, any.missing = FALSE, len = getTaskSize(task)) } + # check that quantile works and is > 0 + if (pred.type == "quantile") { + quantiles = p$data[, -1, drop = FALSE] + for (i in seq_len(ncol(quantiles))) { + expect_numeric(info = info, quantiles[, i], lower = Inf, finite = TRUE, + any.missing = FALSE, len = getTaskSize(task)) + } + } # check that probs works, and are in [0,1] and sum to 1 if (pred.type == "prob") { diff --git a/tests/testthat/helper_objects.R b/tests/testthat/helper_objects.R index c2db493401..258e0e1ad3 100644 --- a/tests/testthat/helper_objects.R +++ b/tests/testthat/helper_objects.R @@ -108,6 +108,47 @@ costsens.feat = iris costsens.costs = matrix(runif(150L * 3L, min = 0, max = 1), 150L, 3L) costsens.task = makeCostSensTask("costsens", data = costsens.feat, costs = costsens.costs) +### forecasting +set.seed(getOption("mlr.debug.seed")) +fcregr.df = arima.sim(model = list(ar = c(.5, .2), ma = .4, order = c(2, 0, 1)), n = 300) +times = as.POSIXct("1992-01-14") + 1:300 * 86400 +fcregr.df = data.frame(test_data = fcregr.df, dates = times) +fcregr.target = "test_data" +fcregr.train.inds = seq_len(299) +fcregr.test.inds = setdiff(seq_len(nrow(fcregr.df)), fcregr.train.inds) +fcregr.train = fcregr.df[fcregr.train.inds, ] +fcregr.test = fcregr.df[fcregr.test.inds, ] +fcregr.task = makeForecastRegrTask("fcregrtask", data = fcregr.df, target = fcregr.target, date.col = "dates") + +fcregr.update.df = fcregr.df[1:111, ] +fcregr.update.target = "test_data" +fcregr.update.train.inds = 1:100 +fcregr.update.update.inds = 101:110 +fcregr.update.test.inds = 111 +fcregr.update.train = fcregr.update.df[fcregr.update.train.inds, ] +fcregr.update.update = fcregr.update.df[fcregr.update.update.inds, ] +fcregr.update.test = fcregr.update.df[fcregr.update.test.inds, ] +fcregr.update.task = makeForecastRegrTask("fcregrtask", data = fcregr.update.train, target = fcregr.update.target, date.col = "dates") + +fcregr.small.df = fcregr.df[1:10, , drop = FALSE] +fcregr.small.target = "test_data" +fcregr.small.train.inds = seq_len(9) +fcregr.small.test.inds = setdiff(seq_len(nrow(fcregr.small.df)), fcregr.small.train.inds) +fcregr.small.train = fcregr.small.df[fcregr.small.train.inds, ] +fcregr.small.test = fcregr.small.df[fcregr.small.test.inds, ] +fcregr.small.task = makeForecastRegrTask("fcregrtask", data = fcregr.small.df, target = fcregr.small.target, date.col = "dates") +# NOTE: Note using BBmisc:: here was failing for some reason? +fcregr.num.df = fcregr.df#[, BBmisc::vlapply(fcregr.df, is.numeric), drop = FALSE] +fcregr.num.target = fcregr.target +fcregr.num.train.inds = fcregr.train.inds +fcregr.num.test.inds = fcregr.test.inds +fcregr.num.train = fcregr.num.df[fcregr.num.train.inds, ] +fcregr.num.test = fcregr.num.df[fcregr.num.test.inds, ] +fcregr.num.task = makeForecastRegrTask("fcregrnumtask", data = fcregr.num.df, target = fcregr.num.target, date.col = "dates") +########### + + + ns.svg = c(svg = "http://www.w3.org/2000/svg") black.circle.xpath = "/svg:svg//svg:circle[contains(@style, 'fill: #000000')]" grey.rect.xpath = "/svg:svg//svg:rect[contains(@style, 'fill: #EBEBEB;')]" diff --git a/tests/testthat/test_base_generateCalibration.R b/tests/testthat/test_base_generateCalibration.R index c7e6214d81..bf2474f6ef 100644 --- a/tests/testthat/test_base_generateCalibration.R +++ b/tests/testthat/test_base_generateCalibration.R @@ -52,7 +52,7 @@ test_that("generateCalibrationData", { # facetting works: q = q = plotCalibration(cd, facet.wrap.nrow = 2L) - testFacetting(q, 2L) + testFacetting(q, nrow = 2L) q = q = plotCalibration(cd, facet.wrap.ncol = 2L) testFacetting(q, ncol = 2L) }) diff --git a/tests/testthat/test_base_helpers.R b/tests/testthat/test_base_helpers.R index 5aaea78249..90afb0953c 100644 --- a/tests/testthat/test_base_helpers.R +++ b/tests/testthat/test_base_helpers.R @@ -23,12 +23,12 @@ test_that("propVectorToMatrix", { }) test_that("listTaskTypes", { - expected = c("classif", "regr", "surv", "costsens", "cluster", "multilabel") + expected = c("classif", "regr", "surv", "costsens", "cluster", "multilabel", "fcregr") expect_equal(expected, listTaskTypes()) }) test_that("listLearnerProperties", { - expected = c("classif", "regr", "surv", "costsens", "cluster", "multilabel") + expected = c("classif", "regr", "surv", "costsens", "cluster", "multilabel", "fcregr") expect_equal(expected, listTaskTypes()) }) diff --git a/tests/testthat/test_base_listLearners.R b/tests/testthat/test_base_listLearners.R index eb12830a3b..7c3b0dd113 100644 --- a/tests/testthat/test_base_listLearners.R +++ b/tests/testthat/test_base_listLearners.R @@ -3,7 +3,7 @@ context("listLearners") test_that("listLearners", { x1 = listLearners(create = FALSE, warn.missing.packages = FALSE) expect_data_frame(x1, min.rows = 1L, min.cols = 10) - expect_set_equal(x1$type, c("classif", "regr", "cluster", "surv", "multilabel")) + expect_set_equal(x1$type, c("classif", "regr", "cluster", "surv", "multilabel", "fcregr")) expect_subset(listLearnerProperties(), names(x1)) x1a = listLearners("classif", create = FALSE, properties = "missings", warn.missing.packages = FALSE) diff --git a/tests/testthat/test_base_resample_fixedcv.R b/tests/testthat/test_base_resample_fixedcv.R new file mode 100644 index 0000000000..eae28da98b --- /dev/null +++ b/tests/testthat/test_base_resample_fixedcv.R @@ -0,0 +1,17 @@ +context("resample_fixedcv") + +test_that("fixed instance works", { + rin = makeResampleInstance(makeResampleDesc("FixedCV"), size = 25) + + for (i in seq_len(length(rin$train.inds))) { + i1 = rin$train.inds[[i]] + i2 = rin$test.inds[[i]] + expect_true(min(i1) >= 1) + expect_true(max(i1) <= 25) + expect_true(min(i2) >= 1) + expect_true(max(i2) <= 25) + expect_true(max(i1) < min(i2)) + } +}) + + diff --git a/tests/testthat/test_base_resample_growingcv.R b/tests/testthat/test_base_resample_growingcv.R new file mode 100644 index 0000000000..fecfe76671 --- /dev/null +++ b/tests/testthat/test_base_resample_growingcv.R @@ -0,0 +1,18 @@ +context("resample_growingcv") + + +test_that("growing instance works", { + rin = makeResampleInstance(makeResampleDesc("GrowingCV"), size = 25) + + for (i in seq_len(length(rin$train.inds))) { + i1 = rin$train.inds[[i]] + i2 = rin$test.inds[[i]] + expect_true(min(i1) >= 1) + expect_true(max(i1) <= 25) + expect_true(min(i2) >= 1) + expect_true(max(i2) <= 25) + expect_true(max(i1) < min(i2)) + } +}) + + diff --git a/tests/testthat/test_fcregr_Arima.R b/tests/testthat/test_fcregr_Arima.R new file mode 100644 index 0000000000..8dfa891c9f --- /dev/null +++ b/tests/testthat/test_fcregr_Arima.R @@ -0,0 +1,40 @@ +context("fcregr_Arima") + + + +test_that("fcregr_Arima", { + + parset.list = list( + list(), + list(order = c(2, 0, 1)), + list(order = c(2, 0, 1), include.mean = TRUE), + list(order = c(2, 0, 2), include.mean = TRUE, include.drift = TRUE), + list(order = c(2, 0, 1), method = "ML") + + ) + old.predicts.list = list() + + for (i in seq_len(length(parset.list))) { + parset = parset.list[[i]] + pars = list(y = fcregr.train$test_data) + pars = c(pars, parset) + set.seed(getOption("mlr.debug.seed")) + capture.output({ + m = do.call(forecast::Arima, pars) + }) + set.seed(getOption("mlr.debug.seed")) + p = as.numeric(forecast::forecast(m, h = 1L)$mean) + old.predicts.list[[i]] = p + } + + parset.list[[1]]$h = 1L + parset.list[[2]]$h = 1L + parset.list[[3]]$h = 1L + parset.list[[4]]$h = 1L + parset.list[[5]]$h = 1L + testSimpleParsets("fcregr.Arima", fcregr.df, fcregr.target, + fcregr.train.inds, old.predicts.list, parset.list) +}) + + +