-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
PipeOpDensitySplit.R
115 lines (102 loc) · 4.23 KB
/
PipeOpDensitySplit.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#' @title Split Regression Task into two Density Tasks
#'
#' @usage NULL
#' @name mlr_pipeops_densitysplit
#' @format [`R6Class`] inheriting from [`PipeOp`].
#'
#' @description
#' Create two [`TaskDensity`] from a [`TaskRegr`][mlr3::TaskRegr]: one comprising the upper `alpha` fraction, the other the 1 - `alpha` lower fraction
#' (but at least `min_size`).
#'
#' @section Construction:
#' ```
#' PipeOpDensitySplit$new(id = "densitysplit", param_vals = list())
#' ```
#'
#' * `id` :: `character(1)`\cr
#' Identifier of the resulting object, default `"densityratio"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' [`PipeOpDensitySplit`] has one input channels named `"input"`, taking a [`TaskRegr`][mlr3::TaskRegr] during both training and prediction.
#'
#' [`PipeOpDensitySplit`] has two output channels, `"top"` and `"bottom"`, both [`TaskDensity`] during both training and prediction.
#'
#' @section State:
#' The `$state` is left empty (`list()`).
#'
#' @section Parameters:
#' The parameters are the parameters inherited from the [`PipeOp`], as well as:
#' * `alpha` :: `numeric(1)`\cr
#' What proportion of values to consider 'good'. BOHB has this at `0.15`.
#' * `min_size` :: `integer(1)`\cr
#' Minimum size of both [`TaskDensity`] to create. Initialized to 1.
#'
#' @section Internals:
#' Can be used with [`PipeOpDensityRatio`].
#'
#' @section Fields:
#' Only fields inherited from [`PipeOp`].
#'
#' @section Methods:
#' Only methods inherited from [`PipeOp`].
#' @family PipeOps
#' @family BOHB implementing operations
#' @export
PipeOpDensitySplit = R6Class("PipeOpDensitySplit",
inherit = mlr3pipelines::PipeOp,
public = list(
initialize = function(id = "densitysplit", param_vals = list()) {
param_set = ps(alpha = p_dbl(tags = c("train", "required")), min_size = p_int(1, tags = c("train", "required")))
param_set$values = list(alpha = 0.15, min_size = 1)
super$initialize(id, param_set = param_set, param_vals = param_vals,
input = data.table(name = "input", train = "TaskRegr", predict = "TaskRegr"),
output = data.table(name = c("top", "bottom"), train = "TaskDensity", predict = "TaskDensity"),
tags = "ensemble"
)
}
),
private = list(
.train = function(inputs) {
self$state = list()
pv = self$param_set$get_values(tags = "train")
task = inputs[[1]]
if (task$nrow <= pv$min_size) stopf("Task must have more than min_size (%s) samples, but has %s.", pv$min_size, task$nrow)
target = task$data(cols = task$target_names)
rows = task$row_ids
n_top = max(round(pv$alpha * task$nrow), pv$min_size)
n_bottom = max(task$nrow - n_top, pv$min_size)
order_target = order(target)
rows_top = rows[rev(order_target)[seq_len(n_top)]]
rows_bottom = rows[order_target[seq_len(n_bottom)]]
new_col_roles = task$col_roles[intersect(names(task$col_roles), mlr3::mlr_reflections$task_col_roles$density)]
top = todensity(task, ".top")
top$filter(rows = rows_top)
top$col_roles = new_col_roles
bottom = todensity(task, ".bottom")
bottom$filter(rows = rows_bottom)
bottom$col_roles = new_col_roles
list(top, bottom)
},
.predict = function(inputs) {
task = inputs[[1]]
new_col_roles = task$col_roles[intersect(names(task$col_roles), mlr3::mlr_reflections$task_col_roles$density)]
new_row_roles = task$row_roles
task = todensity(task, ".density")
task$col_roles = new_col_roles
task$row_roles = new_row_roles
list(task, task)
},
.splittask = function(task, pv) {
}
)
)
# mlr_pipeops$add("densitysplit", PipeOpDensitySplit)
## ugh, this is broken :-/
# This is necessary, because it is not possible to create a task with empty levels. otherwise we could just create TaskDensity$new(id, task$backend)
todensity = function(task, postfix) {
keepcols = c(unname(unlist(task$col_roles)), task$backend$primary_key)
backend = mlr3::as_data_backend(task$backend$data(cols = keepcols, rows = task$row_ids), primary_key = task$backend$primary_key)
TaskDensity$new(paste0(task$id, postfix), backend)
}