-
Notifications
You must be signed in to change notification settings - Fork 0
/
outliers.R
206 lines (197 loc) · 6.91 KB
/
outliers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#' Outlier detection for price relatives
#'
#' Standard cutoff-based methods for detecting outliers with price relatives.
#'
#' Each of these functions constructs an interval of the form \eqn{[b_l(x) -
#' c_l \times l(x), b_u(x) + c_u \times u(x)]}{[bl(x) - cl * l(x), bu(x) + cu *
#' u(x)]} and assigns a value in `x` as `TRUE` if that value does not
#' belong to the interval, `FALSE` otherwise. The methods differ in how
#' they construct the values \eqn{b_l(x)}{bl(x)}, \eqn{b_u(x)}{bu(x)},
#' \eqn{l(x)}, and \eqn{u(x)}. Any missing values in `x` are ignored when
#' calculating the cutoffs, but will return `NA`.
#'
#' The fixed cutoff method is the simplest, and just uses the interval
#' \eqn{[c_l, c_u]}{[cl, cu]}.
#'
#' The quartile method and Tukey algorithm are described in paragraphs 5.113 to
#' 5.135 of the CPI manual (2020), as well as by Rais (2008) and Hutton (2008).
#' The resistant fences method is an alternative to the quartile method, and is
#' described by Rais (2008) and Hutton (2008). Quantile-based methods often
#' identify price relatives as outliers because the distribution is
#' concentrated around 1; setting `a > 0` puts a floor on the minimum
#' dispersion between quantiles as a fraction of the median. See the references
#' for more details.
#'
#' The robust Z-score is the usual method to identify relatives in the
#' (asymmetric) tails of the distribution, simply replacing the mean with the
#' median, and the standard deviation with the median absolute deviation.
#'
#' These methods often assume that price relatives are symmetrically
#' distributed (if not Gaussian). As the distribution of price relatives often
#' has a long right tail, the natural logarithm can be used to transform price
#' relative before identifying outliers (sometimes under the assumption that
#' price relatives are distributed log-normal). The Hidiroglou-Berthelot
#' transformation is another approach, described in the CPI manual (par.
#' 5.124). (Sometimes the transformed price relatives are multiplied by
#' \eqn{\max(p_1, p_0)^u}{max(p1, p0)^u}, for some
#' \eqn{0 \le u \le 1}{0 <= u <= 1}, so that products with a larger price
#' get flagged as outliers (par. 5.128).)
#'
#' @param x A strictly positive numeric vector of price relatives. These can be
#' made with, e.g., [back_period()].
#' @param cu,cl A numeric vector, or something that can be coerced into one,
#' giving the upper and lower cutoffs for each element of `x`. Recycled to the
#' same length as `x`.
#' @param a A numeric vector, or something that can be coerced into one,
#' between 0 and 1 giving the scale factor for the median to establish the
#' minimum dispersion between quartiles for each element of `x`. The default
#' does not set a minimum dispersion. Recycled to the same length as `x`.
#' @param type See [quantile()].
#'
#' @returns
#' A logical vector, the same length as `x`, that is `TRUE` if the
#' corresponding element of `x` is identified as an outlier,
#' `FALSE` otherwise.
#'
#' @seealso
#' [grouped()] to make each of these functions operate on grouped data.
#'
#' [back_period()]/[base_period()] for a simple utility function to turn prices
#' in a table into price relatives.
#'
#' The `HBmethod()` function in the \pkg{univOutl} package for the
#' Hidiroglou-Berthelot method for identifying outliers.
#'
#' @references
#' Hutton, H. (2008). Dynamic outlier detection in price index surveys.
#' *Proceedings of the Survey Methods Section: Statistical Society of Canada Annual Meeting*.
#'
#' ILO, IMF, OECD, Eurostat, UN, and World Bank. (2020).
#' *Consumer Price Index Manual: Theory and Practice*.
#' International Monetary Fund.
#'
#' Rais, S. (2008). Outlier detection for the Consumer Price Index.
#' *Proceedings of the Survey Methods Section: Statistical Society of Canada Annual Meeting*.
#'
#' @examples
#' set.seed(1234)
#'
#' x <- rlnorm(10)
#'
#' fixed_cutoff(x)
#' robust_z(x)
#' quartile_method(x)
#' resistant_fences(x) # always identifies fewer outliers than above
#' tukey_algorithm(x)
#'
#' log(x)
#' hb_transform(x)
#'
#' # Works the same for grouped data
#'
#' f <- c("a", "b", "a", "a", "b", "b", "b", "a", "a", "b")
#' grouped(quartile_method)(x, group = f)
#'
#' @name outliers
#' @export
quartile_method <- function(x, cu = 2.5, cl = cu, a = 0, type = 7) {
x <- as.numeric(x)
cu <- as.numeric(cu)
# It's faster to not recycle cu, cl, or a when they're length 1.
if (length(cu) != 1L) cu <- rep_len(cu, length(x))
cl <- as.numeric(cl)
if (length(cl) != 1L) cl <- rep_len(cl, length(x))
a <- as.numeric(a)
if (length(a) != 1L) a <- rep_len(a, length(x))
q <- stats::quantile(
x, c(0.25, 0.5, 0.75),
names = FALSE, na.rm = TRUE, type = type
)
x <- x - q[2L]
u <- cu * pmax.int(q[3L] - q[2L], abs(a * q[2L]))
l <- -cl * pmax.int(q[2L] - q[1L], abs(a * q[2L]))
x > u | x < l
}
#' Resistant fences
#' @rdname outliers
#' @export
resistant_fences <- function(x, cu = 2.5, cl = cu, a = 0, type = 7) {
x <- as.numeric(x)
cu <- as.numeric(cu)
if (length(cu) != 1L) cu <- rep_len(cu, length(x))
cl <- as.numeric(cl)
if (length(cl) != 1L) cl <- rep_len(cl, length(x))
a <- as.numeric(a)
if (length(a) != 1L) a <- rep_len(a, length(x))
q <- stats::quantile(
x, c(0.25, 0.5, 0.75),
names = FALSE, na.rm = TRUE, type = type
)
iqr <- pmax.int(q[3L] - q[1L], abs(a * q[2L]))
u <- q[3L] + cu * iqr
l <- q[1L] - cl * iqr
x > u | x < l
}
#' Robust z-score
#' @rdname outliers
#' @export
robust_z <- function(x, cu = 2.5, cl = cu) {
x <- as.numeric(x)
cu <- as.numeric(cu)
if (length(cu) != 1L) cu <- rep_len(cu, length(x))
cl <- as.numeric(cl)
if (length(cl) != 1L) cl <- rep_len(cl, length(x))
med <- stats::median(x, na.rm = TRUE)
s <- stats::mad(x, na.rm = TRUE)
x <- x - med
u <- cu * s
l <- -cl * s
x > u | x < l
}
#' Fixed cutoff
#' @rdname outliers
#' @export
fixed_cutoff <- function(x, cu = 2.5, cl = 1 / cu) {
x <- as.numeric(x)
cu <- as.numeric(cu)
if (length(cu) != 1L) cu <- rep_len(cu, length(x))
cl <- as.numeric(cl)
if (length(cl) != 1L) cl <- rep_len(cl, length(x))
x > cu | x < cl
}
#' Tukey's algorithm
#' @rdname outliers
#' @export
tukey_algorithm <- function(x, cu = 2.5, cl = cu, type = 7) {
x <- as.numeric(x)
cu <- as.numeric(cu)
if (length(cu) != 1L) cu <- rep_len(cu, length(x))
cl <- as.numeric(cl)
if (length(cl) != 1L) cl <- rep_len(cl, length(x))
q <- stats::quantile(
x, c(0.05, 0.95),
names = FALSE, na.rm = TRUE, type = type
)
tail <- x < q[1L] | x > q[2L]
ts <- x[x != 1 & !tail]
if (length(ts) == 0L) {
return(tail)
}
# In some versions m is the median.
m <- mean(ts, na.rm = TRUE)
x <- x - m
u <- cu * (mean(ts[ts >= m], na.rm = TRUE) - m)
l <- -cl * (m - mean(ts[ts <= m], na.rm = TRUE))
x > u | x < l | tail
}
#' HB transform
#' @rdname outliers
#' @export
hb_transform <- function(x) {
x <- as.numeric(x)
med <- stats::median(x, na.rm = TRUE)
res <- 1 - med / x
gemed <- x >= med
res[gemed] <- x[gemed] / med - 1
res
}