/
mpdist.R
212 lines (187 loc) · 6.5 KB
/
mpdist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#' MPdist - Distance between Time Series using Matrix Profile
#'
#' MPdist is a recently introduced distance measure which considers two time series to be similar
#' if they share many similar subsequences, regardless of the order of matching subsequences.
#' It was demonstrated in that MPdist is robust to spikes, warping, linear trends, dropouts,
#' wandering baseline and missing values, issues that are common outside of benchmark datasets.
#'
#' @details MPdist returns the distance of two time series or a vector containing the distance
#' between all sliding windows. If argument `type` is set to `vector`, the vector is returned.
#'
#' @param ref_data a `matrix` or a `vector`. The reference data
#' @param query_data a `matrix` or a `vector`. The query data
#' @param window_size an int. Size of the sliding window.
#' @param type the type of result. (Default is `simple`). See details.
#' @param thr threshold for MPdist. (Default is `0.05`). Don't change this unless you know what you are doing.
#'
#' @return Returns the distance of two time series or a vector containing the distance
#' between all sliding windows.
#' @export
#'
#' @references * Gharghabi S, Imani S, Bagnall A, Darvishzadeh A, Keogh E. Matrix Profile XII:
#' MPdist: A Novel Time Series Distance Measure to Allow Data Mining in More Challenging Scenarios.
#' In: 2018 IEEE International Conference on Data Mining (ICDM). 2018.
#'
#' @references Website: <https://sites.google.com/site/mpdistinfo/>
#'
#' @family distance measure
#'
#' @examples
#' ref_data <- mp_toy_data$data[, 1]
#' qe_data <- mp_toy_data$data[, 2]
#' qd_data <- mp_toy_data$data[150:200, 1]
#' w <- mp_toy_data$sub_len
#'
#' # distance between data of same size
#' deq <- mpdist(ref_data, qe_data, w)
#'
#' # distance between data of different sizes
#' ddiff <- mpdist(ref_data, qd_data, w)
#'
#' # distance vector between data of different sizes
#' ddvect <- mpdist(ref_data, qd_data, w, type = "vector")
mpdist <- function(ref_data, query_data, window_size, type = c("simple", "vector"), thr = 0.05) {
type <- match.arg(type)
# transform data into matrix
if (is.vector(ref_data)) {
ref_data <- as.matrix(ref_data)
}
else if (is.matrix(ref_data)) {
if (ncol(ref_data) > nrow(ref_data)) {
ref_data <- t(ref_data)
}
} else {
stop("Unknown type of data. Must be: a column matrix or a vector.", call. = FALSE)
}
if (is.vector(query_data)) {
query_data <- as.matrix(query_data)
} else if (is.matrix(query_data)) {
if (ncol(query_data) > nrow(query_data)) {
query_data <- t(query_data)
}
} else {
stop("Unknown type of query. Must be: a column matrix or a vector.", call. = FALSE)
}
if (window_size < 4) {
stop("`window_size` must be at least 4.", call. = FALSE)
}
if (nrow(ref_data) < nrow(query_data)) {
temp <- ref_data
ref_data <- query_data
query_data <- temp
}
if (type == "simple" || nrow(ref_data) == nrow(query_data)) {
if (nrow(query_data) == window_size) {
warning("Distance profile is being used since window_size equals to query size")
dist <- dist_profile(ref_data, query_data)$distance_profile
dist <- sqrt(cal_mp_dist(dist, thr, nrow(ref_data)))
} else {
dist <- mpdist_simple(ref_data, query_data, window_size, thr)
}
} else {
obj <- list()
if (nrow(query_data) == window_size) {
warning("Distance profile is being used since window_size equals to query size")
obj$mpdist <- sqrt(dist_profile(ref_data, query_data)$distance_profile)
} else {
obj$mpdist <- mpdist_vect(ref_data, query_data, window_size, thr)
}
obj$w <- window_size
obj$data <- list(ref_data, query_data)
class(obj) <- "MPdistProfile"
attr(obj, "origin") <- list(
data_size = nrow(ref_data),
query_size = nrow(query_data),
window_size = window_size,
mp_size = nrow(obj$mpdist),
algorithm = "MPdist",
class = class(obj),
version = 1.1
)
return(obj)
}
return(dist)
}
#' MP Distance
#'
#' @param data reference data
#' @param query query data
#' @param window_size window size
#' @param thr threshold for mpdist
#'
#' @return Returns the distance vector
#'
#' @keywords internal
#' @noRd
mpdist_simple <- function(ref_data, query_data, window_size, thr = 0.05) {
mp <- mpx(data = ref_data, query = query_data, window_size = window_size, idx = FALSE)
dist <- cal_mp_dist(c(mp$mpa, mp$mpb), thr, nrow(ref_data) + nrow(query_data))
return(dist)
}
#' MP Distance vector
#'
#' @param data reference data
#' @param query query data
#' @param window_size window size
#' @param thr threshold for mpdist
#'
#' @return Returns the distance vector
#'
#' @keywords internal
#' @noRd
mpdist_vect <- function(data, query, window_size, thr = 0.05) {
nn <- NULL
query_size <- nrow(query)
data_size <- nrow(data)
num_subseqs <- query_size - window_size + 1
dist_profile_size <- data_size - window_size + 1
mat <- matrix(nrow = num_subseqs, ncol = dist_profile_size)
for (i in seq_len(num_subseqs)) {
nn <- dist_profile(data, query, nn, window_size = window_size, index = i)
mat[i, ] <- nn$distance_profile
}
all_right_histogram <- do.call(pmin, as.data.frame(t(mat))) # col min
mass_dist_slid_min <- matrix(nrow = num_subseqs, ncol = dist_profile_size - num_subseqs + 1)
# apply is not faster
for (i in seq_len(num_subseqs)) {
mass_dist_slid_min[i, ] <- movmin(mat[i, ], num_subseqs)
}
mp_dist_length <- data_size - query_size + 1
right_hist_length <- query_size - window_size + 1 # num_subseqs
mpdist_array <- vector(mode = "numeric", length = mp_dist_length)
for (i in seq_len(mp_dist_length)) {
right_hist <- all_right_histogram[i:(right_hist_length + i - 1)]
left_hist <- mass_dist_slid_min[, i]
recreated_mp <- c(left_hist, right_hist)
mpdist_array[i] <- cal_mp_dist(recreated_mp, thr, 2 * query_size)
}
mpdist_array[mpdist_array < vars()$eps] <- 0
mpdist_array <- sqrt(mpdist_array)
obj <- as.matrix(mpdist_array)
return(obj)
}
#' Calculates the distance
#'
#' @param mp a matrix profile
#' @param thr threshold for mpdist
#' @param data_size size of the data
#'
#' @return Returns the distance
#'
#' @keywords internal
#' @noRd
cal_mp_dist <- function(mp, thr, data_size, partial = TRUE) {
k <- ceiling(thr * data_size)
if (k > length(mp)) {
# last element of a sorted list is the max()
return(max(mp))
}
# keep classic sorting for debug reasons
if (partial) {
mp_sorted <- sort.int(mp, partial = k)
} else {
mp_sorted <- sort.int(mp)
}
dist <- mp_sorted[k]
return(dist)
}