-
-
Notifications
You must be signed in to change notification settings - Fork 398
/
complexity_dimension.py
328 lines (263 loc) · 12.2 KB
/
complexity_dimension.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import scipy.spatial
from .complexity_embedding import complexity_embedding
def complexity_dimension(signal, delay=1, dimension_max=20, method="afnn", show=False, R=10.0, A=2.0, **kwargs):
"""Estimate optimal Dimension (m) for time-delay embedding.
Parameters
----------
signal : list or array or Series
The signal (i.e., a time series) in the form of a vector of values.
delay : int
Time delay (often denoted 'Tau', sometimes referred to as 'lag').
In practice, it is common to have a fixed time lag (corresponding for instance to the
sampling rate; Gautama, 2003), or to find a suitable value using some algorithmic heuristics
(see ``complexity_delay()``).
dimension_max : int
The maximum embedding dimension (often denoted 'm' or 'd', sometimes referred to as 'order')
to test.
method : str
Method can either be afnn (average false nearest neighbour) or fnn (false nearest neighbour).
show : bool
Visualize the result.
R : float
Relative tolerance (for fnn method).
A : float
Absolute tolerance (for fnn method)
**kwargs : optional
Other arguments.
Returns
-------
int
Optimal dimension.
See Also
------------
complexity_delay, complexity_embedding
Examples
---------
>>> import neurokit2 as nk
>>>
>>> # Artifical example
>>> signal = nk.signal_simulate(duration=10, frequency=1, noise=0.01)
>>> delay = nk.complexity_delay(signal, delay_max=500)
>>>
>>> values = nk.complexity_dimension(signal, delay=delay, dimension_max=20, show=True)
References
-----------
- Cao, L. (1997). Practical method for determining the minimum embedding dimension of a scalar
time series. Physica D: Nonlinear Phenomena, 110(1-2), 43-50.
"""
# Initalize vectors
if isinstance(dimension_max, int):
dimension_seq = np.arange(1, dimension_max + 1)
else:
dimension_seq = np.array(dimension_max)
# Method
method = method.lower()
if method in ["afnn"]:
E, Es = _embedding_dimension_afn(signal, dimension_seq=dimension_seq, delay=delay, **kwargs)
E1 = E[1:] / E[:-1]
E2 = Es[1:] / Es[:-1]
# To find where E1 saturates, set a threshold of difference
# threshold = 0.1 * (np.max(E1) - np.min(E1))
min_dimension = [i for i, x in enumerate(E1 >= 0.85 * np.max(E1)) if x][0] + 1
if show is True:
_embedding_dimension_plot(
method=method, dimension_seq=dimension_seq, min_dimension=min_dimension, E1=E1, E2=E2
)
elif method in ["fnn"]:
f1, f2, f3 = _embedding_dimension_ffn(signal, dimension_seq=dimension_seq, delay=delay, R=R, A=A, **kwargs)
min_dimension = [i for i, x in enumerate(f3 <= 1.85 * np.min(f3[np.nonzero(f3)])) if x][0]
if show is True:
_embedding_dimension_plot(
method=method, dimension_seq=dimension_seq, min_dimension=min_dimension, f1=f1, f2=f2, f3=f3
)
else:
raise ValueError("NeuroKit error: complexity_dimension(): 'method' " "not recognized.")
return min_dimension
# =============================================================================
# Methods
# =============================================================================
def _embedding_dimension_afn(signal, dimension_seq, delay=1, **kwargs):
"""Return E(d) and E^*(d) for a all d in dimension_seq.
E(d) and E^*(d) will be used to calculate E1(d) and E2(d)
El(d) = E(d + 1)/E(d). E1(d) stops changing when d is greater than some value d0 if the time
series comes from an attractor. Then d0 + 1 is the minimum embedding dimension we look for.
E2(d) = E*(d + 1)/E*(d). E2(d) is a useful quantity to distinguish deterministic signals from
stochastic signals. For random data, since the future values are independent of the past values,
E2(d) will be equal to 1 for any d. For deterministic data, E2(d) is certainly related to d, it
cannot be a constant for all d; there must exist somed's such that E2(d) is not 1.
"""
values = np.asarray(
[_embedding_dimension_afn_d(signal, dimension, delay, **kwargs) for dimension in dimension_seq]
).T
E, Es = values[0, :], values[1, :]
return E, Es
def _embedding_dimension_afn_d(signal, dimension, delay=1, metric="chebyshev", window=10, maxnum=None, **kwargs):
"""Return E(d) and E^*(d) for a single d.
Returns E(d) and E^*(d) for the AFN method for a single d.
"""
# We need to reduce the number of points in dimension d by tau
# so that after reconstruction, there'll be equal number of points
# at both dimension d as well as dimension d + 1.
y1 = complexity_embedding(signal[:-delay], delay=delay, dimension=dimension)
y2 = complexity_embedding(signal, delay=delay, dimension=dimension + 1)
# Find near neighbors in dimension d.
index, dist = _embedding_dimension_neighbors(y1, metric=metric, window=window, maxnum=maxnum)
# Compute the near-neighbor distances in d + 1 dimension
d = np.asarray([scipy.spatial.distance.chebyshev(i, j) for i, j in zip(y2, y2[index])])
# Compute the ratio of near-neighbor distances in d + 1 over d dimension
# Its average is E(d)
E = np.mean(d / dist)
# Calculate E^*(d)
Es = np.mean(np.abs(y2[:, -1] - y2[index, -1]))
return E, Es
def _embedding_dimension_ffn(signal, dimension_seq, delay=1, **kwargs):
"""Compute the fraction of false nearest neighbors.
The false nearest neighbors (FNN) method described by Kennel et al.
(1992) to calculate the minimum embedding dimension required to embed a scalar time series.
Parameters
----------
signal : list or array or Series
The signal (i.e., a time series) in the form of a vector of values.
dimension_seq : int
The embedding dimension.
delay : int
Time delay (often denoted 'Tau', sometimes referred to as 'lag').
**kwargs : optional
Other arguments.
Returns
-------
f1 : array
Fraction of neighbors classified as false by Test I.
f2 : array
Fraction of neighbors classified as false by Test II.
f3 : array
Fraction of neighbors classified as false by either Test I
or Test II.
"""
values = np.asarray(
[_embedding_dimension_ffn_d(signal, dimension, delay, **kwargs) for dimension in dimension_seq]
).T
f1, f2, f3 = values[0, :], values[1, :], values[2, :]
return f1, f2, f3
def _embedding_dimension_ffn_d(signal, dimension, delay=1, R=10.0, A=2.0, metric="euclidean", window=10, maxnum=None):
"""Return fraction of false nearest neighbors for a single d.
"""
# We need to reduce the number of points in dimension d by tau
# so that after reconstruction, there'll be equal number of points
# at both dimension d as well as dimension d + 1.
y1 = complexity_embedding(signal[:-delay], delay=delay, dimension=dimension)
y2 = complexity_embedding(signal, delay=delay, dimension=dimension + 1)
# Find near neighbors in dimension d.
index, dist = _embedding_dimension_neighbors(y1, metric=metric, window=window, maxnum=maxnum)
# Compute the near-neighbor distances in d + 1 dimension
d = np.asarray([scipy.spatial.distance.chebyshev(i, j) for i, j in zip(y2, y2[index])])
# Find all potential false neighbors using Kennel et al.'s tests.
f1 = np.abs(y2[:, -1] - y2[index, -1]) / dist > R
f2 = d / np.std(signal) > A
f3 = f1 | f2
return np.mean(f1), np.mean(f2), np.mean(f3)
# =============================================================================
# Internals
# =============================================================================
def _embedding_dimension_plot(
method, dimension_seq, min_dimension, E1=None, E2=None, f1=None, f2=None, f3=None, ax=None
):
if ax is None:
fig, ax = plt.subplots()
else:
fig = None
ax.set_title("Optimization of Dimension (d)")
ax.set_xlabel("Embedding dimension $d$")
ax.set_ylabel("$E_1(d)$ and $E_2(d)$")
if method in ["afnn"]:
ax.plot(dimension_seq[:-1], E1, "bo-", label="$E_1(d)$", color="#FF5722")
ax.plot(dimension_seq[:-1], E2, "go-", label="$E_2(d)$", color="#f44336")
if method in ["fnn"]:
ax.plot(dimension_seq, 100 * f1, "bo--", label="Test I", color="#FF5722")
ax.plot(dimension_seq, 100 * f2, "g^--", label="Test II", color="#f44336")
ax.plot(dimension_seq, 100 * f3, "rs-", label="Test I + II", color="#852b01")
ax.axvline(x=min_dimension, color="#E91E63", label="Optimal dimension: " + str(min_dimension))
ax.legend(loc="upper right")
return fig
def _embedding_dimension_neighbors(
signal, dimension_max=20, delay=1, metric="chebyshev", window=0, maxnum=None, show=False
):
"""Find nearest neighbors of all points in the given array. Finds the nearest neighbors of all points in the given
array using SciPy's KDTree search.
Parameters
----------
signal : ndarray or array or list or Series
embedded signal: N-dimensional array containing time-delayed vectors, or
signal: 1-D array (e.g.time series) of signal in the form of a vector of values.
If signal is input, embedded signal will be created using the input dimension and delay.
delay : int
Time delay (often denoted 'Tau', sometimes referred to as 'lag'). In practice, it is common
to have a fixed time lag (corresponding for instance to the sampling rate; Gautama, 2003),
or to find a suitable value using some algorithmic heuristics (see ``delay_optimal()``).
dimension_max : int
The maximum embedding dimension (often denoted 'm' or 'd', sometimes referred to as 'order')
to test.
metric : str
Metric to use for distance computation. Must be one of "cityblock" (aka the Manhattan metric),
"chebyshev" (aka the maximum norm metric), or "euclidean". Defaults to 'chebyshev'.
window : int
Minimum temporal separation (Theiler window) that should exist between near neighbors.
This is crucial while computing Lyapunov exponents and the correlation dimension. Defaults to 0.
maxnum : int
Maximum number of near neighbors that should be found for each point.
In rare cases, when there are no neighbors that are at a nonzero distance, this will have to
be increased (i.e., beyond 2 * window + 3). Defaults to None (optimum).
show : bool
Defaults to False.
Returns
-------
index : array
Array containing indices of near neighbors.
dist : array
Array containing near neighbor distances.
"""
# Sanity checks
if len(signal.shape) == 1:
y = complexity_embedding(signal, delay=delay, dimension=dimension_max)
else:
y = signal
if metric == "chebyshev":
p = np.inf
elif metric == "cityblock":
p = 1
elif metric == "euclidean":
p = 2
else:
raise ValueError('Unknown metric. Should be one of "cityblock", ' '"euclidean", or "chebyshev".')
tree = scipy.spatial.cKDTree(y) # pylint: disable=E1102
n = len(y)
if not maxnum:
maxnum = (window + 1) + 1 + (window + 1)
else:
maxnum = max(1, maxnum)
if maxnum >= n:
raise ValueError("maxnum is bigger than array length.")
dists = np.empty(n)
indices = np.empty(n, dtype=int)
for i, x in enumerate(y):
for k in range(2, maxnum + 2):
# querry for k number of nearest neighbours
dist, index = tree.query(x, k=k, p=p)
# remove points that are closer than min temporal separation
# remove self reference (d > 0)
valid = (np.abs(index - i) > window) & (dist > 0)
if np.count_nonzero(valid):
dists[i] = dist[valid][0]
indices[i] = index[valid][0]
break
if k == (maxnum + 1):
raise Exception(
"Could not find any near neighbor with a nonzero distance." "Try increasing the value of maxnum."
)
indices, values = np.squeeze(indices), np.squeeze(dists)
if show is True:
plt.plot(indices, values)
return indices, values