/
NXapm_paraprobe_config_clusterer.nxdl.xml
477 lines (460 loc) · 24.1 KB
/
NXapm_paraprobe_config_clusterer.nxdl.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="nxdlformat.xsl"?>
<!--
# NeXus - Neutron and X-ray Common Data Format
#
# Copyright (C) 2014-2024 NeXus International Advisory Committee (NIAC)
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# For further information, see http://www.nexusformat.org
-->
<definition xmlns="http://definition.nexusformat.org/nxdl/3.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" category="application" name="NXapm_paraprobe_config_clusterer" extends="NXobject" type="group" xsi:schemaLocation="http://definition.nexusformat.org/nxdl/3.1 ../nxdl.xsd">
<symbols>
<doc>
The symbols used in the schema to specify e.g. dimensions of arrays.
</doc>
<!--n_positions: Number of position values.
n_disjoint_clusters: Number of disjoint cluster.-->
<symbol name="n_ivecmax">
<doc>
Maximum number of atoms per molecular ion. Should be 32 for paraprobe.
</doc>
</symbol>
<symbol name="n_clust_algos">
<doc>
Number of clustering algorithms used.
</doc>
</symbol>
<symbol name="n_ions">
<doc>
Number of different iontypes to distinguish during clustering.
</doc>
</symbol>
</symbols>
<doc>
Configuration of a paraprobe-clusterer tool run in atom probe microscopy.
</doc>
<group type="NXentry">
<!--by default for appdefs the value of the exists keyword is required
unless it is explicitly specified differently-->
<attribute name="version">
<doc>
Version specifier of this application definition.
</doc>
</attribute>
<field name="definition">
<doc>
Official NeXus NXDL schema with which this file was written.
</doc>
<enumeration>
<item value="NXapm_paraprobe_config_clusterer"/>
</enumeration>
</field>
<field name="program">
<doc>
Given name of the program/software/tool with which this NeXus
(configuration) file was generated.
</doc>
<attribute name="version">
<doc>
Ideally program version plus build number, or commit hash or description
of ever persistent resources where the source code of the program and
build instructions can be found so that the program can be configured
ideally in such a manner that the result of this computational process
is recreatable in the same deterministic manner.
</doc>
</attribute>
</field>
<field name="time_stamp" type="NX_DATE_TIME">
<doc>
ISO 8601 formatted time code with local time zone offset to UTC
information included when this configuration file was created.
</doc>
</field>
<field name="analysis_identifier">
<doc>
Ideally, a (globally persistent) unique identifier for referring
to this analysis.
</doc>
</field>
<field name="analysis_description" optional="true">
<doc>
Possibility for leaving a free-text description about this analysis.
</doc>
</field>
<field name="number_of_processes" type="NX_UINT" units="NX_UNITLESS">
<doc>
How many tasks to perform?
</doc>
</field>
<group name="cameca_to_nexus" type="NXprocess" optional="true">
<doc>
This process maps results from cluster analyses performed with IVAS/APSuite
into an interoperable representation. Specifically in this process
paraprobe-clusterer takes results from clustering methods from other tools
of the APM community, like IVAS/APSuite. These results are usually reported
in two ways. Either as an explicit list of reconstructed ion positions.
In the case of IVAS these positions are reported through a text file
with a cluster label for each position.
Alternatively, the list of positions is reported, as it is the case for
AMETEK (IVAS/AP Suite) but the cluster labels are specified implicitly
only in the following way: The mass-to-charge-state ratio column of a
what is effectively a file formatted like POS is used to assign a hypothetical
mass-to-charge value which resolves a floating point representation
of the cluster ID.
Another case can occur where all disjoint floating point values,
i.e. here cluster labels, are reported and then a dictionary is created
how each value matches to a cluster ID.
In general the cluster ID zero is reserved for marking the dataset
as to not be assigned to any cluster. Therefore, indices of disjoint
clusters start at 1.
</doc>
<group name="dataset" type="NXapm_input_reconstruction">
<field name="filename">
<attribute name="version"/>
</field>
<field name="dataset_name_reconstruction"/>
<field name="dataset_name_mass_to_charge">
<doc>
AMETEK/Cameca results of cluster analyses, like with the maximum-
separation (MS) method clustering algorithm `J. Hyde et al. <https://doi.org/10.1557/PROC-650-R6.6>`_
are stored as an improper POS file: This is a matrix of floating
point quadruplets, one for each ion and as many quadruplets as
ions were investigated. The first three values encode the position
of the ion. The fourth value is an improper mass-to-charge-state-ratio
value which encodes the integer identifier of the cluster as a floating
point number.
</doc>
</field>
</group>
<!--mapping_method:
doc: |
How should cluster labels be created from the cluster_labels information
especially when these areNcluste floating point values.
enumeration: [take_as_is, use_dictionary]
mapping_dictionary_keyword(NX_NUMBER):
doc: |
The list of all keywords of a dictionary which maps implicit cluster
indices like those from IVAS/APSuite which were0ass-to-charge-state ratios.
unit: NX_UNITLESS
dimensions:
rank: 1
dim: [[1, n_disjoint_clusters]]
mapping_dictionary_value(NX_UINT):
doc: |
The list of all values of a dictionary which maps implicit cluster
indices like those from IVAS/APSuite which were0ass-to-charge-state ratios.
The sequences of mapping_dictionary_keyword and mapping_dictionary_value
have to match.
unit: NX_UNITLESS
dimensions:
rank: 1
dim: [[1, n_disjoint_clusters]]-->
<field name="recover_evaporation_id" type="NX_BOOLEAN">
<doc>
Specifies if the tool should try to recover for each position the closest
matching position from dataset/dataset_name_reconstruction (within
floating point accuracy). This can be useful for instance when users
wish to recover the original evaporation ID, which IVAS/AP Suite drops
for instance when writing their *.indexed.* cluster results POS files.
</doc>
</field>
</group>
<!--recover_bitmask(NX_BOOLEAN):
doc: |
Specifies if the tool should try to recover, after a recovery of the
evaporation IDs a bitmask which identifies which of the positions
in dataset/dataset/dataset_name_reconstruction where covered
by the IVAS/APSuite cluster analysis. This can be useful to recover
the region of interest.-->
<group name="cluster_analysis" type="NXprocess" minOccurs="0" maxOccurs="1">
<doc>
This process performs a cluster analysis on a reconstructed dataset
or a portion of the reconstruction.
</doc>
<group name="dataset" type="NXapm_input_reconstruction">
<field name="filename">
<attribute name="version"/>
</field>
<field name="dataset_name_reconstruction"/>
<field name="dataset_name_mass_to_charge"/>
</group>
<group name="iontypes" type="NXapm_input_ranging">
<field name="filename">
<attribute name="version"/>
</field>
<field name="group_name_iontypes"/>
</group>
<group name="ion_to_edge_distances" type="NXprocess" optional="true">
<doc>
The tool enables to inject precomputed distance information for each
point/ion which can be used for further post-processing and analysis.
</doc>
<field name="filename">
<doc>
Name of an HDF5 file which contains the ion distances.
</doc>
<attribute name="version">
<doc>
Version identifier of the file such as a secure hash which documents
the binary state of the file to add an additional layer of
reproducibility from which file specifically contains these data.
</doc>
</attribute>
</field>
<field name="dataset_name">
<doc>
Absolute HDF5 path to the dataset with distance values for each ion.
</doc>
</field>
</group>
<group name="spatial_filter" type="NXspatial_filter" optional="true">
<field name="windowing_method">
<enumeration>
<item value="entire_dataset"/>
</enumeration>
</field>
<group type="NXcg_ellipsoid_set" optional="true">
<field name="dimensionality" type="NX_POSINT"/>
<field name="cardinality" type="NX_POSINT"/>
<field name="identifier_offset" type="NX_INT"/>
<field name="center" type="NX_NUMBER"/>
<field name="half_axes_radii" type="NX_NUMBER"/>
<field name="orientation" type="NX_NUMBER"/>
</group>
<group type="NXcg_cylinder_set" optional="true">
<field name="dimensionality" type="NX_POSINT"/>
<field name="cardinality" type="NX_POSINT"/>
<field name="identifier_offset" type="NX_INT"/>
<field name="center" type="NX_NUMBER"/>
<field name="height" type="NX_NUMBER"/>
<field name="radii" type="NX_NUMBER"/>
</group>
<group type="NXcg_hexahedron_set" optional="true">
<field name="dimensionality" type="NX_POSINT"/>
<field name="cardinality" type="NX_POSINT"/>
<field name="identifier_offset" type="NX_INT"/>
<group name="hexahedra" type="NXcg_face_list_data_structure"/>
</group>
<group type="NXcs_filter_boolean_mask" optional="true">
<field name="number_of_objects" type="NX_UINT"/>
<field name="bitdepth" type="NX_UINT"/>
<field name="mask" type="NX_UINT"/>
<field name="identifier" type="NX_UINT"/>
</group>
</group>
<group name="evaporation_id_filter" type="NXsubsampling_filter" optional="true"/>
<group name="iontype_filter" type="NXmatch_filter" optional="true"/>
<group name="hit_multiplicity_filter" type="NXmatch_filter" optional="true"/>
<!--clustering_algorithm:
doc: |
Name of the clustering algorithm used.
enumeration: [dbscan] # , optics, hpdbscan]
dimensions:
rank: 1
dim: [[1, n_clust_algos]]-->
<field name="ion_type_filter">
<doc>
How should iontypes be interpreted/considered during the cluster analysis.
Different options exist how iontypes are interpreted (if considered at all)
given an iontype represents in general a (molecular) ion with different isotopes
that have individually different multiplicity.
The value resolve_all will set an ion active in the analysis
regardless of which iontype it is.
The value resolve_unknown will set an ion active when it is of the
UNKNOWNTYPE.
The value resolve_ion will set an ion active if it is of the
specific iontype, irregardless of its elemental or isotopic details.
The value resolve_element will set an ion active, and most importantly,
account as many times for it, as the (molecular) ion contains
atoms of elements in the whitelist ion_query_isotope_vector.
The value resolve_isotope will set an ion active, and most importantly,
account as many times for it, as the (molecular) ion contains isotopes
in the whitelist ion_query_isotope_vector.
In effect, ion_query_isotope_vector acts as a whitelist to filter
which ions are considered as source ions of the correlation statistics
and how the multiplicity of each ion will be factorized.
This is relevant as in atom probe we have the situation that a ion
of a molecular ion with more than one nuclid, say Ti O for example
is counted such that although there is a single TiO molecular ion
at a position that the cluster has two members. This multiplicity
affects the size of the feature and chemical composition.
</doc>
<!--enumeration: [resolve_all, resolve_unknown, resolve_ion, resolve_element, resolve_isotope]-->
<enumeration>
<item value="resolve_element"/>
</enumeration>
</field>
<field name="ion_query_isotope_vector" type="NX_UINT" units="NX_UNITLESS">
<doc>
Matrix of isotope vectors, as many as rows as different candidates
for iontypes should be distinguished as possible source iontypes.
In the simplest case, the matrix contains only the proton number
of the element in the row, all other values set to zero.
Combined with ion_query_type_source set to resolve_element this will
recover usual spatial correlation statistics like the 1NN C-C
spatial statistics.
</doc>
<dimensions rank="2">
<dim index="1" value="n_ions"/>
<dim index="2" value="n_ivecmax"/>
</dimensions>
</field>
<group name="dbscan" type="NXprocess">
<doc>
Settings for DBScan clustering algorithm. For original details about the
algorithms and (performance-relevant) details consider:
* `M. Ester et al. <https://dx.doi.org/10.5555/3001460.3001507>`_
* `M. Götz et al. <https://dx.doi.org/10.1145/2834892.2834894>`_
For details about how the DBScan algorithms is the key behind the
specific modification known as the maximum-separation method in the
atom probe community consider `E. Jägle et al. <https://dx.doi.org/10.1017/S1431927614013294>`_
</doc>
<field name="high_throughput_method">
<doc>
Strategy how runs are performed with different parameter:
* For tuple as many runs are performed as parameter values.
* For combinatorics individual parameter arrays are looped over.
As an example we may define eps with ten entries and min_pts with
three entries. If high_throughput_method is tuple the analysis is
invalid as we have an insufficient number of min_pts for the ten
eps values.
By contrast, for combinatorics paraprobe-clusterer will run three
individual min_pts runs for each eps value, resulting in a total
of 30 analyses.
As an example the DBScan analysis reported in `M. Kühbach et al. <https://dx.doi.org/10.1038/s41524-020-00486-1>`_
would have defined an array of values np.linspace(0.2, 5.0, nums=241, endpoint=True)
eps values, min_pts one, and high_throughput_method set to combinatorics.
</doc>
<enumeration>
<item value="tuple"/>
<item value="combinatorics"/>
</enumeration>
</field>
<field name="eps" type="NX_FLOAT" units="NX_LENGTH">
<doc>
Array of epsilon (eps) parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="i"/>
</dimensions>
</field>
<field name="min_pts" type="NX_UINT" units="NX_UNITLESS">
<doc>
Array of minimum points (min_pts) parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="j"/>
</dimensions>
</field>
</group>
<!--THE IDEA BEHIND paraprobe-clusterer is that users can run a number of
cluster analyses on their dataset on exactly the same point cloud and under
the same conditions-->
<group name="optics" type="NXprocess">
<doc>
Settings for the OPTICS clustering algorithm.
* `M. Ankerest et al. <https://dx.doi.org/10.1145/304181.304187>`_
</doc>
<field name="high_throughput_method">
<doc>
Strategy how runs are performed with different parameter:
* For tuple as many runs are performed as parameter values.
* For combinatorics individual parameter arrays are looped over.
See the explanation for the corresponding parameter for dbscan
processes above-mentioned for further details.
</doc>
<enumeration>
<item value="tuple"/>
<item value="combinatorics"/>
</enumeration>
</field>
<field name="min_pts" type="NX_UINT" units="NX_UNITLESS">
<doc>
Array of minimum points (min_pts) parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="i"/>
</dimensions>
</field>
<field name="max_eps" type="NX_FLOAT" units="NX_LENGTH">
<doc>
Array of maximum epsilon (eps) parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="j"/>
</dimensions>
</field>
</group>
<group name="hdbscan" type="NXprocess">
<doc>
Settings for the HPDBScan clustering algorithm.
* L. McInnes et al. <https://dx.doi.org/10.21105/joss.00205>`_
* scikit-learn hdbscan library `<https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html>`_
See also this documentation for details about the parameter.
Here we use the terminology of the hdbscan documentation.
</doc>
<field name="high_throughput_method">
<doc>
Strategy how runs are performed with different parameter:
* For tuple as many runs are performed as parameter values.
* For combinatorics individual parameter arrays are looped over.
See the explanation for the corresponding parameter for dbscan
processes above-mentioned for further details.
</doc>
<enumeration>
<item value="tuple"/>
<item value="combinatorics"/>
</enumeration>
</field>
<field name="min_cluster_size" type="NX_NUMBER" units="NX_ANY">
<doc>
Array of min_cluster_size parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="i"/>
</dimensions>
</field>
<field name="min_samples" type="NX_NUMBER" units="NX_ANY">
<doc>
Array of min_samples parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="j"/>
</dimensions>
</field>
<field name="cluster_selection_epsilon" type="NX_NUMBER" units="NX_ANY">
<doc>
Array of cluster_selection parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="k"/>
</dimensions>
</field>
<field name="alpha" type="NX_NUMBER" units="NX_ANY">
<doc>
Array of alpha parameter values.
</doc>
<dimensions rank="1">
<dim index="1" value="m"/>
</dimensions>
</field>
</group>
</group>
</group>
<!--ADD FURTHER ALGORITHMS, see L. Stephenson for further details
e.g. https://doi.org/10.1017/S1431927607070900-->
</definition>