/
CanonicalNodeNameOrder.java
339 lines (305 loc) · 13.4 KB
/
CanonicalNodeNameOrder.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
/* Copyright (c) 2012-2016 Boundless and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/org/documents/edl-v10.html
*
* Contributors:
* Gabriel Roldan (Boundless) - initial implementation
*/
package org.locationtech.geogig.model;
import java.io.Serializable;
import java.util.Comparator;
import com.google.common.base.Preconditions;
import com.google.common.primitives.UnsignedLong;
/**
* Defines storage order of {@link RevTree tree} {@link Node nodes} based on the non cryptographic
* 64-bit
* <a href="http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function">FNV-1a</a>
* variation of the "Fowler/Noll/Vo" hash algorithm.
* <p>
* This class mandates in which order {@link Node nodes} are stored inside {@link RevTree trees},
* hence defining the canonical order in which tree elements are traversed, regardless of in how
* many subtrees a given tree is split into.
* <p>
* The resulting structure where a given node (identified by its name) always falls on the same
* bucket (subtree) for a given subtree depth makes it possible to compute diffs between two trees
* very quickly, by traversing both trees in parallel and finding both bucket and node differences
* and skipping equal bucket trees, as two buckets at the same depth with the same contents will
* always hash out to the same {@link ObjectId}.
* <p>
* The FNV-1 hash for a node name is computed as in the following pseudo-code:
*
* <pre>
* <code>
* hash = FNV_offset_basis
* for each octet_of_data to be hashed
* hash = hash × FNV_prime
* hash = hash XOR octet_of_data
* return hash
* </code>
* </pre>
*
* Where {@code FNV_offset_basis} is the 64-bit literal {@code 0xcbf29ce484222325}, and
* {@code FNV_prime} is the 64-bit literal {@code 0x100000001b3}.
* <p>
* To compute the node name hash, each two-byte char in the node name produces two
* {@code octet_of_data}, in big-endian order.
* <p>
* This hash function proved to be extremely fast while maintaining a good distribution and low
* collision rate, and is widely used by the computer science industry as explained
* <a href="http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-param">here</a> when speed is
* needed in contrast to cryptographic security.
*
* <p>
* The table bellow shows the {@link RevTree} structure and capacity in its canonical form as
* defined by this class, for the first eight levels of depth, after which {@code buckets per node}
* is always {@code 2} and {@code #features per leaf tree} always {@code 256}.
* <p>
* This structure is fixed and intends to balance out the growth of the trees as feature nodes are
* added to provide for a good maximum capacity without too many levels of nesting, nor too few at
* the cost of an exponential growth on the number of internal tree nodes.
*
* <pre>
* <code>
* Depth #buckets Leaf trees Total # of #features Max #features
* per node tree objects per leaf
* tree
* --------------------------------------------------------------------------
* 0 32 32 33 512 16,384
* 1 32 1,024 1,056 512 524,288
* 2 32 32,768 33,792 512 16,777,216
* 3 8 262,144 294,912 256 67,108,864
* 4 8 2,097,152 2,359,296 256 536,870,912
* 5 4 8,388,608 10,485,760 256 2,147,483,648
* 6 4 33,554,432 41,943,040 256 8,589,934,592
* 7 2 67,108,864 100,663,296 256 17,179,869,184
* </code>
* </pre>
*
* The {@code "#buckets per node"} column (which matches the {@link #maxBucketsForLevel} method)
* represents in how many subtrees a leaf tree that reached it's maximum capacity is split into at
* every depth index (where the leaf tree maximum capacity is given by the
* {@code "#features per leaf tree"} column, matching the method {@link #normalizedSizeLimit}).
* <p>
* The {@code "Leaf trees"} column represents how many leaf trees result from splitting the tree at
* that level of nesting.
* <p>
* The {@code "Max #features"} column shows what's the maximum number of feature nodes a tree split
* at that level of nesting can contain.
*
* @since 1.0
*/
public final class CanonicalNodeNameOrder implements Comparator<String>, Serializable {
private static final long serialVersionUID = -685759544293388523L;
private static final FNV1a64bitHash hashOrder = new FNV1a64bitHash();
public static final CanonicalNodeNameOrder INSTANCE = new CanonicalNodeNameOrder();
@Override
public int compare(String p1, String p2) {
return hashOrder.compare(p1, p2);
}
public int compare(final long longBits1, final String p1, final long longBits2, String p2) {
return hashOrder.compare(longBits1, p1, longBits2, p2);
}
/**
* Returns the canonical max size of a leaf tree for the given depth index; hard limit, can't be
* changed or would affect the hash of trees.
* <p>
* The combination of {@link #maxBucketsForLevel(int) maxBucketsForLevel(int depthIndex)} and
* {@code normalizedSizeLimit(int depthIndex)} defines the maximum capacity for a tree of a
* given depth.
*
* @param depthIndex the depth index (starting at zero) of a leaf for which to return the
* allowed maximum number of feature nodes before needing to split it into the number of
* buckets given by {@link #maxBucketsForLevel} for the same depth.
*
* @return {@code 512} for depth indexes {@code 0} to {@code 2}, and {@code 256} for depth
* indexes bigger than {@code 2}
* @since 1.0
*/
public static int normalizedSizeLimit(final int depthIndex) {
Preconditions.checkArgument(depthIndex > -1,
"depthIndex must be a positive integer or zero");
switch (depthIndex) {
case 0:
case 1:
case 2:
return 512;
default:
return 256;
}
}
/**
* Returns a positive integer defining in how many <i>bucket trees</i> a <i>leaf tree</i> that
* has reached its {@link #getNormalizedSizeLimit maximum capacity} is split into, at the given
* depth index.
* <p>
* E.g., if a feature tree is to be split into buckets, then this method must be called with
* {@code depthIndex == 0}; a bucket tree that's a direct child of a feature tree with
* {@code depthIndex == 1}, and so forth.
* <p>
* The combination of {@code maxBucketsForLevel(int depthIndex)} and
* {@link #normalizedSizeLimit(int) normalizedSizeLimit(int depthIndex)} defines the maximum
* capacity of a tree of a given depth.
*
* @param depthIndex the depth index (starting at zero) of a leaf tree that needs to be split
* into buckets.
* @return {@code 32} for depth indexes {@code 0} to {@code 2}; {@code 8} for depth indexes
* {@code 3} and {@code 4}; {@code 4} for depth indexes {@code 5} and {@code 6}; and
* {@code 2} for any depth index bigger than 6.
* @since 1.0
*/
public static int maxBucketsForLevel(final int depthIndex) {
Preconditions.checkArgument(depthIndex > -1,
"depthIndex must be a positive integer or zero (%s)", depthIndex);
switch (depthIndex) {
case 0:
case 1:
case 2:
return 32;
case 3:
case 4:
return 8;
case 5:
case 6:
return 4;
default:
return 2;
}
}
/**
* Computes the bucket index (zero based) that corresponds to the given node name at the given
* (zero based) depth.
*
* @return and Integer between zero and {@link #maxBucketsForLevel
* maxBucketsForLevel(depthIndex)} minus one
*/
public static Integer bucket(final String nodeName, final int depthIndex) {
final long longBits = hashOrder.fnvBits(nodeName);
final int bucket = hashOrder.bucket(longBits, depthIndex);
return Integer.valueOf(bucket);
}
public static int[] allBuckets(final String nodeName) {
final long longBits = hashOrder.fnvBits(nodeName);
final int maxDepth = 8;
int[] indices = new int[maxDepth];
for (int i = 0; i < maxDepth; i++) {
indices[i] = hashOrder.bucket(longBits, i);
}
return indices;
}
/**
* Given a feature name's {@link #hashCodeLong(String) long hashcode}, computes the bucket index
* that corresponds to the node name at the given depth.
*
* @return and Integer between zero and {@link #maxBucketsForLevel
* maxBucketsForLevel(depthIndex)} minus one
*/
public static int bucket(final long hashCodeLong, final int depthIndex) {
final int bucket = hashOrder.bucket(hashCodeLong, depthIndex);
return bucket;
}
/**
* Computes the
* <a href="http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function">FNV-1a
* </a> hash code for the given feature identifier, as an unsigned long.
*
* @param featureIdentifier
* @return an unsigned long representing the FNV-1a hash code for the given feature identifier.
*/
public UnsignedLong hashCodeLong(String featureIdentifier) {
UnsignedLong fnv = hashOrder.fnv(featureIdentifier);
return fnv;
}
/**
* The FNV-1a hash function used as {@link Node} storage order.
* <p>
* Should match the following formula:
*
* <pre>
* <code>
* hash = FNV_offset_basis
* for each octet_of_data to be hashed
* hash = hash × FNV_prime
* hash = hash XOR octet_of_data
* return hash
* </code>
* </pre>
*/
private static class FNV1a64bitHash implements Serializable {
private static final long serialVersionUID = -1931193743208260766L;
private static final UnsignedLong FNV64_OFFSET_BASIS = UnsignedLong
.valueOf("14695981039346656037");
private static final UnsignedLong FNV64_PRIME = UnsignedLong.valueOf("1099511628211");
public int compare(final String p1, final String p2) {
final long longBits1 = fnvBits(p1);
final long longBits2 = fnvBits(p2);
return compare(longBits1, p1, longBits2, p2);
}
private int compare(final long longBits1, final String p1, final long longBits2,
final String p2) {
for (int i = 0; i < 8; i++) {
int bucket1 = bucket(longBits1, i);
int bucket2 = bucket(longBits2, i);
if (bucket1 > bucket2) {
return 1;
} else if (bucket2 > bucket1) {
return -1;
}
}
if (!p1.equals(p2)) {
// They fall on the same bucket all the way down to the last level. Fall back to
// canonical string sorting
return p1.compareTo(p2);
}
return 0;
}
private int bucket(long longBits, int depthIndex) {
final int byteN = byteN(longBits, depthIndex);
final int maxBuckets = CanonicalNodeNameOrder.maxBucketsForLevel(depthIndex);
final int bucket = (byteN * maxBuckets) / 256;
return bucket;
}
private UnsignedLong fnv(CharSequence chars) {
long hash = fnvBits(chars);
return UnsignedLong.fromLongBits(hash);
}
private long fnvBits(CharSequence chars) {
final int length = chars.length();
long hash = FNV64_OFFSET_BASIS.longValue();
for (int i = 0; i < length; i++) {
char c = chars.charAt(i);
byte b1 = (byte) (c >> 8);
byte b2 = (byte) c;
hash = update(hash, b1);
hash = update(hash, b2);
}
return hash;
}
private static long update(long hash, final byte octet) {
// it's ok to use the signed long value here, its a bitwise operation anyways, and its
// on the lower byte of the long value
final long longValue = hash;
final long bits = longValue ^ octet;
// convert back to unsigned long
// hash = UnsignedLong.fromLongBits(bits);
// multiply by prime
// /hash = hash.times(FNV64_PRIME);
// this does the same than the above commented out block without an extra object
// allocation per call
return bits * FNV64_PRIME.longValue();
// return hash;
}
/**
* Returns the Nth unsigned byte in the hash of a given node name where N is given by
* {@code depth}
*/
private int byteN(final long longBits, final int depth) {
Preconditions.checkArgument(depth < 8, "depth too deep: %s", Integer.valueOf(depth));
final int displaceBits = 8 * (7 - depth);// how many bits to right shift longBits to get
// the byte N
final int byteN = ((byte) (longBits >> displaceBits)) & 0xFF;
return byteN;
}
}
}