diff --git a/HyperLogLog.pdf b/HyperLogLog.pdf new file mode 100644 index 0000000..40a86a5 Binary files /dev/null and b/HyperLogLog.pdf differ diff --git a/lib/hashes.js b/lib/hashes.js new file mode 100644 index 0000000..ea94d6a --- /dev/null +++ b/lib/hashes.js @@ -0,0 +1,29 @@ +var BitArray = require('bit-array'); +var murmurhash3 = require('murmurhash3'); + +module.exports = { + 'jenkins32': { + 'fn': function (str) { + var hash = 0; + + for (var i = 0; i < str.length; i++) { + hash += str.charCodeAt(i); + hash += hash << 10; + hash ^= hash >> 6; + } + + hash += hash << 3; + hash ^= hash >> 6; + hash += hash << 16; + + return (new BitArray([hash])).toArray().reverse(); + }, + 'b': 5, + }, + 'murmur128': { + 'fn': function (str) { + return (new BitArray(murmurhash3.murmur128Sync(str))).toArray().reverse(); + }, + 'b': 7, + }, +}; \ No newline at end of file diff --git a/lib/hyperloglog.js b/lib/hyperloglog.js deleted file mode 100644 index 6c3f3a4..0000000 --- a/lib/hyperloglog.js +++ /dev/null @@ -1,86 +0,0 @@ -module.exports.HyperLogLog = function (arr) { - var HASH_LENGTH = 32, // bites - HASH_K = 5; // HASH_LENGTH = 2 ^ HASH_K - - /** - * Jenkins hash function - * - * @url http://en.wikipedia.org/wiki/Jenkins_hash_function - * - * @param {String} str - * @return {Number} Hash - */ - function hash(str) { - var hash = 0; - - for (var i = 0, l = str.length; i < l; i++) { - hash += str.charCodeAt(i); - hash += hash << 10; - hash ^= hash >> 6; - } - - hash += hash << 3; - hash ^= hash >> 6; - hash += hash << 16; - - return hash; - } - - /** - * Offset of first 1-bit - * - * @example 00010 => 4 - * - * @param {Number} bites - * @return {Number} - */ - function scan1(bites) { - if (bites == 0) { - return HASH_LENGTH - HASH_K; - } - var offset = parseInt(Math.log(bites) / Math.log(2)); - offset = HASH_LENGTH - HASH_K - offset; - return offset; - } - - /** - * @param {String} $bites - * @param {Number} $start >=1 - * @param {Number} $end <= HASH_LENGTH - * - * @return {Number} slice of $bites - */ - function getBites(bites, start, end) { - var r = bites >> (HASH_LENGTH - end); - r = r & (Math.pow(2, end - start + 1) - 1); - - return r; - } - - var M = []; - for (var i = 0, l = arr.length; i < l; i++) { - var h = hash(arr[i]), - j = getBites(h, 1, HASH_K) + 1, - w = getBites(h, HASH_K + 1, HASH_LENGTH); - - w = scan1(w); - - if (typeof M[j] == 'undefined' || M[j] < w) { - M[j] = w; - } - } - - var alpha = 0.697122946; // 1 / (32 * integral(0,inf)( (log2(1+1/(1+x)))^32 dx)) - - var Z = 0; - for (var i = 1; i <= HASH_LENGTH; i++) { - if (typeof M[i] != 'undefined' && M[i] != 0) { - Z += 1 / Math.pow(2, M[i]); - } else { - Z += 1; - } - } - Z = alpha * HASH_LENGTH * HASH_LENGTH / Z; - - return parseInt(Z); -}; \ No newline at end of file diff --git a/lib/index.js b/lib/index.js index cf0d538..40ada4e 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,4 +1,108 @@ +var hashes = require('./hashes'); + +var configuration = { + 'hash': 'murmur128', +}; +var configure = function (settings) { + // TODO: Allow changing configuration before set construction +}; + +var convert = function (set) { + // TODO: Allow converting a naive set to a HyperLogLog set +}; + +/** + * Constructs a new cardinality set or reconstructs one from a serialization. + * Accepts one of the following: + * - an array of values to insert + * - an object whose keys should be inserted + * - a serialized representation of a set + * + * Note that while we quote the hyper log log algorithm extensively in this + * file, we use zero-indexed arrays so the logic may not look identical. We + * also use 0 instead of negative infinity for the purposes of easier JSON + * serialization. + */ +var set = function () { + if (!(this instanceof set)) { + // forces "new" + return set.apply(new set(), arguments); + } + var serialization = {}; + var array = []; + if (arguments.length === 1 && arguments[0] instanceof Array) { + array = arguments[0]; + } else if (arguments.length === 1 && arguments[0] instanceof Object) { + array = Object.keys(arguments[0]); + } else if (arguments.length === 1 && arguments[0] instanceof String) { + serialization = JSON.parse(arguments[0]); + } else if (arguments.length > 0) { + throw new Error('new set([array|object|serialized set])'); + } + // TODO: if the set size looks likely to remain small, use naive set until untenable. + this.hash = hashes[serialization['hash'] || configuration['hash']]; + if (!this.hash) { + throw new Error('Hash function "' + (serialization['hash'] || configuration['hash']) + '" not found'); + } + // select b from the set of positive integers, and set m as 2 to the power of b + // initialize a collection of m registers to negative infinity (here we use zero + // for serialization purposes, which is algorithmically indistinguishable) + this.M = serialization['table'] || []; + for (var j = 0; j < Math.pow(2, this.hash['b']); j++) { + if (this.M[j] === undefined) { + this.M[j] = 0; + } + } + + for (var i = 0; i < array.length; i++) { + // for value in array do + this.push(array[i]); + } + return this; +}; +set.prototype.push = function (v) { + // set x as hash of v + var x = this.hash['fn'](v); + // set j as 1 + the binary address determined by the first b bits of x + var j = ''; + for (var i = 0; i < this.hash['b']; i++) { + j += (x[i] ? '1' : '0'); + } + j = parseInt(j, 2); + // set w as the the binary value of the rest of the bits of x (omitted) + // set the register of M with index j to the maximum of the current value of + // said register or the value of rho(w), where rho(w) is the logical position of the + // leftmost non-zero bit in w. Note that although we are using "zero indexed" bits + // here, the value of rho(w) is used as an actual numerical value, NOT an index, + // so we add 1 to correspond with the algorithm. + var rhoW = x.indexOf(true, this.hash['b']); + if (rhoW === -1) { + rhoW = x.length - this.hash['b']; + } else { + rhoW -= this.hash['b']; + } + this.M[j] = Math.max(this.M[j], rhoW + 1); +}; +set.prototype.size = function () { + var Z = 0; + for (var j = 0; j < this.M.length; j++) { + Z += 1 / Math.pow(2, this.M[j]); + } + var alphas = { + 4: 0.673, + 5: 0.697, + 6: 0.709, + }; + return parseInt((alphas[this.hash['b']] || (0.7213 / (1 + 1.079 / this.M.length))) * this.M.length * this.M.length / Z); +}; +set.prototype.serialize = function () { + return JSON.stringify({ + 'hash': this.hash, + 'table': this.M, + }); +}; + module.exports = { - 'LogLog': require('./loglog').LogLog, - 'HyperLogLog': require('./hyperloglog').HyperLogLog, + 'configure': configure, + 'set': set, }; \ No newline at end of file diff --git a/lib/loglog.js b/lib/loglog.js deleted file mode 100644 index 234c1a3..0000000 --- a/lib/loglog.js +++ /dev/null @@ -1,84 +0,0 @@ -module.exports.LogLog = function (arr) { - var HASH_LENGTH = 32, // bites - HASH_K = 5; // HASH_LENGTH = 2 ^ HASH_K - - /** - * Jenkins hash function - * - * @url http://en.wikipedia.org/wiki/Jenkins_hash_function - * - * @param {String} str - * @return {Number} Hash - */ - function hash(str) { - var hash = 0; - - for (var i = 0, l = str.length; i < l; i++) { - hash += str.charCodeAt(i); - hash += hash << 10; - hash ^= hash >> 6; - } - - hash += hash << 3; - hash ^= hash >> 6; - hash += hash << 16; - - return hash; - } - - /** - * Offset of first 1-bit - * - * @example 00010 => 4 - * - * @param {Number} bites - * @return {Number} - */ - function scan1(bites) { - if (bites == 0) { - return HASH_LENGTH - HASH_K; - } - var offset = parseInt(Math.log(bites) / Math.log(2)); - offset = HASH_LENGTH - HASH_K - offset; - return offset; - } - - /** - * @param {String} $bites - * @param {Number} $start >=1 - * @param {Number} $end <= HASH_LENGTH - * - * @return {Number} slice of $bites - */ - function getBites(bites, start, end) { - var r = bites >> (HASH_LENGTH - end); - r = r & (Math.pow(2, end - start + 1) - 1); - - return r; - } - - var M = []; - for (var i = 0, l = arr.length; i < l; i++) { - var h = hash(arr[i]), - j = getBites(h, 1, HASH_K) + 1, - k = getBites(h, HASH_K + 1, HASH_LENGTH); - k = scan1(k); - - if (typeof M[j] == 'undefined' || M[j] < k) { - M[j] = k; - } - } - - var alpha = 0.77308249784697296; // (Gamma(-1/32) * (2^(-1/32) - 1) / ln2)^(-32) - - var E = 0; - for (var i = 1; i <= HASH_LENGTH; i++) { - if (typeof M[i] != 'undefined') { - E += M[i]; - } - } - E /= HASH_LENGTH; - E = alpha * HASH_LENGTH * Math.pow(2, E); - - return parseInt(E); -}; \ No newline at end of file diff --git a/package.json b/package.json index 6a1ea8f..b46d360 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,10 @@ "fusy", "meunier" ], + "dependencies": { + "bit-array": "0.1.2", + "murmurhash3": "0.0.9" + }, "repository": { "type": "git", "url": "https://github.com/mattbornski/cardinality.git" diff --git a/test/basic.js b/test/basic.js index 1b48966..6786cc3 100644 --- a/test/basic.js +++ b/test/basic.js @@ -10,32 +10,52 @@ var assert = require('assert'); var harness = require('./harness'); var cardinality = require('../lib/index'); -describe('Log Log algorithm', function () { - it('should estimate the cardinality of variously sized sets within 2% accuracy, faster than naive counting', function (done) { +describe('Hyper Log Log algorithm', function () { + it('with short words', function (done) { var sizes = [ - 100, - 10000, 1000000, - 10000000, ]; - var results = harness.compare(harness.naiveCardinality, cardinality.LogLog, sizes); - console.log('LogLog'); - console.log(results); - return done(); + var results = harness.compare(harness.heavilyOverlappingShortWords, harness.naiveCardinality, cardinality.set, sizes); + var acceptable = true; + for (var index in sizes) { + console.log('for set of size ~' + sizes[index] + '...'); + console.log(' HyperLogLog estimated ' + results[index]['counts'][1] + ' items in ' + results[index]['times'][1] + ' ms'); + console.log(' Naive counting netted ' + results[index]['counts'][0] + ' items in ' + results[index]['times'][0] + ' ms'); + var deviation = (Math.abs(results[index]['counts'][0] - results[index]['counts'][1]) / results[index]['counts'][0]); + console.log(deviation); + if (deviation > 0.02) { + acceptable = false + } + } + if (!acceptable) { + return done(new Error('Results were unacceptably off of real values')); + } else { + return done(); + } }); }); describe('Hyper Log Log algorithm', function () { - it('should estimate the cardinality of variously sized sets within 2% accuracy, faster than naive counting', function (done) { + it('with BSON object id strings', function (done) { var sizes = [ - 100, - 10000, 1000000, - 10000000, ]; - var results = harness.compare(harness.naiveCardinality, cardinality.HyperLogLog, sizes); - console.log('HyperLogLog'); - console.log(results); - return done(); + var results = harness.compare(harness.lightlyOverlappingObjectIds, harness.naiveCardinality, cardinality.set, sizes); + var acceptable = true; + for (var index in sizes) { + console.log('for set of size ~' + sizes[index] + '...'); + console.log(' HyperLogLog estimated ' + results[index]['counts'][1] + ' items in ' + results[index]['times'][1] + ' ms'); + console.log(' Naive counting netted ' + results[index]['counts'][0] + ' items in ' + results[index]['times'][0] + ' ms'); + var deviation = (Math.abs(results[index]['counts'][0] - results[index]['counts'][1]) / results[index]['counts'][0]); + console.log(deviation); + if (deviation > 0.02) { + acceptable = false + } + } + if (!acceptable) { + return done(new Error('Results were unacceptably off of real values')); + } else { + return done(); + } }); }); \ No newline at end of file diff --git a/test/harness.js b/test/harness.js index 1dd3197..e3bc939 100644 --- a/test/harness.js +++ b/test/harness.js @@ -1,21 +1,37 @@ -var generateWords = function (count) { - var result = []; +module.exports = { + 'heavilyOverlappingShortWords': function (count) { + var result = []; - while (count > 0) { - var word = ''; - for (var j = 0; j < (parseInt(Math.random() * (8 - 1)) + 1); j++) { // from 1char to 8chars - word += String.fromCharCode(parseInt(Math.random() * (122 - 97)) + 97); // a-z - } + while (count > 0) { + var word = ''; + for (var j = 0; j < (parseInt(Math.random() * (8 - 1)) + 1); j++) { // from 1char to 8chars + word += String.fromCharCode(parseInt(Math.random() * (122 - 97)) + 97); // a-z + } - for (var i = 0; i < Math.random() * 100; i++) { - result.push(word); - count--; + for (var i = 0; i < Math.random() * 100; i++) { + result.push(word); + count--; + } } - } - return result; -}; + return result; + }, + 'lightlyOverlappingObjectIds': function (count) { + var result = []; -module.exports = { + while (count > 0) { + result.push(); + var word = ''; + for (var j = 0; j < 24; j++) { + word += '012456789abcdef'[Math.floor(Math.random() * 16)]; + } + + for (var i = 0; i < Math.random() * 2; i++) { + result.push(word); + count--; + } + } + return result; + }, 'naiveCardinality': function (arr) { var t = {}, r = 0; for (var i = 0, l = arr.length; i < l; i++) { @@ -24,19 +40,19 @@ module.exports = { r++; } } - return r; + return {'size': function () { return r; }}; }, - 'compare': function (f1, f2, sizes) { + 'compare': function (inputs, f1, f2, sizes) { var results = []; for (var index in sizes) { - var words = generateWords(sizes[index]); + var data = inputs(sizes[index]); var f1Start = Date.now(); - var f1Count = f1(words); + var f1Count = f1(data).size(); var f1End = Date.now(); var f2Start = Date.now(); - var f2Count = f2(words); + var f2Count = f2(data).size(); var f2End = Date.now(); results.push({ 'counts': [f1Count, f2Count],