Skip to content

Commit

Permalink
First draft node module which uses Hyper Log Log to estimate set card…
Browse files Browse the repository at this point in the history
…inality
  • Loading branch information
mattbornski committed May 10, 2012
1 parent d7b207b commit 008c16a
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 208 deletions.
Binary file added HyperLogLog.pdf
Binary file not shown.
29 changes: 29 additions & 0 deletions lib/hashes.js
@@ -0,0 +1,29 @@
var BitArray = require('bit-array');
var murmurhash3 = require('murmurhash3');

module.exports = {
'jenkins32': {
'fn': function (str) {
var hash = 0;

for (var i = 0; i < str.length; i++) {
hash += str.charCodeAt(i);
hash += hash << 10;
hash ^= hash >> 6;
}

hash += hash << 3;
hash ^= hash >> 6;
hash += hash << 16;

return (new BitArray([hash])).toArray().reverse();
},
'b': 5,
},
'murmur128': {
'fn': function (str) {
return (new BitArray(murmurhash3.murmur128Sync(str))).toArray().reverse();
},
'b': 7,
},
};
86 changes: 0 additions & 86 deletions lib/hyperloglog.js

This file was deleted.

108 changes: 106 additions & 2 deletions lib/index.js
@@ -1,4 +1,108 @@
var hashes = require('./hashes');

var configuration = {
'hash': 'murmur128',
};
var configure = function (settings) {
// TODO: Allow changing configuration before set construction
};

var convert = function (set) {
// TODO: Allow converting a naive set to a HyperLogLog set
};

/**
* Constructs a new cardinality set or reconstructs one from a serialization.
* Accepts one of the following:
* - an array of values to insert
* - an object whose keys should be inserted
* - a serialized representation of a set
*
* Note that while we quote the hyper log log algorithm extensively in this
* file, we use zero-indexed arrays so the logic may not look identical. We
* also use 0 instead of negative infinity for the purposes of easier JSON
* serialization.
*/
var set = function () {
if (!(this instanceof set)) {
// forces "new"
return set.apply(new set(), arguments);
}
var serialization = {};
var array = [];
if (arguments.length === 1 && arguments[0] instanceof Array) {
array = arguments[0];
} else if (arguments.length === 1 && arguments[0] instanceof Object) {
array = Object.keys(arguments[0]);
} else if (arguments.length === 1 && arguments[0] instanceof String) {
serialization = JSON.parse(arguments[0]);
} else if (arguments.length > 0) {
throw new Error('new set([array|object|serialized set])');
}
// TODO: if the set size looks likely to remain small, use naive set until untenable.
this.hash = hashes[serialization['hash'] || configuration['hash']];
if (!this.hash) {
throw new Error('Hash function "' + (serialization['hash'] || configuration['hash']) + '" not found');
}
// select b from the set of positive integers, and set m as 2 to the power of b
// initialize a collection of m registers to negative infinity (here we use zero
// for serialization purposes, which is algorithmically indistinguishable)
this.M = serialization['table'] || [];
for (var j = 0; j < Math.pow(2, this.hash['b']); j++) {
if (this.M[j] === undefined) {
this.M[j] = 0;
}
}

for (var i = 0; i < array.length; i++) {
// for value in array do
this.push(array[i]);
}
return this;
};
set.prototype.push = function (v) {
// set x as hash of v
var x = this.hash['fn'](v);
// set j as 1 + the binary address determined by the first b bits of x
var j = '';
for (var i = 0; i < this.hash['b']; i++) {
j += (x[i] ? '1' : '0');
}
j = parseInt(j, 2);
// set w as the the binary value of the rest of the bits of x (omitted)
// set the register of M with index j to the maximum of the current value of
// said register or the value of rho(w), where rho(w) is the logical position of the
// leftmost non-zero bit in w. Note that although we are using "zero indexed" bits
// here, the value of rho(w) is used as an actual numerical value, NOT an index,
// so we add 1 to correspond with the algorithm.
var rhoW = x.indexOf(true, this.hash['b']);
if (rhoW === -1) {
rhoW = x.length - this.hash['b'];
} else {
rhoW -= this.hash['b'];
}
this.M[j] = Math.max(this.M[j], rhoW + 1);
};
set.prototype.size = function () {
var Z = 0;
for (var j = 0; j < this.M.length; j++) {
Z += 1 / Math.pow(2, this.M[j]);
}
var alphas = {
4: 0.673,
5: 0.697,
6: 0.709,
};
return parseInt((alphas[this.hash['b']] || (0.7213 / (1 + 1.079 / this.M.length))) * this.M.length * this.M.length / Z);
};
set.prototype.serialize = function () {
return JSON.stringify({
'hash': this.hash,
'table': this.M,
});
};

module.exports = {
'LogLog': require('./loglog').LogLog,
'HyperLogLog': require('./hyperloglog').HyperLogLog,
'configure': configure,
'set': set,
};
84 changes: 0 additions & 84 deletions lib/loglog.js

This file was deleted.

4 changes: 4 additions & 0 deletions package.json
Expand Up @@ -25,6 +25,10 @@
"fusy",
"meunier"
],
"dependencies": {
"bit-array": "0.1.2",
"murmurhash3": "0.0.9"
},
"repository": {
"type": "git",
"url": "https://github.com/mattbornski/cardinality.git"
Expand Down
54 changes: 37 additions & 17 deletions test/basic.js
Expand Up @@ -10,32 +10,52 @@ var assert = require('assert');
var harness = require('./harness');
var cardinality = require('../lib/index');

describe('Log Log algorithm', function () {
it('should estimate the cardinality of variously sized sets within 2% accuracy, faster than naive counting', function (done) {
describe('Hyper Log Log algorithm', function () {
it('with short words', function (done) {
var sizes = [
100,
10000,
1000000,
10000000,
];
var results = harness.compare(harness.naiveCardinality, cardinality.LogLog, sizes);
console.log('LogLog');
console.log(results);
return done();
var results = harness.compare(harness.heavilyOverlappingShortWords, harness.naiveCardinality, cardinality.set, sizes);
var acceptable = true;
for (var index in sizes) {
console.log('for set of size ~' + sizes[index] + '...');
console.log(' HyperLogLog estimated ' + results[index]['counts'][1] + ' items in ' + results[index]['times'][1] + ' ms');
console.log(' Naive counting netted ' + results[index]['counts'][0] + ' items in ' + results[index]['times'][0] + ' ms');
var deviation = (Math.abs(results[index]['counts'][0] - results[index]['counts'][1]) / results[index]['counts'][0]);
console.log(deviation);
if (deviation > 0.02) {
acceptable = false
}
}
if (!acceptable) {
return done(new Error('Results were unacceptably off of real values'));
} else {
return done();
}
});
});

describe('Hyper Log Log algorithm', function () {
it('should estimate the cardinality of variously sized sets within 2% accuracy, faster than naive counting', function (done) {
it('with BSON object id strings', function (done) {
var sizes = [
100,
10000,
1000000,
10000000,
];
var results = harness.compare(harness.naiveCardinality, cardinality.HyperLogLog, sizes);
console.log('HyperLogLog');
console.log(results);
return done();
var results = harness.compare(harness.lightlyOverlappingObjectIds, harness.naiveCardinality, cardinality.set, sizes);
var acceptable = true;
for (var index in sizes) {
console.log('for set of size ~' + sizes[index] + '...');
console.log(' HyperLogLog estimated ' + results[index]['counts'][1] + ' items in ' + results[index]['times'][1] + ' ms');
console.log(' Naive counting netted ' + results[index]['counts'][0] + ' items in ' + results[index]['times'][0] + ' ms');
var deviation = (Math.abs(results[index]['counts'][0] - results[index]['counts'][1]) / results[index]['counts'][0]);
console.log(deviation);
if (deviation > 0.02) {
acceptable = false
}
}
if (!acceptable) {
return done(new Error('Results were unacceptably off of real values'));
} else {
return done();
}
});
});

0 comments on commit 008c16a

Please sign in to comment.