From ea82f5b5be6f73634c73ce031434ae32c4c66c59 Mon Sep 17 00:00:00 2001 From: Kevin Matulef Date: Thu, 3 May 2012 17:37:28 -0400 Subject: [PATCH] SERVER-2001 part 1: hashing BSONElements --- jstests/hashtest1.js | 78 +++++++++++ src/mongo/SConscript | 2 + src/mongo/bson/bsonelement.h | 63 +++++++-- src/mongo/db/commands/hashcmd.cpp | 70 ++++++++++ src/mongo/db/hasher.cpp | 94 +++++++++++++ src/mongo/db/hasher.h | 96 ++++++++++++++ src/mongo/dbtests/jsobjhashingtests.cpp | 169 ++++++++++++++++++++++++ 7 files changed, 564 insertions(+), 8 deletions(-) create mode 100644 jstests/hashtest1.js create mode 100644 src/mongo/db/commands/hashcmd.cpp create mode 100644 src/mongo/db/hasher.cpp create mode 100644 src/mongo/db/hasher.h create mode 100644 src/mongo/dbtests/jsobjhashingtests.cpp diff --git a/jstests/hashtest1.js b/jstests/hashtest1.js new file mode 100644 index 0000000000000..981a0c3687746 --- /dev/null +++ b/jstests/hashtest1.js @@ -0,0 +1,78 @@ +//hashtest1.js +//Simple tests to check hashing of various types +//make sure that different numeric types hash to same thing, and other sanity checks + +var hash = function( v , seed ){ + if (seed) + return db.runCommand({"_hashBSONElement" : v , "seed" : seed})["out"]; + else + return db.runCommand({"_hashBSONElement" : v})["out"]; +}; + +var oidHash = hash( ObjectId() ); +var oidHash2 = hash( ObjectId() ); +var oidHash3 = hash( ObjectId() ); +assert(! friendlyEqual( oidHash, oidHash2) , "ObjectIDs should hash to different things"); +assert(! friendlyEqual( oidHash, oidHash3) , "ObjectIDs should hash to different things"); +assert(! friendlyEqual( oidHash2, oidHash3) , "ObjectIDs should hash to different things"); + +var intHash = hash( NumberInt(3) ); +var doubHash = hash( 3 ); +var doubHash2 = hash( 3.0 ); +var longHash = hash( NumberLong(3) ); +var fracHash = hash( NumberInt(3.5) ); +assert.eq( intHash , doubHash ); +assert.eq( intHash , doubHash2 ); +assert.eq( intHash , longHash ); +assert.eq( intHash , fracHash ); + +var trueHash = hash( true ); +var falseHash = hash( false ); +assert(! friendlyEqual( trueHash, falseHash) , "true and false should hash to different things"); + +var nullHash = hash( null ); +assert(! friendlyEqual( falseHash , nullHash ) , "false and null should hash to different things"); + +var dateHash = hash( new Date() ); +sleep(1); +var isodateHash = hash( ISODate() ); +assert(! friendlyEqual( dateHash, isodateHash) , "different dates should hash to different things"); + +var stringHash = hash( "3" ); +assert(! friendlyEqual( intHash , stringHash ), "3 and \"3\" should hash to different things"); + +var regExpHash = hash( RegExp("3") ); +assert(! friendlyEqual( stringHash , regExpHash) , "\"3\" and RegExp(3) should hash to different things"); + +var intHash4 = hash( 4 ); +assert(! friendlyEqual( intHash , intHash4 ), "3 and 4 should hash to different things"); + +var intHashSeeded = hash( 4 , 3 ); +assert(! friendlyEqual(intHash4 , intHashSeeded ), "different seeds should make different hashes"); + +var minkeyHash = hash( MinKey ); +var maxkeyHash = hash( MaxKey ); +assert(! friendlyEqual(minkeyHash , maxkeyHash ), "minkey and maxkey should hash to different things"); + +var arrayHash = hash( [0,1.0,NumberLong(2)] ); +var arrayHash2 = hash( [0,NumberInt(1),2] ); +assert.eq( arrayHash , arrayHash2 , "didn't squash numeric types in array"); + +var objectHash = hash( {"0":0, "1" : NumberInt(1), "2" : 2} ); +assert(! friendlyEqual(objectHash , arrayHash2) , "arrays and sub-objects should hash to different things"); + +var c = hash( {a : {}, b : 1} ); +var d = hash( {a : {b : 1}} ); +assert(! friendlyEqual( c , d ) , "hashing doesn't group sub-docs and fields correctly"); + +var e = hash( {a : 3 , b : [NumberLong(3), {c : NumberInt(3)}]} ); +var f = hash( {a : NumberLong(3) , b : [NumberInt(3), {c : 3.0}]} ); +assert.eq( e , f , "recursive number squashing doesn't work"); + +var nanHash = hash( 0/0 ); +var zeroHash = hash( 0 ); +assert.eq( nanHash , zeroHash , "NaN and Zero should hash to the same thing"); + + +//should also test that CodeWScope hashes correctly +//but waiting for SERVER-3391 (CodeWScope support in shell) \ No newline at end of file diff --git a/src/mongo/SConscript b/src/mongo/SConscript index c6697824e1770..fb7ff17b9d467 100644 --- a/src/mongo/SConscript +++ b/src/mongo/SConscript @@ -28,6 +28,7 @@ commonFiles = [ "pch.cpp", "db/jsobj.cpp", "bson/oid.cpp", "db/json.cpp", + "db/hasher.cpp", "db/lasterror.cpp", "db/namespace.cpp", "db/nonce.cpp", @@ -94,6 +95,7 @@ coreServerFiles = [ "util/version.cpp", "db/dbcommands_generic.cpp", "db/commands/cloud.cpp", "db/dbmessage.cpp", + "db/commands/hashcmd.cpp", "db/commands/pipeline.cpp", "db/indexkey.cpp", "db/pipeline/accumulator.cpp", diff --git a/src/mongo/bson/bsonelement.h b/src/mongo/bson/bsonelement.h index 3edaa1ed7a9b9..e7ea3d8dc775c 100644 --- a/src/mongo/bson/bsonelement.h +++ b/src/mongo/bson/bsonelement.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include "util/builder.h" #include "bsontypes.h" @@ -144,6 +145,13 @@ namespace mongo { return data + 1; } + + int fieldNameSize() const { + if ( fieldNameSize_ == -1 ) + fieldNameSize_ = (int)strlen( fieldName() ) + 1; + return fieldNameSize_; + } + /** raw data of the element's value (so be careful). */ const char * value() const { return (data + fieldNameSize() + 1); @@ -192,8 +200,18 @@ namespace mongo { /** Retrieve int value for the element safely. Zero returned if not a number. */ int numberInt() const; - /** Retrieve long value for the element safely. Zero returned if not a number. */ + /** Retrieve long value for the element safely. Zero returned if not a number. + * Behavior is not defined for double values that are NaNs, or too large/small + * to be represented by long longs */ long long numberLong() const; + + /** Like numberLong() but with well-defined behavior for doubles that + * are NaNs, or too large/small to be represented as long longs. + * NaNs -> 0 + * very large doubles -> LLONG_MAX + * very small doubles -> LLONG_MIN */ + long long safeNumberLong() const; + /** Retrieve the numeric value of the element. If not of a numeric type, returns 0. Note: casts to double, data loss may occur with large (>52 bit) NumberLong values. */ @@ -243,11 +261,20 @@ namespace mongo { /** Get javascript code of a CodeWScope data element. */ const char * codeWScopeCode() const { - return value() + 8; + massert( 16177 , "not codeWScope" , type() == CodeWScope ); + return value() + 4 + 4; //two ints precede code (see BSON spec) + } + + /** Get length of the code part of the CodeWScope object + * This INCLUDES the null char at the end */ + int codeWScopeCodeLen() const { + massert( 16178 , "not codeWScope" , type() == CodeWScope ); + return *(int *)( value() + 4 ); } + /** Get the scope SavedContext of a CodeWScope data element. */ const char * codeWScopeScopeData() const { - // TODO fix + //This can error if there are null chars in the codeWScopeCode return codeWScopeCode() + strlen( codeWScopeCode() ) + 1; } @@ -413,11 +440,7 @@ namespace mongo { private: const char *data; mutable int fieldNameSize_; // cached value - int fieldNameSize() const { - if ( fieldNameSize_ == -1 ) - fieldNameSize_ = (int)strlen( fieldName() ) + 1; - return fieldNameSize_; - } + mutable int totalSize; /* caches the computed size */ friend class BSONObjIterator; @@ -574,6 +597,30 @@ namespace mongo { } } + /** Like numberLong() but with well-defined behavior for doubles that + * are NaNs, or too large/small to be represented as long longs. + * NaNs -> 0 + * very large doubles -> LLONG_MAX + * very small doubles -> LLONG_MIN */ + inline long long BSONElement::safeNumberLong() const { + double d; + switch( type() ) { + case NumberDouble: + d = numberDouble(); + if ( std::isnan( d ) ){ + return 0; + } + if ( d > (double) std::numeric_limits::max() ){ + return std::numeric_limits::max(); + } + if ( d < std::numeric_limits::min() ){ + return std::numeric_limits::min(); + } + default: + return numberLong(); + } + } + inline BSONElement::BSONElement() { static char z = 0; data = &z; diff --git a/src/mongo/db/commands/hashcmd.cpp b/src/mongo/db/commands/hashcmd.cpp new file mode 100644 index 0000000000000..f4f85c0e02b4a --- /dev/null +++ b/src/mongo/db/commands/hashcmd.cpp @@ -0,0 +1,70 @@ +/* hashcmd.cpp + * + * Defines a shell command for hashing a BSONElement value + */ + + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "mongo/db/commands.h" +#include "mongo/db/hasher.h" + +namespace mongo { + + class CmdHashElt : public Command { + public: + CmdHashElt() : Command("_hashBSONElement") {}; + virtual LockType locktype() const { return NONE; } + virtual bool slaveOk() const { return true; } + virtual void help( stringstream& help ) const { + help << "returns the hash of the first BSONElement val in a BSONObj"; + } + + /* CmdObj has the form {"hash" : } + * or {"hash" : , "seed" : } + * Result has the form + * {"key" : , "seed" : , "out": NumberLong()} + * + * Example use in the shell: + *> db.runCommand({hash: "hashthis", seed: 1}) + *> {"key" : "hashthis", + *> "seed" : 1, + *> "out" : NumberLong(6271151123721111923), + *> "ok" : 1 } + **/ + bool run( const string& db, + BSONObj& cmdObj, + int options, string& errmsg, + BSONObjBuilder& result, + bool fromRepl = false ){ + result.appendAs(cmdObj.firstElement(),"key"); + + int seed = 0; + if (cmdObj.hasField("seed")){ + if (! cmdObj["seed"].isNumber()) { + errmsg += "seed must be a number"; + return false; + } + seed = cmdObj["seed"].numberInt(); + } + result.append( "seed" , seed ); + + result.append( "out" , BSONElementHasher::hash64( cmdObj.firstElement() , seed ) ); + return true; + } + } cmdHashElt; +} diff --git a/src/mongo/db/hasher.cpp b/src/mongo/db/hasher.cpp new file mode 100644 index 0000000000000..b6603523f5ffc --- /dev/null +++ b/src/mongo/db/hasher.cpp @@ -0,0 +1,94 @@ +/* hasher.cpp + * + * Defines a simple hash function class + */ + + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#include "mongo/db/hasher.h" +#include "mongo/db/jsobj.h" + +namespace mongo { + + Hasher::Hasher( HashSeed seed ) : _seed( seed ) { + md5_init( &_md5State ); + md5_append( &_md5State , reinterpret_cast< const md5_byte_t * >( & _seed ) , sizeof( _seed ) ); + } + + void Hasher::addData( const void * keyData , size_t numBytes ) { + md5_append( &_md5State , static_cast< const md5_byte_t * >( keyData ), numBytes ); + } + + void Hasher::finish( HashDigest out ) { + md5_finish( &_md5State , out ); + } + + long long int BSONElementHasher::hash64( const BSONElement& e , HashSeed seed ){ + scoped_ptr h( HasherFactory::createHasher( seed ) ); + recursiveHash( h.get() , e , false ); + HashDigest d; + h->finish(d); + //HashDigest is actually 16 bytes, but we just get 8 via truncation + // NOTE: assumes little-endian + return *reinterpret_cast< long long int * >( d ); + } + + void BSONElementHasher::recursiveHash( Hasher* h , + const BSONElement& e , + bool includeFieldName ) { + + int canonicalType = e.canonicalType(); + h->addData( &canonicalType , sizeof( canonicalType ) ); + + if ( includeFieldName ){ + h->addData( e.fieldName() , e.fieldNameSize() ); + } + + if ( !e.mayEncapsulate() ){ + //if there are no embedded objects (subobjects or arrays), + //compute the hash, squashing numeric types to 64-bit ints + if ( e.isNumber() ){ + long long int i = e.safeNumberLong(); //well-defined for troublesome doubles + h->addData( &i , sizeof( i ) ); + } + else { + h->addData( e.value() , e.valuesize() ); + } + } + else { + //else identify the subobject. + //hash any preceding stuff (in the case of codeWscope) + //then each sub-element + //then finish with the EOO element. + BSONObj b; + if ( e.type() == CodeWScope ) { + h->addData( e.codeWScopeCode() , e.codeWScopeCodeLen() ); + b = e.codeWScopeObject(); + } + else { + b = e.embeddedObject(); + } + BSONObjIterator i(b); + while( i.moreWithEOO() ) { + BSONElement el = i.next(); + recursiveHash( h , el , true ); + } + } + } + +} diff --git a/src/mongo/db/hasher.h b/src/mongo/db/hasher.h new file mode 100644 index 0000000000000..a303460d442b5 --- /dev/null +++ b/src/mongo/db/hasher.h @@ -0,0 +1,96 @@ +/* hasher.h + * + * Defines a simple hash function class + */ + + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see . +*/ + +#pragma once + +#include "pch.h" +#include "mongo/bson/bsonelement.h" +#include "mongo/util/md5.hpp" + +namespace mongo { + + typedef int HashSeed; + typedef unsigned char HashDigest[16]; + + class Hasher : private boost::noncopyable { + public: + + explicit Hasher( HashSeed seed ); + ~Hasher() { }; + + //pointer to next part of input key, length in bytes to read + void addData( const void * keyData , size_t numBytes ); + + //finish computing the hash, put the result in the digest + //only call this once per Hasher + void finish( HashDigest out ); + + private: + md5_state_t _md5State; + HashSeed _seed; + }; + + class HasherFactory : private boost::noncopyable { + public: + /* Eventually this may be a more sophisticated factory + * for creating other hashers, but for now use MD5. + */ + static Hasher* createHasher( HashSeed seed ) { + return new Hasher( seed ); + } + + private: + HasherFactory(); + }; + + class BSONElementHasher : private boost::noncopyable { + public: + /* This computes a 64-bit hash of the value part of BSONElement "e", + * preceded by the seed "seed". Squashes element (and any sub-elements) + * of the same canonical type, so hash({a:{b:4}}) will be the same + * as hash({a:{b:4.1}}). In particular, this squashes doubles to 64-bit long + * ints via truncation, so floating point values round towards 0 to the + * nearest int representable as a 64-bit long. + * + * This function is used in the computation of hashed indexes + * and hashed shard keys, and thus should not be changed unless + * the associated "getKeys" and "makeSingleKey" method in the + * hashindex type is changed accordingly. + */ + static long long int hash64( const BSONElement& e , HashSeed seed ); + + private: + BSONElementHasher(); + + /* This incrementally computes the hash of BSONElement "e" + * using hash function "h". If "includeFieldName" is true, + * then the name of the field is hashed in between the type of + * the element and the element value. The hash function "h" + * is applied recursively to any sub-elements (arrays/sub-documents), + * squashing elements of the same canonical type. + * Used as a helper for hash64 above. + */ + static void recursiveHash( Hasher* h , const BSONElement& e , bool includeFieldName ); + + }; + +} diff --git a/src/mongo/dbtests/jsobjhashingtests.cpp b/src/mongo/dbtests/jsobjhashingtests.cpp new file mode 100644 index 0000000000000..8481965f42c45 --- /dev/null +++ b/src/mongo/dbtests/jsobjhashingtests.cpp @@ -0,0 +1,169 @@ +// jsobjhashingtests.cpp - Tests for hasher.{h,cpp} code +// + +/** + * Copyright (C) 2012 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + + +#include "mongo/dbtests/dbtests.h" +#include "mongo/db/hasher.h" + +namespace JsobjHashingTests { + + class BSONElementHashingTest { + public: + void run() { + int seed = 0; + + //test different oids hash to different things + long long int oidHash = BSONElementHasher::hash64( + BSONObjBuilder().genOID().obj().firstElement() , seed ); + long long int oidHash2 = BSONElementHasher::hash64( + BSONObjBuilder().genOID().obj().firstElement() , seed ); + long long int oidHash3 = BSONElementHasher::hash64( + BSONObjBuilder().genOID().obj().firstElement() , seed ); + + ASSERT_NOT_EQUALS( oidHash , oidHash2 ); + ASSERT_NOT_EQUALS( oidHash , oidHash3 ); + ASSERT_NOT_EQUALS( oidHash3 , oidHash2 ); + + //test 32-bit ints, 64-bit ints, doubles hash to same thing + int i = 3; + BSONObj p1 = BSON("a" << i); + long long int intHash = BSONElementHasher::hash64( p1.firstElement() , seed ); + + long long int ilong = 3; + BSONObj p2 = BSON("a" << ilong); + long long int longHash = BSONElementHasher::hash64( p2.firstElement() , seed ); + + double d = 3.1; + BSONObj p3 = BSON("a" << d); + long long int doubleHash = BSONElementHasher::hash64( p3.firstElement() , seed ); + + ASSERT_EQUALS( intHash, longHash ); + ASSERT_EQUALS( doubleHash, longHash ); + + //test different ints don't hash to same thing + BSONObj p4 = BSON("a" << 4); + long long int intHash4 = BSONElementHasher::hash64( p4.firstElement() , seed ); + ASSERT_NOT_EQUALS( intHash , intHash4 ); + + //test seed makes a difference + long long int intHash4Seed = BSONElementHasher::hash64( p4.firstElement() , 1 ); + ASSERT_NOT_EQUALS( intHash4 , intHash4Seed ); + + //test strings hash to different things + BSONObj p5 = BSON("a" << "3"); + long long int stringHash = BSONElementHasher::hash64( p5.firstElement() , seed ); + ASSERT_NOT_EQUALS( intHash , stringHash ); + + //test regexps and strings hash to different things + BSONObjBuilder b; + b.appendRegex("a","3"); + long long int regexHash = BSONElementHasher::hash64( b.obj().firstElement() , seed ); + ASSERT_NOT_EQUALS( stringHash , regexHash ); + + //test arrays and subobject hash to different things + BSONObj p6 = fromjson("{a : {'0' : 0 , '1' : 1}}"); + BSONObj p7 = fromjson("{a : [0,1]}"); + ASSERT_NOT_EQUALS( + BSONElementHasher::hash64( p6.firstElement() , seed ) , + BSONElementHasher::hash64( p7.firstElement() , seed ) + ); + + //testing sub-document grouping + BSONObj p8 = fromjson("{x : {a : {}, b : 1}}"); + BSONObj p9 = fromjson("{x : {a : {b : 1}}}"); + ASSERT_NOT_EQUALS( + BSONElementHasher::hash64( p8.firstElement() , seed ) , + BSONElementHasher::hash64( p9.firstElement() , seed ) + ); + + //testing codeWscope scope squashing + BSONObjBuilder b1; + b1.appendCodeWScope("a","print('this is some stupid code')", BSON("a" << 3)); + BSONObj p10 = b1.obj(); + + BSONObjBuilder b2; + b2.appendCodeWScope("a","print('this is some stupid code')", BSON("a" << 3.1)); + + BSONObjBuilder b3; + b3.appendCodeWScope("a","print('this is \nsome stupider code')", BSON("a" << 3)); + ASSERT_EQUALS( + BSONElementHasher::hash64( p10.firstElement() , seed ) , + BSONElementHasher::hash64( b2.obj().firstElement() , seed ) + ); + ASSERT_NOT_EQUALS( + BSONElementHasher::hash64( p10.firstElement() , seed ) , + BSONElementHasher::hash64( b3.obj().firstElement() , seed ) + ); + + //test some recursive squashing + BSONObj p11 = fromjson("{x : {a : 3 , b : [ 3.1, {c : 3}]}}"); + BSONObj p12 = fromjson("{x : {a : 3.1 , b : [3, {c : 3.0}]}}"); + ASSERT_EQUALS( + BSONElementHasher::hash64( p11.firstElement() , seed ) , + BSONElementHasher::hash64( p12.firstElement() , seed ) + ); + + //test minkey and maxkey don't hash to same thing + BSONObj p13 = BSON("a" << MAXKEY); + BSONObj p14 = BSON("a" << MINKEY); + ASSERT_NOT_EQUALS( + BSONElementHasher::hash64( p13.firstElement() , seed ) , + BSONElementHasher::hash64( p14.firstElement() , seed ) + ); + + //test squashing very large doubles and very small doubles + long long maxInt = std::numeric_limits::max(); + double smallerDouble = maxInt/2; + double biggerDouble = ( (double)maxInt )*( (double)maxInt ); + BSONObj p15 = BSON("a" << maxInt ); + BSONObj p16 = BSON("a" << smallerDouble ); + BSONObj p17 = BSON("a" << biggerDouble ); + ASSERT_NOT_EQUALS( + BSONElementHasher::hash64( p15.firstElement() , seed ) , + BSONElementHasher::hash64( p16.firstElement() , seed ) + ); + ASSERT_EQUALS( + BSONElementHasher::hash64( p15.firstElement() , seed ) , + BSONElementHasher::hash64( p17.firstElement() , seed ) + ); + + long long minInt = std::numeric_limits::min(); + double negativeDouble = -( (double)maxInt )*( (double)maxInt ); + BSONObj p18 = BSON("a" << minInt ); + BSONObj p19 = BSON("a" << negativeDouble ); + ASSERT_EQUALS( + BSONElementHasher::hash64( p18.firstElement() , seed ) , + BSONElementHasher::hash64( p19.firstElement() , seed ) + ); + + } + }; + + class All : public Suite { + public: + All() : Suite( "jsobjhashing" ) { + } + + void setupTests() { + add< BSONElementHashingTest >(); + } + } myall; + +} // namespace JsobjTests +