Permalink
Browse files

SERVER-2001 part 1: hashing BSONElements

  • Loading branch information...
matulef committed May 3, 2012
1 parent d073865 commit ea82f5b5be6f73634c73ce031434ae32c4c66c59
View
@@ -0,0 +1,78 @@
+//hashtest1.js
+//Simple tests to check hashing of various types
+//make sure that different numeric types hash to same thing, and other sanity checks
+
+var hash = function( v , seed ){
+ if (seed)
+ return db.runCommand({"_hashBSONElement" : v , "seed" : seed})["out"];
+ else
+ return db.runCommand({"_hashBSONElement" : v})["out"];
+};
+
+var oidHash = hash( ObjectId() );
+var oidHash2 = hash( ObjectId() );
+var oidHash3 = hash( ObjectId() );
+assert(! friendlyEqual( oidHash, oidHash2) , "ObjectIDs should hash to different things");
+assert(! friendlyEqual( oidHash, oidHash3) , "ObjectIDs should hash to different things");
+assert(! friendlyEqual( oidHash2, oidHash3) , "ObjectIDs should hash to different things");
+
+var intHash = hash( NumberInt(3) );
+var doubHash = hash( 3 );
+var doubHash2 = hash( 3.0 );
+var longHash = hash( NumberLong(3) );
+var fracHash = hash( NumberInt(3.5) );
+assert.eq( intHash , doubHash );
+assert.eq( intHash , doubHash2 );
+assert.eq( intHash , longHash );
+assert.eq( intHash , fracHash );
+
+var trueHash = hash( true );
+var falseHash = hash( false );
+assert(! friendlyEqual( trueHash, falseHash) , "true and false should hash to different things");
+
+var nullHash = hash( null );
+assert(! friendlyEqual( falseHash , nullHash ) , "false and null should hash to different things");
+
+var dateHash = hash( new Date() );
+sleep(1);
+var isodateHash = hash( ISODate() );
+assert(! friendlyEqual( dateHash, isodateHash) , "different dates should hash to different things");
+
+var stringHash = hash( "3" );
+assert(! friendlyEqual( intHash , stringHash ), "3 and \"3\" should hash to different things");
+
+var regExpHash = hash( RegExp("3") );
+assert(! friendlyEqual( stringHash , regExpHash) , "\"3\" and RegExp(3) should hash to different things");
+
+var intHash4 = hash( 4 );
+assert(! friendlyEqual( intHash , intHash4 ), "3 and 4 should hash to different things");
+
+var intHashSeeded = hash( 4 , 3 );
+assert(! friendlyEqual(intHash4 , intHashSeeded ), "different seeds should make different hashes");
+
+var minkeyHash = hash( MinKey );
+var maxkeyHash = hash( MaxKey );
+assert(! friendlyEqual(minkeyHash , maxkeyHash ), "minkey and maxkey should hash to different things");
+
+var arrayHash = hash( [0,1.0,NumberLong(2)] );
+var arrayHash2 = hash( [0,NumberInt(1),2] );
+assert.eq( arrayHash , arrayHash2 , "didn't squash numeric types in array");
+
+var objectHash = hash( {"0":0, "1" : NumberInt(1), "2" : 2} );
+assert(! friendlyEqual(objectHash , arrayHash2) , "arrays and sub-objects should hash to different things");
+
+var c = hash( {a : {}, b : 1} );
+var d = hash( {a : {b : 1}} );
+assert(! friendlyEqual( c , d ) , "hashing doesn't group sub-docs and fields correctly");
+
+var e = hash( {a : 3 , b : [NumberLong(3), {c : NumberInt(3)}]} );
+var f = hash( {a : NumberLong(3) , b : [NumberInt(3), {c : 3.0}]} );
+assert.eq( e , f , "recursive number squashing doesn't work");
+
+var nanHash = hash( 0/0 );
+var zeroHash = hash( 0 );
+assert.eq( nanHash , zeroHash , "NaN and Zero should hash to the same thing");
+
+
+//should also test that CodeWScope hashes correctly
+//but waiting for SERVER-3391 (CodeWScope support in shell)
View
@@ -28,6 +28,7 @@ commonFiles = [ "pch.cpp",
"db/jsobj.cpp",
"bson/oid.cpp",
"db/json.cpp",
+ "db/hasher.cpp",
"db/lasterror.cpp",
"db/namespace.cpp",
"db/nonce.cpp",
@@ -94,6 +95,7 @@ coreServerFiles = [ "util/version.cpp",
"db/dbcommands_generic.cpp",
"db/commands/cloud.cpp",
"db/dbmessage.cpp",
+ "db/commands/hashcmd.cpp",
"db/commands/pipeline.cpp",
"db/indexkey.cpp",
"db/pipeline/accumulator.cpp",
@@ -18,6 +18,7 @@
#pragma once
#include <vector>
+#include <cmath>
#include <string.h>
#include "util/builder.h"
#include "bsontypes.h"
@@ -144,6 +145,13 @@ namespace mongo {
return data + 1;
}
+
+ int fieldNameSize() const {
+ if ( fieldNameSize_ == -1 )
+ fieldNameSize_ = (int)strlen( fieldName() ) + 1;
+ return fieldNameSize_;
+ }
+
/** raw data of the element's value (so be careful). */
const char * value() const {
return (data + fieldNameSize() + 1);
@@ -192,8 +200,18 @@ namespace mongo {
/** Retrieve int value for the element safely. Zero returned if not a number. */
int numberInt() const;
- /** Retrieve long value for the element safely. Zero returned if not a number. */
+ /** Retrieve long value for the element safely. Zero returned if not a number.
+ * Behavior is not defined for double values that are NaNs, or too large/small
+ * to be represented by long longs */
long long numberLong() const;
+
+ /** Like numberLong() but with well-defined behavior for doubles that
+ * are NaNs, or too large/small to be represented as long longs.
+ * NaNs -> 0
+ * very large doubles -> LLONG_MAX
+ * very small doubles -> LLONG_MIN */
+ long long safeNumberLong() const;
+
/** Retrieve the numeric value of the element. If not of a numeric type, returns 0.
Note: casts to double, data loss may occur with large (>52 bit) NumberLong values.
*/
@@ -243,11 +261,20 @@ namespace mongo {
/** Get javascript code of a CodeWScope data element. */
const char * codeWScopeCode() const {
- return value() + 8;
+ massert( 16177 , "not codeWScope" , type() == CodeWScope );
+ return value() + 4 + 4; //two ints precede code (see BSON spec)
+ }
+
+ /** Get length of the code part of the CodeWScope object
+ * This INCLUDES the null char at the end */
+ int codeWScopeCodeLen() const {
+ massert( 16178 , "not codeWScope" , type() == CodeWScope );
+ return *(int *)( value() + 4 );
}
+
/** Get the scope SavedContext of a CodeWScope data element. */
const char * codeWScopeScopeData() const {
- // TODO fix
+ //This can error if there are null chars in the codeWScopeCode
return codeWScopeCode() + strlen( codeWScopeCode() ) + 1;
}
@@ -413,11 +440,7 @@ namespace mongo {
private:
const char *data;
mutable int fieldNameSize_; // cached value
- int fieldNameSize() const {
- if ( fieldNameSize_ == -1 )
- fieldNameSize_ = (int)strlen( fieldName() ) + 1;
- return fieldNameSize_;
- }
+
mutable int totalSize; /* caches the computed size */
friend class BSONObjIterator;
@@ -574,6 +597,30 @@ namespace mongo {
}
}
+ /** Like numberLong() but with well-defined behavior for doubles that
+ * are NaNs, or too large/small to be represented as long longs.
+ * NaNs -> 0
+ * very large doubles -> LLONG_MAX
+ * very small doubles -> LLONG_MIN */
+ inline long long BSONElement::safeNumberLong() const {
+ double d;
+ switch( type() ) {
+ case NumberDouble:
+ d = numberDouble();
+ if ( std::isnan( d ) ){
+ return 0;
+ }
+ if ( d > (double) std::numeric_limits<long long>::max() ){
+ return std::numeric_limits<long long>::max();
+ }
+ if ( d < std::numeric_limits<long long>::min() ){
+ return std::numeric_limits<long long>::min();
+ }
+ default:
+ return numberLong();
+ }
+ }
+
inline BSONElement::BSONElement() {
static char z = 0;
data = &z;
@@ -0,0 +1,70 @@
+/* hashcmd.cpp
+ *
+ * Defines a shell command for hashing a BSONElement value
+ */
+
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/db/commands.h"
+#include "mongo/db/hasher.h"
+
+namespace mongo {
+
+ class CmdHashElt : public Command {
+ public:
+ CmdHashElt() : Command("_hashBSONElement") {};
+ virtual LockType locktype() const { return NONE; }
+ virtual bool slaveOk() const { return true; }
+ virtual void help( stringstream& help ) const {
+ help << "returns the hash of the first BSONElement val in a BSONObj";
+ }
+
+ /* CmdObj has the form {"hash" : <thingToHash>}
+ * or {"hash" : <thingToHash>, "seed" : <number> }
+ * Result has the form
+ * {"key" : <thingTohash>, "seed" : <int>, "out": NumberLong(<hash>)}
+ *
+ * Example use in the shell:
+ *> db.runCommand({hash: "hashthis", seed: 1})
+ *> {"key" : "hashthis",
+ *> "seed" : 1,
+ *> "out" : NumberLong(6271151123721111923),
+ *> "ok" : 1 }
+ **/
+ bool run( const string& db,
+ BSONObj& cmdObj,
+ int options, string& errmsg,
+ BSONObjBuilder& result,
+ bool fromRepl = false ){
+ result.appendAs(cmdObj.firstElement(),"key");
+
+ int seed = 0;
+ if (cmdObj.hasField("seed")){
+ if (! cmdObj["seed"].isNumber()) {
+ errmsg += "seed must be a number";
+ return false;
+ }
+ seed = cmdObj["seed"].numberInt();
+ }
+ result.append( "seed" , seed );
+
+ result.append( "out" , BSONElementHasher::hash64( cmdObj.firstElement() , seed ) );
+ return true;
+ }
+ } cmdHashElt;
+}
View
@@ -0,0 +1,94 @@
+/* hasher.cpp
+ *
+ * Defines a simple hash function class
+ */
+
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/db/hasher.h"
+#include "mongo/db/jsobj.h"
+
+namespace mongo {
+
+ Hasher::Hasher( HashSeed seed ) : _seed( seed ) {
+ md5_init( &_md5State );
+ md5_append( &_md5State , reinterpret_cast< const md5_byte_t * >( & _seed ) , sizeof( _seed ) );
+ }
+
+ void Hasher::addData( const void * keyData , size_t numBytes ) {
+ md5_append( &_md5State , static_cast< const md5_byte_t * >( keyData ), numBytes );
+ }
+
+ void Hasher::finish( HashDigest out ) {
+ md5_finish( &_md5State , out );
+ }
+
+ long long int BSONElementHasher::hash64( const BSONElement& e , HashSeed seed ){
+ scoped_ptr<Hasher> h( HasherFactory::createHasher( seed ) );
+ recursiveHash( h.get() , e , false );
+ HashDigest d;
+ h->finish(d);
+ //HashDigest is actually 16 bytes, but we just get 8 via truncation
+ // NOTE: assumes little-endian
+ return *reinterpret_cast< long long int * >( d );
+ }
+
+ void BSONElementHasher::recursiveHash( Hasher* h ,
+ const BSONElement& e ,
+ bool includeFieldName ) {
+
+ int canonicalType = e.canonicalType();
+ h->addData( &canonicalType , sizeof( canonicalType ) );
+
+ if ( includeFieldName ){
+ h->addData( e.fieldName() , e.fieldNameSize() );
+ }
+
+ if ( !e.mayEncapsulate() ){
+ //if there are no embedded objects (subobjects or arrays),
+ //compute the hash, squashing numeric types to 64-bit ints
+ if ( e.isNumber() ){
+ long long int i = e.safeNumberLong(); //well-defined for troublesome doubles
+ h->addData( &i , sizeof( i ) );
+ }
+ else {
+ h->addData( e.value() , e.valuesize() );
+ }
+ }
+ else {
+ //else identify the subobject.
+ //hash any preceding stuff (in the case of codeWscope)
+ //then each sub-element
+ //then finish with the EOO element.
+ BSONObj b;
+ if ( e.type() == CodeWScope ) {
+ h->addData( e.codeWScopeCode() , e.codeWScopeCodeLen() );
+ b = e.codeWScopeObject();
+ }
+ else {
+ b = e.embeddedObject();
+ }
+ BSONObjIterator i(b);
+ while( i.moreWithEOO() ) {
+ BSONElement el = i.next();
+ recursiveHash( h , el , true );
+ }
+ }
+ }
+
+}
Oops, something went wrong.

0 comments on commit ea82f5b

Please sign in to comment.