Skip to content

Commit

Permalink
SERVER-2001 part 1: hashing BSONElements
Browse files Browse the repository at this point in the history
  • Loading branch information
matulef committed May 9, 2012
1 parent d073865 commit ea82f5b
Show file tree
Hide file tree
Showing 7 changed files with 564 additions and 8 deletions.
78 changes: 78 additions & 0 deletions jstests/hashtest1.js
@@ -0,0 +1,78 @@
//hashtest1.js
//Simple tests to check hashing of various types
//make sure that different numeric types hash to same thing, and other sanity checks

var hash = function( v , seed ){
if (seed)
return db.runCommand({"_hashBSONElement" : v , "seed" : seed})["out"];
else
return db.runCommand({"_hashBSONElement" : v})["out"];
};

var oidHash = hash( ObjectId() );
var oidHash2 = hash( ObjectId() );
var oidHash3 = hash( ObjectId() );
assert(! friendlyEqual( oidHash, oidHash2) , "ObjectIDs should hash to different things");
assert(! friendlyEqual( oidHash, oidHash3) , "ObjectIDs should hash to different things");
assert(! friendlyEqual( oidHash2, oidHash3) , "ObjectIDs should hash to different things");

var intHash = hash( NumberInt(3) );
var doubHash = hash( 3 );
var doubHash2 = hash( 3.0 );
var longHash = hash( NumberLong(3) );
var fracHash = hash( NumberInt(3.5) );
assert.eq( intHash , doubHash );
assert.eq( intHash , doubHash2 );
assert.eq( intHash , longHash );
assert.eq( intHash , fracHash );

var trueHash = hash( true );
var falseHash = hash( false );
assert(! friendlyEqual( trueHash, falseHash) , "true and false should hash to different things");

var nullHash = hash( null );
assert(! friendlyEqual( falseHash , nullHash ) , "false and null should hash to different things");

var dateHash = hash( new Date() );
sleep(1);
var isodateHash = hash( ISODate() );
assert(! friendlyEqual( dateHash, isodateHash) , "different dates should hash to different things");

var stringHash = hash( "3" );
assert(! friendlyEqual( intHash , stringHash ), "3 and \"3\" should hash to different things");

var regExpHash = hash( RegExp("3") );
assert(! friendlyEqual( stringHash , regExpHash) , "\"3\" and RegExp(3) should hash to different things");

var intHash4 = hash( 4 );
assert(! friendlyEqual( intHash , intHash4 ), "3 and 4 should hash to different things");

var intHashSeeded = hash( 4 , 3 );
assert(! friendlyEqual(intHash4 , intHashSeeded ), "different seeds should make different hashes");

var minkeyHash = hash( MinKey );
var maxkeyHash = hash( MaxKey );
assert(! friendlyEqual(minkeyHash , maxkeyHash ), "minkey and maxkey should hash to different things");

var arrayHash = hash( [0,1.0,NumberLong(2)] );
var arrayHash2 = hash( [0,NumberInt(1),2] );
assert.eq( arrayHash , arrayHash2 , "didn't squash numeric types in array");

var objectHash = hash( {"0":0, "1" : NumberInt(1), "2" : 2} );
assert(! friendlyEqual(objectHash , arrayHash2) , "arrays and sub-objects should hash to different things");

var c = hash( {a : {}, b : 1} );
var d = hash( {a : {b : 1}} );
assert(! friendlyEqual( c , d ) , "hashing doesn't group sub-docs and fields correctly");

var e = hash( {a : 3 , b : [NumberLong(3), {c : NumberInt(3)}]} );
var f = hash( {a : NumberLong(3) , b : [NumberInt(3), {c : 3.0}]} );
assert.eq( e , f , "recursive number squashing doesn't work");

var nanHash = hash( 0/0 );
var zeroHash = hash( 0 );
assert.eq( nanHash , zeroHash , "NaN and Zero should hash to the same thing");


//should also test that CodeWScope hashes correctly
//but waiting for SERVER-3391 (CodeWScope support in shell)
2 changes: 2 additions & 0 deletions src/mongo/SConscript
Expand Up @@ -28,6 +28,7 @@ commonFiles = [ "pch.cpp",
"db/jsobj.cpp",
"bson/oid.cpp",
"db/json.cpp",
"db/hasher.cpp",
"db/lasterror.cpp",
"db/namespace.cpp",
"db/nonce.cpp",
Expand Down Expand Up @@ -94,6 +95,7 @@ coreServerFiles = [ "util/version.cpp",
"db/dbcommands_generic.cpp",
"db/commands/cloud.cpp",
"db/dbmessage.cpp",
"db/commands/hashcmd.cpp",
"db/commands/pipeline.cpp",
"db/indexkey.cpp",
"db/pipeline/accumulator.cpp",
Expand Down
63 changes: 55 additions & 8 deletions src/mongo/bson/bsonelement.h
Expand Up @@ -18,6 +18,7 @@
#pragma once

#include <vector>
#include <cmath>
#include <string.h>
#include "util/builder.h"
#include "bsontypes.h"
Expand Down Expand Up @@ -144,6 +145,13 @@ namespace mongo {
return data + 1;
}


int fieldNameSize() const {
if ( fieldNameSize_ == -1 )
fieldNameSize_ = (int)strlen( fieldName() ) + 1;
return fieldNameSize_;
}

/** raw data of the element's value (so be careful). */
const char * value() const {
return (data + fieldNameSize() + 1);
Expand Down Expand Up @@ -192,8 +200,18 @@ namespace mongo {

/** Retrieve int value for the element safely. Zero returned if not a number. */
int numberInt() const;
/** Retrieve long value for the element safely. Zero returned if not a number. */
/** Retrieve long value for the element safely. Zero returned if not a number.
* Behavior is not defined for double values that are NaNs, or too large/small
* to be represented by long longs */
long long numberLong() const;

/** Like numberLong() but with well-defined behavior for doubles that
* are NaNs, or too large/small to be represented as long longs.
* NaNs -> 0
* very large doubles -> LLONG_MAX
* very small doubles -> LLONG_MIN */
long long safeNumberLong() const;

/** Retrieve the numeric value of the element. If not of a numeric type, returns 0.
Note: casts to double, data loss may occur with large (>52 bit) NumberLong values.
*/
Expand Down Expand Up @@ -243,11 +261,20 @@ namespace mongo {

/** Get javascript code of a CodeWScope data element. */
const char * codeWScopeCode() const {
return value() + 8;
massert( 16177 , "not codeWScope" , type() == CodeWScope );
return value() + 4 + 4; //two ints precede code (see BSON spec)
}

/** Get length of the code part of the CodeWScope object
* This INCLUDES the null char at the end */
int codeWScopeCodeLen() const {
massert( 16178 , "not codeWScope" , type() == CodeWScope );
return *(int *)( value() + 4 );
}

/** Get the scope SavedContext of a CodeWScope data element. */
const char * codeWScopeScopeData() const {
// TODO fix
//This can error if there are null chars in the codeWScopeCode
return codeWScopeCode() + strlen( codeWScopeCode() ) + 1;
}

Expand Down Expand Up @@ -413,11 +440,7 @@ namespace mongo {
private:
const char *data;
mutable int fieldNameSize_; // cached value
int fieldNameSize() const {
if ( fieldNameSize_ == -1 )
fieldNameSize_ = (int)strlen( fieldName() ) + 1;
return fieldNameSize_;
}

mutable int totalSize; /* caches the computed size */

friend class BSONObjIterator;
Expand Down Expand Up @@ -574,6 +597,30 @@ namespace mongo {
}
}

/** Like numberLong() but with well-defined behavior for doubles that
* are NaNs, or too large/small to be represented as long longs.
* NaNs -> 0
* very large doubles -> LLONG_MAX
* very small doubles -> LLONG_MIN */
inline long long BSONElement::safeNumberLong() const {
double d;
switch( type() ) {
case NumberDouble:
d = numberDouble();
if ( std::isnan( d ) ){
return 0;
}
if ( d > (double) std::numeric_limits<long long>::max() ){
return std::numeric_limits<long long>::max();
}
if ( d < std::numeric_limits<long long>::min() ){
return std::numeric_limits<long long>::min();
}
default:
return numberLong();
}
}

inline BSONElement::BSONElement() {
static char z = 0;
data = &z;
Expand Down
70 changes: 70 additions & 0 deletions src/mongo/db/commands/hashcmd.cpp
@@ -0,0 +1,70 @@
/* hashcmd.cpp
*
* Defines a shell command for hashing a BSONElement value
*/


/**
* Copyright (C) 2012 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include "mongo/db/commands.h"
#include "mongo/db/hasher.h"

namespace mongo {

class CmdHashElt : public Command {
public:
CmdHashElt() : Command("_hashBSONElement") {};
virtual LockType locktype() const { return NONE; }
virtual bool slaveOk() const { return true; }
virtual void help( stringstream& help ) const {
help << "returns the hash of the first BSONElement val in a BSONObj";
}

/* CmdObj has the form {"hash" : <thingToHash>}
* or {"hash" : <thingToHash>, "seed" : <number> }
* Result has the form
* {"key" : <thingTohash>, "seed" : <int>, "out": NumberLong(<hash>)}
*
* Example use in the shell:
*> db.runCommand({hash: "hashthis", seed: 1})
*> {"key" : "hashthis",
*> "seed" : 1,
*> "out" : NumberLong(6271151123721111923),
*> "ok" : 1 }
**/
bool run( const string& db,
BSONObj& cmdObj,
int options, string& errmsg,
BSONObjBuilder& result,
bool fromRepl = false ){
result.appendAs(cmdObj.firstElement(),"key");

int seed = 0;
if (cmdObj.hasField("seed")){
if (! cmdObj["seed"].isNumber()) {
errmsg += "seed must be a number";
return false;
}
seed = cmdObj["seed"].numberInt();
}
result.append( "seed" , seed );

result.append( "out" , BSONElementHasher::hash64( cmdObj.firstElement() , seed ) );
return true;
}
} cmdHashElt;
}
94 changes: 94 additions & 0 deletions src/mongo/db/hasher.cpp
@@ -0,0 +1,94 @@
/* hasher.cpp
*
* Defines a simple hash function class
*/


/**
* Copyright (C) 2012 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include "mongo/db/hasher.h"
#include "mongo/db/jsobj.h"

namespace mongo {

Hasher::Hasher( HashSeed seed ) : _seed( seed ) {
md5_init( &_md5State );
md5_append( &_md5State , reinterpret_cast< const md5_byte_t * >( & _seed ) , sizeof( _seed ) );
}

void Hasher::addData( const void * keyData , size_t numBytes ) {
md5_append( &_md5State , static_cast< const md5_byte_t * >( keyData ), numBytes );
}

void Hasher::finish( HashDigest out ) {
md5_finish( &_md5State , out );
}

long long int BSONElementHasher::hash64( const BSONElement& e , HashSeed seed ){
scoped_ptr<Hasher> h( HasherFactory::createHasher( seed ) );
recursiveHash( h.get() , e , false );
HashDigest d;
h->finish(d);
//HashDigest is actually 16 bytes, but we just get 8 via truncation
// NOTE: assumes little-endian
return *reinterpret_cast< long long int * >( d );
}

void BSONElementHasher::recursiveHash( Hasher* h ,
const BSONElement& e ,
bool includeFieldName ) {

int canonicalType = e.canonicalType();
h->addData( &canonicalType , sizeof( canonicalType ) );

if ( includeFieldName ){
h->addData( e.fieldName() , e.fieldNameSize() );
}

if ( !e.mayEncapsulate() ){
//if there are no embedded objects (subobjects or arrays),
//compute the hash, squashing numeric types to 64-bit ints
if ( e.isNumber() ){
long long int i = e.safeNumberLong(); //well-defined for troublesome doubles
h->addData( &i , sizeof( i ) );
}
else {
h->addData( e.value() , e.valuesize() );
}
}
else {
//else identify the subobject.
//hash any preceding stuff (in the case of codeWscope)
//then each sub-element
//then finish with the EOO element.
BSONObj b;
if ( e.type() == CodeWScope ) {
h->addData( e.codeWScopeCode() , e.codeWScopeCodeLen() );
b = e.codeWScopeObject();
}
else {
b = e.embeddedObject();
}
BSONObjIterator i(b);
while( i.moreWithEOO() ) {
BSONElement el = i.next();
recursiveHash( h , el , true );
}
}
}

}

0 comments on commit ea82f5b

Please sign in to comment.