Skip to content

Commit

Permalink
SERVER-322 implement $in with regex
Browse files Browse the repository at this point in the history
  • Loading branch information
astaple committed Mar 9, 2010
1 parent 53235fd commit 6d095cf
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 55 deletions.
108 changes: 64 additions & 44 deletions db/matcher.cpp
Expand Up @@ -27,6 +27,25 @@
#include "db.h"
#include "client.h"

#include "pdfile.h"

namespace {
inline pcrecpp::RE_Options flags2options(const char* flags){
pcrecpp::RE_Options options;
options.set_utf8(true);
while ( flags && *flags ) {
if ( *flags == 'i' )
options.set_caseless(true);
else if ( *flags == 'm' )
options.set_multiline(true);
else if ( *flags == 'x' )
options.set_extended(true);
flags++;
}
return options;
}
}

namespace mongo {

//#define DEBUGMATCHER(x) cout << x << endl;
Expand Down Expand Up @@ -96,8 +115,24 @@ namespace mongo {
shared_ptr<Matcher> s;
s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
allMatchers.push_back( s );
}
else {
} else if ( ie.type() == RegEx ) {
if ( !myregex.get() ) {
myregex.reset( new vector< RegexMatcher >() );
}
myregex->push_back( RegexMatcher() );
RegexMatcher &rm = myregex->back();
rm.re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
rm.fieldName = 0; // no need for field name
rm.regex = ie.regex();
rm.flags = ie.regexFlags();
rm.isNot = false; // what about $nin?
if (!false){ //TODO something smarter
bool purePrefix;
string prefix = simpleRegex(rm.regex, rm.flags, &purePrefix);
if (purePrefix)
rm.prefix = prefix;
}
} else {
myset->insert(ie);
}
}
Expand All @@ -108,30 +143,6 @@ namespace mongo {

}


} // namespace mongo

#include "pdfile.h"

namespace {
inline pcrecpp::RE_Options flags2options(const char* flags){
pcrecpp::RE_Options options;
options.set_utf8(true);
while ( flags && *flags ) {
if ( *flags == 'i' )
options.set_caseless(true);
else if ( *flags == 'm' )
options.set_multiline(true);
else if ( *flags == 'x' )
options.set_extended(true);
flags++;
}
return options;
}
}

namespace mongo {

CoveredIndexMatcher::CoveredIndexMatcher(const BSONObj &jsobj, const BSONObj &indexKeyPattern) :
_keyMatcher(jsobj.filterFieldsUndotted(indexKeyPattern, true),
indexKeyPattern),
Expand Down Expand Up @@ -173,7 +184,7 @@ namespace mongo {
}
else {
RegexMatcher& rm = regexs[nRegex];
rm.re = new pcrecpp::RE(regex, flags2options(flags));
rm.re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
rm.fieldName = fieldName;
rm.regex = regex;
rm.flags = flags;
Expand Down Expand Up @@ -375,7 +386,22 @@ namespace mongo {

constrainIndexKey_ = constrainIndexKey;
}


inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) {
switch (e.type()){
case String:
case Symbol:
if (rm.prefix.empty())
return rm.re->PartialMatch(e.valuestr());
else
return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
case RegEx:
return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
default:
return false;
}
}

inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) {
assert( op != BSONObj::NE && op != BSONObj::NIN );

Expand All @@ -385,7 +411,16 @@ namespace mongo {

if ( op == BSONObj::opIN ) {
// { $in : [1,2,3] }
return bm.myset->count(l);
int count = bm.myset->count(l);
if ( count )
return count;
if ( bm.myregex.get() ) {
for( vector<RegexMatcher>::const_iterator i = bm.myregex->begin(); i != bm.myregex->end(); ++i ) {
if ( regexMatches( *i, l ) ) {
return true;
}
}
}
}

if ( op == BSONObj::opSIZE ) {
Expand Down Expand Up @@ -627,21 +662,6 @@ namespace mongo {

extern int dump;

inline bool regexMatches(RegexMatcher& rm, const BSONElement& e) {
switch (e.type()){
case String:
case Symbol:
if (rm.prefix.empty())
return rm.re->PartialMatch(e.valuestr());
else
return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
case RegEx:
return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
default:
return false;
}
}

/* See if an object matches the query.
*/
bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) {
Expand Down
8 changes: 3 additions & 5 deletions db/matcher.h
Expand Up @@ -34,12 +34,9 @@ namespace mongo {
const char *regex;
const char *flags;
string prefix;
pcrecpp::RE *re;
shared_ptr< pcrecpp::RE > re;
bool isNot;
RegexMatcher() : re( 0 ), isNot() {}
~RegexMatcher() {
delete re;
}
RegexMatcher() : isNot() {}
};

struct element_lt
Expand Down Expand Up @@ -70,6 +67,7 @@ namespace mongo {
int compareOp;
bool isNot;
shared_ptr< set<BSONElement,element_lt> > myset;
shared_ptr< vector<RegexMatcher> > myregex;

// these are for specific operators
int mod;
Expand Down
68 changes: 65 additions & 3 deletions db/queryutil.cpp
Expand Up @@ -118,14 +118,24 @@ namespace mongo {
// NOTE with $not, we could potentially form a complementary set of intervals.
if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
set< BSONElement, element_lt > vals;
vector< FieldRange > regexes;
uassert( 12580 , "invalid query" , e.isABSONObj() );
BSONObjIterator i( e.embeddedObject() );
while( i.more() )
vals.insert( i.next() );
while( i.more() ) {
BSONElement ie = i.next();
if ( ie.type() == RegEx ) {
regexes.push_back( FieldRange( ie, false, optimize ) );
} else {
vals.insert( ie );
}
}

for( set< BSONElement, element_lt >::const_iterator i = vals.begin(); i != vals.end(); ++i )
intervals_.push_back( FieldInterval(*i) );

for( vector< FieldRange >::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
*this |= *i;

return;
}

Expand Down Expand Up @@ -177,7 +187,7 @@ namespace mongo {
upperInclusive = false; //MaxForType String is an empty Object
}

// regex matches self
// regex matches self - regex type > string type
if (e.type() == RegEx){
BSONElement re = addObj( BSON( "" << e ) ).firstElement();
intervals_.push_back( FieldInterval(re) );
Expand Down Expand Up @@ -351,6 +361,58 @@ namespace mongo {
return *this;
}

void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector< FieldInterval > &newIntervals ) {
if ( low.bound_.eoo() ) {
low = lower.lower_; high = lower.upper_;
} else {
if ( high.bound_.woCompare( lower.lower_.bound_, false ) < 0 ) { // when equal but neither inclusive, just assume they overlap, since current btree scanning code just as efficient either way
FieldInterval tmp;
tmp.lower_ = low;
tmp.upper_ = high;
newIntervals.push_back( tmp );
low = lower.lower_; high = lower.upper_;
} else {
high = lower.upper_;
}
}
}

const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
vector< FieldInterval > newIntervals;
FieldBound low;
FieldBound high;
vector< FieldInterval >::const_iterator i = intervals_.begin();
vector< FieldInterval >::const_iterator j = other.intervals_.begin();
while( i != intervals_.end() && j != other.intervals_.end() ) {
int cmp = i->lower_.bound_.woCompare( j->lower_.bound_, false );
if ( ( cmp == 0 && i->lower_.inclusive_ ) || cmp < 0 ) {
handleInterval( *i, low, high, newIntervals );
++i;
} else {
handleInterval( *j, low, high, newIntervals );
++j;
}
}
while( i != intervals_.end() ) {
handleInterval( *i, low, high, newIntervals );
++i;
}
while( j != other.intervals_.end() ) {
handleInterval( *j, low, high, newIntervals );
++j;
}
FieldInterval tmp;
tmp.lower_ = low;
tmp.upper_ = high;
newIntervals.push_back( tmp );
intervals_ = newIntervals;
for( vector< BSONObj >::const_iterator i = other.objData_.begin(); i != other.objData_.end(); ++i )
objData_.push_back( *i );
if ( _special.size() == 0 && other._special.size() )
_special = other._special;
return *this;
}

BSONObj FieldRange::addObj( const BSONObj &o ) {
objData_.push_back( o );
return o;
Expand Down
1 change: 1 addition & 0 deletions db/queryutil.h
Expand Up @@ -50,6 +50,7 @@ namespace mongo {
public:
FieldRange( const BSONElement &e = BSONObj().firstElement() , bool isNot=false , bool optimize=true );
const FieldRange &operator&=( const FieldRange &other );
const FieldRange &operator|=( const FieldRange &other );
BSONElement min() const { assert( !empty() ); return intervals_[ 0 ].lower_.bound_; }
BSONElement max() const { assert( !empty() ); return intervals_[ intervals_.size() - 1 ].upper_.bound_; }
bool minInclusive() const { assert( !empty() ); return intervals_[ 0 ].lower_.inclusive_; }
Expand Down
30 changes: 27 additions & 3 deletions jstests/regex5.js
Expand Up @@ -2,12 +2,36 @@
t = db.regex5
t.drop()

t.save( { x : [ "abc" , "xyz" ] } )
t.save( { x : [ "ac" , "xyz" ] } )
t.save( { x : [ "abc" , "xyz1" ] } )
t.save( { x : [ "ac" , "xyz2" ] } )

a = /.*b.*c/
x = /.*y.*/

doit = function() {

assert.eq( 1 , t.find( { x : a } ).count() , "A" )
assert.eq( 2 , t.find( { x : x } ).count() , "B" )
// assert.eq( 1 , t.find( { x : { $all : [ a , x ] } } ).count() , "C" ) // SERVER-505
assert.eq( 2 , t.find( { x : { $in: [ x ] } } ).count() , "C" ) // SERVER-322
assert.eq( 1 , t.find( { x : { $in: [ a, "xyz1" ] } } ).count() , "D" ) // SERVER-322
assert.eq( 2 , t.find( { x : { $in: [ a, "xyz2" ] } } ).count() , "E" ) // SERVER-322
// assert.eq( 1 , t.find( { x : { $all : [ a , x ] } } ).count() , "F" ) // SERVER-505

}

doit();
t.ensureIndex( {x:1} );
print( "now indexed" );
doit();

// check bound unions SERVER-322
assert.eq( [
[ {x:1},{x:1} ],
[ {x:2.5},{x:2.5} ],
[ {x:"a"},{x:"a"} ],
[ {x:"b"},{x:"e"} ],
[ {x:/^b/},{x:/^b/} ],
[ {x:/^c/},{x:/^c/} ],
[ {x:/^d/},{x:/^d/} ]
],
t.find( { x : { $in: [ 1, 2.5, "a", "b", /^b/, /^c/, /^d/ ] } } ).explain().indexBounds );
3 changes: 3 additions & 0 deletions jstests/regex7.js
Expand Up @@ -21,3 +21,6 @@ assert.eq( /^a/i, t.findOne({r:/^a/i}).r, '2 1 b')
assert.eq( 1, t.count({r:/^a/i}), '2 2 b')
assert.eq( /^b/, t.findOne({r:/^b/}).r, '3 1 b')
assert.eq( 1, t.count({r:/^b/}), '3 2 b')

t.insert( {r:"a"} );
assert.eq( 2, t.count({r:/^a/}), 'c' );
2 changes: 2 additions & 0 deletions mongo.xcodeproj/project.pbxproj
Expand Up @@ -497,6 +497,7 @@
93BFA0E311330A8C0045D084 /* not2.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = not2.js; sourceTree = "<group>"; };
93C38E940FA66622007D6E4A /* basictests.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = basictests.cpp; sourceTree = "<group>"; };
93C8E6FE11457D9000F28017 /* master1.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = master1.js; sourceTree = "<group>"; };
93C8E81C1145BCCA00F28017 /* regex7.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = regex7.js; sourceTree = "<group>"; };
93CC40C2113C407A00734218 /* insert1.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = insert1.js; sourceTree = "<group>"; };
93CC441A113DE6BA00734218 /* indexg.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = indexg.js; sourceTree = "<group>"; };
93CC4484113E602400734218 /* in3.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = in3.js; sourceTree = "<group>"; };
Expand Down Expand Up @@ -748,6 +749,7 @@
934BEB9A10DFFA9600178102 /* jstests */ = {
isa = PBXGroup;
children = (
93C8E81C1145BCCA00F28017 /* regex7.js */,
93CC4484113E602400734218 /* in3.js */,
93CC441A113DE6BA00734218 /* indexg.js */,
93CC40C2113C407A00734218 /* insert1.js */,
Expand Down

0 comments on commit 6d095cf

Please sign in to comment.