Permalink
Browse files

SERVER-322 implement $in with regex

  • Loading branch information...
1 parent 53235fd commit 6d095cf3574b7791155b22c5c9f4f13c464df9c8 @astaple astaple committed Mar 9, 2010
Showing with 165 additions and 55 deletions.
  1. +64 −44 db/matcher.cpp
  2. +3 −5 db/matcher.h
  3. +65 −3 db/queryutil.cpp
  4. +1 −0 db/queryutil.h
  5. +27 −3 jstests/regex5.js
  6. +3 −0 jstests/regex7.js
  7. +2 −0 mongo.xcodeproj/project.pbxproj
View
@@ -27,6 +27,25 @@
#include "db.h"
#include "client.h"
+#include "pdfile.h"
+
+namespace {
+ inline pcrecpp::RE_Options flags2options(const char* flags){
+ pcrecpp::RE_Options options;
+ options.set_utf8(true);
+ while ( flags && *flags ) {
+ if ( *flags == 'i' )
+ options.set_caseless(true);
+ else if ( *flags == 'm' )
+ options.set_multiline(true);
+ else if ( *flags == 'x' )
+ options.set_extended(true);
+ flags++;
+ }
+ return options;
+ }
+}
+
namespace mongo {
//#define DEBUGMATCHER(x) cout << x << endl;
@@ -96,8 +115,24 @@ namespace mongo {
shared_ptr<Matcher> s;
s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
allMatchers.push_back( s );
- }
- else {
+ } else if ( ie.type() == RegEx ) {
+ if ( !myregex.get() ) {
+ myregex.reset( new vector< RegexMatcher >() );
+ }
+ myregex->push_back( RegexMatcher() );
+ RegexMatcher &rm = myregex->back();
+ rm.re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
+ rm.fieldName = 0; // no need for field name
+ rm.regex = ie.regex();
+ rm.flags = ie.regexFlags();
+ rm.isNot = false; // what about $nin?
+ if (!false){ //TODO something smarter
+ bool purePrefix;
+ string prefix = simpleRegex(rm.regex, rm.flags, &purePrefix);
+ if (purePrefix)
+ rm.prefix = prefix;
+ }
+ } else {
myset->insert(ie);
}
}
@@ -108,30 +143,6 @@ namespace mongo {
}
-
-} // namespace mongo
-
-#include "pdfile.h"
-
-namespace {
- inline pcrecpp::RE_Options flags2options(const char* flags){
- pcrecpp::RE_Options options;
- options.set_utf8(true);
- while ( flags && *flags ) {
- if ( *flags == 'i' )
- options.set_caseless(true);
- else if ( *flags == 'm' )
- options.set_multiline(true);
- else if ( *flags == 'x' )
- options.set_extended(true);
- flags++;
- }
- return options;
- }
-}
-
-namespace mongo {
-
CoveredIndexMatcher::CoveredIndexMatcher(const BSONObj &jsobj, const BSONObj &indexKeyPattern) :
_keyMatcher(jsobj.filterFieldsUndotted(indexKeyPattern, true),
indexKeyPattern),
@@ -173,7 +184,7 @@ namespace mongo {
}
else {
RegexMatcher& rm = regexs[nRegex];
- rm.re = new pcrecpp::RE(regex, flags2options(flags));
+ rm.re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
rm.fieldName = fieldName;
rm.regex = regex;
rm.flags = flags;
@@ -375,7 +386,22 @@ namespace mongo {
constrainIndexKey_ = constrainIndexKey;
}
-
+
+ inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) {
+ switch (e.type()){
+ case String:
+ case Symbol:
+ if (rm.prefix.empty())
+ return rm.re->PartialMatch(e.valuestr());
+ else
+ return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
+ case RegEx:
+ return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
+ default:
+ return false;
+ }
+ }
+
inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) {
assert( op != BSONObj::NE && op != BSONObj::NIN );
@@ -385,7 +411,16 @@ namespace mongo {
if ( op == BSONObj::opIN ) {
// { $in : [1,2,3] }
- return bm.myset->count(l);
+ int count = bm.myset->count(l);
+ if ( count )
+ return count;
+ if ( bm.myregex.get() ) {
+ for( vector<RegexMatcher>::const_iterator i = bm.myregex->begin(); i != bm.myregex->end(); ++i ) {
+ if ( regexMatches( *i, l ) ) {
+ return true;
+ }
+ }
+ }
}
if ( op == BSONObj::opSIZE ) {
@@ -627,21 +662,6 @@ namespace mongo {
extern int dump;
- inline bool regexMatches(RegexMatcher& rm, const BSONElement& e) {
- switch (e.type()){
- case String:
- case Symbol:
- if (rm.prefix.empty())
- return rm.re->PartialMatch(e.valuestr());
- else
- return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
- case RegEx:
- return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
- default:
- return false;
- }
- }
-
/* See if an object matches the query.
*/
bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) {
View
@@ -34,12 +34,9 @@ namespace mongo {
const char *regex;
const char *flags;
string prefix;
- pcrecpp::RE *re;
+ shared_ptr< pcrecpp::RE > re;
bool isNot;
- RegexMatcher() : re( 0 ), isNot() {}
- ~RegexMatcher() {
- delete re;
- }
+ RegexMatcher() : isNot() {}
};
struct element_lt
@@ -70,6 +67,7 @@ namespace mongo {
int compareOp;
bool isNot;
shared_ptr< set<BSONElement,element_lt> > myset;
+ shared_ptr< vector<RegexMatcher> > myregex;
// these are for specific operators
int mod;
View
@@ -118,14 +118,24 @@ namespace mongo {
// NOTE with $not, we could potentially form a complementary set of intervals.
if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
set< BSONElement, element_lt > vals;
+ vector< FieldRange > regexes;
uassert( 12580 , "invalid query" , e.isABSONObj() );
BSONObjIterator i( e.embeddedObject() );
- while( i.more() )
- vals.insert( i.next() );
+ while( i.more() ) {
+ BSONElement ie = i.next();
+ if ( ie.type() == RegEx ) {
+ regexes.push_back( FieldRange( ie, false, optimize ) );
+ } else {
+ vals.insert( ie );
+ }
+ }
for( set< BSONElement, element_lt >::const_iterator i = vals.begin(); i != vals.end(); ++i )
intervals_.push_back( FieldInterval(*i) );
+ for( vector< FieldRange >::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
+ *this |= *i;
+
return;
}
@@ -177,7 +187,7 @@ namespace mongo {
upperInclusive = false; //MaxForType String is an empty Object
}
- // regex matches self
+ // regex matches self - regex type > string type
if (e.type() == RegEx){
BSONElement re = addObj( BSON( "" << e ) ).firstElement();
intervals_.push_back( FieldInterval(re) );
@@ -351,6 +361,58 @@ namespace mongo {
return *this;
}
+ void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector< FieldInterval > &newIntervals ) {
+ if ( low.bound_.eoo() ) {
+ low = lower.lower_; high = lower.upper_;
+ } else {
+ if ( high.bound_.woCompare( lower.lower_.bound_, false ) < 0 ) { // when equal but neither inclusive, just assume they overlap, since current btree scanning code just as efficient either way
+ FieldInterval tmp;
+ tmp.lower_ = low;
+ tmp.upper_ = high;
+ newIntervals.push_back( tmp );
+ low = lower.lower_; high = lower.upper_;
+ } else {
+ high = lower.upper_;
+ }
+ }
+ }
+
+ const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
+ vector< FieldInterval > newIntervals;
+ FieldBound low;
+ FieldBound high;
+ vector< FieldInterval >::const_iterator i = intervals_.begin();
+ vector< FieldInterval >::const_iterator j = other.intervals_.begin();
+ while( i != intervals_.end() && j != other.intervals_.end() ) {
+ int cmp = i->lower_.bound_.woCompare( j->lower_.bound_, false );
+ if ( ( cmp == 0 && i->lower_.inclusive_ ) || cmp < 0 ) {
+ handleInterval( *i, low, high, newIntervals );
+ ++i;
+ } else {
+ handleInterval( *j, low, high, newIntervals );
+ ++j;
+ }
+ }
+ while( i != intervals_.end() ) {
+ handleInterval( *i, low, high, newIntervals );
+ ++i;
+ }
+ while( j != other.intervals_.end() ) {
+ handleInterval( *j, low, high, newIntervals );
+ ++j;
+ }
+ FieldInterval tmp;
+ tmp.lower_ = low;
+ tmp.upper_ = high;
+ newIntervals.push_back( tmp );
+ intervals_ = newIntervals;
+ for( vector< BSONObj >::const_iterator i = other.objData_.begin(); i != other.objData_.end(); ++i )
+ objData_.push_back( *i );
+ if ( _special.size() == 0 && other._special.size() )
+ _special = other._special;
+ return *this;
+ }
+
BSONObj FieldRange::addObj( const BSONObj &o ) {
objData_.push_back( o );
return o;
View
@@ -50,6 +50,7 @@ namespace mongo {
public:
FieldRange( const BSONElement &e = BSONObj().firstElement() , bool isNot=false , bool optimize=true );
const FieldRange &operator&=( const FieldRange &other );
+ const FieldRange &operator|=( const FieldRange &other );
BSONElement min() const { assert( !empty() ); return intervals_[ 0 ].lower_.bound_; }
BSONElement max() const { assert( !empty() ); return intervals_[ intervals_.size() - 1 ].upper_.bound_; }
bool minInclusive() const { assert( !empty() ); return intervals_[ 0 ].lower_.inclusive_; }
View
@@ -2,12 +2,36 @@
t = db.regex5
t.drop()
-t.save( { x : [ "abc" , "xyz" ] } )
-t.save( { x : [ "ac" , "xyz" ] } )
+t.save( { x : [ "abc" , "xyz1" ] } )
+t.save( { x : [ "ac" , "xyz2" ] } )
a = /.*b.*c/
x = /.*y.*/
+doit = function() {
+
assert.eq( 1 , t.find( { x : a } ).count() , "A" )
assert.eq( 2 , t.find( { x : x } ).count() , "B" )
-// assert.eq( 1 , t.find( { x : { $all : [ a , x ] } } ).count() , "C" ) // SERVER-505
+assert.eq( 2 , t.find( { x : { $in: [ x ] } } ).count() , "C" ) // SERVER-322
+assert.eq( 1 , t.find( { x : { $in: [ a, "xyz1" ] } } ).count() , "D" ) // SERVER-322
+assert.eq( 2 , t.find( { x : { $in: [ a, "xyz2" ] } } ).count() , "E" ) // SERVER-322
+// assert.eq( 1 , t.find( { x : { $all : [ a , x ] } } ).count() , "F" ) // SERVER-505
+
+}
+
+doit();
+t.ensureIndex( {x:1} );
+print( "now indexed" );
+doit();
+
+// check bound unions SERVER-322
+assert.eq( [
+ [ {x:1},{x:1} ],
+ [ {x:2.5},{x:2.5} ],
+ [ {x:"a"},{x:"a"} ],
+ [ {x:"b"},{x:"e"} ],
+ [ {x:/^b/},{x:/^b/} ],
+ [ {x:/^c/},{x:/^c/} ],
+ [ {x:/^d/},{x:/^d/} ]
+ ],
+ t.find( { x : { $in: [ 1, 2.5, "a", "b", /^b/, /^c/, /^d/ ] } } ).explain().indexBounds );
View
@@ -21,3 +21,6 @@ assert.eq( /^a/i, t.findOne({r:/^a/i}).r, '2 1 b')
assert.eq( 1, t.count({r:/^a/i}), '2 2 b')
assert.eq( /^b/, t.findOne({r:/^b/}).r, '3 1 b')
assert.eq( 1, t.count({r:/^b/}), '3 2 b')
+
+t.insert( {r:"a"} );
+assert.eq( 2, t.count({r:/^a/}), 'c' );
@@ -497,6 +497,7 @@
93BFA0E311330A8C0045D084 /* not2.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = not2.js; sourceTree = "<group>"; };
93C38E940FA66622007D6E4A /* basictests.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = basictests.cpp; sourceTree = "<group>"; };
93C8E6FE11457D9000F28017 /* master1.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = master1.js; sourceTree = "<group>"; };
+ 93C8E81C1145BCCA00F28017 /* regex7.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = regex7.js; sourceTree = "<group>"; };
93CC40C2113C407A00734218 /* insert1.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = insert1.js; sourceTree = "<group>"; };
93CC441A113DE6BA00734218 /* indexg.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = indexg.js; sourceTree = "<group>"; };
93CC4484113E602400734218 /* in3.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = in3.js; sourceTree = "<group>"; };
@@ -748,6 +749,7 @@
934BEB9A10DFFA9600178102 /* jstests */ = {
isa = PBXGroup;
children = (
+ 93C8E81C1145BCCA00F28017 /* regex7.js */,
93CC4484113E602400734218 /* in3.js */,
93CC441A113DE6BA00734218 /* indexg.js */,
93CC40C2113C407A00734218 /* insert1.js */,

0 comments on commit 6d095cf

Please sign in to comment.