Browse files

SERVER-380: Experimental text search indexing

  • Loading branch information...
1 parent d2df300 commit f201972ecc87f099777e1c61f269998f4399caf4 @erh erh committed Dec 25, 2012
Showing with 8,070 additions and 0 deletions.
  1. +18 −0 jstests/fts1.js
  2. +23 −0 jstests/fts2.js
  3. +23 −0 jstests/fts3.js
  4. +23 −0 jstests/fts4.js
  5. +23 −0 jstests/fts5.js
  6. +27 −0 jstests/fts_blog.js
  7. +41 −0 jstests/fts_blogwild.js
  8. +152 −0 jstests/fts_mix.js
  9. +18 −0 jstests/fts_partition1.js
  10. +25 −0 jstests/fts_phrase.js
  11. +22 −0 jstests/fts_proj.js
  12. +32 −0 jstests/fts_spanish.js
  13. +21 −0 jstests/libs/fts.js
  14. +88 −0 src/mongo/db/fts/SConscript
  15. +93 −0 src/mongo/db/fts/fts_command.cpp
  16. +68 −0 src/mongo/db/fts/fts_command.h
  17. +159 −0 src/mongo/db/fts/fts_command_mongod.cpp
  18. +129 −0 src/mongo/db/fts/fts_command_mongos.cpp
  19. +28 −0 src/mongo/db/fts/fts_enabled.cpp
  20. +25 −0 src/mongo/db/fts/fts_enabled.h
  21. +96 −0 src/mongo/db/fts/fts_index.cpp
  22. +67 −0 src/mongo/db/fts/fts_index.h
  23. +119 −0 src/mongo/db/fts/fts_index_format.cpp
  24. +55 −0 src/mongo/db/fts/fts_index_format.h
  25. +96 −0 src/mongo/db/fts/fts_index_format_test.cpp
  26. +247 −0 src/mongo/db/fts/fts_matcher.cpp
  27. +67 −0 src/mongo/db/fts/fts_matcher.h
  28. +63 −0 src/mongo/db/fts/fts_matcher_test.cpp
  29. +173 −0 src/mongo/db/fts/fts_query.cpp
  30. +80 −0 src/mongo/db/fts/fts_query.h
  31. +73 −0 src/mongo/db/fts/fts_query_test.cpp
  32. +175 −0 src/mongo/db/fts/fts_search.cpp
  33. +103 −0 src/mongo/db/fts/fts_search.h
  34. +395 −0 src/mongo/db/fts/fts_spec.cpp
  35. +108 −0 src/mongo/db/fts/fts_spec.h
  36. +139 −0 src/mongo/db/fts/fts_spec_test.cpp
  37. +30 −0 src/mongo/db/fts/fts_util.cpp
  38. +112 −0 src/mongo/db/fts/fts_util.h
  39. +36 −0 src/mongo/db/fts/fts_util_test.cpp
  40. +56 −0 src/mongo/db/fts/generate_stop_words.py
  41. +58 −0 src/mongo/db/fts/stemmer.cpp
  42. +48 −0 src/mongo/db/fts/stemmer.h
  43. +42 −0 src/mongo/db/fts/stemmer_test.cpp
  44. +73 −0 src/mongo/db/fts/stop_words.cpp
  45. +50 −0 src/mongo/db/fts/stop_words.h
  46. +100 −0 src/mongo/db/fts/stop_words_danish.txt
  47. +48 −0 src/mongo/db/fts/stop_words_dutch.txt
  48. +174 −0 src/mongo/db/fts/stop_words_english.txt
  49. +747 −0 src/mongo/db/fts/stop_words_finnish.txt
  50. +126 −0 src/mongo/db/fts/stop_words_french.txt
  51. +992 −0 src/mongo/db/fts/stop_words_german.txt
  52. +35 −0 src/mongo/db/fts/stop_words_hungarian.txt
  53. +279 −0 src/mongo/db/fts/stop_words_italian.txt
  54. +119 −0 src/mongo/db/fts/stop_words_norwegian.txt
  55. +147 −0 src/mongo/db/fts/stop_words_portuguese.txt
  56. +258 −0 src/mongo/db/fts/stop_words_romanian.txt
  57. +421 −0 src/mongo/db/fts/stop_words_russian.txt
  58. +177 −0 src/mongo/db/fts/stop_words_spanish.txt
  59. +386 −0 src/mongo/db/fts/stop_words_swedish.txt
  60. +32 −0 src/mongo/db/fts/stop_words_test.cpp
  61. +114 −0 src/mongo/db/fts/stop_words_turkish.txt
  62. +129 −0 src/mongo/db/fts/tokenizer.cpp
  63. +68 −0 src/mongo/db/fts/tokenizer.h
  64. +119 −0 src/mongo/db/fts/tokenizer_test.cpp
View
18 jstests/fts1.js
@@ -0,0 +1,18 @@
+
+load( "jstests/libs/fts.js" );
+
+t = db.text1;
+t.drop();
+
+t.save( { _id : 1 , x : "az b c" } );
+t.save( { _id : 2 , x : "az b" } );
+t.save( { _id : 3 , x : "b c" } );
+t.save( { _id : 4 , x : "b c d" } );
+
+t.ensureIndex( { x : "text" } );
+
+assert.eq( [1,2,3,4] , queryIDS( t , "c az" ) , "A1" );
+assert.eq( [4] , queryIDS( t , "d" ) , "A2" );
+
+
+
View
23 jstests/fts2.js
@@ -0,0 +1,23 @@
+
+load( "jstests/libs/fts.js" );
+
+t = db.text2;
+t.drop();
+
+t.save( { _id : 1 , x : "az b x" , y : "c d m" , z : 1 } );
+t.save( { _id : 2 , x : "c d y" , y : "az b n" , z : 2 } );
+
+t.ensureIndex( { x : "text" } , { weights : { x : 10 , y : 1 } } );
+
+assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" );
+assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" );
+
+assert.eq( [1] , queryIDS( t , "x" ) , "A3" );
+assert.eq( [2] , queryIDS( t , "y" ) , "A4" );
+
+assert.eq( [1] , queryIDS( t , "az" , { z : 1 } ) , "B1" );
+assert.eq( [1] , queryIDS( t , "d" , { z : 1 } ) , "B2" );
+
+assert.eq( 2 , lastCommadResult.stats.nscannedObjects , "B3" );
+assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" );
+
View
23 jstests/fts3.js
@@ -0,0 +1,23 @@
+
+load( "jstests/libs/fts.js" );
+
+t = db.text3;
+t.drop();
+
+t.save( { _id : 1 , x : "az b x" , y : "c d m" , z : 1 } );
+t.save( { _id : 2 , x : "c d y" , y : "az b n" , z : 2 } );
+
+t.ensureIndex( { x : "text" , z : 1 } , { weights : { x : 10 , y : 1 } } );
+
+assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" );
+assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" );
+
+assert.eq( [1] , queryIDS( t , "x" ) , "A3" );
+assert.eq( [2] , queryIDS( t , "y" ) , "A4" );
+
+assert.eq( [1] , queryIDS( t , "az" , { z : 1 } ) , "B1" );
+assert.eq( [1] , queryIDS( t , "d" , { z : 1 } ) , "B2" );
+
+assert.eq( 0 , lastCommadResult.stats.nscannedObjects , "B3" );
+assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" );
+
View
23 jstests/fts4.js
@@ -0,0 +1,23 @@
+
+load( "jstests/libs/fts.js" );
+
+t = db.text4;
+t.drop();
+
+t.save( { _id : 1 , x : [ "az" , "b" , "x" ] , y : [ "c" , "d" , "m" ] , z : 1 } );
+t.save( { _id : 2 , x : [ "c" , "d" , "y" ] , y : [ "az" , "b" , "n" ] , z : 2 } );
+
+t.ensureIndex( { y : "text" , z : 1 } , { weights : { x : 10 } } );
+
+assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" );
+assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" );
+
+assert.eq( [1] , queryIDS( t , "x" ) , "A3" );
+assert.eq( [2] , queryIDS( t , "y" ) , "A4" );
+
+assert.eq( [1] , queryIDS( t , "az" , { z : 1 } ) , "B1" );
+assert.eq( [1] , queryIDS( t , "d" , { z : 1 } ) , "B2" );
+
+assert.eq( 0 , lastCommadResult.stats.nscannedObjects , "B3" );
+assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" );
+
View
23 jstests/fts5.js
@@ -0,0 +1,23 @@
+
+load( "jstests/libs/fts.js" );
+
+t = db.text5;
+t.drop();
+
+t.save( { _id: 1 , x: [ { a: "az" } , { a: "b" } , { a: "x" } ] , y: [ "c" , "d" , "m" ] , z: 1 } );
+t.save( { _id: 2 , x: [ { a: "c" } , { a: "d" } , { a: "y" } ] , y: [ "az" , "b" , "n" ] , z: 2 } );
+
+t.ensureIndex( { y: "text" , z: 1 } , { weights: { "x.a": 10 } } );
+
+assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" );
+assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" );
+
+assert.eq( [1] , queryIDS( t , "x" ) , "A3" );
+assert.eq( [2] , queryIDS( t , "y" ) , "A4" );
+
+assert.eq( [1] , queryIDS( t , "az" , { z: 1 } ) , "B1" );
+assert.eq( [1] , queryIDS( t , "d" , { z: 1 } ) , "B2" );
+
+assert.eq( 0 , lastCommadResult.stats.nscannedObjects , "B3" );
+assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" );
+
View
27 jstests/fts_blog.js
@@ -0,0 +1,27 @@
+
+t = db.text_blog;
+t.drop();
+
+t.save( { _id : 1 , title : "my blog post" , text : "this is a new blog i am writing. yay" } );
+t.save( { _id : 2 , title : "my 2nd post" , text : "this is a new blog i am writing. yay" } );
+t.save( { _id : 3 , title : "knives are Fun" , text : "this is a new blog i am writing. yay" } );
+
+// default weight is 1
+// specify weights if you want a field to be more meaningull
+t.ensureIndex( { "title" : "text" , text : "text" } , { weights : { title : 10 } } );
+
+res = t.runCommand( "text" , { search : "blog" } )
+assert.eq( 3, res.results.length );
+assert.eq( 1, res.results[0].obj._id );
+
+res = t.runCommand( "text" , { search : "write" } )
+assert.eq( 3, res.results.length );
+assert.eq( res.results[0].score, res.results[1].score );
+assert.eq( res.results[0].score, res.results[2].score );
+
+
+
+
+
+
+
View
41 jstests/fts_blogwild.js
@@ -0,0 +1,41 @@
+
+t = db.text_blog;
+t.drop();
+
+t.save( { _id: 1 , title: "my blog post" , text: "this is a new blog i am writing. yay eliot" } );
+t.save( { _id: 2 , title: "my 2nd post" , text: "this is a new blog i am writing. yay" } );
+t.save( { _id: 3 , title: "knives are Fun for writing eliot" , text: "this is a new blog i am writing. yay" } );
+
+// default weight is 1
+// specify weights if you want a field to be more meaningull
+t.ensureIndex( { dummy: "text" } , { weights: "$**" } );
+
+res = t.runCommand( "text" , { search: "blog" } );
+assert.eq( 3 , res.stats.n , "A1" );
+
+res = t.runCommand( "text" , { search: "write" } );
+assert.eq( 3 , res.stats.n , "B1" );
+
+// mixing
+t.dropIndex( "dummy_text" );
+assert.eq( 1 , t.getIndexKeys().length , "C1" );
+t.ensureIndex( { dummy: "text" } , { weights: { "$**": 1 , title: 2 } } );
+
+
+res = t.runCommand( "text" , { search: "write" } );
+assert.eq( 3 , res.stats.n , "C2" );
+assert.eq( 3 , res.results[0].obj._id , "C3" );
+
+res = t.runCommand( "text" , { search: "blog" } );
+assert.eq( 3 , res.stats.n , "D1" );
+assert.eq( 1 , res.results[0].obj._id , "D2" );
+
+res = t.runCommand( "text" , { search: "eliot" } );
+assert.eq( 2 , res.stats.n , "E1" );
+assert.eq( 3 , res.results[0].obj._id , "E2" );
+
+
+
+
+
+
View
152 jstests/fts_mix.js
@@ -0,0 +1,152 @@
+
+load( "jstests/libs/fts.js" );
+
+// test collection
+tc = db.text_mix;
+tc.drop();
+
+// creation of collection documents
+// content generated using wikipedia random article
+tc.save( { _id: 1, title: "Olivia Shakespear",text: "Olivia Shakespear (born Olivia Tucker; 17 March 1863 – 3 October 1938) was a British novelist, playwright, and patron of the arts. She wrote six books that are described as \"marriage problem\" novels. Her works sold poorly, sometimes only a few hundred copies. Her last novel, Uncle Hilary, is considered her best. She wrote two plays in collaboration with Florence Farr." } );
+tc.save( { _id: 2, title: "Mahim Bora", text: "Mahim Bora (born 1926) is an Indian writer and educationist from Assam state. He was born at a tea estate of Sonitpur district. He is an M.A. in Assamese literature from Gauhati University and had been a teacher in the Nowgong College for most of his teaching career. He has now retired and lives at Nagaon. Bora spent a good part of his childhood in the culture-rich surroundings of rural Nagaon, where the river Kalong was the life-blood of a community. His impressionable mind was to capture a myriad memories of that childhood, later to find expression in his poems, short stories and novels with humour, irony and pathos woven into their texture. When this river was dammed up, its disturbing effect was on the entire community dependant on nature's bounty." } );
+tc.save( { _id: 3, title: "A break away!", text: "A break away! is an 1891 painting by Australian artist Tom Roberts. The painting depicts a mob of thirsty sheep stampeding towards a dam. A drover on horseback is attempting to turn the mob before they drown or crush each other in their desire to drink. The painting, an \"icon of Australian art\", is part of a series of works by Roberts that \"captures what was an emerging spirit of national identity.\" Roberts painted the work at Corowa. The painting depicts a time of drought, with little grass and the soil kicked up as dust. The work itself is a reflection on the pioneering days of the pastoral industry, which were coming to an end by the 1890s." } );
+tc.save( { _id: 4, title: "Linn-Kristin Riegelhuth Koren", text: "Linn-Kristin Riegelhuth Koren (born 1 August 1984, in Ski) is a Norwegian handballer playing for Larvik HK and the Norwegian national team. She is commonly known as Linka. Outside handball she is a qualified nurse." } );
+tc.save( { _id: 5, title: "Morten Jensen", text: "Morten Jensen (born December 2, 1982 in Lynge) is a Danish athlete. He primarily participates in long jump, 100 metres and 200 metres. He competed at the World Championships in 2005 and 2007, the 2006 World Indoor Championships, the 2006 European Championships, the 2007 World Championships and the 2008 Olympic Games without qualifying for the final round. He was runner-up in the 2010 Finnish Elite Games rankings, just missing out to Levern Spencer for that year's jackpot. He holds the Danish record in both long jump and 100 metres. He also holds the Danish indoor record in the 200 metres. He has been a part of the Sparta teamsine 2005, before then he was a part of FIF Hillerd. His coach was Leif Dahlberg after the 2010 European Championships he change to Lars Nielsen and Anders Miller." } );
+tc.save( { _id: 6, title: "Janet Laurence", text: "Janet Laurence (born 1947) is a Sydney based Australian artist who works in mixed media and installation. Her work has been included in major survey exhibitions, nationally and internationally and is regularly exhibited in Sydney, Melbourne and Japan. Her work explores a relationship to the natural world, often from an architectural context. It extends from the gallery space into the urban fabric, and has been realized in many site specific projects, often involving collaborations with architects, landscape architects and environmental scientists. She has received many grants and awards including a Rockefeller Residency in 1997. Laurence was a Trustee of the Art Gallery of NSW from 1995 to 2005. Laurence was the subject of John Beard's winning entry for the 2007 Archibald Prize." } );
+tc.save( { _id: 7, title: "Glen-Coats Baronets", text: "The Glen-Coats Baronetcy, of Ferguslie Park in the Parish of Abbey in the County of Renfrew, was a title in the Baronetage of the United Kingdom. It was created on 25 June 1894 for Thomas Glen-Coats, Director of the thread-making firm of J. & P. Coats, Ltd, and later Liberal Member of Parliament for Renfrewshire West. Born Thomas Coats, he assumed the additional surname of Glen, which was that of his maternal grandfather. He was succeeded by his son, the second Baronet. He won a gold medal in sailing at the 1908 Summer Olympics. The title became extinct on his death in 1954. Two other members of the Coats family also gained distinction. George Coats, 1st Baron Glentanar, was the younger brother of the first Baronet, while Sir James Coats, 1st Baronet (see Coats Baronets), was the first cousin of the first Baronet." } );
+tc.save( { _id: 8, title: "Grapeleaf Skeletonizer", text: "The Grapeleaf Skeletonizer, Harrisina americana is a moth in the family Zygaenidae. It is widespread in the eastern half of the United States, and commonly noticed defoliating grapes, especially of the Virginia creeper (Parthenocissus quinquefolia). The western grapeleaf skeletonizer, Harrisina brillians is very similar to and slightly larger than H. americana, but their distributions are different. Members of this family all produce hydrogen cyanide, a potent antipredator toxin." } );
+tc.save( { _id: 9, title: "Physics World", text: "Physics World is the membership magazine of the Institute of Physics, one of the largest physical societies in the world. It is an international monthly magazine covering all areas of physics, both pure and applied, and is aimed at physicists in research, industry and education worldwide. It was launched in 1988 by IOP Publishing Ltd and has established itself as one of the world's leading physics magazines. The magazine is sent free to members of the Institute of Physics, who can also access a digital edition of the magazine, although selected articles can be read by anyone for free online. It was redesigned in September 2005 and has an audited circulation of just under 35000. The current editor is Matin Durrani. Also on the team are Dens Milne (associate editor), Michael Banks (news editor), Louise Mayor (features editor) and Margaret Harris (reviews and careers editor). Hamish Johnston is the editor of the magazine's website physicsworld.com and James Dacey is its reporter." } );
+tc.save( { _id: 10, title: "Mallacoota, Victoria", text: "Mallacoota is a small town in the East Gippsland region of Victoria, Australia. At the 2006 census, Mallacoota had a population of 972. At holiday times, particularly Easter and Christmas, the population increases by about 8,000. It is one of the most isolated towns in the state of Victoria, 25 kilometres off the Princes Highway and 523 kilometres (325 mi) from Melbourne. It is 526 kilometres (327 mi) from Sydney, New South Wales. It is halfway between Melbourne and Sydney when travelling via Princes Highway, though that is a long route between Australia's two main cities. It is the last official township on Victoria's east coast before the border with New South Wales. Mallacoota has a regional airport (Mallacoota Airport) YMCO (XMC) consisting of a grassed field for private light planes. It is known for its wild flowers, abalone industry, the inlet estuary consisting of Top Lake and Bottom Lake, and Croajingolong National Park that surround it. It is a popular and beautiful holiday spot for boating, fishing, walking the wilderness coast, swimming, birdwatching, and surfing. The Mallacoota Arts Council runs events throughout each year. Mallacoota Inlet is one of the main villages along the wilderness coast walk from NSW to Victoria, Australia." } );
+
+// begin tests
+
+// -------------------------------------------- INDEXING & WEIGHTING -------------------------------
+
+// start with basic index, one item with default weight
+tc.ensureIndex( { "title": "text" } );
+
+// test the single result case..
+res = tc.runCommand( "text", { search: "Victoria" } );
+assert.eq( 1, res.results.length );
+assert.eq( 10, res.results[0].obj._id );
+
+tc.dropIndexes();
+
+// now let's see about multiple fields, with specific weighting
+tc.ensureIndex( { "title": "text", "text": "text" }, { weights: { "title": 10 } } );
+assert.eq( [9,7,8], queryIDS( tc, "members physics" ) );
+
+tc.dropIndexes();
+
+// test all-1 weighting with "$**"
+tc.ensureIndex( { "$**": "text" } );
+assert.eq( [2,8,7], queryIDS( tc, "family tea estate" ) );
+
+tc.dropIndexes();
+
+// non-1 weight on "$**" + other weight specified for some field
+tc.ensureIndex( { "$**": "text" }, { weights: { "$**": 10, "text": 2 } } );
+assert.eq( [7,5], queryIDS( tc, "Olympic Games gold medal" ) );
+
+tc.dropIndexes();
+
+// -------------------------------------------- SEARCHING ------------------------------------------
+
+// go back to "$**": 1, "title": 10.. and test more specific search functionality!
+tc.ensureIndex( { "$**": "text" }, { weights: { "title": 10 } } );
+
+// -------------------------------------------- STEMMING -------------------------------------------
+
+// tests stemming for basic plural case
+res = tc.runCommand( "text", { search: "member" } );
+res2 = tc.runCommand( "text", { search: "members" } );
+assert.eq( getIDS( res ), getIDS( res2 ) );
+
+// search for something with potential 's bug.
+res = tc.runCommand( "text", { search: "magazine's" } );
+res2 = tc.runCommand( "text", { search: "magazine" } );
+assert.eq( getIDS( res ), getIDS( res2 ) );
+
+// -------------------------------------------- LIMIT RESULTS --------------------------------------
+
+// ensure limit limits results
+assert.eq( [2], queryIDS( tc, "rural river dam", null , { limit : 1 } ) );
+
+// ensure top results are the same regardless of limit
+// make sure that this uses a case where it wouldn't be otherwise..
+res = tc.runCommand( "text", { search: "united kingdom british princes", limit: 1 } );
+res2 = tc.runCommand( "text", { search: "united kingdom british princes" } );
+assert.eq( 1, res.results.length );
+assert.eq( 4, res2.results.length );
+assert.eq( res.results[0].obj._id, res2.results[0].obj._id );
+
+// -------------------------------------------- PROJECTION -----------------------------------------
+
+// test projection.. show just title and id
+res = tc.runCommand( "text", { search: "Morten Jensen", projection: { title: 1 } } );
+assert.eq( 1, res.results.length );
+assert.eq( 5, res.results[0].obj._id );
+assert.eq( null, res.results[0].obj.text );
+assert.neq( null, res.results[0].obj.title );
+assert.neq( null, res.results[0].obj._id );
+
+// test negative projection, ie. show everything but text
+res = tc.runCommand( "text", { search: "handball", projection: { text: 0 } } );
+assert.eq( 1, res.results.length );
+assert.eq( 4, res.results[0].obj._id );
+assert.eq( null, res.results[0].obj.text );
+assert.neq( null, res.results[0].obj.title );
+assert.neq( null, res.results[0].obj._id );
+
+// test projection only title, no id
+res = tc.runCommand( "text", { search: "Mahim Bora", projection: { _id: 0, title: 1 } } );
+assert.eq( 1, res.results.length );
+assert.eq( "Mahim Bora", res.results[0].obj.title );
+assert.eq( null, res.results[0].obj.text );
+assert.neq( null, res.results[0].obj.title );
+assert.eq( null, res.results[0].obj._id );
+
+// -------------------------------------------- NEGATION -------------------------------------------
+
+// test negation
+assert.eq( [8], queryIDS( tc, "United -Kingdom" ) );
+assert.eq( -1, tc.findOne( { _id : 8 } ).text.search(/Kingdom/i) );
+
+// test negation edge cases... hyphens, double dash, etc.
+assert.eq( [4], queryIDS( tc, "Linn-Kristin" ) );
+
+// -------------------------------------------- PHRASE MATCHING ------------------------------------
+
+// test exact phrase matching on
+assert.eq( [7], queryIDS( tc, "\"Summer Olympics\"" ) );
+assert.neq( -1, tc.findOne( { _id: 7 } ).text.indexOf("Summer Olympics") );
+
+// phrasematch with other stuff.. negation, other terms, etc.
+assert.eq( [10], queryIDS( tc, "\"wild flowers\" Sydney" ) );
+
+assert.eq( [3], queryIDS( tc, "\"industry\" -Melbourne -Physics" ) );
+
+// -------------------------------------------- EDGE CASES -----------------------------------------
+
+// test empty string
+res = tc.runCommand( "text", { search: "" } );
+assert.eq( 0, res.ok )
+
+// test string with a space in it
+res = tc.runCommand( "text", { search: " " } );
+assert.eq( 0, res.results.length );
+
+// -------------------------------------------- FILTERING ------------------------------------------
+
+assert.eq( [2], queryIDS( tc, "Mahim" ) );
+assert.eq( [2], queryIDS( tc, "Mahim", { _id: 2 } ) );
+assert.eq( [], queryIDS( tc, "Mahim", { _id: 1 } ) );
+assert.eq( [], queryIDS( tc, "Mahim", { _id: { $gte: 4 } } ) );
+assert.eq( [2], queryIDS( tc, "Mahim", { _id: { $lte: 4 } } ) );
+
+// using regex conditional filtering
+assert.eq( [9], queryIDS( tc, "members", { title: { $regex: /Phy.*/i } } ) );
+
+// -------------------------------------------------------------------------------------------------
+
+assert( tc.validate().valid );
View
18 jstests/fts_partition1.js
@@ -0,0 +1,18 @@
+load( "jstests/libs/fts.js" )
+
+t = db.text_parition1;
+t.drop();
+
+t.insert( { _id : 1 , x : 1 , y : "foo" } );
+t.insert( { _id : 2 , x : 1 , y : "bar" } );
+t.insert( { _id : 3 , x : 2 , y : "foo" } );
+t.insert( { _id : 4 , x : 2 , y : "bar" } );
+
+t.ensureIndex( { x : 1, y : "text" } );
+
+res = t.runCommand( "text", { search : "foo" } );
+assert.eq( 0, res.ok, tojson(res) );
+
+assert.eq( [ 1 ], queryIDS( t, "foo" , { x : 1 } ) );
+
+
View
25 jstests/fts_phrase.js
@@ -0,0 +1,25 @@
+
+t = db.text_phrase;
+t.drop()
+
+t.save( { _id : 1 , title : "my blog post" , text : "i am writing a blog. yay" } );
+t.save( { _id : 2 , title : "my 2nd post" , text : "this is a new blog i am typing. yay" } );
+t.save( { _id : 3 , title : "knives are Fun" , text : "this is a new blog i am writing. yay" } );
+
+t.ensureIndex( { "title" : "text" , text : "text" } , { weights : { title : 10 } } );
+
+res = t.runCommand( "text" , { search : "blog write" } );
+assert.eq( 3, res.results.length );
+assert.eq( 1, res.results[0].obj._id );
+assert( res.results[0].score > (res.results[1].score*2), tojson(res) );
+
+res = t.runCommand( "text" , { search : "write blog" } );
+assert.eq( 3, res.results.length );
+assert.eq( 1, res.results[0].obj._id );
+assert( res.results[0].score > (res.results[1].score*2), tojson(res) );
+
+
+
+
+
+
View
22 jstests/fts_proj.js
@@ -0,0 +1,22 @@
+load( "jstests/libs/fts.js" );
+
+t = db.text_proj;
+t.drop();
+
+t.save( { _id : 1 , x : "a", y: "b", z : "c"});
+t.save( { _id : 2 , x : "d", y: "e", z : "f"});
+t.save( { _id : 3 , x : "a", y: "g", z : "h"});
+
+t.ensureIndex( { x : "text"} , { default_language : "none" } );
+
+res = t.runCommand("text", {search : "a"});
+assert.eq( 2, res.results.length );
+assert( res.results[0].obj.y, tojson(res) );
+
+res = t.runCommand("text", {search : "a", projection: {x: 1}});
+assert.eq( 2, res.results.length );
+assert( !res.results[0].obj.y, tojson(res) );
+
+
+
+
View
32 jstests/fts_spanish.js
@@ -0,0 +1,32 @@
+
+load( "jstests/libs/fts.js" );
+
+t = db.text_spanish;
+t.drop();
+
+t.save( { _id: 1, title: "mi blog", text: "Este es un blog de prueba" } );
+t.save( { _id: 2, title: "mi segundo post", text: "Este es un blog de prueba" } );
+t.save( { _id: 3, title: "cuchillos son divertidos", text: "este es mi tercer blog stemmed" } );
+t.save( { _id: 4, language: "english", title: "My fourth blog", text: "This stemmed blog is in english" } );
+
+// default weight is 1
+// specify weights if you want a field to be more meaningull
+t.ensureIndex( { "title": "text", text: "text" }, { weights: { title: 10 },
+ default_language: "spanish" } );
+
+res = t.runCommand( "text", { search: "blog" } );
+assert.eq( 4, res.results.length );
+
+assert.eq( [4], queryIDS( t, "stem" ) );
+assert.eq( [3], queryIDS( t, "stemmed" ) );
+assert.eq( [4], queryIDS( t, "stemmed", null, { language : "english" } ) );
+
+assert.eq( [1,2], queryIDS( t, "prueba" ) );
+
+
+
+
+
+
+
+
View
21 jstests/libs/fts.js
@@ -0,0 +1,21 @@
+
+// make sure we're enabled
+db.adminCommand( { setParameter : "*", textSearchEnabled : true } );
+
+function queryIDS( coll, search, filter, extra ){
+ var cmd = { search : search }
+ if ( filter )
+ cmd.filter = filter;
+ if ( extra )
+ Object.extend( cmd, extra );
+ lastCommadResult = coll.runCommand( "text" , cmd);
+
+ return getIDS( lastCommadResult );
+}
+
+function getIDS( commandResult ){
+ if ( ! ( commandResult && commandResult.results ) )
+ return []
+
+ return commandResult.results.map( function(z){ return z.obj._id; } )
+}
View
88 src/mongo/db/fts/SConscript
@@ -0,0 +1,88 @@
+# -*- mode: python -*-
+
+Import("env")
+
+stop_word_lanages = [
+ 'danish',
+ 'dutch',
+ 'english',
+ 'finnish',
+ 'french',
+ 'german',
+ 'hungarian',
+ 'italian',
+ 'norwegian',
+ 'portuguese',
+ 'romanian',
+ 'russian',
+ 'spanish',
+ 'swedish',
+ 'turkish',
+]
+
+env.Command( [ "stop_words_list.h", "stop_words_list.cpp"],
+ [ "generate_stop_words.py"] + [ 'stop_words_%s.txt' % x for x in stop_word_lanages ],
+ "$PYTHON $SOURCES $TARGETS" )
+
+# this is not awesome
+hack = env.Clone()
+hack.StaticLibrary( "stopwords", [ "stop_words_list.cpp" ] )
+if "-O3" in hack["CCFLAGS"]:
+ hack["CCFLAGS"] = hack["CCFLAGS"].remove( "-O3" )
+
+env.StaticLibrary('base', [
+ 'fts_index_format.cpp',
+ 'fts_matcher.cpp',
+ 'fts_query.cpp',
+ 'fts_spec.cpp',
+ 'fts_util.cpp',
+ 'stemmer.cpp',
+ 'stop_words.cpp',
+ 'tokenizer.cpp',
+ ], LIBDEPS=["stopwords",
+ "$BUILD_DIR/mongo/base/base",
+ "$BUILD_DIR/mongo/bson",
+ "$BUILD_DIR/mongo/platform/platform",
+ "$BUILD_DIR/third_party/libstemmer_c/stemmer"
+ ])
+
+env.StaticLibrary( 'server_common', [
+ 'fts_command.cpp',
+ 'fts_enabled.cpp'
+ ] )
+
+env.StaticLibrary('ftsmongod', [
+ 'fts_command_mongod.cpp',
+ 'fts_index.cpp',
+ 'fts_search.cpp',
+ ], LIBDEPS=["base","server_common"])
+
+
+env.StaticLibrary('ftsmongos', [
+ 'fts_command_mongos.cpp',
+ ], LIBDEPS=["server_common"])
+
+
+env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_query_test", "fts_query_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_util_test", "fts_util_test.cpp",
+ LIBDEPS=["base","$BUILD_DIR/mongo/mongohasher"] )
View
93 src/mongo/db/fts/fts_command.cpp
@@ -0,0 +1,93 @@
+// fts_command.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <string>
+#include <vector>
+
+#include "mongo/db/fts/fts_command.h"
+#include "mongo/db/fts/fts_enabled.h"
+#include "mongo/db/fts/fts_search.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/timer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using namespace mongoutils;
+
+ FTSCommand ftsCommand;
+
+ FTSCommand::FTSCommand()
+ : Command( "text" ) {
+ }
+
+ void FTSCommand::addRequiredPrivileges(const std::string& dbname,
+ const BSONObj& cmdObj,
+ std::vector<Privilege>* out) {
+ ActionSet actions;
+ actions.addAction(ActionType::find);
+ out->push_back(Privilege(parseNs(dbname, cmdObj), actions));
+ }
+
+
+ bool FTSCommand::run(const string& dbname,
+ BSONObj& cmdObj,
+ int options,
+ string& errmsg,
+ BSONObjBuilder& result,
+ bool fromRepl) {
+
+ if ( !isTextSearchEnabled() ) {
+ errmsg = "text search not enabled";
+ return false;
+ }
+
+ string ns = dbname + "." + cmdObj.firstElement().String();
+
+ string search = cmdObj["search"].valuestrsafe();
+ if ( search.size() == 0 ) {
+ errmsg = "no search specified";
+ return false;
+ }
+
+ string language = cmdObj["language"].valuestrsafe();
+
+ int limit = cmdObj["limit"].numberInt();
+ if (limit == 0)
+ limit = 100;
+
+ BSONObj filter;
+ if ( cmdObj["filter"].isABSONObj() )
+ filter = cmdObj["filter"].Obj();
+
+ BSONObj projection;
+ if (cmdObj["projection"].isABSONObj()) {
+ projection = cmdObj["projection"].Obj();
+ }
+
+ return _run( dbname, cmdObj, options,
+ ns, search, language, limit, filter, projection, errmsg, result );
+ }
+
+
+ }
+
+
+}
View
68 src/mongo/db/fts/fts_command.h
@@ -0,0 +1,68 @@
+// fts_command.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "mongo/db/commands.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSCommand : public Command {
+ public:
+ FTSCommand();
+
+ bool slaveOk() const { return true; }
+ bool slaveOverrideOk() const { return true; }
+
+ LockType locktype() const;
+
+ void addRequiredPrivileges(const std::string& dbname,
+ const BSONObj& cmdObj,
+ std::vector<Privilege>* out);
+
+
+ bool run(const string& dbname,
+ BSONObj& cmdObj,
+ int options,
+ string& errmsg,
+ BSONObjBuilder& result,
+ bool fromRepl);
+
+ protected:
+ bool _run( const string& dbName,
+ BSONObj& cmdObj,
+ int cmdOptions,
+ const string& ns,
+ const string& searchString,
+ string language, // "" for not-set
+ int limit,
+ BSONObj& filter,
+ BSONObj& projection,
+ string& errmsg,
+ BSONObjBuilder& result );
+ };
+
+ }
+
+}
+
View
159 src/mongo/db/fts/fts_command_mongod.cpp
@@ -0,0 +1,159 @@
+// fts_command_mongod.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "mongo/db/fts/fts_command.h"
+#include "mongo/db/fts/fts_search.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/db/pdfile.h"
+#include "mongo/db/projection.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/timer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ Command::LockType FTSCommand::locktype() const {
+ return READ;
+ }
+
+ /*
+ * Runs the command object cmdobj on the db with name dbname and puts result in result.
+ * @param dbname, name of db
+ * @param cmdobj, object that contains entire command
+ * @param options
+ * @param errmsg, reference to error message
+ * @param result, reference to builder for result
+ * @param fromRepl
+ * @return true if successful, false otherwise
+ */
+ bool FTSCommand::_run(const string& dbname,
+ BSONObj& cmdObj,
+ int cmdOptions,
+ const string& ns,
+ const string& searchString,
+ string language, // "" for not-set
+ int limit,
+ BSONObj& filter,
+ BSONObj& projection,
+ string& errmsg,
+ BSONObjBuilder& result ) {
+
+ Timer comm;
+
+ scoped_ptr<Projection> pr;
+ if ( !projection.isEmpty() ) {
+ pr.reset( new Projection() );
+ pr->init( projection );
+ }
+
+ // priority queue for results
+ Results results;
+
+ NamespaceDetails * d = nsdetails( ns.c_str() );
+ if ( !d ) {
+ errmsg = "can't find ns";
+ return false;
+ }
+
+ vector<int> idxMatches;
+ d->findIndexByType( INDEX_NAME, idxMatches );
+ if ( idxMatches.size() == 0 ) {
+ errmsg = str::stream() << "no text index for: " << ns;
+ return false;
+ }
+ if ( idxMatches.size() > 1 ) {
+ errmsg = str::stream() << "too many text index for: " << ns;
+ return false;
+ }
+
+ const IndexDetails& id = d->idx( idxMatches[0] );
+ BSONObj indexPrefix;
+
+ if ( language == "" ) {
+ FTSIndex* ftsIndex = static_cast<FTSIndex*>(id.getSpec().getType());
+ language = ftsIndex->getFtsSpec().defaultLanguage();
+ Status s = ftsIndex->getFtsSpec().getIndexPrefix( filter, &indexPrefix );
+ if ( !s.isOK() ) {
+ errmsg = s.toString();
+ return false;
+ }
+ }
+
+
+ FTSQuery query;
+ if ( !query.parse( searchString, language ).isOK() ) {
+ errmsg = "can't parse search";
+ return false;
+ }
+ result.append( "queryDebugString", query.debugString() );
+ result.append( "language", language );
+
+ FTSSearch search( d, id, indexPrefix, query, filter );
+ search.go( &results, limit );
+
+ // grab underlying container inside priority queue
+ vector<ScoredLocation> r( results.dangerous() );
+
+ // sort results by score (not always in correct order, especially w.r.t. multiterm)
+ sort( r.begin(), r.end() );
+
+ // build the results bson array shown to user
+ BSONArrayBuilder a( result.subarrayStart( "results" ) );
+
+ int BSONResultSize = 1024;
+
+ for ( unsigned n = 0; n < r.size(); n++ ) {
+ BSONObj obj = BSONObj::make(r[n].rec);
+ BSONObj toSendBack = obj;
+
+ if ( pr ) {
+ toSendBack = pr->transform(obj);
+ }
+
+ if ( ( BSONResultSize + toSendBack.objsize() ) >= BSONObjMaxUserSize ) {
+ break;
+ }
+
+ BSONObjBuilder x( a.subobjStart() );
+ x.append( "score" , r[n].score );
+ x.append( "obj", toSendBack );
+
+ BSONObj xobj = x.done();
+ BSONResultSize += xobj.objsize();
+ }
+
+ a.done();
+
+ // returns some stats to the user
+ BSONObjBuilder bb( result.subobjStart( "stats" ) );
+ bb.appendNumber( "nscanned" , search.getKeysLookedAt() );
+ bb.appendNumber( "nscannedObjects" , search.getObjLookedAt() );
+ bb.appendNumber( "n" , r.size() );
+ bb.append( "timeMicros", (int)comm.micros() );
+ bb.done();
+
+ return true;
+ }
+ }
+
+}
View
129 src/mongo/db/fts/fts_command_mongos.cpp
@@ -0,0 +1,129 @@
+// fts_command_mongos.cpp
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_command.h"
+#include "mongo/s/strategy.h"
+
+
+namespace mongo {
+ namespace fts {
+
+ struct Scored {
+ Scored( BSONObj full )
+ : full( full ) {
+ score = full["score"].numberDouble();
+ }
+ bool operator<( const Scored& other ) const {
+ return other.score < score;
+ }
+ BSONObj full;
+ double score;
+ };
+
+
+ // all grid commands are designed not to lock
+ Command::LockType FTSCommand::locktype() const { return NONE; }
+
+ bool FTSCommand::_run(const string& dbName,
+ BSONObj& cmdObj,
+ int cmdOptions,
+ const string& ns,
+ const string& searchString,
+ string language, // "" for not-set
+ int limit,
+ BSONObj& filter,
+ BSONObj& projection,
+ string& errmsg,
+ BSONObjBuilder& result ) {
+
+ Timer timer;
+
+ map<Shard, BSONObj> results;
+ SHARDED->commandOp( dbName, cmdObj, cmdOptions, ns, filter, results );
+
+ vector<Scored> all;
+ long long nscanned = 0;
+ long long nscannedObjects = 0;
+
+ BSONObjBuilder shardStats;
+
+ for ( map<Shard,BSONObj>::const_iterator i = results.begin(); i != results.end(); ++i ) {
+ BSONObj r = i->second;
+
+ LOG(2) << "fts result for shard: " << i->first << "\n" << r << endl;
+
+ if ( !r["ok"].trueValue() ) {
+ errmsg = str::stream() << "failure on shard: " << i->first.toString()
+ << ": " << r["errmsg"];
+ result.append( "rawresult", r );
+ return false;
+ }
+
+ if ( r["stats"].isABSONObj() ) {
+ BSONObj x = r["stats"].Obj();
+ nscanned += x["nscanned"].numberLong();
+ nscannedObjects += x["nscannedObjects"].numberLong();
+
+ shardStats.append( i->first.getName(), x );
+ }
+
+ if ( r["results"].isABSONObj() ) {
+ BSONObjIterator j( r["results"].Obj() );
+ while ( j.more() ) {
+ BSONElement e = j.next();
+ all.push_back( Scored(e.Obj()) );
+ }
+ }
+ }
+
+ sort( all.begin(), all.end() );
+ long long n = 0;
+ {
+ BSONArrayBuilder arr( result.subarrayStart( "results" ) );
+ for ( unsigned i = 0; i < all.size(); i++ ) {
+ arr.append( all[i].full );
+ if ( ++n >= limit )
+ break;
+ }
+ arr.done();
+ }
+
+ {
+ BSONObjBuilder stats( result.subobjStart( "stats" ) );
+ stats.appendNumber( "nscanned", nscanned );
+ stats.appendNumber( "nscannedObjects", nscannedObjects );
+ stats.appendNumber( "n", n );
+ stats.append( "timeMicros", (int)timer.micros() );
+
+ stats.append( "shards", shardStats.obj() );
+
+ stats.done();
+ }
+
+ return true;
+ }
+
+ FTSCommand ftsCommandSharded;
+ }
+}
View
28 src/mongo/db/fts/fts_enabled.cpp
@@ -0,0 +1,28 @@
+// fts_enabled.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/db/server_parameters.h"
+
+namespace mongo {
+ namespace fts {
+ MONGO_EXPORT_SERVER_PARAMETER( textSearchEnabled, bool, false );
+ bool isTextSearchEnabled() {
+ return textSearchEnabled;
+ }
+ }
+}
View
25 src/mongo/db/fts/fts_enabled.h
@@ -0,0 +1,25 @@
+// fts_enabled.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+ namespace fts {
+ bool isTextSearchEnabled();
+ }
+}
View
96 src/mongo/db/fts/fts_index.cpp
@@ -0,0 +1,96 @@
+// fts_index.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/base/init.h"
+#include "mongo/db/client.h"
+#include "mongo/db/fts/fts_enabled.h"
+#include "mongo/db/fts/fts_index.h"
+#include "mongo/db/fts/fts_index_format.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+#include "mongo/util/timer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using namespace mongoutils;
+
+ /*
+ * extrapolates the weights vector
+ * and extra information from the spec
+ * @param plugin the index plugin for FTS
+ * @param spec the index specification
+ */
+ FTSIndex::FTSIndex( const IndexPlugin* plugin, const IndexSpec* spec )
+ : IndexType( plugin, spec ), _ftsSpec( spec->info ) {
+ }
+
+ void FTSIndex::getKeys( const BSONObj& obj, BSONObjSet& keys) const {
+ FTSIndexFormat::getKeys( _ftsSpec, obj, &keys );
+ }
+
+ shared_ptr<Cursor> FTSIndex::newCursor( const BSONObj& query,
+ const BSONObj& order,
+ int numWanted ) const {
+ shared_ptr<Cursor> c;
+ verify(0);
+ return c;
+ }
+
+
+ FTSIndexPlugin::FTSIndexPlugin() : IndexPlugin( INDEX_NAME ) {}
+
+
+ /*
+ * Adjusts spec by appending information relative to the
+ * FTS Index (such as weights, index name, etc)
+ * @param spec, specification object
+ *
+ */
+ BSONObj FTSIndexPlugin::adjustIndexSpec( const BSONObj& spec ) const {
+ StringData desc = cc().desc();
+ if ( desc.find( "conn" ) == 0 ) {
+ // this is to make sure we only complain for users
+ // if you do get a text index created an a primary
+ // want it to index on the secondary as well
+ massert( 16633, "text search not enabled", isTextSearchEnabled() );
+ }
+ return FTSSpec::fixSpec( spec );
+ }
+
+ /*
+ * Generates an FTSIndex with a spec and this plugin
+ * @param spec, specification to be used
+ */
+ IndexType* FTSIndexPlugin::generate( const IndexSpec* spec ) const {
+ return new FTSIndex( this, spec );
+ }
+
+
+ FTSIndexPlugin* ftsPlugin;
+ MONGO_INITIALIZER(FTSIndexPlugin)(InitializerContext* context) {
+ ftsPlugin = new FTSIndexPlugin();
+ return Status::OK();
+ }
+
+ }
+
+}
View
67 src/mongo/db/fts/fts_index.h
@@ -0,0 +1,67 @@
+// fts_index.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/db/index.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSIndex : public IndexType {
+ public:
+
+ // index constructor, called when user enters ensureIndex command with fts flag
+ FTSIndex(const IndexPlugin *plugin, const IndexSpec* spec);
+
+ void getKeys( const BSONObj& obj, BSONObjSet& keys) const;
+
+ /* newCursor is pure Virtual in IndexType so it has to be redefined in FTSIndex */
+ shared_ptr<Cursor> newCursor( const BSONObj& query,
+ const BSONObj& order,
+ int numWanted ) const;
+
+ const FTSSpec& getFtsSpec() const { return _ftsSpec; }
+
+ private:
+
+ FTSSpec _ftsSpec;
+ };
+
+
+ class FTSIndexPlugin : public IndexPlugin {
+ public:
+ FTSIndexPlugin();
+
+ IndexType* generate( const IndexSpec* spec ) const;
+
+ BSONObj adjustIndexSpec( const BSONObj& spec ) const;
+
+ };
+
+ } //namespace fts
+} //namespace mongo
View
119 src/mongo/db/fts/fts_index_format.cpp
@@ -0,0 +1,119 @@
+// fts_index_format.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/base/init.h"
+#include "mongo/db/fts/fts_index_format.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ namespace {
+ BSONObj nullObj;
+ BSONElement nullElt;
+ }
+
+ MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) {
+ BSONObjBuilder b;
+ b.appendNull( "" );
+ nullObj = b.obj();
+ nullElt = nullObj.firstElement();
+ return Status::OK();
+ }
+
+ void FTSIndexFormat::getKeys( const FTSSpec& spec,
+ const BSONObj& obj,
+ BSONObjSet* keys ) {
+
+ int extraSize = 0;
+ vector<BSONElement> extrasBefore;
+ vector<BSONElement> extrasAfter;
+
+ // compute the non FTS key elements
+ for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) {
+ BSONElement e = obj.getFieldDotted(spec.extraBefore(i));
+ if ( e.eoo() )
+ e = nullElt;
+ extrasBefore.push_back(e);
+ extraSize += e.size();
+ }
+ for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) {
+ BSONElement e = obj.getFieldDotted(spec.extraAfter(i));
+ if ( e.eoo() )
+ e = nullElt;
+ extrasAfter.push_back(e);
+ extraSize += e.size();
+ }
+
+
+ TermFrequencyMap term_freqs;
+ spec.scoreDocument( obj, &term_freqs );
+
+ // create index keys from raw scores
+ // only 1 per string
+ for ( TermFrequencyMap::const_iterator i = term_freqs.begin();
+ i != term_freqs.end();
+ ++i ) {
+
+ const string& term = i->first;
+ double weight = i->second;
+
+ // guess the total size of the btree entry based on the size of the weight, term tuple
+ int guess =
+ 5 /* bson overhead */ +
+ 10 /* weight */ +
+ 8 /* term overhead */ +
+ term.size() +
+ extraSize;
+
+ BSONObjBuilder b(guess); // builds a BSON object with guess length.
+ for ( unsigned k = 0; k < extrasBefore.size(); k++ )
+ b.appendAs( extrasBefore[k], "" );
+ _appendIndexKey( b, weight, term );
+ for ( unsigned k = 0; k < extrasAfter.size(); k++ )
+ b.appendAs( extrasAfter[k], "" );
+ BSONObj res = b.obj();
+
+ verify( guess >= res.objsize() );
+
+ keys->insert( res );
+ }
+ }
+
+ BSONObj FTSIndexFormat::getIndexKey( double weight,
+ const string& term,
+ const BSONObj& indexPrefix ) {
+ BSONObjBuilder b;
+
+ BSONObjIterator i( indexPrefix );
+ while ( i.more() )
+ b.appendAs( i.next(), "" );
+
+ _appendIndexKey( b, weight, term );
+ return b.obj();
+ }
+
+ void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term ) {
+ verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header
+ b.append( "", term );
+ b.append( "", weight );
+ }
+ }
+}
View
55 src/mongo/db/fts/fts_index_format.h
@@ -0,0 +1,55 @@
+// fts_index_format.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mongo/db/fts/fts_spec.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSIndexFormat {
+ public:
+
+ static void getKeys( const FTSSpec& spec,
+ const BSONObj& document,
+ BSONObjSet* keys );
+
+ /*
+ * Helper method to get return entry from the FTSIndex as a BSONObj
+ * @param weight, the weight of the term in the entry
+ * @param term, the string term in the entry
+ * @param indexPrefix, the fields that go in the index first
+ */
+ static BSONObj getIndexKey( double weight,
+ const string& term,
+ const BSONObj& indexPrefix );
+
+ private:
+ /*
+ * Helper method to get return entry from the FTSIndex as a BSONObj
+ * @param b, reference to the BSONOBjBuilder
+ * @param weight, the weight of the term in the entry
+ * @param term, the string term in the entry
+ */
+ static void _appendIndexKey( BSONObjBuilder& b, double weight, const string& term );
+ };
+
+ }
+}
View
96 src/mongo/db/fts/fts_index_format_test.cpp
@@ -0,0 +1,96 @@
+// fts_index_format_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_index_format.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ TEST( FTSIndexFormat, Simple1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec, BSON( "data" << "cat sat" ), &keys );
+
+ ASSERT_EQUALS( 2U, keys.size() );
+ for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+ BSONObj key = *i;
+ ASSERT_EQUALS( 2, key.nFields() );
+ ASSERT_EQUALS( String, key.firstElement().type() );
+ }
+ }
+
+ TEST( FTSIndexFormat, ExtraBack1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" <<
+ "x" << 1 ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys );
+
+ ASSERT_EQUALS( 1U, keys.size() );
+ BSONObj key = *(keys.begin());
+ ASSERT_EQUALS( 3, key.nFields() );
+ BSONObjIterator i( key );
+ ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
+ ASSERT( i.next().numberDouble() > 0 );
+ ASSERT_EQUALS( 5, i.next().numberInt() );
+ }
+
+ /*
+ TEST( FTSIndexFormat, ExtraBackArray1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" <<
+ "x.y" << 1 ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec,
+ BSON( "data" << "cat" <<
+ "x" << BSON_ARRAY( BSON( "y" << 1 ) <<
+ BSON( "y" << 2 ) ) ),
+ &keys );
+
+ ASSERT_EQUALS( 1U, keys.size() );
+ BSONObj key = *(keys.begin());
+ log() << "e: " << key << endl;
+ ASSERT_EQUALS( 3, key.nFields() );
+ BSONObjIterator i( key );
+ ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
+ ASSERT( i.next().numberDouble() > 0 );
+ ASSERT_EQUALS( 5, i.next().numberInt() );
+ }
+ */
+
+ TEST( FTSIndexFormat, ExtraFront1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << 1 <<
+ "data" << "text" ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys );
+
+ ASSERT_EQUALS( 1U, keys.size() );
+ BSONObj key = *(keys.begin());
+ ASSERT_EQUALS( 3, key.nFields() );
+ BSONObjIterator i( key );
+ ASSERT_EQUALS( 5, i.next().numberInt() );
+ ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
+ ASSERT( i.next().numberDouble() > 0 );
+ }
+
+
+ }
+}
View
247 src/mongo/db/fts/fts_matcher.cpp
@@ -0,0 +1,247 @@
+// fts_matcher.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_matcher.h"
+
+namespace mongo {
+
+ namespace fts {
+
+
+ FTSMatcher::FTSMatcher( const FTSQuery& query, const FTSSpec& spec )
+ : _query( query ),
+ _spec( spec ),
+ _stemmer( query.getLanguage() ){
+ }
+
+ /*
+ * Checks if the obj contains any of the negTerms, if so returns true, otherwise false
+ * @param obj, object to be checked
+ */
+ bool FTSMatcher::hasNegativeTerm(const BSONObj& obj ) const {
+ // called during search. deals with the case in which we have a term
+ // flagged for exclusion, i.e. "hello -world" we want to remove all
+ // results that include "world"
+
+ if ( _query.getNegatedTerms().size() == 0 )
+ return false;
+
+ if ( _spec.wildcard() ) {
+ return _hasNegativeTerm_recurse(obj);
+ }
+
+ /* otherwise look at fields where weights are defined */
+ for ( Weights::const_iterator i = _spec.weights().begin();
+ i != _spec.weights().end();
+ i++ ) {
+ const char * leftOverName = i->first.c_str();
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+
+ if ( e.type() == Array ) {
+ BSONObjIterator j( e.Obj() );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+ if ( leftOverName[0] && x.isABSONObj() )
+ x = x.Obj().getFieldDotted( leftOverName );
+ if ( x.type() == String )
+ if ( _hasNegativeTerm_string( x.String() ) )
+ return true;
+ }
+ }
+ else if ( e.type() == String ) {
+ if ( _hasNegativeTerm_string( e.String() ) )
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FTSMatcher::_hasNegativeTerm_recurse(const BSONObj& obj ) const {
+ BSONObjIterator j( obj );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( _spec.languageOverrideField() == x.fieldName())
+ continue;
+
+ if (x.type() == String) {
+ if ( _hasNegativeTerm_string( x.String() ) )
+ return true;
+ }
+ else if ( x.isABSONObj() ) {
+ BSONObjIterator k( x.Obj() );
+ while ( k.more() ) {
+ // check if k.next() is a obj/array or not
+ BSONElement y = k.next();
+ if ( y.type() == String ) {
+ if ( _hasNegativeTerm_string( y.String() ) )
+ return true;
+ }
+ else if ( y.isABSONObj() ) {
+ if ( _hasNegativeTerm_recurse( y.Obj() ) )
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /*
+ * Checks if any of the negTerms is in the tokenized string
+ * @param raw, the raw string to be tokenized
+ */
+ bool FTSMatcher::_hasNegativeTerm_string( const string& raw ) const {
+
+ Tokenizer i( _query.getLanguage(), raw );
+ while ( i.more() ) {
+ Token t = i.next();
+ if ( t.type != Token::TEXT )
+ continue;
+ string word = tolowerString( _stemmer.stem( t.data ) );
+ if ( _query.getNegatedTerms().count( word ) > 0 )
+ return true;
+ }
+ return false;
+ }
+
+
+ bool FTSMatcher::phrasesMatch( const BSONObj& obj ) const {
+ for (unsigned i = 0; i < _query.getPhr().size(); i++ ) {
+ if ( !phraseMatch( _query.getPhr()[i], obj ) ) {
+ return false;
+ }
+ }
+
+ for (unsigned i = 0; i < _query.getNegatedPhr().size(); i++ ) {
+ if ( phraseMatch( _query.getNegatedPhr()[i], obj ) ) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+
+ /**
+ * Checks if phrase is exactly matched in obj, returns true if so, false otherwise
+ * @param phrase, the string to be matched
+ * @param obj, document in the collection to match against
+ */
+ bool FTSMatcher::phraseMatch( const string& phrase, const BSONObj& obj ) const {
+
+ if ( _spec.wildcard() ) {
+ // case where everything is indexed (all fields)
+ return _phraseRecurse( phrase, obj );
+ }
+
+ for ( Weights::const_iterator i = _spec.weights().begin();
+ i != _spec.weights().end();
+ ++i ) {
+
+ // figure out what the indexed field is.. ie. is it "field" or "field.subfield" etc.
+ const char * leftOverName = i->first.c_str();
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+
+ if ( e.type() == Array ) {
+ BSONObjIterator j( e.Obj() );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( leftOverName[0] && x.isABSONObj() )
+ x = x.Obj().getFieldDotted( leftOverName );
+
+ if ( x.type() == String )
+ if ( _phraseMatches( phrase, x.String() ) )
+ return true;
+ }
+ }
+ else if ( e.type() == String ) {
+ if ( _phraseMatches( phrase, e.String() ) )
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ /*
+ * Recurses over all fields in the obj to match against phrase
+ * @param phrase, string to be matched
+ * @param obj, object to matched against
+ */
+ bool FTSMatcher::_phraseRecurse( const string& phrase, const BSONObj& obj ) const {
+ BSONObjIterator j( obj );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( _spec.languageOverrideField() == x.fieldName() )
+ continue;
+
+ if ( x.type() == String ) {
+ if ( _phraseMatches( phrase, x.String() ) )
+ return true;
+ }
+ else if ( x.isABSONObj() ) {
+ BSONObjIterator k( x.Obj() );
+
+ while ( k.more() ) {
+
+ BSONElement y = k.next();
+
+ if ( y.type() == mongo::String ) {
+ if ( _phraseMatches( phrase, y.String() ) )
+ return true;
+ }
+ else if ( y.isABSONObj() ) {
+ if ( _phraseRecurse( phrase, y.Obj() ) )
+ return true;
+ }
+ }
+
+ }
+ }
+
+ return false;
+ }
+
+
+ /*
+ * Looks for phrase in a raw string
+ * @param phrase, phrase to match
+ * @param raw, raw string to be parsed
+ */
+ bool FTSMatcher::_phraseMatches( const string& phrase, const string& haystack ) const {
+#ifdef _WIN32
+ // windows doesn't have strcasestr
+ // for now, doing something very slow, bu correct
+ string p = phrase;
+ string h = haystack;
+ makeLower( &p );
+ makeLower( &h );
+ return strstr( h.c_str(), p.c_str() ) > 0;
+#else
+ return strcasestr( haystack.c_str(), phrase.c_str() ) > 0;
+#endif
+ }
+
+
+ }
+}
View
67 src/mongo/db/fts/fts_matcher.h
@@ -0,0 +1,67 @@
+// fts_matcher.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/tokenizer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSMatcher {
+ public:
+ FTSMatcher( const FTSQuery& query, const FTSSpec& spec );
+
+ /**
+ * @return true if obj has a negated term
+ */
+ bool hasNegativeTerm(const BSONObj& obj ) const;
+
+ /**
+ * @return true if obj is ok by all phrases
+ * so all full phrases and no negated
+ */
+ bool phrasesMatch( const BSONObj& obj ) const;
+
+ bool phraseMatch( const string& phrase, const BSONObj& obj ) const;
+
+ bool matchesNonTerm( const BSONObj& obj ) const {
+ return !hasNegativeTerm( obj ) && phrasesMatch( obj );
+ }
+
+ private:
+ bool _hasNegativeTerm_recurse(const BSONObj& obj ) const;
+
+ /**
+ * @return true if raw has a negated term
+ */
+ bool _hasNegativeTerm_string( const string& raw ) const;
+
+ bool _phraseRecurse( const string& phrase, const BSONObj& obj ) const;
+ bool _phraseMatches( const string& phrase, const string& haystack ) const;
+
+ FTSQuery _query;
+ FTSSpec _spec;
+ Stemmer _stemmer;
+ };
+
+ }
+}
View
63 src/mongo/db/fts/fts_matcher_test.cpp
@@ -0,0 +1,63 @@
+// fts_matcher_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_matcher.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( FTSMatcher, NegWild1 ) {
+ FTSQuery q;
+ q.parse( "foo -bar", "english" );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) );
+
+ ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) );
+ ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) );
+ }
+
+ TEST( FTSMatcher, Phrase1 ) {
+ FTSQuery q;
+ q.parse( "foo \"table top\"", "english" );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) );
+
+ ASSERT( m.phraseMatch( "table top", BSON( "x" << "table top" ) ) );
+ ASSERT( m.phraseMatch( "table top", BSON( "x" << " asd table top asd" ) ) );
+ ASSERT( !m.phraseMatch( "table top", BSON( "x" << "tablz top" ) ) );
+ ASSERT( !m.phraseMatch( "table top", BSON( "x" << " asd tablz top asd" ) ) );
+
+ ASSERT( m.phrasesMatch( BSON( "x" << "table top" ) ) );
+ ASSERT( !m.phrasesMatch( BSON( "x" << "table a top" ) ) );
+
+ }
+
+ TEST( FTSMatcher, Phrase2 ) {
+ FTSQuery q;
+ q.parse( "foo \"table top\"", "english" );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "fts" ) ) ) ) );
+ ASSERT( m.phraseMatch( "table top",
+ BSON( "x" << BSON_ARRAY( "table top" ) ) ) );
+ }
+
+ }
+}
View
173 src/mongo/db/fts/fts_query.cpp
@@ -0,0 +1,173 @@
+// fts_query.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using namespace mongoutils;
+
+ Status FTSQuery::parse(const string& query, const string& language) {
+ _search = query;
+ _language = language;
+
+ const StopWords* stopWords = StopWords::getStopWords( language );
+ Stemmer stemmer( language );
+
+ bool inNegation = false;
+ bool inPhrase = false;
+
+ str::stream phrase;
+
+ Tokenizer i( _language, query );
+ while ( i.more() ) {
+ Token t = i.next();
+
+ if ( t.type == Token::TEXT ) {
+ string s = t.data.toString();
+
+ if ( inPhrase ) {
+ if ( phrase.ss.len() > 0 )
+ phrase << ' ';
+ phrase << s;
+ }
+
+ if ( inPhrase && inNegation ) {
+ // don't add term
+ }
+ else {
+ _addTerm( stopWords, stemmer, s, inNegation );
+ }
+
+ if ( inNegation && !inPhrase )
+ inNegation = false;
+ }
+ else if ( t.type == Token::DELIMITER ) {
+ char c = t.data[0];
+ if ( c == '-' ) {
+ if ( t.previousWhiteSpace )
+ inNegation = true;
+ }
+ else if ( c == '"' ) {
+ if ( inPhrase ) {
+ // end of a phrase
+ if ( inNegation )
+ _negatedPhrases.push_back( tolowerString( phrase ) );
+ else
+ _phrases.push_back( tolowerString( phrase ) );
+ inNegation = false;
+ inPhrase = false;
+ }
+ else {
+ // start of a phrase
+ inPhrase = true;
+ phrase.ss.reset();
+ }
+ }
+ }
+ else {
+ abort();
+ }
+ }
+
+ return Status::OK();
+ }
+
+ void FTSQuery::_addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated ) {
+ string word = tolowerString( term );
+ if ( sw->isStopWord( word ) )
+ return;
+ word = stemmer.stem( word );
+ if ( negated )
+ _negatedTerms.insert( word );
+ else
+ _terms.push_back( word );
+ }
+
+ namespace {
+ void _debugHelp( stringstream& ss, const set<string>& s, const string& sep ) {
+ bool first = true;
+ for ( set<string>::const_iterator i = s.begin(); i != s.end(); ++i ) {
+ if ( first )
+ first = false;
+ else
+ ss << sep;
+ ss << *i;
+ }
+ }
+
+ void _debugHelp( stringstream& ss, const vector<string>& v, const string& sep ) {
+ set<string> s( v.begin(), v.end() );
+ _debugHelp( ss, s, sep );
+ }
+
+ void _debugHelp( stringstream& ss, const unordered_set<string>& v, const string& sep ) {
+ set<string> s( v.begin(), v.end() );
+ _debugHelp( ss, s, sep );
+ }
+
+ }
+
+ string FTSQuery::toString() const {
+ stringstream ss;
+ ss << "FTSQuery\n";
+
+ ss << " terms: ";
+ _debugHelp( ss, getTerms(), ", " );
+ ss << "\n";
+
+ ss << " negated terms: ";
+ _debugHelp( ss, getNegatedTerms(), ", " );
+ ss << "\n";
+
+ ss << " phrases: ";
+ _debugHelp( ss, getPhr(), ", " );
+ ss << "\n";
+
+ ss << " negated phrases: ";
+ _debugHelp( ss, getNegatedPhr(), ", " );
+ ss << "\n";
+
+ return ss.str();
+ }
+
+ string FTSQuery::debugString() const {
+ stringstream ss;
+
+ _debugHelp( ss, getTerms(), "|" );
+ ss << "||";
+
+ _debugHelp( ss, getNegatedTerms(), "|" );
+ ss << "||";
+
+ _debugHelp( ss, getPhr(), "|" );
+ ss << "||";
+
+ _debugHelp( ss, getNegatedPhr(), "|" );
+
+ return ss.str();
+ }
+ }
+}
View
80 src/mongo/db/fts/fts_query.h
@@ -0,0 +1,80 @@
+// fts_query.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "mongo/base/status.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/platform/unordered_set.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using std::string;
+ using std::vector;
+ using std::set;
+
+ class FTSQuery {
+
+ public:
+ Status parse(const string& query, const string& language);
+
+ const vector<string>& getTerms() const { return _terms; }
+ const unordered_set<string>& getNegatedTerms() const { return _negatedTerms; }
+
+ const vector<string>& getPhr() const { return _phrases; }
+ const vector<string>& getNegatedPhr() const { return _negatedPhrases; }
+
+ /**
+ * @return true if any negations or phrase + or -
+ */
+ bool hasNonTermPieces() const {
+ return
+ _negatedTerms.size() > 0 ||
+ _phrases.size() > 0 ||
+ _negatedPhrases.size() > 0;
+ }
+
+ string getSearch() const { return _search; }
+ string getLanguage() const { return _language; }
+
+ string toString() const;
+
+ string debugString() const;
+
+ protected:
+ string _search;
+ string _language;
+ vector<string> _terms;
+ unordered_set<string> _negatedTerms;
+ vector<string> _phrases;
+ vector<string> _negatedPhrases;
+
+ private:
+ void _addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated );
+ };
+
+ }
+}
+
View
73 src/mongo/db/fts/fts_query_test.cpp
@@ -0,0 +1,73 @@
+// fts_query_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( FTSQuery, Basic1 ) {
+ FTSQuery q;
+ ASSERT( q.parse( "this is fun", "english" ).isOK() );
+
+ ASSERT_EQUALS( 1U, q.getTerms().size() );
+ ASSERT_EQUALS( "fun", q.getTerms()[0] );
+ ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
+ ASSERT_EQUALS( 0U, q.getPhr().size() );
+ ASSERT_EQUALS