Permalink
Browse files

SERVER-380: Add snowball stemmer

  • Loading branch information...
erh committed Dec 25, 2012
1 parent 13a61cb commit d2df300721805ace411b5d1a87cb4bf6d8a51ff3
Showing with 28,945 additions and 2 deletions.
  1. +1 −0 SConstruct
  2. +37 −1 distsrc/THIRD-PARTY-NOTICES
  3. +7 −1 src/mongo/SConscript
  4. +2 −0 src/third_party/SConscript
  5. +72 −0 src/third_party/libstemmer_c/MANIFEST
  6. +9 −0 src/third_party/libstemmer_c/Makefile
  7. +125 −0 src/third_party/libstemmer_c/README
  8. +46 −0 src/third_party/libstemmer_c/SConscript
  9. +209 −0 src/third_party/libstemmer_c/examples/stemwords.c
  10. +79 −0 src/third_party/libstemmer_c/include/libstemmer.h
  11. +95 −0 src/third_party/libstemmer_c/libstemmer/libstemmer.c
  12. +95 −0 src/third_party/libstemmer_c/libstemmer/libstemmer_c.in
  13. +95 −0 src/third_party/libstemmer_c/libstemmer/libstemmer_utf8.c
  14. +190 −0 src/third_party/libstemmer_c/libstemmer/modules.h
  15. +50 −0 src/third_party/libstemmer_c/libstemmer/modules.txt
  16. +121 −0 src/third_party/libstemmer_c/libstemmer/modules_utf8.h
  17. +49 −0 src/third_party/libstemmer_c/libstemmer/modules_utf8.txt
  18. +82 −0 src/third_party/libstemmer_c/mkinc.mak
  19. +52 −0 src/third_party/libstemmer_c/mkinc_utf8.mak
  20. +66 −0 src/third_party/libstemmer_c/runtime/api.c
  21. +26 −0 src/third_party/libstemmer_c/runtime/api.h
  22. +58 −0 src/third_party/libstemmer_c/runtime/header.h
  23. +478 −0 src/third_party/libstemmer_c/runtime/utilities.c
  24. +337 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_danish.c
  25. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_danish.h
  26. +624 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
  27. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
  28. +1,117 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_english.c
  29. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_english.h
  30. +762 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
  31. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
  32. +1,246 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_french.c
  33. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_french.h
  34. +521 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_german.c
  35. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_german.h
  36. +1,230 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
  37. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
  38. +1,065 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_italian.c
  39. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_italian.h
  40. +297 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
  41. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
  42. +749 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_porter.c
  43. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_porter.h
  44. +1,017 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
  45. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
  46. +1,093 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
  47. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
  48. +307 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
  49. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
  50. +998 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
  51. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
  52. +700 −0 src/third_party/libstemmer_c/src_c/stem_KOI8_R_russian.c
  53. +16 −0 src/third_party/libstemmer_c/src_c/stem_KOI8_R_russian.h
  54. +339 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_danish.c
  55. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_danish.h
  56. +634 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_dutch.c
  57. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_dutch.h
  58. +1,125 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_english.c
  59. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_english.h
  60. +768 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_finnish.c
  61. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_finnish.h
  62. +1,256 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_french.c
  63. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_french.h
  64. +527 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_german.c
  65. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_german.h
  66. +1,234 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_hungarian.c
  67. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_hungarian.h
  68. +1,073 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_italian.c
  69. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_italian.h
  70. +299 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_norwegian.c
  71. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_norwegian.h
  72. +755 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_porter.c
  73. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_porter.h
  74. +1,023 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_portuguese.c
  75. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_portuguese.h
  76. +1,004 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_romanian.c
  77. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_romanian.h
  78. +694 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_russian.c
  79. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_russian.h
  80. +1,097 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_spanish.c
  81. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_spanish.h
  82. +309 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_swedish.c
  83. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_swedish.h
  84. +2,205 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_turkish.c
  85. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_turkish.h
View
@@ -780,6 +780,7 @@ if not use_system_version_of_library("boost"):
CPPDEFINES=['BOOST_ALL_NO_LIB'])
env.Prepend(CPPPATH=['$BUILD_DIR/third_party/s2'])
env.Prepend(CPPPATH=['$BUILD_DIR/third_party/libstemmer_c/include'])
env.Append( CPPPATH=['$EXTRACPPPATH'],
LIBPATH=['$EXTRALIBPATH'] )
@@ -373,4 +373,40 @@ For applicable files:
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
End
10) License notice for Snowball
Copyright (c) 2001, Dr Martin Porter
All rights reserved.
THE "BSD" LICENCE
-----------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the name of Google
Inc. nor the names of their contributors may be used to endorse or
promote products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
End
View
@@ -16,6 +16,7 @@ Import("darwin windows solaris linux nix")
env.SConscript(['base/SConscript',
'db/auth/SConscript',
'db/fts/SConscript',
'db/ops/SConscript',
'platform/SConscript',
's/SConscript',
@@ -464,7 +465,11 @@ mongosLibraryFiles = [
"s/version_manager.cpp",
]
env.Library( "mongoscore" , mongosLibraryFiles, LIBDEPS=['db/auth/authmongos'] )
env.Library( "mongoscore",
mongosLibraryFiles,
LIBDEPS=['db/auth/authmongos',
'db/fts/ftsmongos'
] )
env.CppUnitTest( "balancer_policy_test" , [ "s/balancer_policy_tests.cpp" ] ,
LIBDEPS=["mongoscore", "coreshard", "mongocommon","coreserver","coredb","dbcmdline","mongodandmongos"] ,
@@ -532,6 +537,7 @@ env.CppUnitTest("geoparser_test", [ "db/geo/geoparser_test.cpp" ], LIBDEPS = ["g
env.StaticLibrary("serveronly", serverOnlyFiles,
LIBDEPS=["coreshard",
"db/auth/authmongod",
"db/fts/ftsmongod",
"dbcmdline",
"defaultversion",
"geoparser",
@@ -59,3 +59,5 @@ else:
env.SConscript('gperftools-2.0/SConscript')
env.StaticLibrary('shim_allocator', 'shim_allocator.cpp',
LIBDEPS=['gperftools-2.0/tcmalloc_minimal'])
env.SConscript('libstemmer_c/SConscript')
@@ -0,0 +1,72 @@
README
src_c/stem_ISO_8859_1_danish.c
src_c/stem_ISO_8859_1_danish.h
src_c/stem_ISO_8859_1_dutch.c
src_c/stem_ISO_8859_1_dutch.h
src_c/stem_ISO_8859_1_english.c
src_c/stem_ISO_8859_1_english.h
src_c/stem_ISO_8859_1_finnish.c
src_c/stem_ISO_8859_1_finnish.h
src_c/stem_ISO_8859_1_french.c
src_c/stem_ISO_8859_1_french.h
src_c/stem_ISO_8859_1_german.c
src_c/stem_ISO_8859_1_german.h
src_c/stem_ISO_8859_1_hungarian.c
src_c/stem_ISO_8859_1_hungarian.h
src_c/stem_ISO_8859_1_italian.c
src_c/stem_ISO_8859_1_italian.h
src_c/stem_ISO_8859_1_norwegian.c
src_c/stem_ISO_8859_1_norwegian.h
src_c/stem_ISO_8859_1_porter.c
src_c/stem_ISO_8859_1_porter.h
src_c/stem_ISO_8859_1_portuguese.c
src_c/stem_ISO_8859_1_portuguese.h
src_c/stem_ISO_8859_1_spanish.c
src_c/stem_ISO_8859_1_spanish.h
src_c/stem_ISO_8859_1_swedish.c
src_c/stem_ISO_8859_1_swedish.h
src_c/stem_ISO_8859_2_romanian.c
src_c/stem_ISO_8859_2_romanian.h
src_c/stem_KOI8_R_russian.c
src_c/stem_KOI8_R_russian.h
src_c/stem_UTF_8_danish.c
src_c/stem_UTF_8_danish.h
src_c/stem_UTF_8_dutch.c
src_c/stem_UTF_8_dutch.h
src_c/stem_UTF_8_english.c
src_c/stem_UTF_8_english.h
src_c/stem_UTF_8_finnish.c
src_c/stem_UTF_8_finnish.h
src_c/stem_UTF_8_french.c
src_c/stem_UTF_8_french.h
src_c/stem_UTF_8_german.c
src_c/stem_UTF_8_german.h
src_c/stem_UTF_8_hungarian.c
src_c/stem_UTF_8_hungarian.h
src_c/stem_UTF_8_italian.c
src_c/stem_UTF_8_italian.h
src_c/stem_UTF_8_norwegian.c
src_c/stem_UTF_8_norwegian.h
src_c/stem_UTF_8_porter.c
src_c/stem_UTF_8_porter.h
src_c/stem_UTF_8_portuguese.c
src_c/stem_UTF_8_portuguese.h
src_c/stem_UTF_8_romanian.c
src_c/stem_UTF_8_romanian.h
src_c/stem_UTF_8_russian.c
src_c/stem_UTF_8_russian.h
src_c/stem_UTF_8_spanish.c
src_c/stem_UTF_8_spanish.h
src_c/stem_UTF_8_swedish.c
src_c/stem_UTF_8_swedish.h
src_c/stem_UTF_8_turkish.c
src_c/stem_UTF_8_turkish.h
runtime/api.c
runtime/api.h
runtime/header.h
runtime/utilities.c
libstemmer/libstemmer.c
libstemmer/libstemmer_utf8.c
libstemmer/modules.h
libstemmer/modules_utf8.h
include/libstemmer.h
@@ -0,0 +1,9 @@
include mkinc.mak
CFLAGS=-Iinclude
all: libstemmer.o stemwords
libstemmer.o: $(snowball_sources:.c=.o)
$(AR) -cru $@ $^
stemwords: examples/stemwords.o libstemmer.o
$(CC) -o $@ $^
clean:
rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
@@ -0,0 +1,125 @@
libstemmer_c
============
This document pertains to the C version of the libstemmer distribution,
available for download from:
http://snowball.tartarus.org/dist/libstemmer_c.tgz
Compiling the library
=====================
A simple makefile is provided for Unix style systems. On such systems, it
should be possible simply to run "make", and the file "libstemmer.o"
and the example program "stemwords" will be generated.
If this doesn't work on your system, you need to write your own build
system (or call the compiler directly). The files to compile are
all contained in the "libstemmer", "runtime" and "src_c" directories,
and the public header file is contained in the "include" directory.
The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
"libstemmer.c".
For convenience "mkinc.mak" is a makefile fragment listing the source files and
header files used to compile the standard version of the library.
"mkinc_utf8.mak" is a comparable makefile fragment listing just the source
files for the UTF-8 only version of the library.
Using the library
=================
The library provides a simple C API. Essentially, a new stemmer can
be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
used to stem a word, "sb_stemmer_length" returns the stemmed
length of the last word processed, and "sb_stemmer_delete" is
used to delete a stemmer.
Creating a stemmer is a relatively expensive operation - the expected
usage pattern is that a new stemmer is created when needed, used
to stem many words, and deleted after some time.
Stemmers are re-entrant, but not threadsafe. In other words, if
you wish to access the same stemmer object from multiple threads,
you must ensure that all access is protected by a mutex or similar
device.
libstemmer does not currently incorporate any mechanism for caching the results
of stemming operations. Such caching can greatly increase the performance of a
stemmer under certain situations, so suitable patches will be considered for
inclusion.
The standard libstemmer sources contain an algorithm for each of the supported
languages. The algorithm may be selected using the english name of the
language, or using the 2 or 3 letter ISO 639 language codes. In addition,
the traditional "Porter" stemming algorithm for english is included for
backwards compatibility purposes, but we recommend use of the "English"
stemmer in preference for new projects.
(Some minor algorithms which are included only as curiosities in the snowball
website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
included in the standard libstemmer sources. These are not really supported by
the snowball project, but it would be possible to compile a modified libstemmer
library containing these if desired.)
The stemwords example
=====================
The stemwords example program allows you to run any of the stemmers
compiled into the libstemmer library on a sample vocabulary. For
details on how to use it, run it with the "-h" command line option.
Using the library in a larger system
====================================
If you are incorporating the library into the build system of a larger
program, I recommend copying the unpacked tarball without modification into
a subdirectory of the sources of your program. Future versions of the
library are intended to keep the same structure, so this will keep the
work required to move to a new version of the library to a minimum.
As an additional convenience, the list of source and header files used
in the library is detailed in mkinc.mak - a file which is in a suitable
format for inclusion by a Makefile. By including this file in your build
system, you can link the snowball system into your program with a few
extra rules.
Using the library in a system using GNU autotools
=================================================
The libstemmer_c library can be integrated into a larger system which uses the
GNU autotool framework (and in particular, automake and autoconf) as follows:
1) Unpack libstemmer_c.tgz in the top level project directory so that there is
a libstemmer_c subdirectory of the top level directory of the project.
2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
noinst_LTLIBRARIES = libstemmer.la
include $(srcdir)/mkinc.mak
noinst_HEADERS = $(snowball_headers)
libstemmer_la_SOURCES = $(snowball_sources)
(You may also need to add other lines to this, for example, if you are using
compiler options which are not compatible with compiling the libstemmer
library.)
3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
configure.ac file.
4) Add to the top level makefile the following lines (or modify existing
assignments to these variables appropriately):
AUTOMAKE_OPTIONS = subdir-objects
AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
SUBDIRS=libstemmer_c
<name>_LIBADD = libstemmer_c/libstemmer.la
(Where <name> is the name of the library or executable which links against
libstemmer.)
@@ -0,0 +1,46 @@
# -*- mode: python -*-
Import("env")
stemming_packages = [
"ISO_8859_1_danish",
"ISO_8859_1_french",
"ISO_8859_1_norwegian",
"ISO_8859_1_swedish",
"UTF_8_dutch",
"UTF_8_german",
"UTF_8_porter",
"UTF_8_spanish",
"ISO_8859_1_dutch",
"ISO_8859_1_german",
"ISO_8859_1_porter",
"ISO_8859_2_romanian",
"UTF_8_english",
"UTF_8_hungarian",
"UTF_8_portuguese",
"UTF_8_swedish",
"ISO_8859_1_english",
"ISO_8859_1_hungarian",
"ISO_8859_1_portuguese",
"KOI8_R_russian",
"UTF_8_finnish",
"UTF_8_italian",
"UTF_8_romanian",
"UTF_8_turkish",
"ISO_8859_1_finnish",
"ISO_8859_1_italian",
"ISO_8859_1_spanish",
"UTF_8_danish",
"UTF_8_french",
"UTF_8_norwegian",
"UTF_8_russian",
]
stemmer_files = [
'runtime/api.c',
'runtime/utilities.c',
'libstemmer/libstemmer_utf8.c',
['src_c/stem_%s.c' % p for p in stemming_packages],
]
env.StaticLibrary( "stemmer", stemmer_files )
Oops, something went wrong.

0 comments on commit d2df300

Please sign in to comment.