Permalink
Browse files

SERVER-380: Add snowball stemmer

  • Loading branch information...
1 parent 13a61cb commit d2df300721805ace411b5d1a87cb4bf6d8a51ff3 @erh erh committed Dec 25, 2012
Showing with 19,092 additions and 2 deletions.
  1. +1 −0 SConstruct
  2. +37 −1 distsrc/THIRD-PARTY-NOTICES
  3. +7 −1 src/mongo/SConscript
  4. +2 −0 src/third_party/SConscript
  5. +72 −0 src/third_party/libstemmer_c/MANIFEST
  6. +9 −0 src/third_party/libstemmer_c/Makefile
  7. +125 −0 src/third_party/libstemmer_c/README
  8. +46 −0 src/third_party/libstemmer_c/SConscript
  9. +209 −0 src/third_party/libstemmer_c/examples/stemwords.c
  10. +79 −0 src/third_party/libstemmer_c/include/libstemmer.h
  11. +95 −0 src/third_party/libstemmer_c/libstemmer/libstemmer.c
  12. +95 −0 src/third_party/libstemmer_c/libstemmer/libstemmer_c.in
  13. +95 −0 src/third_party/libstemmer_c/libstemmer/libstemmer_utf8.c
  14. +190 −0 src/third_party/libstemmer_c/libstemmer/modules.h
  15. +50 −0 src/third_party/libstemmer_c/libstemmer/modules.txt
  16. +121 −0 src/third_party/libstemmer_c/libstemmer/modules_utf8.h
  17. +49 −0 src/third_party/libstemmer_c/libstemmer/modules_utf8.txt
  18. +82 −0 src/third_party/libstemmer_c/mkinc.mak
  19. +52 −0 src/third_party/libstemmer_c/mkinc_utf8.mak
  20. +66 −0 src/third_party/libstemmer_c/runtime/api.c
  21. +26 −0 src/third_party/libstemmer_c/runtime/api.h
  22. +58 −0 src/third_party/libstemmer_c/runtime/header.h
  23. +478 −0 src/third_party/libstemmer_c/runtime/utilities.c
  24. +337 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_danish.c
  25. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_danish.h
  26. +624 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
  27. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
  28. +1,117 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_english.c
  29. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_english.h
  30. +762 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
  31. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
  32. +1,246 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_french.c
  33. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_french.h
  34. +521 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_german.c
  35. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_german.h
  36. +1,230 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
  37. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
  38. +1,065 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_italian.c
  39. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_italian.h
  40. +297 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
  41. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
  42. +749 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_porter.c
  43. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_porter.h
  44. +1,017 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
  45. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
  46. +1,093 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
  47. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
  48. +307 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
  49. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
  50. +998 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
  51. +16 −0 src/third_party/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
  52. +700 −0 src/third_party/libstemmer_c/src_c/stem_KOI8_R_russian.c
  53. +16 −0 src/third_party/libstemmer_c/src_c/stem_KOI8_R_russian.h
  54. +339 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_danish.c
  55. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_danish.h
  56. +634 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_dutch.c
  57. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_dutch.h
  58. +1,125 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_english.c
  59. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_english.h
  60. +768 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_finnish.c
  61. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_finnish.h
  62. +1,256 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_french.c
  63. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_french.h
  64. +527 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_german.c
  65. +16 −0 src/third_party/libstemmer_c/src_c/stem_UTF_8_german.h
Sorry, we could not display the entire diff because it was too big.
View
1 SConstruct
@@ -780,6 +780,7 @@ if not use_system_version_of_library("boost"):
CPPDEFINES=['BOOST_ALL_NO_LIB'])
env.Prepend(CPPPATH=['$BUILD_DIR/third_party/s2'])
+env.Prepend(CPPPATH=['$BUILD_DIR/third_party/libstemmer_c/include'])
env.Append( CPPPATH=['$EXTRACPPPATH'],
LIBPATH=['$EXTRALIBPATH'] )
View
38 distsrc/THIRD-PARTY-NOTICES
@@ -373,4 +373,40 @@ For applicable files:
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-End
+10) License notice for Snowball
+ Copyright (c) 2001, Dr Martin Porter
+ All rights reserved.
+
+THE "BSD" LICENCE
+-----------------
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the name of Google
+ Inc. nor the names of their contributors may be used to endorse or
+ promote products derived from this software without specific prior
+ written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+
+End
View
8 src/mongo/SConscript
@@ -16,6 +16,7 @@ Import("darwin windows solaris linux nix")
env.SConscript(['base/SConscript',
'db/auth/SConscript',
+ 'db/fts/SConscript',
'db/ops/SConscript',
'platform/SConscript',
's/SConscript',
@@ -464,7 +465,11 @@ mongosLibraryFiles = [
"s/version_manager.cpp",
]
-env.Library( "mongoscore" , mongosLibraryFiles, LIBDEPS=['db/auth/authmongos'] )
+env.Library( "mongoscore",
+ mongosLibraryFiles,
+ LIBDEPS=['db/auth/authmongos',
+ 'db/fts/ftsmongos'
+ ] )
env.CppUnitTest( "balancer_policy_test" , [ "s/balancer_policy_tests.cpp" ] ,
LIBDEPS=["mongoscore", "coreshard", "mongocommon","coreserver","coredb","dbcmdline","mongodandmongos"] ,
@@ -532,6 +537,7 @@ env.CppUnitTest("geoparser_test", [ "db/geo/geoparser_test.cpp" ], LIBDEPS = ["g
env.StaticLibrary("serveronly", serverOnlyFiles,
LIBDEPS=["coreshard",
"db/auth/authmongod",
+ "db/fts/ftsmongod",
"dbcmdline",
"defaultversion",
"geoparser",
View
2 src/third_party/SConscript
@@ -59,3 +59,5 @@ else:
env.SConscript('gperftools-2.0/SConscript')
env.StaticLibrary('shim_allocator', 'shim_allocator.cpp',
LIBDEPS=['gperftools-2.0/tcmalloc_minimal'])
+
+env.SConscript('libstemmer_c/SConscript')
View
72 src/third_party/libstemmer_c/MANIFEST
@@ -0,0 +1,72 @@
+README
+src_c/stem_ISO_8859_1_danish.c
+src_c/stem_ISO_8859_1_danish.h
+src_c/stem_ISO_8859_1_dutch.c
+src_c/stem_ISO_8859_1_dutch.h
+src_c/stem_ISO_8859_1_english.c
+src_c/stem_ISO_8859_1_english.h
+src_c/stem_ISO_8859_1_finnish.c
+src_c/stem_ISO_8859_1_finnish.h
+src_c/stem_ISO_8859_1_french.c
+src_c/stem_ISO_8859_1_french.h
+src_c/stem_ISO_8859_1_german.c
+src_c/stem_ISO_8859_1_german.h
+src_c/stem_ISO_8859_1_hungarian.c
+src_c/stem_ISO_8859_1_hungarian.h
+src_c/stem_ISO_8859_1_italian.c
+src_c/stem_ISO_8859_1_italian.h
+src_c/stem_ISO_8859_1_norwegian.c
+src_c/stem_ISO_8859_1_norwegian.h
+src_c/stem_ISO_8859_1_porter.c
+src_c/stem_ISO_8859_1_porter.h
+src_c/stem_ISO_8859_1_portuguese.c
+src_c/stem_ISO_8859_1_portuguese.h
+src_c/stem_ISO_8859_1_spanish.c
+src_c/stem_ISO_8859_1_spanish.h
+src_c/stem_ISO_8859_1_swedish.c
+src_c/stem_ISO_8859_1_swedish.h
+src_c/stem_ISO_8859_2_romanian.c
+src_c/stem_ISO_8859_2_romanian.h
+src_c/stem_KOI8_R_russian.c
+src_c/stem_KOI8_R_russian.h
+src_c/stem_UTF_8_danish.c
+src_c/stem_UTF_8_danish.h
+src_c/stem_UTF_8_dutch.c
+src_c/stem_UTF_8_dutch.h
+src_c/stem_UTF_8_english.c
+src_c/stem_UTF_8_english.h
+src_c/stem_UTF_8_finnish.c
+src_c/stem_UTF_8_finnish.h
+src_c/stem_UTF_8_french.c
+src_c/stem_UTF_8_french.h
+src_c/stem_UTF_8_german.c
+src_c/stem_UTF_8_german.h
+src_c/stem_UTF_8_hungarian.c
+src_c/stem_UTF_8_hungarian.h
+src_c/stem_UTF_8_italian.c
+src_c/stem_UTF_8_italian.h
+src_c/stem_UTF_8_norwegian.c
+src_c/stem_UTF_8_norwegian.h
+src_c/stem_UTF_8_porter.c
+src_c/stem_UTF_8_porter.h
+src_c/stem_UTF_8_portuguese.c
+src_c/stem_UTF_8_portuguese.h
+src_c/stem_UTF_8_romanian.c
+src_c/stem_UTF_8_romanian.h
+src_c/stem_UTF_8_russian.c
+src_c/stem_UTF_8_russian.h
+src_c/stem_UTF_8_spanish.c
+src_c/stem_UTF_8_spanish.h
+src_c/stem_UTF_8_swedish.c
+src_c/stem_UTF_8_swedish.h
+src_c/stem_UTF_8_turkish.c
+src_c/stem_UTF_8_turkish.h
+runtime/api.c
+runtime/api.h
+runtime/header.h
+runtime/utilities.c
+libstemmer/libstemmer.c
+libstemmer/libstemmer_utf8.c
+libstemmer/modules.h
+libstemmer/modules_utf8.h
+include/libstemmer.h
View
9 src/third_party/libstemmer_c/Makefile
@@ -0,0 +1,9 @@
+include mkinc.mak
+CFLAGS=-Iinclude
+all: libstemmer.o stemwords
+libstemmer.o: $(snowball_sources:.c=.o)
+ $(AR) -cru $@ $^
+stemwords: examples/stemwords.o libstemmer.o
+ $(CC) -o $@ $^
+clean:
+ rm -f stemwords *.o src_c/*.o runtime/*.o libstemmer/*.o
View
125 src/third_party/libstemmer_c/README
@@ -0,0 +1,125 @@
+libstemmer_c
+============
+
+This document pertains to the C version of the libstemmer distribution,
+available for download from:
+
+http://snowball.tartarus.org/dist/libstemmer_c.tgz
+
+
+Compiling the library
+=====================
+
+A simple makefile is provided for Unix style systems. On such systems, it
+should be possible simply to run "make", and the file "libstemmer.o"
+and the example program "stemwords" will be generated.
+
+If this doesn't work on your system, you need to write your own build
+system (or call the compiler directly). The files to compile are
+all contained in the "libstemmer", "runtime" and "src_c" directories,
+and the public header file is contained in the "include" directory.
+
+The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
+sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
+"libstemmer.c".
+
+For convenience "mkinc.mak" is a makefile fragment listing the source files and
+header files used to compile the standard version of the library.
+"mkinc_utf8.mak" is a comparable makefile fragment listing just the source
+files for the UTF-8 only version of the library.
+
+
+Using the library
+=================
+
+The library provides a simple C API. Essentially, a new stemmer can
+be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
+used to stem a word, "sb_stemmer_length" returns the stemmed
+length of the last word processed, and "sb_stemmer_delete" is
+used to delete a stemmer.
+
+Creating a stemmer is a relatively expensive operation - the expected
+usage pattern is that a new stemmer is created when needed, used
+to stem many words, and deleted after some time.
+
+Stemmers are re-entrant, but not threadsafe. In other words, if
+you wish to access the same stemmer object from multiple threads,
+you must ensure that all access is protected by a mutex or similar
+device.
+
+libstemmer does not currently incorporate any mechanism for caching the results
+of stemming operations. Such caching can greatly increase the performance of a
+stemmer under certain situations, so suitable patches will be considered for
+inclusion.
+
+The standard libstemmer sources contain an algorithm for each of the supported
+languages. The algorithm may be selected using the english name of the
+language, or using the 2 or 3 letter ISO 639 language codes. In addition,
+the traditional "Porter" stemming algorithm for english is included for
+backwards compatibility purposes, but we recommend use of the "English"
+stemmer in preference for new projects.
+
+(Some minor algorithms which are included only as curiosities in the snowball
+website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
+included in the standard libstemmer sources. These are not really supported by
+the snowball project, but it would be possible to compile a modified libstemmer
+library containing these if desired.)
+
+
+The stemwords example
+=====================
+
+The stemwords example program allows you to run any of the stemmers
+compiled into the libstemmer library on a sample vocabulary. For
+details on how to use it, run it with the "-h" command line option.
+
+
+Using the library in a larger system
+====================================
+
+If you are incorporating the library into the build system of a larger
+program, I recommend copying the unpacked tarball without modification into
+a subdirectory of the sources of your program. Future versions of the
+library are intended to keep the same structure, so this will keep the
+work required to move to a new version of the library to a minimum.
+
+As an additional convenience, the list of source and header files used
+in the library is detailed in mkinc.mak - a file which is in a suitable
+format for inclusion by a Makefile. By including this file in your build
+system, you can link the snowball system into your program with a few
+extra rules.
+
+Using the library in a system using GNU autotools
+=================================================
+
+The libstemmer_c library can be integrated into a larger system which uses the
+GNU autotool framework (and in particular, automake and autoconf) as follows:
+
+1) Unpack libstemmer_c.tgz in the top level project directory so that there is
+ a libstemmer_c subdirectory of the top level directory of the project.
+
+2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
+
+noinst_LTLIBRARIES = libstemmer.la
+include $(srcdir)/mkinc.mak
+noinst_HEADERS = $(snowball_headers)
+libstemmer_la_SOURCES = $(snowball_sources)
+
+(You may also need to add other lines to this, for example, if you are using
+compiler options which are not compatible with compiling the libstemmer
+library.)
+
+3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
+ configure.ac file.
+
+4) Add to the top level makefile the following lines (or modify existing
+ assignments to these variables appropriately):
+
+AUTOMAKE_OPTIONS = subdir-objects
+AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
+SUBDIRS=libstemmer_c
+<name>_LIBADD = libstemmer_c/libstemmer.la
+
+(Where <name> is the name of the library or executable which links against
+libstemmer.)
+
View
46 src/third_party/libstemmer_c/SConscript
@@ -0,0 +1,46 @@
+# -*- mode: python -*-
+
+Import("env")
+
+stemming_packages = [
+ "ISO_8859_1_danish",
+ "ISO_8859_1_french",
+ "ISO_8859_1_norwegian",
+ "ISO_8859_1_swedish",
+ "UTF_8_dutch",
+ "UTF_8_german",
+ "UTF_8_porter",
+ "UTF_8_spanish",
+ "ISO_8859_1_dutch",
+ "ISO_8859_1_german",
+ "ISO_8859_1_porter",
+ "ISO_8859_2_romanian",
+ "UTF_8_english",
+ "UTF_8_hungarian",
+ "UTF_8_portuguese",
+ "UTF_8_swedish",
+ "ISO_8859_1_english",
+ "ISO_8859_1_hungarian",
+ "ISO_8859_1_portuguese",
+ "KOI8_R_russian",
+ "UTF_8_finnish",
+ "UTF_8_italian",
+ "UTF_8_romanian",
+ "UTF_8_turkish",
+ "ISO_8859_1_finnish",
+ "ISO_8859_1_italian",
+ "ISO_8859_1_spanish",
+ "UTF_8_danish",
+ "UTF_8_french",
+ "UTF_8_norwegian",
+ "UTF_8_russian",
+ ]
+
+stemmer_files = [
+ 'runtime/api.c',
+ 'runtime/utilities.c',
+ 'libstemmer/libstemmer_utf8.c',
+ ['src_c/stem_%s.c' % p for p in stemming_packages],
+ ]
+
+env.StaticLibrary( "stemmer", stemmer_files )
View
209 src/third_party/libstemmer_c/examples/stemwords.c
@@ -0,0 +1,209 @@
+/* This is a simple program which uses libstemmer to provide a command
+ * line interface for stemming using any of the algorithms provided.
+ */
+
+#include <stdio.h>
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for memmove */
+#include <ctype.h> /* for isupper, tolower */
+
+#include "libstemmer.h"
+
+const char * progname;
+static int pretty = 1;
+
+static void
+stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
+{
+#define INC 10
+ int lim = INC;
+ sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
+
+ while(1) {
+ int ch = getc(f_in);
+ if (ch == EOF) {
+ free(b); return;
+ }
+ {
+ int i = 0;
+ int inlen = 0;
+ while(1) {
+ if (ch == '\n' || ch == EOF) break;
+ if (i == lim) {
+ sb_symbol * newb;
+ newb = (sb_symbol *)
+ realloc(b, (lim + INC) * sizeof(sb_symbol));
+ if (newb == 0) goto error;
+ b = newb;
+ lim = lim + INC;
+ }
+ /* Update count of utf-8 characters. */
+ if (ch < 0x80 || ch > 0xBF) inlen += 1;
+ /* force lower case: */
+ if (isupper(ch)) ch = tolower(ch);
+
+ b[i] = ch;
+ i++;
+ ch = getc(f_in);
+ }
+
+ {
+ const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
+ if (stemmed == NULL)
+ {
+ fprintf(stderr, "Out of memory");
+ exit(1);
+ }
+ else
+ {
+ if (pretty == 1) {
+ fwrite(b, i, 1, f_out);
+ fputs(" -> ", f_out);
+ } else if (pretty == 2) {
+ fwrite(b, i, 1, f_out);
+ if (sb_stemmer_length(stemmer) > 0) {
+ int j;
+ if (inlen < 30) {
+ for (j = 30 - inlen; j > 0; j--)
+ fputs(" ", f_out);
+ } else {
+ fputs("\n", f_out);
+ for (j = 30; j > 0; j--)
+ fputs(" ", f_out);
+ }
+ }
+ }
+
+ fputs((char *)stemmed, f_out);
+ putc('\n', f_out);
+ }
+ }
+ }
+ }
+error:
+ if (b != 0) free(b);
+ return;
+}
+
+/** Display the command line syntax, and then exit.
+ * @param n The value to exit with.
+ */
+static void
+usage(int n)
+{
+ printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
+ "\n"
+ "The input file consists of a list of words to be stemmed, one per\n"
+ "line. Words should be in lower case, but (for English) A-Z letters\n"
+ "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
+ "used.\n"
+ "\n"
+ "If -c is given, the argument is the character encoding of the input\n"
+ "and output files. If it is omitted, the UTF-8 encoding is used.\n"
+ "\n"
+ "If -p is given the output file consists of each word of the input\n"
+ "file followed by \"->\" followed by its stemmed equivalent.\n"
+ "If -p2 is given the output file is a two column layout containing\n"
+ "the input words in the first column and the stemmed eqivalents in\n"
+ "the second column.\n"
+ "Otherwise, the output file consists of the stemmed words, one per\n"
+ "line.\n"
+ "\n"
+ "-h displays this help\n",
+ progname);
+ exit(n);
+}
+
+int
+main(int argc, char * argv[])
+{
+ char * in = 0;
+ char * out = 0;
+ FILE * f_in;
+ FILE * f_out;
+ struct sb_stemmer * stemmer;
+
+ char * language = "english";
+ char * charenc = NULL;
+
+ char * s;
+ int i = 1;
+ pretty = 0;
+
+ progname = argv[0];
+
+ while(i < argc) {
+ s = argv[i++];
+ if (s[0] == '-') {
+ if (strcmp(s, "-o") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ out = argv[i++];
+ } else if (strcmp(s, "-i") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ in = argv[i++];
+ } else if (strcmp(s, "-l") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ language = argv[i++];
+ } else if (strcmp(s, "-c") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ charenc = argv[i++];
+ } else if (strcmp(s, "-p2") == 0) {
+ pretty = 2;
+ } else if (strcmp(s, "-p") == 0) {
+ pretty = 1;
+ } else if (strcmp(s, "-h") == 0) {
+ usage(0);
+ } else {
+ fprintf(stderr, "option %s unknown\n", s);
+ usage(1);
+ }
+ } else {
+ fprintf(stderr, "unexpected parameter %s\n", s);
+ usage(1);
+ }
+ }
+
+ /* prepare the files */
+ f_in = (in == 0) ? stdin : fopen(in, "r");
+ if (f_in == 0) {
+ fprintf(stderr, "file %s not found\n", in);
+ exit(1);
+ }
+ f_out = (out == 0) ? stdout : fopen(out, "w");
+ if (f_out == 0) {
+ fprintf(stderr, "file %s cannot be opened\n", out);
+ exit(1);
+ }
+
+ /* do the stemming process: */
+ stemmer = sb_stemmer_new(language, charenc);
+ if (stemmer == 0) {
+ if (charenc == NULL) {
+ fprintf(stderr, "language `%s' not available for stemming\n", language);
+ exit(1);
+ } else {
+ fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
+ exit(1);
+ }
+ }
+ stem_file(stemmer, f_in, f_out);
+ sb_stemmer_delete(stemmer);
+
+ if (in != 0) (void) fclose(f_in);
+ if (out != 0) (void) fclose(f_out);
+
+ return 0;
+}
+
View
79 src/third_party/libstemmer_c/include/libstemmer.h
@@ -0,0 +1,79 @@
+
+/* Make header file work when included from C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sb_stemmer;
+typedef unsigned char sb_symbol;
+
+/* FIXME - should be able to get a version number for each stemming
+ * algorithm (which will be incremented each time the output changes). */
+
+/** Returns an array of the names of the available stemming algorithms.
+ * Note that these are the canonical names - aliases (ie, other names for
+ * the same algorithm) will not be included in the list.
+ * The list is terminated with a null pointer.
+ *
+ * The list must not be modified in any way.
+ */
+const char ** sb_stemmer_list(void);
+
+/** Create a new stemmer object, using the specified algorithm, for the
+ * specified character encoding.
+ *
+ * All algorithms will usually be available in UTF-8, but may also be
+ * available in other character encodings.
+ *
+ * @param algorithm The algorithm name. This is either the english
+ * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
+ * language. Note that case is significant in this parameter - the
+ * value should be supplied in lower case.
+ *
+ * @param charenc The character encoding. NULL may be passed as
+ * this value, in which case UTF-8 encoding will be assumed. Otherwise,
+ * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
+ * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that
+ * case is significant in this parameter.
+ *
+ * @return NULL if the specified algorithm is not recognised, or the
+ * algorithm is not available for the requested encoding. Otherwise,
+ * returns a pointer to a newly created stemmer for the requested algorithm.
+ * The returned pointer must be deleted by calling sb_stemmer_delete().
+ *
+ * @note NULL will also be returned if an out of memory error occurs.
+ */
+struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
+
+/** Delete a stemmer object.
+ *
+ * This frees all resources allocated for the stemmer. After calling
+ * this function, the supplied stemmer may no longer be used in any way.
+ *
+ * It is safe to pass a null pointer to this function - this will have
+ * no effect.
+ */
+void sb_stemmer_delete(struct sb_stemmer * stemmer);
+
+/** Stem a word.
+ *
+ * The return value is owned by the stemmer - it must not be freed or
+ * modified, and it will become invalid when the stemmer is called again,
+ * or if the stemmer is freed.
+ *
+ * The length of the return value can be obtained using sb_stemmer_length().
+ *
+ * If an out-of-memory error occurs, this will return NULL.
+ */
+const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
+ const sb_symbol * word, int size);
+
+/** Get the length of the result of the last stemmed word.
+ * This should not be called before sb_stemmer_stem() has been called.
+ */
+int sb_stemmer_length(struct sb_stemmer * stemmer);
+
+#ifdef __cplusplus
+}
+#endif
+
View
95 src/third_party/libstemmer_c/libstemmer/libstemmer.c
@@ -0,0 +1,95 @@
+
+#include <stdlib.h>
+#include <string.h>
+#include "../include/libstemmer.h"
+#include "../runtime/api.h"
+#include "modules.h"
+
+struct sb_stemmer {
+ struct SN_env * (*create)(void);
+ void (*close)(struct SN_env *);
+ int (*stem)(struct SN_env *);
+
+ struct SN_env * env;
+};
+
+extern const char **
+sb_stemmer_list(void)
+{
+ return algorithm_names;
+}
+
+static stemmer_encoding_t
+sb_getenc(const char * charenc)
+{
+ struct stemmer_encoding * encoding;
+ if (charenc == NULL) return ENC_UTF_8;
+ for (encoding = encodings; encoding->name != 0; encoding++) {
+ if (strcmp(encoding->name, charenc) == 0) break;
+ }
+ if (encoding->name == NULL) return ENC_UNKNOWN;
+ return encoding->enc;
+}
+
+extern struct sb_stemmer *
+sb_stemmer_new(const char * algorithm, const char * charenc)
+{
+ stemmer_encoding_t enc;
+ struct stemmer_modules * module;
+ struct sb_stemmer * stemmer;
+
+ enc = sb_getenc(charenc);
+ if (enc == ENC_UNKNOWN) return NULL;
+
+ for (module = modules; module->name != 0; module++) {
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+ }
+ if (module->name == NULL) return NULL;
+
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+ if (stemmer == NULL) return NULL;
+
+ stemmer->create = module->create;
+ stemmer->close = module->close;
+ stemmer->stem = module->stem;
+
+ stemmer->env = stemmer->create();
+ if (stemmer->env == NULL)
+ {
+ sb_stemmer_delete(stemmer);
+ return NULL;
+ }
+
+ return stemmer;
+}
+
+void
+sb_stemmer_delete(struct sb_stemmer * stemmer)
+{
+ if (stemmer == 0) return;
+ if (stemmer->close == 0) return;
+ stemmer->close(stemmer->env);
+ stemmer->close = 0;
+ free(stemmer);
+}
+
+const sb_symbol *
+sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
+{
+ int ret;
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
+ {
+ stemmer->env->l = 0;
+ return NULL;
+ }
+ ret = stemmer->stem(stemmer->env);
+ if (ret < 0) return NULL;
+ stemmer->env->p[stemmer->env->l] = 0;
+ return (const sb_symbol *)(stemmer->env->p);
+}
+
+int
+sb_stemmer_length(struct sb_stemmer * stemmer)
+{
+ return stemmer->env->l;
+}
View
95 src/third_party/libstemmer_c/libstemmer/libstemmer_c.in
@@ -0,0 +1,95 @@
+
+#include <stdlib.h>
+#include <string.h>
+#include "../include/libstemmer.h"
+#include "../runtime/api.h"
+#include "@MODULES_H@"
+
+struct sb_stemmer {
+ struct SN_env * (*create)(void);
+ void (*close)(struct SN_env *);
+ int (*stem)(struct SN_env *);
+
+ struct SN_env * env;
+};
+
+extern const char **
+sb_stemmer_list(void)
+{
+ return algorithm_names;
+}
+
+static stemmer_encoding_t
+sb_getenc(const char * charenc)
+{
+ struct stemmer_encoding * encoding;
+ if (charenc == NULL) return ENC_UTF_8;
+ for (encoding = encodings; encoding->name != 0; encoding++) {
+ if (strcmp(encoding->name, charenc) == 0) break;
+ }
+ if (encoding->name == NULL) return ENC_UNKNOWN;
+ return encoding->enc;
+}
+
+extern struct sb_stemmer *
+sb_stemmer_new(const char * algorithm, const char * charenc)
+{
+ stemmer_encoding_t enc;
+ struct stemmer_modules * module;
+ struct sb_stemmer * stemmer;
+
+ enc = sb_getenc(charenc);
+ if (enc == ENC_UNKNOWN) return NULL;
+
+ for (module = modules; module->name != 0; module++) {
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+ }
+ if (module->name == NULL) return NULL;
+
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+ if (stemmer == NULL) return NULL;
+
+ stemmer->create = module->create;
+ stemmer->close = module->close;
+ stemmer->stem = module->stem;
+
+ stemmer->env = stemmer->create();
+ if (stemmer->env == NULL)
+ {
+ sb_stemmer_delete(stemmer);
+ return NULL;
+ }
+
+ return stemmer;
+}
+
+void
+sb_stemmer_delete(struct sb_stemmer * stemmer)
+{
+ if (stemmer == 0) return;
+ if (stemmer->close == 0) return;
+ stemmer->close(stemmer->env);
+ stemmer->close = 0;
+ free(stemmer);
+}
+
+const sb_symbol *
+sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
+{
+ int ret;
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
+ {
+ stemmer->env->l = 0;
+ return NULL;
+ }
+ ret = stemmer->stem(stemmer->env);
+ if (ret < 0) return NULL;
+ stemmer->env->p[stemmer->env->l] = 0;
+ return (const sb_symbol *)(stemmer->env->p);
+}
+
+int
+sb_stemmer_length(struct sb_stemmer * stemmer)
+{
+ return stemmer->env->l;
+}
View
95 src/third_party/libstemmer_c/libstemmer/libstemmer_utf8.c
@@ -0,0 +1,95 @@
+
+#include <stdlib.h>
+#include <string.h>
+#include "../include/libstemmer.h"
+#include "../runtime/api.h"
+#include "modules_utf8.h"
+
+struct sb_stemmer {
+ struct SN_env * (*create)(void);
+ void (*close)(struct SN_env *);
+ int (*stem)(struct SN_env *);
+
+ struct SN_env * env;
+};
+
+extern const char **
+sb_stemmer_list(void)
+{
+ return algorithm_names;
+}
+
+static stemmer_encoding_t
+sb_getenc(const char * charenc)
+{
+ struct stemmer_encoding * encoding;
+ if (charenc == NULL) return ENC_UTF_8;
+ for (encoding = encodings; encoding->name != 0; encoding++) {
+ if (strcmp(encoding->name, charenc) == 0) break;
+ }
+ if (encoding->name == NULL) return ENC_UNKNOWN;
+ return encoding->enc;
+}
+
+extern struct sb_stemmer *
+sb_stemmer_new(const char * algorithm, const char * charenc)
+{
+ stemmer_encoding_t enc;
+ struct stemmer_modules * module;
+ struct sb_stemmer * stemmer;
+
+ enc = sb_getenc(charenc);
+ if (enc == ENC_UNKNOWN) return NULL;
+
+ for (module = modules; module->name != 0; module++) {
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+ }
+ if (module->name == NULL) return NULL;
+
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+ if (stemmer == NULL) return NULL;
+
+ stemmer->create = module->create;
+ stemmer->close = module->close;
+ stemmer->stem = module->stem;
+
+ stemmer->env = stemmer->create();
+ if (stemmer->env == NULL)
+ {
+ sb_stemmer_delete(stemmer);
+ return NULL;
+ }
+
+ return stemmer;
+}
+
+void
+sb_stemmer_delete(struct sb_stemmer * stemmer)
+{
+ if (stemmer == 0) return;
+ if (stemmer->close == 0) return;
+ stemmer->close(stemmer->env);
+ stemmer->close = 0;
+ free(stemmer);
+}
+
+const sb_symbol *
+sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
+{
+ int ret;
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
+ {
+ stemmer->env->l = 0;
+ return NULL;
+ }
+ ret = stemmer->stem(stemmer->env);
+ if (ret < 0) return NULL;
+ stemmer->env->p[stemmer->env->l] = 0;
+ return (const sb_symbol *)(stemmer->env->p);
+}
+
+int
+sb_stemmer_length(struct sb_stemmer * stemmer)
+{
+ return stemmer->env->l;
+}
View
190 src/third_party/libstemmer_c/libstemmer/modules.h
@@ -0,0 +1,190 @@
+/* libstemmer/modules.h: List of stemming modules.
+ *
+ * This file is generated by mkmodules.pl from a list of module names.
+ * Do not edit manually.
+ *
+ * Modules included by this file are: danish, dutch, english, finnish, french,
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
+ * russian, spanish, swedish, turkish
+ */
+
+#include "../src_c/stem_ISO_8859_1_danish.h"
+#include "../src_c/stem_UTF_8_danish.h"
+#include "../src_c/stem_ISO_8859_1_dutch.h"
+#include "../src_c/stem_UTF_8_dutch.h"
+#include "../src_c/stem_ISO_8859_1_english.h"
+#include "../src_c/stem_UTF_8_english.h"
+#include "../src_c/stem_ISO_8859_1_finnish.h"
+#include "../src_c/stem_UTF_8_finnish.h"
+#include "../src_c/stem_ISO_8859_1_french.h"
+#include "../src_c/stem_UTF_8_french.h"
+#include "../src_c/stem_ISO_8859_1_german.h"
+#include "../src_c/stem_UTF_8_german.h"
+#include "../src_c/stem_ISO_8859_1_hungarian.h"
+#include "../src_c/stem_UTF_8_hungarian.h"
+#include "../src_c/stem_ISO_8859_1_italian.h"
+#include "../src_c/stem_UTF_8_italian.h"
+#include "../src_c/stem_ISO_8859_1_norwegian.h"
+#include "../src_c/stem_UTF_8_norwegian.h"
+#include "../src_c/stem_ISO_8859_1_porter.h"
+#include "../src_c/stem_UTF_8_porter.h"
+#include "../src_c/stem_ISO_8859_1_portuguese.h"
+#include "../src_c/stem_UTF_8_portuguese.h"
+#include "../src_c/stem_ISO_8859_2_romanian.h"
+#include "../src_c/stem_UTF_8_romanian.h"
+#include "../src_c/stem_KOI8_R_russian.h"
+#include "../src_c/stem_UTF_8_russian.h"
+#include "../src_c/stem_ISO_8859_1_spanish.h"
+#include "../src_c/stem_UTF_8_spanish.h"
+#include "../src_c/stem_ISO_8859_1_swedish.h"
+#include "../src_c/stem_UTF_8_swedish.h"
+#include "../src_c/stem_UTF_8_turkish.h"
+
+typedef enum {
+ ENC_UNKNOWN=0,
+ ENC_ISO_8859_1,
+ ENC_ISO_8859_2,
+ ENC_KOI8_R,
+ ENC_UTF_8
+} stemmer_encoding_t;
+
+struct stemmer_encoding {
+ const char * name;
+ stemmer_encoding_t enc;
+};
+static struct stemmer_encoding encodings[] = {
+ {"ISO_8859_1", ENC_ISO_8859_1},
+ {"ISO_8859_2", ENC_ISO_8859_2},
+ {"KOI8_R", ENC_KOI8_R},
+ {"UTF_8", ENC_UTF_8},
+ {0,ENC_UNKNOWN}
+};
+
+struct stemmer_modules {
+ const char * name;
+ stemmer_encoding_t enc;
+ struct SN_env * (*create)(void);
+ void (*close)(struct SN_env *);
+ int (*stem)(struct SN_env *);
+};
+static struct stemmer_modules modules[] = {
+ {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
+ {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+ {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
+ {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+ {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
+ {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+ {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+ {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+ {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+ {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+ {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
+ {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+ {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
+ {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+ {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
+ {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+ {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+ {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+ {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
+ {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+ {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
+ {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+ {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
+ {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+ {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+ {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+ {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+ {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
+ {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+ {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
+ {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
+ {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+ {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
+ {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+ {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
+ {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+ {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
+ {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+ {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
+ {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+ {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
+ {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+ {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+ {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
+ {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
+ {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+ {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
+ {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+ {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
+ {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+ {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
+ {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+ {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
+ {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
+ {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
+ {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+ {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
+ {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+ {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+ {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+ {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+ {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
+ {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+ {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
+ {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
+ {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+ {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
+ {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+ {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+ {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
+ {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
+ {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+ {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
+ {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+ {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
+ {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+ {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+ {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+ {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+ {0,ENC_UNKNOWN,0,0,0}
+};
+static const char * algorithm_names[] = {
+ "danish",
+ "dutch",
+ "english",
+ "finnish",
+ "french",
+ "german",
+ "hungarian",
+ "italian",
+ "norwegian",
+ "porter",
+ "portuguese",
+ "romanian",
+ "russian",
+ "spanish",
+ "swedish",
+ "turkish",
+ 0
+};
View
50 src/third_party/libstemmer_c/libstemmer/modules.txt
@@ -0,0 +1,50 @@
+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+# First item is name of stemmer.
+# Second item is comma separated list of character sets.
+# Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8, and also with
+# the most commonly used encoding.
+
+danish UTF_8,ISO_8859_1 danish,da,dan
+dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
+english UTF_8,ISO_8859_1 english,en,eng
+finnish UTF_8,ISO_8859_1 finnish,fi,fin
+french UTF_8,ISO_8859_1 french,fr,fre,fra
+german UTF_8,ISO_8859_1 german,de,ger,deu
+hungarian UTF_8,ISO_8859_1 hungarian,hu,hun
+italian UTF_8,ISO_8859_1 italian,it,ita
+norwegian UTF_8,ISO_8859_1 norwegian,no,nor
+portuguese UTF_8,ISO_8859_1 portuguese,pt,por
+romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
+russian UTF_8,KOI8_R russian,ru,rus
+spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
+swedish UTF_8,ISO_8859_1 swedish,sv,swe
+turkish UTF_8 turkish,tr,tur
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter UTF_8,ISO_8859_1 porter
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported. These
+# algorithms are:
+#
+# german2 - This is a slight modification of the german stemmer.
+#german2 UTF_8,ISO_8859_1 german2
+#
+# kraaij_pohlmann - This is a different dutch stemmer.
+#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
+#
+# lovins - This is an english stemmer, but fairly outdated, and
+# only really applicable to a restricted type of input text
+# (keywords in academic publications).
+#lovins UTF_8,ISO_8859_1 lovins
View
121 src/third_party/libstemmer_c/libstemmer/modules_utf8.h
@@ -0,0 +1,121 @@
+/* libstemmer/modules_utf8.h: List of stemming modules.
+ *
+ * This file is generated by mkmodules.pl from a list of module names.
+ * Do not edit manually.
+ *
+ * Modules included by this file are: danish, dutch, english, finnish, french,
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
+ * russian, spanish, swedish, turkish
+ */
+
+#include "../src_c/stem_UTF_8_danish.h"
+#include "../src_c/stem_UTF_8_dutch.h"
+#include "../src_c/stem_UTF_8_english.h"
+#include "../src_c/stem_UTF_8_finnish.h"
+#include "../src_c/stem_UTF_8_french.h"
+#include "../src_c/stem_UTF_8_german.h"
+#include "../src_c/stem_UTF_8_hungarian.h"
+#include "../src_c/stem_UTF_8_italian.h"
+#include "../src_c/stem_UTF_8_norwegian.h"
+#include "../src_c/stem_UTF_8_porter.h"
+#include "../src_c/stem_UTF_8_portuguese.h"
+#include "../src_c/stem_UTF_8_romanian.h"
+#include "../src_c/stem_UTF_8_russian.h"
+#include "../src_c/stem_UTF_8_spanish.h"
+#include "../src_c/stem_UTF_8_swedish.h"
+#include "../src_c/stem_UTF_8_turkish.h"
+
+typedef enum {
+ ENC_UNKNOWN=0,
+ ENC_UTF_8
+} stemmer_encoding_t;
+
+struct stemmer_encoding {
+ const char * name;
+ stemmer_encoding_t enc;
+};
+static struct stemmer_encoding encodings[] = {
+ {"UTF_8", ENC_UTF_8},
+ {0,ENC_UNKNOWN}
+};
+
+struct stemmer_modules {
+ const char * name;
+ stemmer_encoding_t enc;
+ struct SN_env * (*create)(void);
+ void (*close)(struct SN_env *);
+ int (*stem)(struct SN_env *);
+};
+static struct stemmer_modules modules[] = {
+ {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+ {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+ {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
+ {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+ {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+ {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
+ {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+ {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+ {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
+ {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
+ {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
+ {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+ {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+ {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
+ {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+ {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+ {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
+ {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
+ {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+ {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+ {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
+ {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+ {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
+ {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+ {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
+ {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+ {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
+ {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+ {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
+ {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
+ {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+ {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+ {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
+ {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+ {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+ {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
+ {0,ENC_UNKNOWN,0,0,0}
+};
+static const char * algorithm_names[] = {
+ "danish",
+ "dutch",
+ "english",
+ "finnish",
+ "french",
+ "german",
+ "hungarian",
+ "italian",
+ "norwegian",
+ "porter",
+ "portuguese",
+ "romanian",
+ "russian",
+ "spanish",
+ "swedish",
+ "turkish",
+ 0
+};
View
49 src/third_party/libstemmer_c/libstemmer/modules_utf8.txt
@@ -0,0 +1,49 @@
+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+# First item is name of stemmer.
+# Second item is comma separated list of character sets.
+# Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8.
+
+danish UTF_8 danish,da,dan
+dutch UTF_8 dutch,nl,dut,nld
+english UTF_8 english,en,eng
+finnish UTF_8 finnish,fi,fin
+french UTF_8 french,fr,fre,fra
+german UTF_8 german,de,ger,deu
+hungarian UTF_8 hungarian,hu,hun
+italian UTF_8 italian,it,ita
+norwegian UTF_8 norwegian,no,nor
+portuguese UTF_8 portuguese,pt,por
+romanian UTF_8 romanian,ro,rum,ron
+russian UTF_8 russian,ru,rus
+spanish UTF_8 spanish,es,esl,spa
+swedish UTF_8 swedish,sv,swe
+turkish UTF_8 turkish,tr,tur
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter UTF_8 porter
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported. These
+# algorithms are:
+#
+# german2 - This is a slight modification of the german stemmer.
+#german2 UTF_8 german2
+#
+# kraaij_pohlmann - This is a different dutch stemmer.
+#kraaij_pohlmann UTF_8 kraaij_pohlmann
+#
+# lovins - This is an english stemmer, but fairly outdated, and
+# only really applicable to a restricted type of input text
+# (keywords in academic publications).
+#lovins UTF_8 lovins
View
82 src/third_party/libstemmer_c/mkinc.mak
@@ -0,0 +1,82 @@
+# libstemmer/mkinc.mak: List of stemming module source files
+#
+# This file is generated by mkmodules.pl from a list of module names.
+# Do not edit manually.
+#
+# Modules included by this file are: danish, dutch, english, finnish, french,
+# german, hungarian, italian, norwegian, porter, portuguese, romanian,
+# russian, spanish, swedish, turkish
+
+snowball_sources= \
+ src_c/stem_ISO_8859_1_danish.c \
+ src_c/stem_UTF_8_danish.c \
+ src_c/stem_ISO_8859_1_dutch.c \
+ src_c/stem_UTF_8_dutch.c \
+ src_c/stem_ISO_8859_1_english.c \
+ src_c/stem_UTF_8_english.c \
+ src_c/stem_ISO_8859_1_finnish.c \
+ src_c/stem_UTF_8_finnish.c \
+ src_c/stem_ISO_8859_1_french.c \
+ src_c/stem_UTF_8_french.c \
+ src_c/stem_ISO_8859_1_german.c \
+ src_c/stem_UTF_8_german.c \
+ src_c/stem_ISO_8859_1_hungarian.c \
+ src_c/stem_UTF_8_hungarian.c \
+ src_c/stem_ISO_8859_1_italian.c \
+ src_c/stem_UTF_8_italian.c \
+ src_c/stem_ISO_8859_1_norwegian.c \
+ src_c/stem_UTF_8_norwegian.c \
+ src_c/stem_ISO_8859_1_porter.c \
+ src_c/stem_UTF_8_porter.c \
+ src_c/stem_ISO_8859_1_portuguese.c \
+ src_c/stem_UTF_8_portuguese.c \
+ src_c/stem_ISO_8859_2_romanian.c \
+ src_c/stem_UTF_8_romanian.c \
+ src_c/stem_KOI8_R_russian.c \
+ src_c/stem_UTF_8_russian.c \
+ src_c/stem_ISO_8859_1_spanish.c \
+ src_c/stem_UTF_8_spanish.c \
+ src_c/stem_ISO_8859_1_swedish.c \
+ src_c/stem_UTF_8_swedish.c \
+ src_c/stem_UTF_8_turkish.c \
+ runtime/api.c \
+ runtime/utilities.c \
+ libstemmer/libstemmer.c
+
+snowball_headers= \
+ src_c/stem_ISO_8859_1_danish.h \
+ src_c/stem_UTF_8_danish.h \
+ src_c/stem_ISO_8859_1_dutch.h \
+ src_c/stem_UTF_8_dutch.h \
+ src_c/stem_ISO_8859_1_english.h \
+ src_c/stem_UTF_8_english.h \
+ src_c/stem_ISO_8859_1_finnish.h \
+ src_c/stem_UTF_8_finnish.h \
+ src_c/stem_ISO_8859_1_french.h \
+ src_c/stem_UTF_8_french.h \
+ src_c/stem_ISO_8859_1_german.h \
+ src_c/stem_UTF_8_german.h \
+ src_c/stem_ISO_8859_1_hungarian.h \
+ src_c/stem_UTF_8_hungarian.h \
+ src_c/stem_ISO_8859_1_italian.h \
+ src_c/stem_UTF_8_italian.h \
+ src_c/stem_ISO_8859_1_norwegian.h \
+ src_c/stem_UTF_8_norwegian.h \
+ src_c/stem_ISO_8859_1_porter.h \
+ src_c/stem_UTF_8_porter.h \
+ src_c/stem_ISO_8859_1_portuguese.h \
+ src_c/stem_UTF_8_portuguese.h \
+ src_c/stem_ISO_8859_2_romanian.h \
+ src_c/stem_UTF_8_romanian.h \
+ src_c/stem_KOI8_R_russian.h \
+ src_c/stem_UTF_8_russian.h \
+ src_c/stem_ISO_8859_1_spanish.h \
+ src_c/stem_UTF_8_spanish.h \
+ src_c/stem_ISO_8859_1_swedish.h \
+ src_c/stem_UTF_8_swedish.h \
+ src_c/stem_UTF_8_turkish.h \
+ include/libstemmer.h \
+ libstemmer/modules.h \
+ runtime/api.h \
+ runtime/header.h
+
View
52 src/third_party/libstemmer_c/mkinc_utf8.mak
@@ -0,0 +1,52 @@
+# libstemmer/mkinc_utf8.mak: List of stemming module source files
+#
+# This file is generated by mkmodules.pl from a list of module names.
+# Do not edit manually.
+#
+# Modules included by this file are: danish, dutch, english, finnish, french,
+# german, hungarian, italian, norwegian, porter, portuguese, romanian,
+# russian, spanish, swedish, turkish
+
+snowball_sources= \
+ src_c/stem_UTF_8_danish.c \
+ src_c/stem_UTF_8_dutch.c \
+ src_c/stem_UTF_8_english.c \
+ src_c/stem_UTF_8_finnish.c \
+ src_c/stem_UTF_8_french.c \
+ src_c/stem_UTF_8_german.c \
+ src_c/stem_UTF_8_hungarian.c \
+ src_c/stem_UTF_8_italian.c \
+ src_c/stem_UTF_8_norwegian.c \
+ src_c/stem_UTF_8_porter.c \
+ src_c/stem_UTF_8_portuguese.c \
+ src_c/stem_UTF_8_romanian.c \
+ src_c/stem_UTF_8_russian.c \
+ src_c/stem_UTF_8_spanish.c \
+ src_c/stem_UTF_8_swedish.c \
+ src_c/stem_UTF_8_turkish.c \
+ runtime/api.c \
+ runtime/utilities.c \
+ libstemmer/libstemmer_utf8.c
+
+snowball_headers= \
+ src_c/stem_UTF_8_danish.h \
+ src_c/stem_UTF_8_dutch.h \
+ src_c/stem_UTF_8_english.h \
+ src_c/stem_UTF_8_finnish.h \
+ src_c/stem_UTF_8_french.h \
+ src_c/stem_UTF_8_german.h \
+ src_c/stem_UTF_8_hungarian.h \
+ src_c/stem_UTF_8_italian.h \
+ src_c/stem_UTF_8_norwegian.h \
+ src_c/stem_UTF_8_porter.h \
+ src_c/stem_UTF_8_portuguese.h \
+ src_c/stem_UTF_8_romanian.h \
+ src_c/stem_UTF_8_russian.h \
+ src_c/stem_UTF_8_spanish.h \
+ src_c/stem_UTF_8_swedish.h \
+ src_c/stem_UTF_8_turkish.h \
+ include/libstemmer.h \
+ libstemmer/modules_utf8.h \
+ runtime/api.h \
+ runtime/header.h
+
View
66 src/third_party/libstemmer_c/runtime/api.c
@@ -0,0 +1,66 @@
+
+#include <stdlib.h> /* for calloc, free */
+#include "header.h"
+
+extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
+{
+ struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
+ if (z == NULL) return NULL;
+ z->p = create_s();
+ if (z->p == NULL) goto error;
+ if (S_size)
+ {
+ int i;
+ z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
+ if (z->S == NULL) goto error;
+
+ for (i = 0; i < S_size; i++)
+ {
+ z->S[i] = create_s();
+ if (z->S[i] == NULL) goto error;
+ }
+ }
+
+ if (I_size)
+ {
+ z->I = (int *) calloc(I_size, sizeof(int));
+ if (z->I == NULL) goto error;
+ }
+
+ if (B_size)
+ {
+ z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
+ if (z->B == NULL) goto error;
+ }
+
+ return z;
+error:
+ SN_close_env(z, S_size);
+ return NULL;
+}
+
+extern void SN_close_env(struct SN_env * z, int S_size)
+{
+ if (z == NULL) return;
+ if (S_size)
+ {
+ int i;
+ for (i = 0; i < S_size; i++)
+ {
+ lose_s(z->S[i]);
+ }
+ free(z->S);
+ }
+ free(z->I);
+ free(z->B);
+ if (z->p) lose_s(z->p);
+ free(z);
+}
+
+extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
+{
+ int err = replace_s(z, 0, z->l, size, s, NULL);
+ z->c = 0;
+ return err;
+}
+
View
26 src/third_party/libstemmer_c/runtime/api.h
@@ -0,0 +1,26 @@
+
+typedef unsigned char symbol;
+
+/* Or replace 'char' above with 'short' for 16 bit characters.
+
+ More precisely, replace 'char' with whatever type guarantees the
+ character width you need. Note however that sizeof(symbol) should divide
+ HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
+ there is an alignment problem. In the unlikely event of a problem here,
+ consult Martin Porter.
+
+*/
+
+struct SN_env {
+ symbol * p;
+ int c; int l; int lb; int bra; int ket;
+ symbol * * S;
+ int * I;
+ unsigned char * B;
+};
+
+extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
+extern void SN_close_env(struct SN_env * z, int S_size);
+
+extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
+
View
58 src/third_party/libstemmer_c/runtime/header.h
@@ -0,0 +1,58 @@
+
+#include <limits.h>
+
+#include "api.h"
+
+#define MAXINT INT_MAX
+#define MININT INT_MIN
+
+#define HEAD 2*sizeof(int)
+
+#define SIZE(p) ((int *)(p))[-1]
+#define SET_SIZE(p, n) ((int *)(p))[-1] = n
+#define CAPACITY(p) ((int *)(p))[-2]
+
+struct among
+{ int s_size; /* number of chars in string */
+ const symbol * s; /* search string */
+ int substring_i;/* index to longest matching substring */
+ int result; /* result of the lookup */
+ int (* function)(struct SN_env *);
+};
+
+extern symbol * create_s(void);
+extern void lose_s(symbol * p);
+
+extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
+
+extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+
+extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+
+extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
+extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
+extern int eq_v(struct SN_env * z, const symbol * p);
+extern int eq_v_b(struct SN_env * z, const symbol * p);
+
+extern int find_among(struct SN_env * z, const struct among * v, int v_size);
+extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
+
+extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
+extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
+extern int slice_from_v(struct SN_env * z, const symbol * p);
+extern int slice_del(struct SN_env * z);
+
+extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
+extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
+
+extern symbol * slice_to(struct SN_env * z, symbol * p);
+extern symbol * assign_to(struct SN_env * z, symbol * p);
+
+extern void debug(struct SN_env * z, int number, int line_count);
+
View
478 src/third_party/libstemmer_c/runtime/utilities.c
@@ -0,0 +1,478 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "header.h"
+
+#define unless(C) if(!(C))
+
+#define CREATE_SIZE 1
+
+extern symbol * create_s(void) {
+ symbol * p;
+ void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
+ if (mem == NULL) return NULL;
+ p = (symbol *) (HEAD + (char *) mem);
+ CAPACITY(p) = CREATE_SIZE;
+ SET_SIZE(p, CREATE_SIZE);
+ return p;
+}
+
+extern void lose_s(symbol * p) {
+ if (p == NULL) return;
+ free((char *) p - HEAD);
+}
+
+/*
+ new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
+ if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
+ position, or 0 on failure.
+
+ -- used to implement hop and next in the utf8 case.
+*/
+
+extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
+ int b;
+ if (n >= 0) {
+ for (; n > 0; n--) {
+ if (c >= l) return -1;
+ b = p[c++];
+ if (b >= 0xC0) { /* 1100 0000 */
+ while (c < l) {
+ b = p[c];
+ if (b >= 0xC0 || b < 0x80) break;
+ /* break unless b is 10------ */
+ c++;
+ }
+ }
+ }
+ } else {
+ for (; n < 0; n++) {
+ if (c <= lb) return -1;
+ b = p[--c];
+ if (b >= 0x80) { /* 1000 0000 */
+ while (c > lb) {
+ b = p[c];
+ if (b >= 0xC0) break; /* 1100 0000 */
+ c--;
+ }
+ }
+ }
+ }
+ return c;
+}
+
+/* Code for character groupings: utf8 cases */
+
+static int get_utf8(const symbol * p, int c, int l, int * slot) {
+ int b0, b1;
+ if (c >= l) return 0;
+ b0 = p[c++];
+ if (b0 < 0xC0 || c == l) { /* 1100 0000 */
+ * slot = b0; return 1;
+ }
+ b1 = p[c++];
+ if (b0 < 0xE0 || c == l) { /* 1110 0000 */
+ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
+ }
+ * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
+}
+
+static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
+ int b0, b1;
+ if (c <= lb) return 0;
+ b0 = p[--c];
+ if (b0 < 0x80 || c == lb) { /* 1000 0000 */
+ * slot = b0; return 1;
+ }
+ b1 = p[--c];
+ if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
+ * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
+ }
+ * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
+}
+
+extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ unless (w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c += w;
+ } while (repeat);
+ return 0;
+}
+
+extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ unless (w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c -= w;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ unless (w) return -1;
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c += w;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ unless (w) return -1;
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c -= w;
+ } while (repeat);
+ return 0;
+}
+
+/* Code for character groupings: non-utf8 cases */
+
+extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c++;
+ } while (repeat);
+ return 0;
+}
+
+extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c--;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c++;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c--;
+ } while (repeat);
+ return 0;
+}
+
+extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
+ if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
+ z->c += s_size; return 1;
+}
+
+extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
+ if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
+ z->c -= s_size; return 1;
+}
+
+extern int eq_v(struct SN_env * z, const symbol * p) {
+ return eq_s(z, SIZE(p), p);
+}
+
+extern int eq_v_b(struct SN_env * z, const symbol * p) {
+ return eq_s_b(z, SIZE(p), p);
+}
+
+extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
+
+ int i = 0;
+ int j = v_size;
+
+ int c = z->c; int l = z->l;
+ symbol * q = z->p + c;
+
+ const struct among * w;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ int first_key_inspected = 0;
+
+ while(1) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j; /* smaller */
+ w = v + k;
+ {
+ int i2; for (i2 = common; i2 < w->s_size; i2++) {
+ if (c + common == l) { diff = -1; break; }
+ diff = q[common] - w->s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ }
+ if (diff < 0) { j = k; common_j = common; }
+ else { i = k; common_i = common; }
+ if (j - i <= 1) {
+ if (i > 0) break; /* v->s has been inspected */
+ if (j == i) break; /* only one item in v */
+
+ /* - but now we need to go round once more to get
+ v->s inspected. This looks messy, but is actually
+ the optimal approach. */
+
+ if (first_key_inspected) break;
+ first_key_inspected = 1;
+ }
+ }
+ while(1) {
+ w = v + i;
+ if (common_i >= w->s_size) {
+ z->c = c + w->s_size;
+ if (w->function == 0) return w->result;
+ {
+ int res = w->function(z);
+ z->c = c + w->s_size;
+ if (res) return w->result;
+ }
+ }
+ i = w->substring_i;
+ if (i < 0) return 0;
+ }
+}
+
+/* find_among_b is for backwards processing. Same comments apply */
+
+extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
+
+ int i = 0;
+ int j = v_size;
+
+ int c = z->c; int lb = z->lb;
+ symbol * q = z->p + c - 1;
+
+ const struct among * w;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ int first_key_inspected = 0;
+
+ while(1) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j;
+ w = v + k;
+ {
+ int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
+ if (c - common == lb) { diff = -1; break; }
+ diff = q[- common] - w->s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ }
+ if (diff < 0) { j = k; common_j = common; }
+ else { i = k; common_i = common; }
+ if (j - i <= 1) {
+ if (i > 0) break;
+ if (j == i) break;
+ if (first_key_inspected) break;
+ first_key_inspected = 1;
+ }
+ }
+ while(1) {
+ w = v + i;
+ if (common_i >= w->s_size) {
+ z->c = c - w->s_size;
+ if (w->function == 0) return w->result;
+ {
+ int res = w->function(z);
+ z->c = c - w->s_size;
+ if (res) return w->result;
+ }
+ }
+ i = w->substring_i;
+ if (i < 0) return 0;
+ }
+}
+
+
+/* Increase the size of the buffer pointed to by p to at least n symbols.
+ * If insufficient memory, returns NULL and frees the old buffer.
+ */
+static symbol * increase_size(symbol * p, int n) {
+ symbol * q;
+ int new_size = n + 20;
+ void * mem = realloc((char *) p - HEAD,
+ HEAD + (new_size + 1) * sizeof(symbol));
+ if (mem == NULL) {
+ lose_s(p);
+ return NULL;
+ }
+ q = (symbol *) (HEAD + (char *)mem);
+ CAPACITY(q) = new_size;
+ return q;
+}
+
+/* to replace symbols between c_bra and c_ket in z->p by the
+ s_size symbols at s.
+ Returns 0 on success, -1 on error.
+ Also, frees z->p (and sets it to NULL) on error.
+*/
+extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
+{
+ int adjustment;
+ int len;
+ if (z->p == NULL) {
+ z->p = create_s();
+ if (z->p == NULL) return -1;
+ }
+ adjustment = s_size - (c_ket - c_bra);
+ len = SIZE(z->p);
+ if (adjustment != 0) {
+ if (adjustment + len > CAPACITY(z->p)) {
+ z->p = increase_size(z->p, adjustment + len);
+ if (z->p == NULL) return -1;
+ }
+ memmove(z->p + c_ket + adjustment,
+ z->p + c_ket,
+ (len - c_ket) * sizeof(symbol));
+ SET_SIZE(z->p, adjustment + len);
+ z->l += adjustment;
+ if (z->c >= c_ket)
+ z->c += adjustment;
+ else
+ if (z->c > c_bra)
+ z->c = c_bra;
+ }
+ unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
+ if (adjptr != NULL)
+ *adjptr = adjustment;
+ return 0;
+}
+
+static int slice_check(struct SN_env * z) {
+
+ if (z->bra < 0 ||
+ z->bra > z->ket ||
+ z->ket > z->l ||
+ z->p == NULL ||
+ z->l > SIZE(z->p)) /* this line could be removed */
+ {
+#if 0
+ fprintf(stderr, "faulty slice operation:\n");
+ debug(z, -1, 0);
+#endif
+ return -1;
+ }
+ return 0;
+}
+
+extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
+ if (slice_check(z)) return -1;
+ return replace_s(z, z->bra, z->ket, s_size, s, NULL);
+}
+
+extern int slice_from_v(struct SN_env * z, const symbol * p) {
+ return slice_from_s(z, SIZE(p), p);
+}
+
+extern int slice_del(struct SN_env * z) {
+ return slice_from_s(z, 0, 0);
+}
+
+extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
+ int adjustment;
+ if (replace_s(z, bra, ket, s_size, s, &adjustment))
+ return -1;
+ if (bra <= z->bra) z->bra += adjustment;
+ if (bra <= z->ket) z->ket += adjustment;
+ return 0;
+}
+
+extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
+ int adjustment;
+ if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
+ return -1;
+ if (bra <= z->bra) z->bra += adjustment;
+ if (bra <= z->ket) z->ket += adjustment;
+ return 0;
+}
+
+extern symbol * slice_to(struct SN_env * z, symbol * p) {
+ if (slice_check(z)) {
+ lose_s(p);
+ return NULL;
+ }
+ {
+ int len = z->ket - z->bra;
+ if (CAPACITY(p) < len) {
+ p = increase_size(p, len);
+ if (p == NULL)
+ return NULL;
+ }
+ memmove(p, z->p + z->bra, len * sizeof(symbol));
+ SET_SIZE(p, len);
+ }
+ return p;
+}
+
+extern symbol * assign_to(struct SN_env * z, symbol * p) {
+ int len = z->l;
+ if (CAPACITY(p) < len) {
+ p = increase_size(p, len);
+ if (p == NULL)
+ return NULL;
+ }
+ memmove(p, z->p, len * sizeof(symbol));
+ SET_SIZE(p, len);
+ return p;
+}
+
+#if 0
+extern void debug(struct SN_env * z, int number, int line_count) {
+ int i;
+ int limit = SIZE(z->p);
+ /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
+ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
+ for (i = 0; i <= limit; i++) {
+ if (z->lb == i) printf("{");
+ if (z->bra == i) printf("[");
+ if (z->c == i) printf("|");
+ if (z->ket == i) printf("]");
+ if (z->l == i) printf("}");
+ if (i < limit)
+ { int ch = z->p[i];
+ if (ch == 0) ch = '#';
+ printf("%c", ch);
+ }
+ }
+ printf("'\n");
+}
+#endif
View
337 src/third_party/libstemmer_c/src_c/stem_ISO_8859_1_danish.c
@@ -0,0 +1,337 @@
+
+/* This file was generated automatically by the Snowball to ANSI C compiler */
+
+#include "../runtime/header.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int danish_ISO_8859_1_stem(struct SN_env * z);
+#ifdef __cplusplus
+}
+#endif
+static int r_undouble(struct SN_env * z);
+static int r_other_suffix(struct SN_env * z);
+static int r_consonant_pair(struct SN_env * z);
+static int r_main_suffix(struct SN_env * z);
+static int r_mark_regions(struct SN_env * z);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+extern struct SN_env * danish_ISO_8859_1_create_env(void);
+extern void danish_ISO_8859_1_close_env(struct SN_env * z);
+
+
+#ifdef __cplusplus
+}
+#endif
+static const symbol s_0_0[3] = { 'h', 'e', 'd' };
+static const symbol s_0_1[5] = { 'e', 't', 'h', 'e', 'd' };
+static const symbol s_0_2[4] = { 'e', 'r', 'e', 'd' };
+static const symbol s_0_3[1] = { 'e' };
+static const symbol s_0_4[5] = { 'e', 'r', 'e', 'd', 'e' };
+static const symbol s_0_5[4] = { 'e', 'n', 'd', 'e' };
+static const symbol s_0_6[6] = { 'e', 'r', 'e', 'n', 'd', 'e' };
+static const symbol s_0_7[3] = { 'e', 'n', 'e' };
+static const symbol s_0_8[4] = { 'e', 'r', 'n', 'e' };
+static const symbol s_0_9[3] = { 'e', 'r', 'e' };
+static const symbol s_0_10[2] = { 'e', 'n' };
+static const symbol s_0_11[5] = { 'h', 'e', 'd', 'e', 'n' };
+static const symbol s_0_12[4] = { 'e', 'r', 'e', 'n' };
+static const symbol s_0_13[2] = { 'e', 'r' };
+static const symbol s_0_14[5] = { 'h', 'e', 'd', 'e', 'r' };
+static const symbol s_0_15[4] = { 'e', 'r', 'e', 'r' };
+static const symbol s_0_16[1] = { 's' };
+static const symbol s_0_17[4] = { 'h', 'e', 'd', 's' };
+static const symbol s_0_18[2] = { 'e', 's' };
+static const symbol s_0_19[5] = { 'e', 'n', 'd', 'e', 's' };
+static const symbol s_0_20[7] = { 'e', 'r', 'e', 'n', 'd', 'e', 's' };
+static const symbol s_0_21[4] = { 'e', 'n', 'e', 's' };
+static const symbol s_0_22[5] = { 'e', 'r', 'n', 'e', 's' };
+static const symbol s_0_23[4] = { 'e', 'r', 'e', 's' };
+static const symbol s_0_24[3] = { 'e', 'n', 's' };
+static const symbol s_0_25[6] = { 'h', 'e', 'd', 'e', 'n', 's' };
+static const symbol s_0_26[5] = { 'e', 'r', 'e', 'n', 's' };
+static const symbol s_0_27[3] = { 'e', 'r', 's' };
+static const symbol s_0_28[3] = { 'e', 't', 's' };
+static const symbol s_0_29[5] = { 'e', 'r', 'e', 't', 's' };
+static const symbol s_0_30[2] = { 'e', 't' };
+static const symbol s_0_31[4] = { 'e', 'r', 'e', 't' };
+
+static const struct among a_0[32] =
+{
+/* 0 */ { 3, s_0_0, -1, 1, 0},
+/* 1 */ { 5, s_0_1, 0, 1, 0},
+/* 2 */ { 4, s_0_2, -1, 1, 0},
+/* 3 */ { 1, s_0_3, -1, 1, 0},
+/* 4 */ { 5, s_0_4, 3, 1, 0},
+/* 5 */ { 4, s_0_5, 3, 1, 0},
+/* 6 */ { 6, s_0_6, 5, 1, 0},
+/* 7 */ { 3, s_0_7, 3, 1, 0},
+/* 8 */ { 4, s_0_8, 3, 1, 0},
+/* 9 */ { 3, s_0_9, 3, 1, 0},
+/* 10 */ { 2, s_0_10, -1, 1, 0},
+/* 11 */ { 5, s_0_11, 10, 1, 0},
+/* 12 */ { 4, s_0_12, 10, 1, 0},
+/* 13 */ { 2, s_0_13, -1, 1, 0},
+/* 14 */ { 5, s_0_14, 13, 1, 0},
+/* 15 */ { 4, s_0_15, 13, 1, 0},
+/* 16 */ { 1, s_0_16, -1, 2, 0},
+/* 17 */ { 4, s_0_17, 16, 1, 0},
+/* 18 */ { 2, s_0_18, 16, 1, 0},
+/* 19 */ { 5, s_0_19, 18, 1, 0},
+/* 20 */ { 7, s_0_20, 19, 1, 0},
+/* 21 */ { 4, s_0_21, 18, 1, 0},
+/* 22 */ { 5, s_0_22, 18, 1, 0},
+/* 23 */ { 4, s_0_23, 18, 1, 0},
+/* 24 */ { 3, s_0_24, 16, 1, 0},
+/* 25 */ { 6, s_0_25, 24, 1, 0},
+/* 26 */ { 5, s_0_26, 24, 1, 0},
+/* 27 */ { 3, s_0_27, 16, 1, 0},
+/* 28 */ { 3, s_0_28, 16, 1, 0},
+/* 29 */ { 5, s_0_29, 28, 1, 0},
+/* 30 */ { 2, s_0_30, -1, 1, 0},
+/* 31 */ { 4, s_0_31, 30, 1, 0}
+};
+
+static const symbol s_1_0[2] = { 'g', 'd' };
+static const symbol s_1_1[2] = { 'd', 't' };
+static const symbol s_1_2[2] = { 'g', 't' };
+static const symbol s_1_3[2] = { 'k', 't' };
+
+static const struct among a_1[4] =
+{
+/* 0 */ { 2, s_1_0, -1, -1, 0},
+/* 1 */ { 2, s_1_1, -1, -1, 0},
+/* 2 */ { 2, s_1_2, -1, -1, 0},
+/* 3 */ { 2, s_1_3, -1, -1, 0}
+};
+
+static const symbol s_2_0[2] = { 'i', 'g' };
+static const symbol s_2_1[3] = { 'l', 'i', 'g' };
+static const symbol s_2_2[4] = { 'e', 'l', 'i', 'g' };
+static const symbol s_2_3[3] = { 'e', 'l', 's' };
+static const symbol s_2_4[4] = { 'l', 0xF8, 's', 't' };
+
+static const struct among a_2[5] =
+{
+/* 0 */ { 2, s_2_0, -1, 1, 0},
+/* 1 */ { 3, s_2_1, 0, 1, 0},
+/* 2 */ { 4, s_2_2, 1, 1, 0},
+/* 3 */ { 3, s_2_3, -1, 1, 0},
+/* 4 */ { 4, s_2_4, -1, 2, 0}
+};
+
+static const unsigned char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 };
+
+static const unsigned char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 };
+
+static const symbol s_0[] = { 's', 't' };
+static const symbol s_1[] = { 'i', 'g' };
+static const symbol s_2[] = { 'l', 0xF8, 's' };
+
+static int r_mark_regions(struct SN_env * z) {