Fixes GH-85: Encoded sequences can clash with separator byte and caus…

…e assertion errors
morfologik · Nov 7, 2016 · efc065b · efc065b
1 parent a321569
commit efc065b
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 9 deletions.
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java
@@ -72,7 +72,8 @@ public WordData next() {
     /*
      * Find the next separator byte's position splitting word form and tag.
      */
-    sepPos = 0;
+    assert sequenceEncoder.prefixBytes() <= bbSize : sequenceEncoder.getClass() + " >? " + bbSize;
+    sepPos = sequenceEncoder.prefixBytes();
     for (; sepPos < bbSize; sepPos++) {
       if (ba[sepPos] == separator)
         break;

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java
@@ -129,6 +129,7 @@ public DictionaryLookup(Dictionary dictionary)
   @Override
   public List<WordData> lookup(CharSequence word) {
     final byte separator = dictionaryMetadata.getSeparator();
+    final int prefixBytes = sequenceEncoder.prefixBytes();
 
     if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
       word = applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
@@ -204,8 +205,9 @@ public List<WordData> lookup(CharSequence word) {
            * Find the separator byte's position splitting the inflection instructions
            * from the tag.
            */
+          assert prefixBytes <= bbSize : sequenceEncoder.getClass() + " >? " + bbSize;
           int sepPos;
-          for (sepPos = 0; sepPos < bbSize; sepPos++) {
+          for (sepPos = prefixBytes; sepPos < bbSize; sepPos++) {
             if (ba[sepPos] == separator) {
               break;
             }

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/ISequenceEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/ISequenceEncoder.java
@@ -34,4 +34,14 @@ public interface ISequenceEncoder {
    * @return Returns the {@link ByteBuffer} with decoded <code>target</code>.
    */
   public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded);
+
+  /**
+   * The number of encoded form's prefix bytes that should be ignored (needed for separator lookup). 
+   * An ugly workaround for GH-85, should be fixed by prior knowledge of whether the dictionary contains tags;
+   * then we can scan for separator right-to-left.
+   * 
+   * @see "https://github.com/morfologik/morfologik-stemming/issues/85"
+   */
+  @Deprecated
+  public int prefixBytes();
 }
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/NoEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/NoEncoder.java
@@ -30,6 +30,11 @@ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded
     return reuse;
   }
 
+  @Override
+  public int prefixBytes() {
+    return 0;
+  }
+
   @Override
   public String toString() {
     return getClass().getSimpleName();

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/TrimInfixAndSuffixEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/TrimInfixAndSuffixEncoder.java
@@ -109,6 +109,11 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
     return reuse;
   }
 
+  @Override
+  public int prefixBytes() {
+    return 3;
+  }
+
   public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
     assert encoded.remaining() >= 3;
 

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/TrimPrefixAndSuffixEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/TrimPrefixAndSuffixEncoder.java
@@ -73,6 +73,11 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
 
     return reuse;
   }
+
+  @Override
+  public int prefixBytes() {
+    return 2;
+  }
 
   public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
     assert encoded.remaining() >= 2;

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/TrimSuffixEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/TrimSuffixEncoder.java
@@ -55,7 +55,12 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
 
     return reuse;
   }
-
+
+  @Override
+  public int prefixBytes() {
+    return 1;
+  }
+
   public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
     assert encoded.remaining() >= 1;
 
@@ -84,7 +89,7 @@ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded
 
     return reuse;
   }
-
+  
   @Override
   public String toString() {
     return getClass().getSimpleName();

diff --git a/morfologik-tools/src/test/java/morfologik/tools/DictCompileBug.java b/morfologik-tools/src/test/java/morfologik/tools/DictCompileBug.java
@@ -0,0 +1,63 @@
+package morfologik.tools;
+
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.DictionaryMetadata;
+import morfologik.stemming.EncoderType;
+import morfologik.stemming.WordData;
+
+public class DictCompileBug extends RandomizedTest {
+  @Test
+  public void testSeparatorInEncoded() throws Exception {
+    final Path input = newTempDir().toPath().resolve("dictionary.input");
+    final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input);
+
+    char separator = '_';
+    try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) {
+      DictionaryMetadata.builder()
+          .separator(separator)
+          .encoder(EncoderType.SUFFIX)
+          .encoding(StandardCharsets.UTF_8)
+          .build()
+          .write(writer);
+    }
+
+    Set<String> sequences = new LinkedHashSet<>();
+    for (int seqs = randomIntBetween(0, 100); --seqs >= 0;) {
+      sequences.add("anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
+      sequences.add("Anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
+    }
+
+    try (Writer writer = Files.newBufferedWriter(input, StandardCharsets.UTF_8)) {
+      for (String in : sequences) {
+        writer.write(in);
+        writer.write('\n');
+      }
+    }
+
+    Assertions.assertThat(new DictCompile(input, false, true, false, false, false).call())
+      .isEqualTo(ExitStatus.SUCCESS);
+
+    Path dict = input.resolveSibling("dictionary.dict");
+    Assertions.assertThat(dict).isRegularFile();
+
+    // Verify the dictionary is valid.
+
+    DictionaryLookup dictionaryLookup = new DictionaryLookup(Dictionary.read(dict));
+    for (WordData wd : dictionaryLookup) {
+      System.out.println(wd);
+    }
+  }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/DictCompileTest.java b/morfologik-tools/src/test/java/morfologik/tools/DictCompileTest.java
@@ -19,15 +19,21 @@
 
 import com.carrotsearch.randomizedtesting.RandomizedTest;
 import com.carrotsearch.randomizedtesting.annotations.Repeat;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 
 public class DictCompileTest extends RandomizedTest {
   @Test
-  @Repeat(iterations = 50)
+  @Repeat(iterations = 200)
   public void testRoundTrip() throws Exception {
     final Path input = newTempDir().toPath().resolve("dictionary.input");
     final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input);
 
-    char separator = '|';
+    char separator = RandomPicks.randomFrom(getRandom(), new Character [] {
+        '|',
+        ',',
+        '\t',
+    });
+
     try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) {
       DictionaryMetadata.builder()
           .separator(separator)
@@ -41,10 +47,44 @@ public void testRoundTrip() throws Exception {
 
     Set<String> sequences = new LinkedHashSet<>();
     for (int seqs = randomIntBetween(0, 100); --seqs >= 0;) {
+      String base;
+      switch (randomIntBetween(0, 5)) {
+        case 0:
+          base = randomAsciiOfLength(('A' - separator) & 0xff);
+          break;
+
+        default:
+          base = randomAsciiOfLengthBetween(1, 100);
+          break;
+      }
+
+      String inflected;
+      switch (randomIntBetween(0, 5)) {
+        case 0:
+          inflected = base;
+          break;
+
+        case 1:
+          inflected = randomAsciiOfLengthBetween(0, 5) + base;
+          break;
+
+        case 3: 
+          inflected = base + randomAsciiOfLengthBetween(0, 5);
+          break;
+
+        case 4:
+          inflected = randomAsciiOfLengthBetween(0, 5) + base + randomAsciiOfLengthBetween(0, 5);
+          break;
+
+        default:
+          inflected = randomAsciiOfLengthBetween(0, 200);
+            break;
+      }
+
+
       sequences.add(
-          randomAsciiOfLengthBetween(1, 10) + 
-          separator + 
-          randomAsciiOfLengthBetween(0, 10) +
+          base + separator + 
+          inflected +
           (useTags ? (separator + randomAsciiOfLengthBetween(0, 10)) : ""));
     }