Skip to content

Commit

Permalink
Fixes GH-85: Encoded sequences can clash with separator byte and caus…
Browse files Browse the repository at this point in the history
…e assertion errors
  • Loading branch information
dweiss committed Nov 7, 2016
1 parent a321569 commit efc065b
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ public WordData next() {
/*
* Find the next separator byte's position splitting word form and tag.
*/
sepPos = 0;
assert sequenceEncoder.prefixBytes() <= bbSize : sequenceEncoder.getClass() + " >? " + bbSize;
sepPos = sequenceEncoder.prefixBytes();
for (; sepPos < bbSize; sepPos++) {
if (ba[sepPos] == separator)
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ public DictionaryLookup(Dictionary dictionary)
@Override
public List<WordData> lookup(CharSequence word) {
final byte separator = dictionaryMetadata.getSeparator();
final int prefixBytes = sequenceEncoder.prefixBytes();

if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
word = applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
Expand Down Expand Up @@ -204,8 +205,9 @@ public List<WordData> lookup(CharSequence word) {
* Find the separator byte's position splitting the inflection instructions
* from the tag.
*/
assert prefixBytes <= bbSize : sequenceEncoder.getClass() + " >? " + bbSize;
int sepPos;
for (sepPos = 0; sepPos < bbSize; sepPos++) {
for (sepPos = prefixBytes; sepPos < bbSize; sepPos++) {
if (ba[sepPos] == separator) {
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,14 @@ public interface ISequenceEncoder {
* @return Returns the {@link ByteBuffer} with decoded <code>target</code>.
*/
public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded);

/**
* The number of encoded form's prefix bytes that should be ignored (needed for separator lookup).
* An ugly workaround for GH-85, should be fixed by prior knowledge of whether the dictionary contains tags;
* then we can scan for separator right-to-left.
*
* @see "https://github.com/morfologik/morfologik-stemming/issues/85"
*/
@Deprecated
public int prefixBytes();
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded
return reuse;
}

@Override
public int prefixBytes() {
return 0;
}

@Override
public String toString() {
return getClass().getSimpleName();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
return reuse;
}

@Override
public int prefixBytes() {
return 3;
}

public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
assert encoded.remaining() >= 3;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)

return reuse;
}

@Override
public int prefixBytes() {
return 2;
}

public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
assert encoded.remaining() >= 2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,12 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)

return reuse;
}


@Override
public int prefixBytes() {
return 1;
}

public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
assert encoded.remaining() >= 1;

Expand Down Expand Up @@ -84,7 +89,7 @@ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded

return reuse;
}

@Override
public String toString() {
return getClass().getSimpleName();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package morfologik.tools;

import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.LinkedHashSet;
import java.util.Set;

import org.assertj.core.api.Assertions;
import org.junit.Test;

import com.carrotsearch.randomizedtesting.RandomizedTest;

import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.DictionaryMetadata;
import morfologik.stemming.EncoderType;
import morfologik.stemming.WordData;

public class DictCompileBug extends RandomizedTest {
@Test
public void testSeparatorInEncoded() throws Exception {
final Path input = newTempDir().toPath().resolve("dictionary.input");
final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input);

char separator = '_';
try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) {
DictionaryMetadata.builder()
.separator(separator)
.encoder(EncoderType.SUFFIX)
.encoding(StandardCharsets.UTF_8)
.build()
.write(writer);
}

Set<String> sequences = new LinkedHashSet<>();
for (int seqs = randomIntBetween(0, 100); --seqs >= 0;) {
sequences.add("anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
sequences.add("Anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
}

try (Writer writer = Files.newBufferedWriter(input, StandardCharsets.UTF_8)) {
for (String in : sequences) {
writer.write(in);
writer.write('\n');
}
}

Assertions.assertThat(new DictCompile(input, false, true, false, false, false).call())
.isEqualTo(ExitStatus.SUCCESS);

Path dict = input.resolveSibling("dictionary.dict");
Assertions.assertThat(dict).isRegularFile();

// Verify the dictionary is valid.

DictionaryLookup dictionaryLookup = new DictionaryLookup(Dictionary.read(dict));
for (WordData wd : dictionaryLookup) {
System.out.println(wd);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,21 @@

import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;

public class DictCompileTest extends RandomizedTest {
@Test
@Repeat(iterations = 50)
@Repeat(iterations = 200)
public void testRoundTrip() throws Exception {
final Path input = newTempDir().toPath().resolve("dictionary.input");
final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input);

char separator = '|';
char separator = RandomPicks.randomFrom(getRandom(), new Character [] {
'|',
',',
'\t',
});

try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) {
DictionaryMetadata.builder()
.separator(separator)
Expand All @@ -41,10 +47,44 @@ public void testRoundTrip() throws Exception {

Set<String> sequences = new LinkedHashSet<>();
for (int seqs = randomIntBetween(0, 100); --seqs >= 0;) {
String base;
switch (randomIntBetween(0, 5)) {
case 0:
base = randomAsciiOfLength(('A' - separator) & 0xff);
break;

default:
base = randomAsciiOfLengthBetween(1, 100);
break;
}

String inflected;
switch (randomIntBetween(0, 5)) {
case 0:
inflected = base;
break;

case 1:
inflected = randomAsciiOfLengthBetween(0, 5) + base;
break;

case 3:
inflected = base + randomAsciiOfLengthBetween(0, 5);
break;

case 4:
inflected = randomAsciiOfLengthBetween(0, 5) + base + randomAsciiOfLengthBetween(0, 5);
break;

default:
inflected = randomAsciiOfLengthBetween(0, 200);
break;
}


sequences.add(
randomAsciiOfLengthBetween(1, 10) +
separator +
randomAsciiOfLengthBetween(0, 10) +
base + separator +
inflected +
(useTags ? (separator + randomAsciiOfLengthBetween(0, 10)) : ""));
}

Expand Down

0 comments on commit efc065b

Please sign in to comment.