Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow custom definitions of in-word characters #2

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.neosearch.stringsearcher;

import java.util.Collection;
import java.util.function.Predicate;

/**
* Builder class to create a StringMatcher instance. Several algorithms can be
Expand Down Expand Up @@ -115,6 +116,18 @@ public SimpleStringSearcherBuilder stopOnHit() {
return this;
}

/**
* Configure the Trie to match keywords based on the given predicate which
* returns true for all characters that are considered in-word characters.
*
* @return This builder.
*/
public SimpleStringSearcherBuilder setIsInWordCharacter(
Predicate<Character> isInWordCharacter) {
this.stringSearcherBuilder.setInWordCharacters(isInWordCharacter);
return this;
}

/**
* Configure the PayloadTrie based on the builder settings.
*
Expand All @@ -123,4 +136,4 @@ public SimpleStringSearcherBuilder stopOnHit() {
public StringSearcher<String> build() {
return this.stringSearcherBuilder.build();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import java.util.LinkedList;
import java.util.Map.Entry;
import java.util.Queue;

import java.util.function.Predicate;
import org.neosearch.stringsearcher.trie.Trie;

/**
Expand Down Expand Up @@ -167,6 +167,17 @@ public StringSearcherBuilder<T> onlyWholeWordsWhiteSpaceSeparated() {
return this;
}

/**
* Configure the Trie to match whole keywords based on the given predicate which
* returns true for all characters that are considered in-word characters.
*
* @return This builder.
*/
public StringSearcherBuilder<T> setInWordCharacters(Predicate<Character> isInWordCharacter) {
this.config.setIsInWordCharacter(isInWordCharacter);
return this;
}

/**
* Configure the Trie to stop after the first keyword is found in the text.
*
Expand All @@ -193,4 +204,4 @@ public StringSearcher<T> build() {
}
return null;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.neosearch.stringsearcher;

import java.util.function.Predicate;

/**
* Configures options for matching strings.
*
Expand All @@ -11,12 +13,10 @@ public class StringSearcherConfig {

private boolean allowOverlaps = true;

private boolean onlyWholeWords = false;

private boolean onlyWholeWordsWhiteSpaceSeparated = false;

private boolean stopOnHit = false;

private Predicate<Character> isInWordCharacter = null;

/**
* Returns true if the matching should be case insensitive.
*/
Expand All @@ -42,7 +42,7 @@ public boolean isStopOnHit() {

/**
* Configures it he StringSearcher should stop on hit.
* @param stopOnHit true, if the StringSearch should stop on hit. False
* @param stopOnHit true, if the StringSearch should stop on hit. False
*/
public void setStopOnHit(boolean stopOnHit) {
this.stopOnHit = stopOnHit;
Expand All @@ -56,20 +56,25 @@ public void setAllowOverlaps(boolean allowOverlaps) {
this.allowOverlaps = allowOverlaps;
}

public boolean isOnlyWholeWords() {
return onlyWholeWords;
public void setOnlyWholeWords(boolean onlyWholeWords) {
this.isInWordCharacter = onlyWholeWords ? ch -> Character.isAlphabetic(ch) : null;
}

public void setOnlyWholeWords(boolean onlyWholeWords) {
this.onlyWholeWords = onlyWholeWords;
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
this.isInWordCharacter =
onlyWholeWordsWhiteSpaceSeparated ? ch -> !Character.isWhitespace(ch) : null;
}

public boolean isOnlyWholeWordsWhiteSpaceSeparated() {
return onlyWholeWordsWhiteSpaceSeparated;
public void setIsInWordCharacter(Predicate<Character> isInWordCharacter) {
this.isInWordCharacter = isInWordCharacter;
}

public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
public Predicate<Character> isInWordCharacter() {
return this.isInWordCharacter;
}

public boolean isOnlyWholeWords() {
return isInWordCharacter != null;
}

}
31 changes: 3 additions & 28 deletions src/main/java/org/neosearch/stringsearcher/trie/Trie.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
package org.neosearch.stringsearcher.trie;

import static java.lang.Character.isWhitespace;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Queue;

import org.neosearch.stringsearcher.Emit;
import org.neosearch.stringsearcher.EmitHandler;
import org.neosearch.stringsearcher.FragmentToken;
Expand Down Expand Up @@ -155,14 +152,9 @@ public Collection<Emit<T>> parseText(final CharSequence text, final StatefulEmit

final List<Emit<T>> collectedEmits = emitHandler.getEmits();

if (trieConfig.isOnlyWholeWords()) {
if (trieConfig.isInWordCharacter() != null) {
removePartialMatches(text, collectedEmits);
}

if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
}

if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
Expand Down Expand Up @@ -258,8 +250,8 @@ public Emit<T> firstMatch(final CharSequence text) {
}

private boolean isPartialMatch(final CharSequence searchText, final Emit<T> emit) {
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1)))
|| (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
return (emit.getStart() != 0 && trieConfig.isInWordCharacter().test(searchText.charAt(emit.getStart() - 1)))
|| (emit.getEnd() + 1 != searchText.length() && trieConfig.isInWordCharacter().test(searchText.charAt(emit.getEnd() + 1)));
}

private void removePartialMatches(final CharSequence searchText, final List<Emit<T>> collectedEmits) {
Expand All @@ -276,23 +268,6 @@ public boolean remove(Emit<T> emit) {
ListElementRemoval.removeIf(collectedEmits, predicate);
}

private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List<Emit<T>> collectedEmits) {
final long size = searchText.length();
final List<Emit<T>> removeEmits = new ArrayList<>();

for (final Emit<T> emit : collectedEmits) {
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1)))
&& (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);
}

for (final Emit<T> removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}

private State<T> getState(State<T> currentState, final Character character) {
State<T> newCurrentState = currentState.nextState(character);

Expand Down
61 changes: 61 additions & 0 deletions src/test/java/org/neosearch/stringsearcher/StringBoundaryTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package org.neosearch.stringsearcher;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import java.util.Iterator;
import java.util.function.Predicate;
import org.junit.Test;

public class StringBoundaryTest {

private final static Predicate<Character> IN_WORD_CHARACTERS =
ch -> Character.isAlphabetic(ch) || Character.isDigit(ch) || ch == '-' || ch == '_';

@Test
public void testWordBoundariesForNumbers() {
final String text = "Plida C2 / TELC C2 C3 and the programming language C.";

StringSearcher searcher = StringSearcher.builder().addSearchString("C")
.addSearchString("C2").setIsInWordCharacter(IN_WORD_CHARACTERS).build();
Iterator<Emit> resultIterator = searcher.parseText(text).iterator();
checkEmit(resultIterator.next(), 6, 7, "C2");
checkEmit(resultIterator.next(), 16, 17, "C2");
checkEmit(resultIterator.next(), 51, 51, "C");
assertFalse("The iterator shouldn't have found more elements", resultIterator.hasNext());
}

@Test
public void testWordBoundariesWithPunctuation() {
StringSearcher searcher =
StringSearcher.builder().addSearchString("MySQL").addSearchString("MariaDB")
.addSearchString("Database").addSearchString("Database Systems")
.ignoreOverlaps().setIsInWordCharacter(IN_WORD_CHARACTERS).build();
Iterator<Emit> resultIterator =
searcher.parseText("Database Systems: MariaDB;MySQL").iterator();
checkEmit(resultIterator.next(), 0, 15, "Database Systems");
checkEmit(resultIterator.next(), 18, 24, "MariaDB");
checkEmit(resultIterator.next(), 26, 30, "MySQL");
assertFalse("The iterator shouldn't have found more elements", resultIterator.hasNext());
}

@Test
public void testWordsWithSpacesAndHyphens() {
StringSearcher searcher = StringSearcher.builder().addSearchString("ER-Models")
.addSearchString("Database").addSearchString("Database Systems").ignoreOverlaps()
.setIsInWordCharacter(IN_WORD_CHARACTERS).build();
Iterator<Emit> resultIterator =
searcher.parseText("Knowledge of ER-Models and Database Systems:-)").iterator();
checkEmit(resultIterator.next(), 13, 21, "ER-Models");
checkEmit(resultIterator.next(), 27, 42, "Database Systems");
assertFalse("The iterator shouldn't have found more elements", resultIterator.hasNext());
}

private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart,
next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword,
next.getSearchString());
}

}